diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index a644a77459..707ada7b01 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -11,6 +11,132 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - **Added `amdsmi_get_gpu_revision()` to Python API** - This function retrieves the GPU revision ID. Available in `amdsmi_interface.py` as `amdsmi_get_gpu_revision()`. +- **Added gpuboard and baseboard temperatures to `amd-smi metric` command**. + - The metric command has been updated with various gpuboard and baseboard temperatures in degrees Celsius. Users can access these + values through the `-G/--gpuboard` or `-b/--baseboard` options or obtain all of them as normal using the `amd-smi metric` command without + any options. If the hardware does not support gpuboard or baseboard temperatures, then the values will be hidden from the default `metric` view. + +```conosle +% amd-smi metric -b +GPU: 0 + BASEBOARD: + TEMPERATURE: + BASEBOARD_FIRST: 78 + BASEBOARD_UBB_FRONT: 55 + BASEBOARD_UBB_BACK: 49 + BASEBOARD_UBB_OAM7: 86 + BASEBOARD_UBB_IBC: 94 + BASEBOARD_UBB_UFPGA: 49 + BASEBOARD_UBB_OAM1: 78 + BASEBOARD_OAM_0_1_HSC: 54 + BASEBOARD_OAM_2_3_HSC: 32 + BASEBOARD_OAM_4_5_HSC: 14 + BASEBOARD_OAM_6_7_HSC: 85 + BASEBOARD_UBB_FPGA_0V72_VR: 43 + BASEBOARD_UBB_FPGA_3V3_VR: 41 + BASEBOARD_RETIMER_0_1_2_3_1V2_VR: 64 + BASEBOARD_RETIMER_4_5_6_7_1V2_VR: 56 + BASEBOARD_RETIMER_0_1_0V9_VR: 74 + BASEBOARD_RETIMER_4_5_0V9_VR: 34 + BASEBOARD_RETIMER_2_3_0V9_VR: 85 + BASEBOARD_RETIMER_6_7_0V9_VR: 92 + BASEBOARD_OAM_0_1_2_3_3V3_VR: 29 + BASEBOARD_OAM_4_5_6_7_3V3_VR: 13 + BASEBOARD_IBC_HSC: 41 + BASEBOARD_IBC: 43 + +% amd-smi metric -G +GPU: 0 + GPUBOARD: + TEMPERATURE: + GPUBOARD_NODE_FIRST: 43 + GPUBOARD_NODE_OAM_X_IBC: 24 + GPUBOARD_NODE_OAM_X_IBC_2: 56 + GPUBOARD_NODE_OAM_X_VDD18_VR: 34 + GPUBOARD_NODE_OAM_X_04_HBM_B_VR: 53 + GPUBOARD_NODE_OAM_X_04_HBM_D_VR: 47 + GPUBOARD_VR_FIRST: 58 + GPUBOARD_VDDCR_VDD1: 78 + GPUBOARD_VDDCR_VDD2: 35 + GPUBOARD_VDDCR_VDD3: 73 + GPUBOARD_VDDCR_SOC_A: 12 + GPUBOARD_VDDCR_SOC_C: 57 + GPUBOARD_VDDCR_SOCIO_A: 39 + GPUBOARD_VDDCR_SOCIO_C: 75 + GPUBOARD_VDD_085_HBM: 64 + GPUBOARD_VDDCR_11_HBM_B: 92 + GPUBOARD_VDDCR_11_HBM_D: 87 + GPUBOARD_VDD_USR: 46 + GPUBOARD_VDDIO_11_E32: 98 + +% amd-smi metric +GPU: 0 + USAGE: + GFX_ACTIVITY: 0 % + UMC_ACTIVITY: 0 % + ... + POWER: + SOCKET_POWER: 140 W + GFX_VOLTAGE: N/A + ... + CLOCK: + GFX_0: + CLK: 132 MHz + MIN_CLK: 500 MHz + ... + TEMPERATURE: + EDGE: N/A + HOTSPOT: 37 °C + ... + PCIE: + WIDTH: 16 + SPEED: 32 GT/s + ... + GPUBOARD: + TEMPERATURE: + GPUBOARD_NODE_FIRST: 43 + GPUBOARD_NODE_OAM_X_IBC: 24 + ... + BASEBOARD: + TEMPERATURE: + BASEBOARD_FIRST: 78 + BASEBOARD_UBB_FRONT: 55 + ... + ECC: + TOTAL_CORRECTABLE_COUNT: 0 + TOTAL_UNCORRECTABLE_COUNT: 0 + ... + ECC_BLOCKS: + UMC: + CORRECTABLE_COUNT: 0 + UNCORRECTABLE_COUNT: 0 + ... + FAN: + SPEED: N/A + MAX: N/A + ... + VOLTAGE_CURVE: + POINT_0_FREQUENCY: N/A + POINT_0_VOLTAGE: N/A + ... + OVERDRIVE: N/A + MEM_OVERDRIVE: N/A + PERF_LEVEL: AMDSMI_DEV_PERF_LEVEL_AUTO + XGMI_ERR: N/A + VOLTAGE: + VDDBOARD: N/A + ENERGY: + TOTAL_ENERGY_CONSUMPTION: 14292727.274 J + MEM_USAGE: + TOTAL_VRAM: 196592 MB + USED_VRAM: 283 MB + ... + THROTTLE: + ACCUMULATION_COUNTER: 100936627 + PROCHOT_ACCUMULATED: 0 + ... +``` + ### Changed ### Removed diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 5624f7e87b..a9335d200f 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1461,7 +1461,7 @@ class AMDSMICommands(): fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None, - ): + base_board=None, gpu_board=None): """Get Metric information for target gpu Args: @@ -1523,6 +1523,10 @@ class AMDSMICommands(): if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): if usage: args.usage = usage + if base_board: + args.base_board = base_board + if gpu_board: + args.gpu_board = gpu_board if power: args.power = power if clock: @@ -1537,10 +1541,10 @@ class AMDSMICommands(): args.ecc = ecc if ecc_blocks: args.ecc_blocks = ecc_blocks - current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"] + current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks", "base_board","gpu_board"] current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.voltage, args.pcie] - current_platform_values += [args.ecc, args.ecc_blocks] + current_platform_values += [args.ecc, args.ecc_blocks, args.base_board, args.gpu_board] if self.helpers.is_baremetal() and self.helpers.is_linux(): if fan: @@ -2245,6 +2249,99 @@ class AMDSMICommands(): if args.pcie: values_dict['pcie'] = pcie_dict + if "gpu_board" in current_platform_args: + if args.gpu_board: + gpu_board_temp_dict = {} + gpu_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR, + amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32 + ] + for type in gpu_board_temp_types: + type_name = type.name.replace("GPUBOARD", "GPU_BOARD") + try: + gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if gpu_board_temp_holder != "N/A": + gpu_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, + gpu_board_temp_holder, + '\N{DEGREE SIGN}C') + else: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + gpu_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get gpu_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + # if every value is N/A, then we don't want to display the values unless explicitly told to + # all args_list being True indicates that this gpu_board is not explicitly called itself + args_list = [getattr(args, arg) for arg in current_platform_args] + if all(value == "N/A" for value in gpu_board_temp_dict.values()) and all(arg == True for arg in args_list): + gpu_board_temp_dict = {} + if gpu_board_temp_dict: + values_dict['gpu_board'] = {'temperature':gpu_board_temp_dict} + if "base_board" in current_platform_args: + if args.base_board: + base_board_temp_dict = {} + base_board_temp_types = [ + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC, + amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC + ] + for type in base_board_temp_types: + type_name = type.name.replace("BASEBOARD", "BASE_BOARD") + try: + base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) + if base_board_temp_holder != "N/A": + + base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger, + base_board_temp_holder, + '\N{DEGREE SIGN}C') + else: + base_board_temp_dict[f'{type_name}'] = "N/A" + except amdsmi_exception.AmdSmiLibraryException as e: + base_board_temp_dict[f'{type_name}'] = "N/A" + logging.debug("Failed to get base_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info()) + # if every value is N/A, then we don't want to display the values unless explicitly told to + # all args_list being True indicates that this base_board is not explicitly called itself + args_list = [getattr(args, arg) for arg in current_platform_args] + if all(value == "N/A" for value in base_board_temp_dict.values()) and all(arg == True for arg in args_list): + base_board_temp_dict = {} + if base_board_temp_dict: + values_dict['base_board'] = {'temperature':base_board_temp_dict} if "ecc" in current_platform_args: if args.ecc: ecc_count = {} @@ -3035,7 +3132,7 @@ class AMDSMICommands(): cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None, core=None, core_boost_limit=None, core_curr_active_freq_core_limit=None, - core_energy=None, throttle=None): + core_energy=None, throttle=None, base_board=None, gpu_board=None): """Get Metric information for target gpu Args: @@ -3110,7 +3207,7 @@ class AMDSMICommands(): gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule", - "guard", "guest_data", "fb_usage", "xgmi", "throttle"] + "guard", "guest_data", "fb_usage", "xgmi", "throttle", "base_board", "gpu_board"] for attr in gpu_attributes: if hasattr(args, attr): if getattr(args, attr): @@ -3184,7 +3281,7 @@ class AMDSMICommands(): fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, voltage, schedule, guard, guest_data, fb_usage, xgmi, throttle, - ) + base_board, gpu_board) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: # If no args are set, print out all CPU and Core metrics info @@ -3219,7 +3316,7 @@ class AMDSMICommands(): clock, temperature, ecc, ecc_blocks, pcie, fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, voltage, schedule, throttle, - ) + base_board, gpu_board) if self.logger.is_json_format(): self.logger.combine_arrays_to_json() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 5daf63f6d7..d32526b13e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -938,6 +938,8 @@ class AMDSMIParser(argparse.ArgumentParser): ecc_blocks_help = "Number of ECC errors per block" pcie_help = "Current PCIe speed, width, and replay count" voltage_help = "GPU voltage" + base_board_help = "base_board temperatures" + gpu_board_help = "gpu_board temperatures" # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" @@ -1004,6 +1006,8 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help) + metric_parser.add_argument('-b', '--base-board', action='store_true', required=False, help=base_board_help, default=False) + metric_parser.add_argument('-G', '--gpu-board', action='store_true', required=False, help=gpu_board_help, default=False) # Options that only apply to Hypervisors and Baremetal Linux if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):