Added gpuboard and baseboard temperatures to amd-smi metric (#617)
* Added gpu-board and base-board temperatures to amd-smi metric
* Updated Changelog and adjusted the metric base-board/gpu-board output
* Adjusted output of metric to hide base/gpu-board when not relevant
---------
Signed-off-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
[ROCm/amdsmi commit: b13fc16d60]
Šī revīzija ir iekļauta:
revīziju iesūtīja
GitHub
vecāks
671612471d
revīzija
3ef5bfef94
@@ -11,6 +11,132 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- **Added `amdsmi_get_gpu_revision()` to Python API**
|
||||
- This function retrieves the GPU revision ID. Available in `amdsmi_interface.py` as `amdsmi_get_gpu_revision()`.
|
||||
|
||||
- **Added gpuboard and baseboard temperatures to `amd-smi metric` command**.
|
||||
- The metric command has been updated with various gpuboard and baseboard temperatures in degrees Celsius. Users can access these
|
||||
values through the `-G/--gpuboard` or `-b/--baseboard` options or obtain all of them as normal using the `amd-smi metric` command without
|
||||
any options. If the hardware does not support gpuboard or baseboard temperatures, then the values will be hidden from the default `metric` view.
|
||||
|
||||
```conosle
|
||||
% amd-smi metric -b
|
||||
GPU: 0
|
||||
BASEBOARD:
|
||||
TEMPERATURE:
|
||||
BASEBOARD_FIRST: 78
|
||||
BASEBOARD_UBB_FRONT: 55
|
||||
BASEBOARD_UBB_BACK: 49
|
||||
BASEBOARD_UBB_OAM7: 86
|
||||
BASEBOARD_UBB_IBC: 94
|
||||
BASEBOARD_UBB_UFPGA: 49
|
||||
BASEBOARD_UBB_OAM1: 78
|
||||
BASEBOARD_OAM_0_1_HSC: 54
|
||||
BASEBOARD_OAM_2_3_HSC: 32
|
||||
BASEBOARD_OAM_4_5_HSC: 14
|
||||
BASEBOARD_OAM_6_7_HSC: 85
|
||||
BASEBOARD_UBB_FPGA_0V72_VR: 43
|
||||
BASEBOARD_UBB_FPGA_3V3_VR: 41
|
||||
BASEBOARD_RETIMER_0_1_2_3_1V2_VR: 64
|
||||
BASEBOARD_RETIMER_4_5_6_7_1V2_VR: 56
|
||||
BASEBOARD_RETIMER_0_1_0V9_VR: 74
|
||||
BASEBOARD_RETIMER_4_5_0V9_VR: 34
|
||||
BASEBOARD_RETIMER_2_3_0V9_VR: 85
|
||||
BASEBOARD_RETIMER_6_7_0V9_VR: 92
|
||||
BASEBOARD_OAM_0_1_2_3_3V3_VR: 29
|
||||
BASEBOARD_OAM_4_5_6_7_3V3_VR: 13
|
||||
BASEBOARD_IBC_HSC: 41
|
||||
BASEBOARD_IBC: 43
|
||||
|
||||
% amd-smi metric -G
|
||||
GPU: 0
|
||||
GPUBOARD:
|
||||
TEMPERATURE:
|
||||
GPUBOARD_NODE_FIRST: 43
|
||||
GPUBOARD_NODE_OAM_X_IBC: 24
|
||||
GPUBOARD_NODE_OAM_X_IBC_2: 56
|
||||
GPUBOARD_NODE_OAM_X_VDD18_VR: 34
|
||||
GPUBOARD_NODE_OAM_X_04_HBM_B_VR: 53
|
||||
GPUBOARD_NODE_OAM_X_04_HBM_D_VR: 47
|
||||
GPUBOARD_VR_FIRST: 58
|
||||
GPUBOARD_VDDCR_VDD1: 78
|
||||
GPUBOARD_VDDCR_VDD2: 35
|
||||
GPUBOARD_VDDCR_VDD3: 73
|
||||
GPUBOARD_VDDCR_SOC_A: 12
|
||||
GPUBOARD_VDDCR_SOC_C: 57
|
||||
GPUBOARD_VDDCR_SOCIO_A: 39
|
||||
GPUBOARD_VDDCR_SOCIO_C: 75
|
||||
GPUBOARD_VDD_085_HBM: 64
|
||||
GPUBOARD_VDDCR_11_HBM_B: 92
|
||||
GPUBOARD_VDDCR_11_HBM_D: 87
|
||||
GPUBOARD_VDD_USR: 46
|
||||
GPUBOARD_VDDIO_11_E32: 98
|
||||
|
||||
% amd-smi metric
|
||||
GPU: 0
|
||||
USAGE:
|
||||
GFX_ACTIVITY: 0 %
|
||||
UMC_ACTIVITY: 0 %
|
||||
...
|
||||
POWER:
|
||||
SOCKET_POWER: 140 W
|
||||
GFX_VOLTAGE: N/A
|
||||
...
|
||||
CLOCK:
|
||||
GFX_0:
|
||||
CLK: 132 MHz
|
||||
MIN_CLK: 500 MHz
|
||||
...
|
||||
TEMPERATURE:
|
||||
EDGE: N/A
|
||||
HOTSPOT: 37 °C
|
||||
...
|
||||
PCIE:
|
||||
WIDTH: 16
|
||||
SPEED: 32 GT/s
|
||||
...
|
||||
GPUBOARD:
|
||||
TEMPERATURE:
|
||||
GPUBOARD_NODE_FIRST: 43
|
||||
GPUBOARD_NODE_OAM_X_IBC: 24
|
||||
...
|
||||
BASEBOARD:
|
||||
TEMPERATURE:
|
||||
BASEBOARD_FIRST: 78
|
||||
BASEBOARD_UBB_FRONT: 55
|
||||
...
|
||||
ECC:
|
||||
TOTAL_CORRECTABLE_COUNT: 0
|
||||
TOTAL_UNCORRECTABLE_COUNT: 0
|
||||
...
|
||||
ECC_BLOCKS:
|
||||
UMC:
|
||||
CORRECTABLE_COUNT: 0
|
||||
UNCORRECTABLE_COUNT: 0
|
||||
...
|
||||
FAN:
|
||||
SPEED: N/A
|
||||
MAX: N/A
|
||||
...
|
||||
VOLTAGE_CURVE:
|
||||
POINT_0_FREQUENCY: N/A
|
||||
POINT_0_VOLTAGE: N/A
|
||||
...
|
||||
OVERDRIVE: N/A
|
||||
MEM_OVERDRIVE: N/A
|
||||
PERF_LEVEL: AMDSMI_DEV_PERF_LEVEL_AUTO
|
||||
XGMI_ERR: N/A
|
||||
VOLTAGE:
|
||||
VDDBOARD: N/A
|
||||
ENERGY:
|
||||
TOTAL_ENERGY_CONSUMPTION: 14292727.274 J
|
||||
MEM_USAGE:
|
||||
TOTAL_VRAM: 196592 MB
|
||||
USED_VRAM: 283 MB
|
||||
...
|
||||
THROTTLE:
|
||||
ACCUMULATION_COUNTER: 100936627
|
||||
PROCHOT_ACCUMULATED: 0
|
||||
...
|
||||
```
|
||||
|
||||
### Changed
|
||||
|
||||
### Removed
|
||||
|
||||
@@ -1461,7 +1461,7 @@ class AMDSMICommands():
|
||||
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
|
||||
xgmi_err=None, energy=None, mem_usage=None, voltage=None, schedule=None,
|
||||
guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None,
|
||||
):
|
||||
base_board=None, gpu_board=None):
|
||||
"""Get Metric information for target gpu
|
||||
|
||||
Args:
|
||||
@@ -1523,6 +1523,10 @@ class AMDSMICommands():
|
||||
if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux():
|
||||
if usage:
|
||||
args.usage = usage
|
||||
if base_board:
|
||||
args.base_board = base_board
|
||||
if gpu_board:
|
||||
args.gpu_board = gpu_board
|
||||
if power:
|
||||
args.power = power
|
||||
if clock:
|
||||
@@ -1537,10 +1541,10 @@ class AMDSMICommands():
|
||||
args.ecc = ecc
|
||||
if ecc_blocks:
|
||||
args.ecc_blocks = ecc_blocks
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks"]
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "voltage", "pcie", "ecc", "ecc_blocks", "base_board","gpu_board"]
|
||||
current_platform_values += [args.usage, args.power, args.clock,
|
||||
args.temperature, args.voltage, args.pcie]
|
||||
current_platform_values += [args.ecc, args.ecc_blocks]
|
||||
current_platform_values += [args.ecc, args.ecc_blocks, args.base_board, args.gpu_board]
|
||||
|
||||
if self.helpers.is_baremetal() and self.helpers.is_linux():
|
||||
if fan:
|
||||
@@ -2245,6 +2249,99 @@ class AMDSMICommands():
|
||||
if args.pcie:
|
||||
values_dict['pcie'] = pcie_dict
|
||||
|
||||
if "gpu_board" in current_platform_args:
|
||||
if args.gpu_board:
|
||||
gpu_board_temp_dict = {}
|
||||
gpu_board_temp_types = [
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_RETIMER_X,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_IBC_2,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_VDD18_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_B_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_NODE_OAM_X_04_HBM_D_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD0,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD1,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD2,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_VDD3,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_A,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOC_C,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_A,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_SOCIO_C,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_085_HBM,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_B,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDCR_11_HBM_D,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDD_USR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.GPUBOARD_VDDIO_11_E32
|
||||
]
|
||||
for type in gpu_board_temp_types:
|
||||
type_name = type.name.replace("GPUBOARD", "GPU_BOARD")
|
||||
try:
|
||||
gpu_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
if gpu_board_temp_holder != "N/A":
|
||||
gpu_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger,
|
||||
gpu_board_temp_holder,
|
||||
'\N{DEGREE SIGN}C')
|
||||
else:
|
||||
gpu_board_temp_dict[f'{type_name}'] = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
gpu_board_temp_dict[f'{type_name}'] = "N/A"
|
||||
logging.debug("Failed to get gpu_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info())
|
||||
# if every value is N/A, then we don't want to display the values unless explicitly told to
|
||||
# all args_list being True indicates that this gpu_board is not explicitly called itself
|
||||
args_list = [getattr(args, arg) for arg in current_platform_args]
|
||||
if all(value == "N/A" for value in gpu_board_temp_dict.values()) and all(arg == True for arg in args_list):
|
||||
gpu_board_temp_dict = {}
|
||||
if gpu_board_temp_dict:
|
||||
values_dict['gpu_board'] = {'temperature':gpu_board_temp_dict}
|
||||
if "base_board" in current_platform_args:
|
||||
if args.base_board:
|
||||
base_board_temp_dict = {}
|
||||
base_board_temp_types = [
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FRONT,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_BACK,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM7,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_IBC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_UFPGA,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_OAM1,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_HSC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_2_3_HSC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_HSC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_6_7_HSC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_0V72_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_UBB_FPGA_3V3_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_2_3_1V2_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_6_7_1V2_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_0_1_0V9_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_4_5_0V9_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_2_3_0V9_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_RETIMER_6_7_0V9_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_0_1_2_3_3V3_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_OAM_4_5_6_7_3V3_VR,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC_HSC,
|
||||
amdsmi_interface.AmdSmiTemperatureType.BASEBOARD_IBC
|
||||
]
|
||||
for type in base_board_temp_types:
|
||||
type_name = type.name.replace("BASEBOARD", "BASE_BOARD")
|
||||
try:
|
||||
base_board_temp_holder = amdsmi_interface.amdsmi_get_temp_metric(args.gpu, type, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT)
|
||||
if base_board_temp_holder != "N/A":
|
||||
|
||||
base_board_temp_dict[f'{type_name}'] = self.helpers.unit_format(self.logger,
|
||||
base_board_temp_holder,
|
||||
'\N{DEGREE SIGN}C')
|
||||
else:
|
||||
base_board_temp_dict[f'{type_name}'] = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
base_board_temp_dict[f'{type_name}'] = "N/A"
|
||||
logging.debug("Failed to get base_board %s for gpu %s | %s", type_name, gpu_id, e.get_error_info())
|
||||
# if every value is N/A, then we don't want to display the values unless explicitly told to
|
||||
# all args_list being True indicates that this base_board is not explicitly called itself
|
||||
args_list = [getattr(args, arg) for arg in current_platform_args]
|
||||
if all(value == "N/A" for value in base_board_temp_dict.values()) and all(arg == True for arg in args_list):
|
||||
base_board_temp_dict = {}
|
||||
if base_board_temp_dict:
|
||||
values_dict['base_board'] = {'temperature':base_board_temp_dict}
|
||||
if "ecc" in current_platform_args:
|
||||
if args.ecc:
|
||||
ecc_count = {}
|
||||
@@ -3035,7 +3132,7 @@ class AMDSMICommands():
|
||||
cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None,
|
||||
cpu_dimm_thermal_sensor=None,
|
||||
core=None, core_boost_limit=None, core_curr_active_freq_core_limit=None,
|
||||
core_energy=None, throttle=None):
|
||||
core_energy=None, throttle=None, base_board=None, gpu_board=None):
|
||||
"""Get Metric information for target gpu
|
||||
|
||||
Args:
|
||||
@@ -3110,7 +3207,7 @@ class AMDSMICommands():
|
||||
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
|
||||
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
|
||||
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "voltage", "schedule",
|
||||
"guard", "guest_data", "fb_usage", "xgmi", "throttle"]
|
||||
"guard", "guest_data", "fb_usage", "xgmi", "throttle", "base_board", "gpu_board"]
|
||||
for attr in gpu_attributes:
|
||||
if hasattr(args, attr):
|
||||
if getattr(args, attr):
|
||||
@@ -3184,7 +3281,7 @@ class AMDSMICommands():
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, voltage, schedule,
|
||||
guard, guest_data, fb_usage, xgmi, throttle,
|
||||
)
|
||||
base_board, gpu_board)
|
||||
elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized
|
||||
if args.cpu == None and args.core == None:
|
||||
# If no args are set, print out all CPU and Core metrics info
|
||||
@@ -3219,7 +3316,7 @@ class AMDSMICommands():
|
||||
clock, temperature, ecc, ecc_blocks, pcie,
|
||||
fan, voltage_curve, overdrive, perf_level,
|
||||
xgmi_err, energy, mem_usage, voltage, schedule, throttle,
|
||||
)
|
||||
base_board, gpu_board)
|
||||
if self.logger.is_json_format():
|
||||
self.logger.combine_arrays_to_json()
|
||||
|
||||
|
||||
@@ -938,6 +938,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
ecc_blocks_help = "Number of ECC errors per block"
|
||||
pcie_help = "Current PCIe speed, width, and replay count"
|
||||
voltage_help = "GPU voltage"
|
||||
base_board_help = "base_board temperatures"
|
||||
gpu_board_help = "gpu_board temperatures"
|
||||
|
||||
# Help text for Arguments only on Linux Baremetal platforms
|
||||
fan_help = "Current fan speed"
|
||||
@@ -1004,6 +1006,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
|
||||
metric_parser.add_argument('-V', '--voltage', action='store_true', required=False, help=voltage_help)
|
||||
metric_parser.add_argument('-b', '--base-board', action='store_true', required=False, help=base_board_help, default=False)
|
||||
metric_parser.add_argument('-G', '--gpu-board', action='store_true', required=False, help=gpu_board_help, default=False)
|
||||
|
||||
# Options that only apply to Hypervisors and Baremetal Linux
|
||||
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user