[SWDEV-511822] Created default command for amdsmi (#348)

* Added degree symbol and fixed power usage
* Added degree symbol and fixed power usage
* fixed default command

---------

Signed-off-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
This commit is contained in:
Pham, Gabriel
2025-05-29 17:14:58 -05:00
committed by GitHub
parent 945e4a159c
commit bc158d2b51
5 changed files with 314 additions and 5 deletions
+36
View File
@@ -72,6 +72,42 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- **Added bad page threshold count**.
- Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count.
- **Added the Default command**.
- A default view has been added. The default view provides a snapshot of commonly requested information such as bdf, current partition mode, version information, and more. Users can access that information by simply typing `amd-smi` with no additional commands or arguments. Users may also obtain this information through laternate output formats such as json or csv by using the default command with the respective output format: `amd-smi default --json` or `amd-smi default --csv`.
```shell
+------------------------------------------------------------------------------+
| AMD SMI 25.4.1+a0ac51... amdgpu version: 6.14.5 ROCm version: 7.0.0 |
|--------------------------------------+---------------------------------------|
| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |
|======================================+=======================================|
| 0000:0c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
| 0 0 2 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:22:00.0 AMD Instinct MI300X | 0 % 40 °C 0 155/750 W |
| 1 1 1 SPX/NPS1 | 0 % N/A 284/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:38:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
| 2 2 0 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:5c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 139/750 W |
| 3 3 3 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:9f:00.0 AMD Instinct MI300X | 0 % 37 °C 0 140/750 W |
| 4 4 7 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:af:00.0 AMD Instinct MI300X | 0 % 37 °C 0 142/750 W |
| 5 5 5 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:bf:00.0 AMD Instinct MI300X | 0 % 36 °C 0 138/750 W |
| 6 6 4 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:df:00.0 AMD Instinct MI300X | 0 % 40 °C 0 138/750 W |
| 7 7 6 SPX/NPS1 | 0 % N/A 283/196592 MB |
+--------------------------------------+---------------------------------------+
```
### Changed
- **The `amd-smi topology` command has been enabled for Guest environments**.
+3 -2
View File
@@ -95,7 +95,8 @@ if __name__ == "__main__":
amd_smi_commands.monitor,
amd_smi_commands.xgmi,
amd_smi_commands.partition,
amd_smi_commands.ras)
amd_smi_commands.ras,
amd_smi_commands.default)
try:
try:
argcomplete.autocomplete(amd_smi_parser)
@@ -109,7 +110,7 @@ if __name__ == "__main__":
sys.argv = [arg.lower() if arg.startswith('--') or not arg.startswith('-')
else arg for arg in sys.argv]
if len(sys.argv) == 1:
args = amd_smi_parser.parse_args(args=['--help'])
args = amd_smi_parser.parse_args(args=['default'])
elif sys.argv[1] in valid_commands:
args = amd_smi_parser.parse_args(args=None)
else:
+161
View File
@@ -6418,6 +6418,167 @@ class AMDSMICommands():
break
time.sleep(1)
def default(self, args):
"""Display the default amdsmi view when no args are given."""
processors = amdsmi_interface.amdsmi_get_processor_handles()
version_info = {"amd-smi": "N/A",
"amdgpu version": "N/A",
"rocm version": "N/A"}
version_info['rocm version'] = amdsmi_interface.amdsmi_get_rocm_version()
version_info["amdgpu version"] = amdsmi_interface.amdsmi_get_gpu_driver_info(processors[0])
version_info["amd-smi"] = f'{__version__}'
default_table_info_dict = {}
default_table_info_dict.update({"version_info": version_info})
gpu_info_list = []
all_process_list = []
# TODO: create new logger function to display table? or modify table?
# get info for each processor to display in default output
for processor in processors:
gpu_info_dict = {}
gpu_id = self.helpers.get_gpu_id_from_device_handle(processor)
gpu_info_dict.update({"gpu_id": gpu_id})
# get common gpu_metrics first
try:
gpu_metrics = amdsmi_interface.amdsmi_get_gpu_metrics_info(processor)
except amdsmi_exception.AmdSmiLibraryException as e:
gpu_metrics = "N/A"
# partition info
try:
current_mem = amdsmi_interface.amdsmi_get_gpu_memory_partition(processor)
except amdsmi_exception.AmdSmiLibraryException as e:
current_mem = "N/A"
try:
current_comp = amdsmi_interface.amdsmi_get_gpu_compute_partition(processor)
except amdsmi_exception.AmdSmiLibraryException as e:
current_comp = "N/A"
if current_comp == "N/A" or current_mem == "N/A":
partition_mode = "N/A"
else:
partition_mode = f"{current_comp}/{current_mem}"
gpu_info_dict.update({"partition_mode": partition_mode})
# GPU name market name and OAM ID
try:
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(processor)
market_name = asic_info['market_name']
oam_id = asic_info['oam_id']
except amdsmi_exception.AmdSmiLibraryException as e:
market_name = "N/A"
oam_id = "N/A"
gpu_info_dict.update({"market_name": market_name})
gpu_info_dict.update({"oam_id": oam_id})
# bdf
try:
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(processor)
except amdsmi_exception.AmdSmiLibraryException as e:
bdf = "N/A"
gpu_info_dict.update({"bdf": bdf})
# HIP ID
try:
enum_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(processor)
hip_id = enum_info['hip_id']
except amdsmi_exception.AmdSmiLibraryException as e:
hip_id = "N/A"
gpu_info_dict.update({"hip_id": hip_id})
# mem utilization, GPU utilization, power usage, and temperature
if gpu_metrics != "N/A":
mem_util = gpu_metrics['average_umc_activity']
mem_util = round(mem_util)
gfx_util = gpu_metrics['average_gfx_activity']
gfx_util = round(gfx_util)
if gpu_metrics['current_socket_power'] != "N/A":
current_power = gpu_metrics['current_socket_power']
else:
current_power = gpu_metrics['average_socket_power']
temperature = gpu_metrics['temperature_hotspot']
else:
mem_util = "N/A"
gfx_util = "N/A"
current_power = "N/A"
temperature = "N/A"
gpu_info_dict.update({"mem_util": mem_util})
gpu_info_dict.update({"gfx_util": gfx_util})
gpu_info_dict.update({"temp": temperature})
# rest of power usage info
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(processor)
socket_power_limit = self.helpers.convert_SI_unit(power_cap_info['power_cap'], AMDSMIHelpers.SI_Unit.MICRO)
power_usage = {"current_power": current_power, "power_limit": socket_power_limit}
except amdsmi_exception.AmdSmiLibraryException as e:
power_usage = "N/A"
gpu_info_dict.update({"power_usage": power_usage})
# memory usage
try:
total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
mem_usage = {"used_vram": used_vram, "total_vram": total_vram}
except amdsmi_exception.AmdSmiLibraryException as e:
mem_usage = "N/A"
gpu_info_dict.update({"mem_usage": mem_usage})
# uncorrectable ECC errors
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(processor)
uncorrectable = ecc_count.pop('uncorrectable_count')
except amdsmi_exception.AmdSmiLibraryException as e:
uncorrectable = "N/A"
gpu_info_dict.update({"uncorr_ecc": uncorrectable})
# Fan usage
try:
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(processor, 0)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get fan speed for gpu %s | %s", processor, e.get_error_info())
fan_speed = "N/A"
try:
fan_max = amdsmi_interface.amdsmi_get_gpu_fan_speed_max(processor, 0)
fan_usage = "N/A"
if fan_max > 0 and fan_speed != "N/A":
fan_usage = round((float(fan_speed) / float(fan_max)) * 100, 2)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get max fan speed for gpu %s | %s", processor, e.get_error_info())
fan_usage = "N/A"
gpu_info_dict.update({"fan": fan_usage})
gpu_info_list.append(gpu_info_dict)
# Running Processes
# try:
# raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
# proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A", "vram": "N/A"}
# for proc in raw_process_list:
# proc_info_dict['gpu'] = gpu_id
# proc_info_dict['pid'] = proc['pid']
# proc_info_dict['name'] = proc['container_name']
# proc_info_dict['vram'] = str(proc['memory_usage']['vram_mem']) + " MB"
# all_process_list.append(proc_info_dict)
# except amdsmi_exception.AmdSmiLibraryException as e:
# logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
default_table_info_dict.update({f"gpu_info_list": gpu_info_list})
# default_table_info_dict.update({"processes": all_process_list})
if self.logger.is_json_format():
self.logger.output = default_table_info_dict
self.logger.print_output()
elif self.logger.is_csv_format():
self.logger.multiple_device_output = default_table_info_dict
self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True)
else:
self.logger.print_default_output(default_table_info_dict)
def _event_thread(self, commands, i):
devices = commands.device_handles
if len(devices) == 0:
+99
View File
@@ -971,3 +971,102 @@ class AMDSMILogger():
with self.destination.open('a', encoding="utf-8") as output_file:
output_file.write(primary_table + '\n')
output_file.write(secondary_table)
def print_default_output(self, output: Dict):
# some template lines
# TODO: adjust column lines to give market name more space
default_line_1 = "+------------------------------------------------------------------------------+"
default_line_2 = "|--------------------------------------+---------------------------------------|"
default_line_3 = "|======================================+=======================================|"
default_line_4 = "+--------------------------------------+---------------------------------------+"
# default_line_5 = "|==============================================================================|"
# print the version information first
amd_smi_version = output['version_info']['amd-smi']
if len(amd_smi_version) > 16:
amd_smi_version = amd_smi_version[:13] + "..."
rocm_version = "N/A"
if output['version_info']['rocm version'][0]:
rocm_version = output['version_info']['rocm version'][1]
amdgpu_version = output['version_info']['amdgpu version']['driver_version']
# print GPU info
print(default_line_1)
print("| AMD SMI {0:16s} amdgpu version: {1:8s} ROCm version: {2:8s} |".format(amd_smi_version, amdgpu_version, rocm_version.ljust(8)))
print(default_line_2)
print("| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |")
print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |")
print(default_line_3)
line_count = 0
end = len(output['gpu_info_list']) - 1
for gpu_info in output['gpu_info_list']:
bdf = str(gpu_info['bdf']).ljust(12)
market_name = str(gpu_info['market_name'])
if len(market_name) > 22:
market_name = ("..." + market_name[-19:])
market_name = market_name.rjust(22)
mem_util = gpu_info['mem_util']
if mem_util != "N/A":
mem_util = str(mem_util) + " %"
mem_util = mem_util.rjust(8)
temp = gpu_info['temp']
if temp != "N/A":
temp = str(temp) + " \u00b0C"
temp = temp.rjust(6)
u_ecc = str(gpu_info['uncorr_ecc']).rjust(5)
power_usage = gpu_info['power_usage']
if power_usage != "N/A":
power_usage = f"{gpu_info['power_usage']['current_power']}/{gpu_info['power_usage']['power_limit']} W"
power_usage = str(power_usage).rjust(12)
print("| {0:12s} {1:22s} | {2:8s} {3:6s} {4:5s} {5:12s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage))
gpu_id = str(gpu_info['gpu_id']).rjust(3)
hip_id = str(gpu_info['hip_id']).rjust(6)
oam_id = str(gpu_info['oam_id']).rjust(7)
partition_modes = str(gpu_info['partition_mode']).rjust(14)
gfx_util = gpu_info['gfx_util']
if gfx_util != "N/A":
gfx_util = str(gfx_util) + " %"
gfx_util = gfx_util.rjust(8)
fan = gpu_info['fan']
if fan != "N/A":
fan = str(fan) + " %"
fan = fan.rjust(7)
mem_usage = gpu_info['mem_usage']
if mem_usage != "N/A":
mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB"
mem_usage = mem_usage.rjust(19)
print("| {0:3s} {1:6s} {2:7s} {3:14s} | {4:8s} {5:7s} {6:19s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage))
if line_count < end:
print(default_line_2)
line_count += 1
print(default_line_4)
# # print process list of all GPUs last
# print(default_line_1)
# print("| Processes: |")
# print("| GPU PID Process name VRAM_MEM |")
# print(default_line_5)
# if len(output['processes']) != 0:
# for process in output['processes']:
# gpu_id = str(process['gpu']).rjust(4)
# pid = str(process['pid']).ljust(7)
# process_name = str(process['name']).ljust(25)
# vram_mem = str(process['vram']).rjust(18)
# print("| {0:4s} {1:7s} {2:25s} {3:18s} |".format(gpu_id, pid, process_name, vram_mem))
# else:
# print("| No running processes found |")
# print(default_line_1)
+15 -3
View File
@@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser):
"""
def __init__(self, version, list, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, monitor,
xgmi, partition, ras):
xgmi, partition, ras, default):
# Helper variables
self.helpers = AMDSMIHelpers()
@@ -115,7 +115,7 @@ class AMDSMIParser(argparse.ArgumentParser):
# Store possible subcommands & aliases for later errors
self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages',
'metric', 'process', 'profile', 'event', 'topology', 'set',
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras']
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
# Add all subparsers
self._add_version_parser(self.subparsers, version)
@@ -135,8 +135,9 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_partition_parser(self.subparsers, partition)
self._add_ras_parser(self.subparsers, ras)
# the default command
self._add_default_parser(self.subparsers, default)
### Parser Validators and Helpers###
def _not_negative_int(self, int_value, sub_arg=None):
# Argument type validator
if int_value.isdigit(): # Is digit doesn't work on negative numbers
@@ -637,6 +638,17 @@ class AMDSMIParser(argparse.ArgumentParser):
return watch_arguments_group
def _add_default_parser(self, subparsers: argparse._SubParsersAction, func):
# there should be no args to parse here so let this be a dummy function to preserve later logic
default_help = "Display the default information panel?"
default_parser = subparsers.add_parser('default', help=default_help, description=None)
default_parser._optionals.title = None
default_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
default_parser.set_defaults(func=func)
# Add Universal Arguments
self._add_command_modifiers(default_parser)
def _add_version_parser(self, subparsers: argparse._SubParsersAction, func):
# Subparser help text