[SWDEV-511822] Created default command for amdsmi (#348)
* Added degree symbol and fixed power usage * Added degree symbol and fixed power usage * fixed default command --------- Signed-off-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
This commit is contained in:
@@ -72,6 +72,42 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- **Added bad page threshold count**.
|
||||
- Added `amdsmi_get_gpu_bad_page_threshold` to Python API and CLI; root/sudo permissions required to display the count.
|
||||
|
||||
- **Added the Default command**.
|
||||
- A default view has been added. The default view provides a snapshot of commonly requested information such as bdf, current partition mode, version information, and more. Users can access that information by simply typing `amd-smi` with no additional commands or arguments. Users may also obtain this information through laternate output formats such as json or csv by using the default command with the respective output format: `amd-smi default --json` or `amd-smi default --csv`.
|
||||
|
||||
```shell
|
||||
+------------------------------------------------------------------------------+
|
||||
| AMD SMI 25.4.1+a0ac51... amdgpu version: 6.14.5 ROCm version: 7.0.0 |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |
|
||||
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |
|
||||
|======================================+=======================================|
|
||||
| 0000:0c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
|
||||
| 0 0 2 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:22:00.0 AMD Instinct MI300X | 0 % 40 °C 0 155/750 W |
|
||||
| 1 1 1 SPX/NPS1 | 0 % N/A 284/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:38:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
|
||||
| 2 2 0 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:5c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 139/750 W |
|
||||
| 3 3 3 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:9f:00.0 AMD Instinct MI300X | 0 % 37 °C 0 140/750 W |
|
||||
| 4 4 7 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:af:00.0 AMD Instinct MI300X | 0 % 37 °C 0 142/750 W |
|
||||
| 5 5 5 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:bf:00.0 AMD Instinct MI300X | 0 % 36 °C 0 138/750 W |
|
||||
| 6 6 4 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
|--------------------------------------+---------------------------------------|
|
||||
| 0000:df:00.0 AMD Instinct MI300X | 0 % 40 °C 0 138/750 W |
|
||||
| 7 7 6 SPX/NPS1 | 0 % N/A 283/196592 MB |
|
||||
+--------------------------------------+---------------------------------------+
|
||||
```
|
||||
|
||||
### Changed
|
||||
|
||||
- **The `amd-smi topology` command has been enabled for Guest environments**.
|
||||
|
||||
@@ -95,7 +95,8 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.monitor,
|
||||
amd_smi_commands.xgmi,
|
||||
amd_smi_commands.partition,
|
||||
amd_smi_commands.ras)
|
||||
amd_smi_commands.ras,
|
||||
amd_smi_commands.default)
|
||||
try:
|
||||
try:
|
||||
argcomplete.autocomplete(amd_smi_parser)
|
||||
@@ -109,7 +110,7 @@ if __name__ == "__main__":
|
||||
sys.argv = [arg.lower() if arg.startswith('--') or not arg.startswith('-')
|
||||
else arg for arg in sys.argv]
|
||||
if len(sys.argv) == 1:
|
||||
args = amd_smi_parser.parse_args(args=['--help'])
|
||||
args = amd_smi_parser.parse_args(args=['default'])
|
||||
elif sys.argv[1] in valid_commands:
|
||||
args = amd_smi_parser.parse_args(args=None)
|
||||
else:
|
||||
|
||||
@@ -6418,6 +6418,167 @@ class AMDSMICommands():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
def default(self, args):
|
||||
"""Display the default amdsmi view when no args are given."""
|
||||
processors = amdsmi_interface.amdsmi_get_processor_handles()
|
||||
version_info = {"amd-smi": "N/A",
|
||||
"amdgpu version": "N/A",
|
||||
"rocm version": "N/A"}
|
||||
version_info['rocm version'] = amdsmi_interface.amdsmi_get_rocm_version()
|
||||
version_info["amdgpu version"] = amdsmi_interface.amdsmi_get_gpu_driver_info(processors[0])
|
||||
version_info["amd-smi"] = f'{__version__}'
|
||||
|
||||
default_table_info_dict = {}
|
||||
default_table_info_dict.update({"version_info": version_info})
|
||||
|
||||
gpu_info_list = []
|
||||
all_process_list = []
|
||||
|
||||
# TODO: create new logger function to display table? or modify table?
|
||||
# get info for each processor to display in default output
|
||||
for processor in processors:
|
||||
gpu_info_dict = {}
|
||||
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(processor)
|
||||
gpu_info_dict.update({"gpu_id": gpu_id})
|
||||
# get common gpu_metrics first
|
||||
try:
|
||||
gpu_metrics = amdsmi_interface.amdsmi_get_gpu_metrics_info(processor)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
gpu_metrics = "N/A"
|
||||
|
||||
# partition info
|
||||
try:
|
||||
current_mem = amdsmi_interface.amdsmi_get_gpu_memory_partition(processor)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
current_mem = "N/A"
|
||||
try:
|
||||
current_comp = amdsmi_interface.amdsmi_get_gpu_compute_partition(processor)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
current_comp = "N/A"
|
||||
if current_comp == "N/A" or current_mem == "N/A":
|
||||
partition_mode = "N/A"
|
||||
else:
|
||||
partition_mode = f"{current_comp}/{current_mem}"
|
||||
gpu_info_dict.update({"partition_mode": partition_mode})
|
||||
|
||||
# GPU name market name and OAM ID
|
||||
try:
|
||||
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(processor)
|
||||
market_name = asic_info['market_name']
|
||||
oam_id = asic_info['oam_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
market_name = "N/A"
|
||||
oam_id = "N/A"
|
||||
gpu_info_dict.update({"market_name": market_name})
|
||||
gpu_info_dict.update({"oam_id": oam_id})
|
||||
|
||||
# bdf
|
||||
try:
|
||||
bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(processor)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bdf = "N/A"
|
||||
gpu_info_dict.update({"bdf": bdf})
|
||||
|
||||
# HIP ID
|
||||
try:
|
||||
enum_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(processor)
|
||||
hip_id = enum_info['hip_id']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
hip_id = "N/A"
|
||||
gpu_info_dict.update({"hip_id": hip_id})
|
||||
|
||||
# mem utilization, GPU utilization, power usage, and temperature
|
||||
if gpu_metrics != "N/A":
|
||||
mem_util = gpu_metrics['average_umc_activity']
|
||||
mem_util = round(mem_util)
|
||||
gfx_util = gpu_metrics['average_gfx_activity']
|
||||
gfx_util = round(gfx_util)
|
||||
if gpu_metrics['current_socket_power'] != "N/A":
|
||||
current_power = gpu_metrics['current_socket_power']
|
||||
else:
|
||||
current_power = gpu_metrics['average_socket_power']
|
||||
temperature = gpu_metrics['temperature_hotspot']
|
||||
else:
|
||||
mem_util = "N/A"
|
||||
gfx_util = "N/A"
|
||||
current_power = "N/A"
|
||||
temperature = "N/A"
|
||||
gpu_info_dict.update({"mem_util": mem_util})
|
||||
gpu_info_dict.update({"gfx_util": gfx_util})
|
||||
gpu_info_dict.update({"temp": temperature})
|
||||
|
||||
|
||||
# rest of power usage info
|
||||
try:
|
||||
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(processor)
|
||||
socket_power_limit = self.helpers.convert_SI_unit(power_cap_info['power_cap'], AMDSMIHelpers.SI_Unit.MICRO)
|
||||
power_usage = {"current_power": current_power, "power_limit": socket_power_limit}
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
power_usage = "N/A"
|
||||
gpu_info_dict.update({"power_usage": power_usage})
|
||||
|
||||
# memory usage
|
||||
try:
|
||||
total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
|
||||
used_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(processor, amdsmi_interface.AmdSmiMemoryType.VRAM) // (1024*1024)
|
||||
mem_usage = {"used_vram": used_vram, "total_vram": total_vram}
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
mem_usage = "N/A"
|
||||
gpu_info_dict.update({"mem_usage": mem_usage})
|
||||
|
||||
# uncorrectable ECC errors
|
||||
try:
|
||||
ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(processor)
|
||||
uncorrectable = ecc_count.pop('uncorrectable_count')
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
uncorrectable = "N/A"
|
||||
gpu_info_dict.update({"uncorr_ecc": uncorrectable})
|
||||
|
||||
# Fan usage
|
||||
try:
|
||||
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(processor, 0)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get fan speed for gpu %s | %s", processor, e.get_error_info())
|
||||
fan_speed = "N/A"
|
||||
try:
|
||||
fan_max = amdsmi_interface.amdsmi_get_gpu_fan_speed_max(processor, 0)
|
||||
fan_usage = "N/A"
|
||||
if fan_max > 0 and fan_speed != "N/A":
|
||||
fan_usage = round((float(fan_speed) / float(fan_max)) * 100, 2)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get max fan speed for gpu %s | %s", processor, e.get_error_info())
|
||||
fan_usage = "N/A"
|
||||
gpu_info_dict.update({"fan": fan_usage})
|
||||
|
||||
gpu_info_list.append(gpu_info_dict)
|
||||
|
||||
# Running Processes
|
||||
# try:
|
||||
# raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
|
||||
# proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A", "vram": "N/A"}
|
||||
# for proc in raw_process_list:
|
||||
# proc_info_dict['gpu'] = gpu_id
|
||||
# proc_info_dict['pid'] = proc['pid']
|
||||
# proc_info_dict['name'] = proc['container_name']
|
||||
# proc_info_dict['vram'] = str(proc['memory_usage']['vram_mem']) + " MB"
|
||||
# all_process_list.append(proc_info_dict)
|
||||
# except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
# logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
default_table_info_dict.update({f"gpu_info_list": gpu_info_list})
|
||||
# default_table_info_dict.update({"processes": all_process_list})
|
||||
|
||||
if self.logger.is_json_format():
|
||||
self.logger.output = default_table_info_dict
|
||||
self.logger.print_output()
|
||||
elif self.logger.is_csv_format():
|
||||
self.logger.multiple_device_output = default_table_info_dict
|
||||
self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True)
|
||||
else:
|
||||
self.logger.print_default_output(default_table_info_dict)
|
||||
|
||||
|
||||
def _event_thread(self, commands, i):
|
||||
devices = commands.device_handles
|
||||
if len(devices) == 0:
|
||||
|
||||
@@ -971,3 +971,102 @@ class AMDSMILogger():
|
||||
with self.destination.open('a', encoding="utf-8") as output_file:
|
||||
output_file.write(primary_table + '\n')
|
||||
output_file.write(secondary_table)
|
||||
|
||||
|
||||
def print_default_output(self, output: Dict):
|
||||
# some template lines
|
||||
# TODO: adjust column lines to give market name more space
|
||||
default_line_1 = "+------------------------------------------------------------------------------+"
|
||||
default_line_2 = "|--------------------------------------+---------------------------------------|"
|
||||
default_line_3 = "|======================================+=======================================|"
|
||||
default_line_4 = "+--------------------------------------+---------------------------------------+"
|
||||
# default_line_5 = "|==============================================================================|"
|
||||
|
||||
# print the version information first
|
||||
amd_smi_version = output['version_info']['amd-smi']
|
||||
if len(amd_smi_version) > 16:
|
||||
amd_smi_version = amd_smi_version[:13] + "..."
|
||||
rocm_version = "N/A"
|
||||
if output['version_info']['rocm version'][0]:
|
||||
rocm_version = output['version_info']['rocm version'][1]
|
||||
amdgpu_version = output['version_info']['amdgpu version']['driver_version']
|
||||
|
||||
# print GPU info
|
||||
print(default_line_1)
|
||||
print("| AMD SMI {0:16s} amdgpu version: {1:8s} ROCm version: {2:8s} |".format(amd_smi_version, amdgpu_version, rocm_version.ljust(8)))
|
||||
print(default_line_2)
|
||||
print("| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |")
|
||||
print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |")
|
||||
print(default_line_3)
|
||||
|
||||
line_count = 0
|
||||
end = len(output['gpu_info_list']) - 1
|
||||
|
||||
for gpu_info in output['gpu_info_list']:
|
||||
bdf = str(gpu_info['bdf']).ljust(12)
|
||||
|
||||
market_name = str(gpu_info['market_name'])
|
||||
if len(market_name) > 22:
|
||||
market_name = ("..." + market_name[-19:])
|
||||
market_name = market_name.rjust(22)
|
||||
|
||||
mem_util = gpu_info['mem_util']
|
||||
if mem_util != "N/A":
|
||||
mem_util = str(mem_util) + " %"
|
||||
mem_util = mem_util.rjust(8)
|
||||
|
||||
temp = gpu_info['temp']
|
||||
if temp != "N/A":
|
||||
temp = str(temp) + " \u00b0C"
|
||||
temp = temp.rjust(6)
|
||||
|
||||
u_ecc = str(gpu_info['uncorr_ecc']).rjust(5)
|
||||
|
||||
power_usage = gpu_info['power_usage']
|
||||
if power_usage != "N/A":
|
||||
power_usage = f"{gpu_info['power_usage']['current_power']}/{gpu_info['power_usage']['power_limit']} W"
|
||||
power_usage = str(power_usage).rjust(12)
|
||||
|
||||
print("| {0:12s} {1:22s} | {2:8s} {3:6s} {4:5s} {5:12s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage))
|
||||
gpu_id = str(gpu_info['gpu_id']).rjust(3)
|
||||
hip_id = str(gpu_info['hip_id']).rjust(6)
|
||||
oam_id = str(gpu_info['oam_id']).rjust(7)
|
||||
partition_modes = str(gpu_info['partition_mode']).rjust(14)
|
||||
|
||||
gfx_util = gpu_info['gfx_util']
|
||||
if gfx_util != "N/A":
|
||||
gfx_util = str(gfx_util) + " %"
|
||||
gfx_util = gfx_util.rjust(8)
|
||||
|
||||
fan = gpu_info['fan']
|
||||
if fan != "N/A":
|
||||
fan = str(fan) + " %"
|
||||
fan = fan.rjust(7)
|
||||
|
||||
mem_usage = gpu_info['mem_usage']
|
||||
if mem_usage != "N/A":
|
||||
mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB"
|
||||
mem_usage = mem_usage.rjust(19)
|
||||
print("| {0:3s} {1:6s} {2:7s} {3:14s} | {4:8s} {5:7s} {6:19s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage))
|
||||
|
||||
if line_count < end:
|
||||
print(default_line_2)
|
||||
line_count += 1
|
||||
|
||||
print(default_line_4)
|
||||
|
||||
# # print process list of all GPUs last
|
||||
# print(default_line_1)
|
||||
# print("| Processes: |")
|
||||
# print("| GPU PID Process name VRAM_MEM |")
|
||||
# print(default_line_5)
|
||||
# if len(output['processes']) != 0:
|
||||
# for process in output['processes']:
|
||||
# gpu_id = str(process['gpu']).rjust(4)
|
||||
# pid = str(process['pid']).ljust(7)
|
||||
# process_name = str(process['name']).ljust(25)
|
||||
# vram_mem = str(process['vram']).rjust(18)
|
||||
# print("| {0:4s} {1:7s} {2:25s} {3:18s} |".format(gpu_id, pid, process_name, vram_mem))
|
||||
# else:
|
||||
# print("| No running processes found |")
|
||||
# print(default_line_1)
|
||||
@@ -69,7 +69,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
"""
|
||||
def __init__(self, version, list, static, firmware, bad_pages, metric,
|
||||
process, profile, event, topology, set_value, reset, monitor,
|
||||
xgmi, partition, ras):
|
||||
xgmi, partition, ras, default):
|
||||
|
||||
# Helper variables
|
||||
self.helpers = AMDSMIHelpers()
|
||||
@@ -115,7 +115,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
# Store possible subcommands & aliases for later errors
|
||||
self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages',
|
||||
'metric', 'process', 'profile', 'event', 'topology', 'set',
|
||||
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras']
|
||||
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
|
||||
|
||||
# Add all subparsers
|
||||
self._add_version_parser(self.subparsers, version)
|
||||
@@ -135,8 +135,9 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
|
||||
# the default command
|
||||
self._add_default_parser(self.subparsers, default)
|
||||
|
||||
### Parser Validators and Helpers###
|
||||
def _not_negative_int(self, int_value, sub_arg=None):
|
||||
# Argument type validator
|
||||
if int_value.isdigit(): # Is digit doesn't work on negative numbers
|
||||
@@ -637,6 +638,17 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
|
||||
return watch_arguments_group
|
||||
|
||||
def _add_default_parser(self, subparsers: argparse._SubParsersAction, func):
|
||||
# there should be no args to parse here so let this be a dummy function to preserve later logic
|
||||
default_help = "Display the default information panel?"
|
||||
default_parser = subparsers.add_parser('default', help=default_help, description=None)
|
||||
default_parser._optionals.title = None
|
||||
default_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog)
|
||||
default_parser.set_defaults(func=func)
|
||||
|
||||
# Add Universal Arguments
|
||||
self._add_command_modifiers(default_parser)
|
||||
|
||||
|
||||
def _add_version_parser(self, subparsers: argparse._SubParsersAction, func):
|
||||
# Subparser help text
|
||||
|
||||
Reference in New Issue
Block a user