From e9f43bc3ddc672705bd85e0df21d5ee2e89205be Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Mon, 4 Nov 2024 15:36:48 -0600 Subject: [PATCH] Added ras and ecc counting back to Linux VMs Signed-off-by: Maisam Arif Change-Id: Ie981f7fe8f481f2137e95dda2e200d00ab4d92c8 [ROCm/amdsmi commit: abee26d4ab740d54a5a47b5833f10cd3501a1253] --- projects/amdsmi/CHANGELOG.md | 13 +++---- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 37 ++++--------------- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 12 +++--- 3 files changed, 18 insertions(+), 44 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index bfe31f066f..fb43dbc8c0 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added +- **Added support for `amd-smi metric --ecc` & `amd-smi metric --ecc-blocks` on Guest VMs**. +Guest VMs now support getting current ECC counts and ras information from the Host cards. + - **Added support for GPU metrics 1.6 to `amdsmi_get_gpu_metrics_info()`**. Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery: - `uint64_t accumulation_counter` - used for all throttled calculations @@ -494,7 +497,7 @@ GPU: 0 ### Changed -- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**. +- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**. This aligns BDF output with ROCm SMI. See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function. - bits [63:32] = domain @@ -502,7 +505,7 @@ See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partiti - bits [27:16] = reserved - bits [15: 0] = pci bus/device/function -- **Moved python tests directory path install location**. +- **Moved python tests directory path install location**. - `/opt//share/amd_smi/pytest/..` to `/opt//share/amd_smi/tests/python_unittest/..` - On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed - Removed pytest dependency, our python testing now only depends on the unittest framework. @@ -580,12 +583,6 @@ GPU: 0 - **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. - This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) -- **Removed `--ras` option from `amd-smi static` command in Guest environments**. - - VMs don't have permission from Hosts to obtain RAS information, so this option was made invalid on Guest environments. - -- **Removed `--ecc` option from `amd-smi monitor` command in Guest environments**. - - Guest VMs do not support getting current ECC counts from the Host cards. - ### Optimized - **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**. diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 03538c1f84..045865a854 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -311,6 +311,8 @@ class AMDSMICommands(): args.board = board if driver: args.driver = driver + if ras: + args.ras = ras if vram: args.vram = vram if cache: @@ -319,14 +321,12 @@ class AMDSMICommands(): args.process_isolation = process_isolation # Store args that are applicable to the current platform - current_platform_args = ["asic", "bus", "vbios", "driver", + current_platform_args = ["asic", "bus", "vbios", "driver", "ras", "vram", "cache", "board", "process_isolation"] - current_platform_values = [args.asic, args.bus, args.vbios, args.driver, + current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras, args.vram, args.cache, args.board, args.process_isolation] if self.helpers.is_linux() and self.helpers.is_baremetal(): - if ras: - args.ras = ras if partition: args.partition = partition if limit: @@ -336,8 +336,7 @@ class AMDSMICommands(): if xgmi_plpd: args.xgmi_plpd = xgmi_plpd current_platform_args += ["ras", "limit", "partition", "soc_pstate", "xgmi_plpd"] - current_platform_values += [args.ras, args.limit, args.partition, - args.soc_pstate, args.xgmi_plpd] + current_platform_values += [args.ras, args.limit, args.partition, args.soc_pstate, args.xgmi_plpd] if self.helpers.is_linux() and not self.helpers.is_virtual_os(): if numa: @@ -1250,17 +1249,13 @@ class AMDSMICommands(): args.temperature = temperature if pcie: args.pcie = pcie - current_platform_args += ["usage", "power", "clock", "temperature", "pcie"] - current_platform_values += [args.usage, args.power, args.clock, - args.temperature, args.pcie] - - # Only args that are applicable to Hypervisors and BM Linux - if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()): if ecc: args.ecc = ecc if ecc_blocks: args.ecc_blocks = ecc_blocks - current_platform_args += ["ecc", "ecc_blocks"] + current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"] + current_platform_values += [args.usage, args.power, args.clock, + args.temperature, args.pcie] current_platform_values += [args.ecc, args.ecc_blocks] if self.helpers.is_baremetal() and self.helpers.is_linux(): @@ -4493,22 +4488,6 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - # handle platform for ecc - if self.helpers.is_virtual_os(): - args.ecc = False - if not any([args.power_usage, args.temperature, args.gfx, args.mem, - args.encoder, args.decoder, args.vram_usage, args.pcie, args.violation]): - args.power_usage = args.temperature = args.gfx = args.mem = \ - args.encoder = args.decoder = \ - args.vram_usage = args.pcie = args.violation = True - else: - if not any([args.power_usage, args.temperature, args.gfx, args.mem, - args.encoder, args.decoder, args.ecc, - args.vram_usage, args.pcie, args.violation]): - args.power_usage = args.temperature = args.gfx = args.mem = \ - args.encoder = args.decoder = args.ecc = \ - args.vram_usage = args.pcie = args.violation = True - # If all arguments are False, the print all values # Don't include process in this logic as it's an optional edge case if not any([args.power_usage, args.temperature, args.gfx, args.mem, diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 4a069b4ca1..63e864d0b2 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -635,10 +635,10 @@ class AMDSMIParser(argparse.ArgumentParser): static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help) + static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) # Options to display on Hypervisors and Baremetal if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): - static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help) @@ -820,11 +820,12 @@ class AMDSMIParser(argparse.ArgumentParser): metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) + metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) # Options that only apply to Hypervisors and Baremetal Linux if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()): - metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) - metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help) + pass # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): @@ -1203,10 +1204,7 @@ class AMDSMIParser(argparse.ArgumentParser): monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help) monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help) monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help) - - if not self.helpers.is_virtual_os(): - monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) - + monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help) monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help) monitor_parser.add_argument('-q', '--process', action='store_true', required=False, help=process_help)