Added ras and ecc counting back to Linux VMs

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: Ie981f7fe8f481f2137e95dda2e200d00ab4d92c8


[ROCm/amdsmi commit: abee26d4ab]
Tento commit je obsažen v:
Maisam Arif
2024-11-04 15:36:48 -06:00
rodič 468a223acc
revize e9f43bc3dd
3 změnil soubory, kde provedl 18 přidání a 44 odebrání
+5 -8
Zobrazit soubor
@@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Added
- **Added support for `amd-smi metric --ecc` & `amd-smi metric --ecc-blocks` on Guest VMs**.
Guest VMs now support getting current ECC counts and ras information from the Host cards.
- **Added support for GPU metrics 1.6 to `amdsmi_get_gpu_metrics_info()`**.
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery:
- `uint64_t accumulation_counter` - used for all throttled calculations
@@ -494,7 +497,7 @@ GPU: 0
### Changed
- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**.
- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**.
This aligns BDF output with ROCm SMI.
See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function.
- bits [63:32] = domain
@@ -502,7 +505,7 @@ See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partiti
- bits [27:16] = reserved
- bits [15: 0] = pci bus/device/function
- **Moved python tests directory path install location**.
- **Moved python tests directory path install location**.
- `/opt/<rocm-path>/share/amd_smi/pytest/..` to `/opt/<rocm-path>/share/amd_smi/tests/python_unittest/..`
- On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed
- Removed pytest dependency, our python testing now only depends on the unittest framework.
@@ -580,12 +583,6 @@ GPU: 0
- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**.
- This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID)
- **Removed `--ras` option from `amd-smi static` command in Guest environments**.
- VMs don't have permission from Hosts to obtain RAS information, so this option was made invalid on Guest environments.
- **Removed `--ecc` option from `amd-smi monitor` command in Guest environments**.
- Guest VMs do not support getting current ECC counts from the Host cards.
### Optimized
- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**.
+8 -29
Zobrazit soubor
@@ -311,6 +311,8 @@ class AMDSMICommands():
args.board = board
if driver:
args.driver = driver
if ras:
args.ras = ras
if vram:
args.vram = vram
if cache:
@@ -319,14 +321,12 @@ class AMDSMICommands():
args.process_isolation = process_isolation
# Store args that are applicable to the current platform
current_platform_args = ["asic", "bus", "vbios", "driver",
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
"vram", "cache", "board", "process_isolation"]
current_platform_values = [args.asic, args.bus, args.vbios, args.driver,
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
args.vram, args.cache, args.board, args.process_isolation]
if self.helpers.is_linux() and self.helpers.is_baremetal():
if ras:
args.ras = ras
if partition:
args.partition = partition
if limit:
@@ -336,8 +336,7 @@ class AMDSMICommands():
if xgmi_plpd:
args.xgmi_plpd = xgmi_plpd
current_platform_args += ["ras", "limit", "partition", "soc_pstate", "xgmi_plpd"]
current_platform_values += [args.ras, args.limit, args.partition,
args.soc_pstate, args.xgmi_plpd]
current_platform_values += [args.ras, args.limit, args.partition, args.soc_pstate, args.xgmi_plpd]
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
if numa:
@@ -1250,17 +1249,13 @@ class AMDSMICommands():
args.temperature = temperature
if pcie:
args.pcie = pcie
current_platform_args += ["usage", "power", "clock", "temperature", "pcie"]
current_platform_values += [args.usage, args.power, args.clock,
args.temperature, args.pcie]
# Only args that are applicable to Hypervisors and BM Linux
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
if ecc:
args.ecc = ecc
if ecc_blocks:
args.ecc_blocks = ecc_blocks
current_platform_args += ["ecc", "ecc_blocks"]
current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
current_platform_values += [args.usage, args.power, args.clock,
args.temperature, args.pcie]
current_platform_values += [args.ecc, args.ecc_blocks]
if self.helpers.is_baremetal() and self.helpers.is_linux():
@@ -4493,22 +4488,6 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
# handle platform for ecc
if self.helpers.is_virtual_os():
args.ecc = False
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
args.encoder, args.decoder, args.vram_usage, args.pcie, args.violation]):
args.power_usage = args.temperature = args.gfx = args.mem = \
args.encoder = args.decoder = \
args.vram_usage = args.pcie = args.violation = True
else:
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
args.encoder, args.decoder, args.ecc,
args.vram_usage, args.pcie, args.violation]):
args.power_usage = args.temperature = args.gfx = args.mem = \
args.encoder = args.decoder = args.ecc = \
args.vram_usage = args.pcie = args.violation = True
# If all arguments are False, the print all values
# Don't include process in this logic as it's an optional edge case
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
+5 -7
Zobrazit soubor
@@ -635,10 +635,10 @@ class AMDSMIParser(argparse.ArgumentParser):
static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help)
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help)
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
# Options to display on Hypervisors and Baremetal
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help)
@@ -820,11 +820,12 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help)
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
# Options that only apply to Hypervisors and Baremetal Linux
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
pass
# Optional Args for Linux Baremetal Systems
if self.helpers.is_baremetal() and self.helpers.is_linux():
@@ -1203,10 +1204,7 @@ class AMDSMIParser(argparse.ArgumentParser):
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
if not self.helpers.is_virtual_os():
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help)
monitor_parser.add_argument('-q', '--process', action='store_true', required=False, help=process_help)