Added ras and ecc counting back to Linux VMs
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Change-Id: Ie981f7fe8f481f2137e95dda2e200d00ab4d92c8
[ROCm/amdsmi commit: abee26d4ab]
Tento commit je obsažen v:
@@ -8,6 +8,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Added
|
||||
|
||||
- **Added support for `amd-smi metric --ecc` & `amd-smi metric --ecc-blocks` on Guest VMs**.
|
||||
Guest VMs now support getting current ECC counts and ras information from the Host cards.
|
||||
|
||||
- **Added support for GPU metrics 1.6 to `amdsmi_get_gpu_metrics_info()`**.
|
||||
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for PVIOL / TVIOL, XCP (Graphics Compute Partitions) stats, and pcie_lc_perf_other_end_recovery:
|
||||
- `uint64_t accumulation_counter` - used for all throttled calculations
|
||||
@@ -494,7 +497,7 @@ GPU: 0
|
||||
|
||||
### Changed
|
||||
|
||||
- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**.
|
||||
- **Updated BDF commands to look use KFD SYSFS for BDF: `amdsmi_get_gpu_device_bdf()`**.
|
||||
This aligns BDF output with ROCm SMI.
|
||||
See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function.
|
||||
- bits [63:32] = domain
|
||||
@@ -502,7 +505,7 @@ See below for overview as seen from `rsmi_dev_pci_id_get()` now provides partiti
|
||||
- bits [27:16] = reserved
|
||||
- bits [15: 0] = pci bus/device/function
|
||||
|
||||
- **Moved python tests directory path install location**.
|
||||
- **Moved python tests directory path install location**.
|
||||
- `/opt/<rocm-path>/share/amd_smi/pytest/..` to `/opt/<rocm-path>/share/amd_smi/tests/python_unittest/..`
|
||||
- On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed
|
||||
- Removed pytest dependency, our python testing now only depends on the unittest framework.
|
||||
@@ -580,12 +583,6 @@ GPU: 0
|
||||
- **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**.
|
||||
- This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID)
|
||||
|
||||
- **Removed `--ras` option from `amd-smi static` command in Guest environments**.
|
||||
- VMs don't have permission from Hosts to obtain RAS information, so this option was made invalid on Guest environments.
|
||||
|
||||
- **Removed `--ecc` option from `amd-smi monitor` command in Guest environments**.
|
||||
- Guest VMs do not support getting current ECC counts from the Host cards.
|
||||
|
||||
### Optimized
|
||||
|
||||
- **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**.
|
||||
|
||||
@@ -311,6 +311,8 @@ class AMDSMICommands():
|
||||
args.board = board
|
||||
if driver:
|
||||
args.driver = driver
|
||||
if ras:
|
||||
args.ras = ras
|
||||
if vram:
|
||||
args.vram = vram
|
||||
if cache:
|
||||
@@ -319,14 +321,12 @@ class AMDSMICommands():
|
||||
args.process_isolation = process_isolation
|
||||
|
||||
# Store args that are applicable to the current platform
|
||||
current_platform_args = ["asic", "bus", "vbios", "driver",
|
||||
current_platform_args = ["asic", "bus", "vbios", "driver", "ras",
|
||||
"vram", "cache", "board", "process_isolation"]
|
||||
current_platform_values = [args.asic, args.bus, args.vbios, args.driver,
|
||||
current_platform_values = [args.asic, args.bus, args.vbios, args.driver, args.ras,
|
||||
args.vram, args.cache, args.board, args.process_isolation]
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if ras:
|
||||
args.ras = ras
|
||||
if partition:
|
||||
args.partition = partition
|
||||
if limit:
|
||||
@@ -336,8 +336,7 @@ class AMDSMICommands():
|
||||
if xgmi_plpd:
|
||||
args.xgmi_plpd = xgmi_plpd
|
||||
current_platform_args += ["ras", "limit", "partition", "soc_pstate", "xgmi_plpd"]
|
||||
current_platform_values += [args.ras, args.limit, args.partition,
|
||||
args.soc_pstate, args.xgmi_plpd]
|
||||
current_platform_values += [args.ras, args.limit, args.partition, args.soc_pstate, args.xgmi_plpd]
|
||||
|
||||
if self.helpers.is_linux() and not self.helpers.is_virtual_os():
|
||||
if numa:
|
||||
@@ -1250,17 +1249,13 @@ class AMDSMICommands():
|
||||
args.temperature = temperature
|
||||
if pcie:
|
||||
args.pcie = pcie
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "pcie"]
|
||||
current_platform_values += [args.usage, args.power, args.clock,
|
||||
args.temperature, args.pcie]
|
||||
|
||||
# Only args that are applicable to Hypervisors and BM Linux
|
||||
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
|
||||
if ecc:
|
||||
args.ecc = ecc
|
||||
if ecc_blocks:
|
||||
args.ecc_blocks = ecc_blocks
|
||||
current_platform_args += ["ecc", "ecc_blocks"]
|
||||
current_platform_args += ["usage", "power", "clock", "temperature", "pcie", "ecc", "ecc_blocks"]
|
||||
current_platform_values += [args.usage, args.power, args.clock,
|
||||
args.temperature, args.pcie]
|
||||
current_platform_values += [args.ecc, args.ecc_blocks]
|
||||
|
||||
if self.helpers.is_baremetal() and self.helpers.is_linux():
|
||||
@@ -4493,22 +4488,6 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
# handle platform for ecc
|
||||
if self.helpers.is_virtual_os():
|
||||
args.ecc = False
|
||||
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
|
||||
args.encoder, args.decoder, args.vram_usage, args.pcie, args.violation]):
|
||||
args.power_usage = args.temperature = args.gfx = args.mem = \
|
||||
args.encoder = args.decoder = \
|
||||
args.vram_usage = args.pcie = args.violation = True
|
||||
else:
|
||||
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
|
||||
args.encoder, args.decoder, args.ecc,
|
||||
args.vram_usage, args.pcie, args.violation]):
|
||||
args.power_usage = args.temperature = args.gfx = args.mem = \
|
||||
args.encoder = args.decoder = args.ecc = \
|
||||
args.vram_usage = args.pcie = args.violation = True
|
||||
|
||||
# If all arguments are False, the print all values
|
||||
# Don't include process in this logic as it's an optional edge case
|
||||
if not any([args.power_usage, args.temperature, args.gfx, args.mem,
|
||||
|
||||
@@ -635,10 +635,10 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help)
|
||||
static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help)
|
||||
static_parser.add_argument('-R', '--process-isolation', action='store_true', required=False, help=process_isolation_help)
|
||||
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
|
||||
|
||||
# Options to display on Hypervisors and Baremetal
|
||||
if self.helpers.is_hypervisor() or self.helpers.is_baremetal():
|
||||
static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help)
|
||||
static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help)
|
||||
static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help)
|
||||
static_parser.add_argument('-P', '--soc-pstate', action='store_true', required=False, help=soc_pstate_help)
|
||||
@@ -820,11 +820,12 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help)
|
||||
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
|
||||
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
|
||||
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
|
||||
|
||||
# Options that only apply to Hypervisors and Baremetal Linux
|
||||
if self.helpers.is_hypervisor() or (self.helpers.is_baremetal() and self.helpers.is_linux()):
|
||||
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
|
||||
pass
|
||||
|
||||
# Optional Args for Linux Baremetal Systems
|
||||
if self.helpers.is_baremetal() and self.helpers.is_linux():
|
||||
@@ -1203,10 +1204,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
monitor_parser.add_argument('-m', '--mem', action='store_true', required=False, help=mem_util_help)
|
||||
monitor_parser.add_argument('-n', '--encoder', action='store_true', required=False, help=encoder_util_help)
|
||||
monitor_parser.add_argument('-d', '--decoder', action='store_true', required=False, help=decoder_util_help)
|
||||
|
||||
if not self.helpers.is_virtual_os():
|
||||
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
|
||||
monitor_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
monitor_parser.add_argument('-v', '--vram-usage', action='store_true', required=False, help=mem_usage_help)
|
||||
monitor_parser.add_argument('-r', '--pcie', action='store_true', required=False, help=pcie_bandwidth_help)
|
||||
monitor_parser.add_argument('-q', '--process', action='store_true', required=False, help=process_help)
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele