Moved replay count under metric --pcie

Change-Id: I86564af04526df9b1a8cdae14da39450d924805d
Signed-off-by: Maisam Arif <maisarif@amd.com>


[ROCm/amdsmi commit: 901b9e8331]
Αυτή η υποβολή περιλαμβάνεται σε:
Maisam Arif
2023-09-22 06:34:47 -05:00
γονέας a904b6503e
υποβολή 7463a84bd6
3 αρχεία άλλαξαν με 21 προσθήκες και 33 διαγραφές
@@ -202,12 +202,11 @@ Metric arguments:
-c, --clock Average, max, and current clock frequencies
-t, --temperature Current temperatures
-e, --ecc Number of ECC errors
-P, --pcie Current PCIe speed and width
-P, --pcie Current PCIe speed, width, and replay count
-f, --fan Current fan speed
-C, --voltage-curve Display voltage curve
-o, --overdrive Current GPU clock overdrive level
-l, --perf-level Current DPM performance level
-r, --replay-count PCIe replay count
-x, --xgmi-err XGMI error information since last read
-E, --energy Amount of energy consumed
-m, --mem-usage Memory usage per block
@@ -696,7 +696,7 @@ class AMDSMICommands():
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
replay_count=None, xgmi_err=None, energy=None, mem_usage=None):
xgmi_err=None, energy=None, mem_usage=None):
"""Get Metric information for target gpu
Args:
@@ -718,7 +718,6 @@ class AMDSMICommands():
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
perf_level (bool, optional): Value override for args.perf_level. Defaults to None.
replay_count (bool, optional): Value override for args.replay_count. Defaults to None.
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
energy (bool, optional): Value override for args.energy. Defaults to None.
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
@@ -742,10 +741,6 @@ class AMDSMICommands():
if mem_usage:
args.mem_usage = mem_usage
if not self.helpers.is_virtual_os():
if replay_count:
args.replay_count = replay_count
if self.helpers.is_linux() and self.helpers.is_baremetal():
if usage:
args.usage = usage
@@ -822,12 +817,11 @@ class AMDSMICommands():
if self.helpers.is_linux() and self.helpers.is_baremetal():
if not any([args.usage, args.mem_usage, args.power, args.clock, args.temperature,
args.ecc, args.ecc_block, args.pcie, args.fan, args.voltage_curve,
args.overdrive, args.perf_level, args.replay_count, args.xgmi_err,
args.energy]):
args.overdrive, args.perf_level, args.xgmi_err, args.energy]):
args.usage = args.mem_usage = args.power = args.clock = args.temperature = \
args.ecc = args.ecc_block = args.pcie = args.fan = args.voltage_curve = \
args.overdrive = args.perf_level = args.replay_count = args.xgmi_err = \
args.energy = self.all_arguments = True
args.overdrive = args.perf_level = args.xgmi_err = args.energy = \
self.all_arguments = True
# Add timestamp and store values for specified arguments
values_dict = {}
@@ -1003,6 +997,10 @@ class AMDSMICommands():
values_dict['ecc_block'] = "N/A"
logging.debug("Failed to get ecc block features for gpu %s | %s", args.gpu, e.get_error_info())
if args.pcie:
pcie_dict = {'current_width': "N/A",
'current_speed': "N/A",
'replay_count' : "N/A"}
try:
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_status(args.gpu)
@@ -1011,20 +1009,22 @@ class AMDSMICommands():
else:
pcie_speed_GTs_value = round(pcie_link_status['pcie_speed'] / 1000)
pcie_link_status['pcie_speed'] = pcie_speed_GTs_value
# The interface version should not be displayed as it is based on the current speed
del pcie_link_status['pcie_interface_version']
pcie_dict['current_speed'] = pcie_speed_GTs_value
pcie_dict['current_width'] = pcie_link_status['pcie_lanes']
if self.logger.is_human_readable_format():
unit = 'GT/s'
pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
pcie_link_status['current_width'] = pcie_link_status.pop('pcie_lanes')
pcie_link_status['current_speed'] = pcie_link_status.pop('pcie_speed')
values_dict['pcie'] = pcie_link_status
pcie_link_status['current_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['pcie'] = "N/A"
logging.debug("Failed to get pcie link status for gpu %s | %s", args.gpu, e.get_error_info())
try:
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
pcie_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info())
values_dict['pcie'] = pcie_dict
if args.fan:
try:
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(args.gpu, 0)
@@ -1094,15 +1094,6 @@ class AMDSMICommands():
values_dict['perf_level'] = "N/A"
logging.debug("Failed to get perf level for gpu %s | %s", args.gpu, e.get_error_info())
if not self.helpers.is_virtual_os():
if args.replay_count:
try:
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
values_dict['replay_count'] = pci_replay_counter
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['replay_count'] = "N/A"
logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info())
if self.helpers.is_linux() and self.helpers.is_baremetal():
if args.xgmi_err:
try:
@@ -425,14 +425,13 @@ class AMDSMIParser(argparse.ArgumentParser):
temperature_help = "Current temperatures"
ecc_help = "Number of ECC errors"
ecc_block_help = "Number of ECC errors per block"
pcie_help = "Current PCIe speed and width"
pcie_help = "Current PCIe speed, width, and replay count"
# Help text for Arguments only on Linux Baremetal platforms
fan_help = "Current fan speed"
vc_help = "Display voltage curve"
overdrive_help = "Current GPU clock overdrive level"
perf_level_help = "Current DPM performance level"
replay_count_help = "PCIe replay count"
xgmi_err_help = "XGMI error information since last read"
energy_help = "Amount of energy consumed"
@@ -466,7 +465,6 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help)
metric_parser.add_argument('-r', '--replay-count', action='store_true', required=False, help=replay_count_help)
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
# Optional Args for Linux Baremetal Systems