Moved replay count under metric --pcie
Change-Id: I86564af04526df9b1a8cdae14da39450d924805d
Signed-off-by: Maisam Arif <maisarif@amd.com>
[ROCm/amdsmi commit: 901b9e8331]
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -202,12 +202,11 @@ Metric arguments:
|
||||
-c, --clock Average, max, and current clock frequencies
|
||||
-t, --temperature Current temperatures
|
||||
-e, --ecc Number of ECC errors
|
||||
-P, --pcie Current PCIe speed and width
|
||||
-P, --pcie Current PCIe speed, width, and replay count
|
||||
-f, --fan Current fan speed
|
||||
-C, --voltage-curve Display voltage curve
|
||||
-o, --overdrive Current GPU clock overdrive level
|
||||
-l, --perf-level Current DPM performance level
|
||||
-r, --replay-count PCIe replay count
|
||||
-x, --xgmi-err XGMI error information since last read
|
||||
-E, --energy Amount of energy consumed
|
||||
-m, --mem-usage Memory usage per block
|
||||
|
||||
@@ -696,7 +696,7 @@ class AMDSMICommands():
|
||||
usage=None, watch=None, watch_time=None, iterations=None, power=None,
|
||||
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None,
|
||||
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
|
||||
replay_count=None, xgmi_err=None, energy=None, mem_usage=None):
|
||||
xgmi_err=None, energy=None, mem_usage=None):
|
||||
"""Get Metric information for target gpu
|
||||
|
||||
Args:
|
||||
@@ -718,7 +718,6 @@ class AMDSMICommands():
|
||||
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
|
||||
overdrive (bool, optional): Value override for args.overdrive. Defaults to None.
|
||||
perf_level (bool, optional): Value override for args.perf_level. Defaults to None.
|
||||
replay_count (bool, optional): Value override for args.replay_count. Defaults to None.
|
||||
xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None.
|
||||
energy (bool, optional): Value override for args.energy. Defaults to None.
|
||||
mem_usage (bool, optional): Value override for args.mem_usage. Defaults to None.
|
||||
@@ -742,10 +741,6 @@ class AMDSMICommands():
|
||||
if mem_usage:
|
||||
args.mem_usage = mem_usage
|
||||
|
||||
if not self.helpers.is_virtual_os():
|
||||
if replay_count:
|
||||
args.replay_count = replay_count
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if usage:
|
||||
args.usage = usage
|
||||
@@ -822,12 +817,11 @@ class AMDSMICommands():
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if not any([args.usage, args.mem_usage, args.power, args.clock, args.temperature,
|
||||
args.ecc, args.ecc_block, args.pcie, args.fan, args.voltage_curve,
|
||||
args.overdrive, args.perf_level, args.replay_count, args.xgmi_err,
|
||||
args.energy]):
|
||||
args.overdrive, args.perf_level, args.xgmi_err, args.energy]):
|
||||
args.usage = args.mem_usage = args.power = args.clock = args.temperature = \
|
||||
args.ecc = args.ecc_block = args.pcie = args.fan = args.voltage_curve = \
|
||||
args.overdrive = args.perf_level = args.replay_count = args.xgmi_err = \
|
||||
args.energy = self.all_arguments = True
|
||||
args.overdrive = args.perf_level = args.xgmi_err = args.energy = \
|
||||
self.all_arguments = True
|
||||
|
||||
# Add timestamp and store values for specified arguments
|
||||
values_dict = {}
|
||||
@@ -1003,6 +997,10 @@ class AMDSMICommands():
|
||||
values_dict['ecc_block'] = "N/A"
|
||||
logging.debug("Failed to get ecc block features for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
if args.pcie:
|
||||
pcie_dict = {'current_width': "N/A",
|
||||
'current_speed': "N/A",
|
||||
'replay_count' : "N/A"}
|
||||
|
||||
try:
|
||||
pcie_link_status = amdsmi_interface.amdsmi_get_pcie_link_status(args.gpu)
|
||||
|
||||
@@ -1011,20 +1009,22 @@ class AMDSMICommands():
|
||||
else:
|
||||
pcie_speed_GTs_value = round(pcie_link_status['pcie_speed'] / 1000)
|
||||
|
||||
pcie_link_status['pcie_speed'] = pcie_speed_GTs_value
|
||||
# The interface version should not be displayed as it is based on the current speed
|
||||
del pcie_link_status['pcie_interface_version']
|
||||
pcie_dict['current_speed'] = pcie_speed_GTs_value
|
||||
pcie_dict['current_width'] = pcie_link_status['pcie_lanes']
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
unit = 'GT/s'
|
||||
pcie_link_status['pcie_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
|
||||
pcie_link_status['current_width'] = pcie_link_status.pop('pcie_lanes')
|
||||
pcie_link_status['current_speed'] = pcie_link_status.pop('pcie_speed')
|
||||
|
||||
values_dict['pcie'] = pcie_link_status
|
||||
pcie_link_status['current_speed'] = f"{pcie_link_status['pcie_speed']} {unit}"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['pcie'] = "N/A"
|
||||
logging.debug("Failed to get pcie link status for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
|
||||
pcie_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
|
||||
values_dict['pcie'] = pcie_dict
|
||||
if args.fan:
|
||||
try:
|
||||
fan_speed = amdsmi_interface.amdsmi_get_gpu_fan_speed(args.gpu, 0)
|
||||
@@ -1094,15 +1094,6 @@ class AMDSMICommands():
|
||||
values_dict['perf_level'] = "N/A"
|
||||
logging.debug("Failed to get perf level for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
|
||||
if not self.helpers.is_virtual_os():
|
||||
if args.replay_count:
|
||||
try:
|
||||
pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu)
|
||||
values_dict['replay_count'] = pci_replay_counter
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['replay_count'] = "N/A"
|
||||
logging.debug("Failed to get pci replay counter for gpu %s | %s", args.gpu, e.get_error_info())
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if args.xgmi_err:
|
||||
try:
|
||||
|
||||
@@ -425,14 +425,13 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
temperature_help = "Current temperatures"
|
||||
ecc_help = "Number of ECC errors"
|
||||
ecc_block_help = "Number of ECC errors per block"
|
||||
pcie_help = "Current PCIe speed and width"
|
||||
pcie_help = "Current PCIe speed, width, and replay count"
|
||||
|
||||
# Help text for Arguments only on Linux Baremetal platforms
|
||||
fan_help = "Current fan speed"
|
||||
vc_help = "Display voltage curve"
|
||||
overdrive_help = "Current GPU clock overdrive level"
|
||||
perf_level_help = "Current DPM performance level"
|
||||
replay_count_help = "PCIe replay count"
|
||||
xgmi_err_help = "XGMI error information since last read"
|
||||
energy_help = "Amount of energy consumed"
|
||||
|
||||
@@ -466,7 +465,6 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
|
||||
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
|
||||
metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help)
|
||||
metric_parser.add_argument('-r', '--replay-count', action='store_true', required=False, help=replay_count_help)
|
||||
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
|
||||
|
||||
# Optional Args for Linux Baremetal Systems
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user