2
0

SWDEV-445664 - Aligned Metric Command with Host

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I905ee72272bb4c5ccde3e237d2663ec6e0e55034


[ROCm/amdsmi commit: 542bfc0c77]
Este cometimento está contido em:
Maisam Arif
2024-02-22 02:39:31 -06:00
cometido por Maisam Arif
ascendente 49dd38f117
cometimento 8c4518eb66
2 ficheiros modificados com 197 adições e 186 eliminações
+194 -183
Ver ficheiro
@@ -353,6 +353,7 @@ class AMDSMICommands():
try:
link_caps = amdsmi_interface.amdsmi_get_pcie_info(args.gpu)
bus_info['max_pcie_width'] = link_caps['pcie_static']['max_pcie_width']
bus_info['max_pcie_speed'] = link_caps['pcie_static']['max_pcie_speed']
bus_info['pcie_slot_type'] = link_caps['pcie_static']['slot_type']
bus_info['pcie_interface_version'] = link_caps['pcie_static']['pcie_interface_version']
@@ -374,11 +375,18 @@ class AMDSMICommands():
else:
bus_info['slot_type'] = "N/A"
if bus_info['pcie_interface_version'] > 0:
bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
# Set the unit for pcie_speed
pcie_speed_unit ='GT/s'
if self.logger.is_human_readable_format():
unit ='GT/s'
bus_info['max_pcie_speed'] = f"{bus_info['max_pcie_speed']} {unit}"
if bus_info['pcie_interface_version'] > 0:
bus_info['pcie_interface_version'] = f"Gen {bus_info['pcie_interface_version']}"
bus_info['max_pcie_speed'] = f"{bus_info['max_pcie_speed']} {pcie_speed_unit}"
if self.logger.is_json_format():
bus_info['max_pcie_speed'] = {"value" : bus_info['max_pcie_speed'],
"unit" : pcie_speed_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
bus_info = "N/A"
logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info())
@@ -395,21 +403,6 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
static_dict['vbios'] = "N/A"
logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info())
if args.board:
static_dict['board'] = {"model_number": "N/A",
"product_serial": "N/A",
"fru_id": "N/A",
"product_name": "N/A",
"manufacturer_name": "N/A"}
try:
board_info = amdsmi_interface.amdsmi_get_gpu_board_info(args.gpu)
for key, value in board_info.items():
if isinstance(value, str):
if value.strip() == '':
board_info[key] = "N/A"
static_dict['board'] = board_info
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info())
if 'limit' in current_platform_args:
if args.limit:
# Power limits
@@ -417,11 +410,11 @@ class AMDSMICommands():
power_limit_error = False
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
max_power_limit = power_cap_info['max_power_cap']
current_power_limit = power_cap_info['power_cap']
socket_power_limit = power_cap_info['power_cap']
except amdsmi_exception.AmdSmiLibraryException as e:
power_limit_error = True
max_power_limit = "N/A"
current_power_limit = "N/A"
socket_power_limit = "N/A"
logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info())
# Edge temperature limits
@@ -491,29 +484,29 @@ class AMDSMICommands():
logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info())
if self.logger.is_human_readable_format():
unit = 'W'
pcie_speed_unit = 'W'
if not power_limit_error:
max_power_limit = f"{max_power_limit} {unit}"
current_power_limit = f"{current_power_limit} {unit}"
max_power_limit = f"{max_power_limit} {pcie_speed_unit}"
socket_power_limit = f"{socket_power_limit} {pcie_speed_unit}"
unit = '\N{DEGREE SIGN}C'
pcie_speed_unit = '\N{DEGREE SIGN}C'
if not slowdown_temp_edge_limit_error:
slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {unit}"
slowdown_temp_edge_limit = f"{slowdown_temp_edge_limit} {pcie_speed_unit}"
if not slowdown_temp_hotspot_limit_error:
slowdown_temp_hotspot_limit = f"{slowdown_temp_hotspot_limit} {unit}"
slowdown_temp_hotspot_limit = f"{slowdown_temp_hotspot_limit} {pcie_speed_unit}"
if not slowdown_temp_vram_limit_error:
slowdown_temp_vram_limit = f"{slowdown_temp_vram_limit} {unit}"
slowdown_temp_vram_limit = f"{slowdown_temp_vram_limit} {pcie_speed_unit}"
if not shutdown_temp_edge_limit_error:
shutdown_temp_edge_limit = f"{shutdown_temp_edge_limit} {unit}"
shutdown_temp_edge_limit = f"{shutdown_temp_edge_limit} {pcie_speed_unit}"
if not shutdown_temp_hotspot_limit_error:
shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {unit}"
shutdown_temp_hotspot_limit = f"{shutdown_temp_hotspot_limit} {pcie_speed_unit}"
if not shutdown_temp_vram_limit_error:
shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {unit}"
shutdown_temp_vram_limit = f"{shutdown_temp_vram_limit} {pcie_speed_unit}"
limit_info = {}
# Power limits
limit_info['max_power'] = max_power_limit
limit_info['current_power'] = current_power_limit
limit_info['socket_power'] = socket_power_limit
# Shutdown limits
limit_info['slowdown_edge_temperature'] = slowdown_temp_edge_limit
@@ -535,6 +528,93 @@ class AMDSMICommands():
logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['driver'] = driver_info_dict
if args.board:
static_dict['board'] = {"model_number": "N/A",
"product_serial": "N/A",
"fru_id": "N/A",
"product_name": "N/A",
"manufacturer_name": "N/A"}
try:
board_info = amdsmi_interface.amdsmi_get_gpu_board_info(args.gpu)
for key, value in board_info.items():
if isinstance(value, str):
if value.strip() == '':
board_info[key] = "N/A"
static_dict['board'] = board_info
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info())
if 'ras' in current_platform_args:
if args.ras:
ras_dict = {"eeprom_version": "N/A",
"parity_schema" : "N/A",
"single_bit_schema" : "N/A",
"double_bit_schema" : "N/A",
"poison_schema" : "N/A",
"ecc_block_state": "N/A"}
try:
ras_info = amdsmi_interface.amdsmi_get_gpu_ras_feature_info(args.gpu)
for key, value in ras_info.items():
if isinstance(value, int):
if value == 65535:
logging.debug(f"Failed to get ras {key} for gpu {gpu_id}")
ras_info[key] = "N/A"
continue
if key != "eeprom_version":
if value:
ras_info[key] = "ENABLED"
else:
ras_info[key] = "DISABLED"
ras_dict.update(ras_info)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras info for gpu %s | %s", gpu_id, e.get_error_info())
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
ecc_block_state_dict = {}
for state in ras_states:
ecc_block_state_dict[state["block"]] = state["status"]
ras_dict["ecc_block_state"] = ecc_block_state_dict
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
static_dict["ras"] = ras_dict
if 'partition' in current_platform_args:
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"compute_partition": compute_partition,
"memory_partition": memory_partition}
if 'numa' in current_platform_args:
if args.numa:
try:
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
numa_node_number = "N/A"
logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info())
try:
numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu)
# -1 means No numa node is assigned to the GPU, so there is no numa affinity
if self.logger.is_human_readable_format() and numa_affinity == -1:
numa_affinity = "NONE"
except amdsmi_exception.AmdSmiLibraryException as e:
numa_affinity = "N/A"
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['numa'] = {'node' : numa_node_number,
'affinity' : numa_affinity}
if args.vram:
vram_info_dict = {"type" : "N/A",
"vendor" : "N/A",
@@ -612,78 +692,6 @@ class AMDSMICommands():
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['cache_info'] = cache_info_list
if 'ras' in current_platform_args:
if args.ras:
ras_dict = {"eeprom_version": "N/A",
"parity_schema" : "N/A",
"single_bit_schema" : "N/A",
"double_bit_schema" : "N/A",
"poison_schema" : "N/A",
"ecc_block_state": "N/A"}
try:
ras_info = amdsmi_interface.amdsmi_get_gpu_ras_feature_info(args.gpu)
for key, value in ras_info.items():
if isinstance(value, int):
if value == 65535:
logging.debug(f"Failed to get ras {key} for gpu {gpu_id}")
ras_info[key] = "N/A"
continue
if key != "eeprom_version":
if value:
ras_info[key] = "ENABLED"
else:
ras_info[key] = "DISABLED"
ras_dict.update(ras_info)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras info for gpu %s | %s", gpu_id, e.get_error_info())
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
ecc_block_state_dict = {}
for state in ras_states:
ecc_block_state_dict[state["block"]] = state["status"]
ras_dict["ecc_block_state"] = ecc_block_state_dict
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info())
static_dict["ras"] = ras_dict
if 'partition' in current_platform_args:
if args.partition:
try:
compute_partition = amdsmi_interface.amdsmi_get_gpu_compute_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
compute_partition = "N/A"
logging.debug("Failed to get compute partition info for gpu %s | %s", gpu_id, e.get_error_info())
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['partition'] = {"compute_partition": compute_partition,
"memory_partition": memory_partition}
if 'numa' in current_platform_args:
if args.numa:
try:
numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
numa_node_number = "N/A"
logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info())
try:
numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu)
# -1 means No numa node is assigned to the GPU, so there is no numa affinity
if self.logger.is_human_readable_format() and numa_affinity == -1:
numa_affinity = "NONE"
except amdsmi_exception.AmdSmiLibraryException as e:
numa_affinity = "N/A"
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['numa'] = {'node' : numa_node_number,
'affinity' : numa_affinity}
# Convert and store output by pid for csv format
multiple_devices_csv_override = False
@@ -1008,7 +1016,7 @@ class AMDSMICommands():
def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None,
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None,):
@@ -1027,7 +1035,7 @@ class AMDSMICommands():
clock (bool, optional): Value override for args.clock. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
ecc_block (bool, optional): Value override for args.ecc. Defaults to None.
ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
@@ -1085,8 +1093,8 @@ class AMDSMICommands():
current_platform_values += [args.usage, args.power, args.clock, args.temperature, args.ecc, args.pcie]
if self.helpers.is_baremetal() and self.helpers.is_linux():
if ecc_block:
args.ecc_block = ecc_block
if ecc_blocks:
args.ecc_blocks = ecc_blocks
if fan:
args.fan = fan
if voltage_curve:
@@ -1099,8 +1107,8 @@ class AMDSMICommands():
args.xgmi_err = xgmi_err
if energy:
args.energy = energy
current_platform_args += ["ecc_block", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"]
current_platform_values += [args.ecc_block, args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy]
current_platform_args += ["ecc_blocks", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"]
current_platform_values += [args.ecc_blocks, args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy]
if self.helpers.is_hypervisor():
if schedule:
@@ -1209,11 +1217,10 @@ class AMDSMICommands():
logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e.get_error_info())
if "power" in current_platform_args:
if args.power:
power_dict = {'current_power': "N/A",
'current_gfx_voltage': "N/A",
'current_soc_voltage': "N/A",
'current_mem_voltage': "N/A",
'power_limit': "N/A",
power_dict = {'socket_power': "N/A",
'gfx_voltage': "N/A",
'soc_voltage': "N/A",
'mem_voltage': "N/A",
'power_management': "N/A",
'throttle_status': "N/A"}
@@ -1228,16 +1235,15 @@ class AMDSMICommands():
elif "power" in key:
power_info[key] = f"{value} W"
power_dict['current_power'] = power_info['current_socket_power']
power_dict['socket_power'] = power_info['current_socket_power']
if power_dict['current_power'] == "N/A":
if power_dict['socket_power'] == "N/A":
# For older gpu's when current power doesn't populate we use the average socket power instead
power_dict['current_power'] = power_info['average_socket_power']
power_dict['socket_power'] = power_info['average_socket_power']
power_dict['current_gfx_voltage'] = power_info['gfx_voltage']
power_dict['current_soc_voltage'] = power_info['soc_voltage']
power_dict['current_mem_voltage'] = power_info['mem_voltage']
power_dict['power_limit'] = power_info['power_limit']
power_dict['gfx_voltage'] = power_info['gfx_voltage']
power_dict['soc_voltage'] = power_info['soc_voltage']
power_dict['mem_voltage'] = power_info['mem_voltage']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info())
@@ -1282,7 +1288,11 @@ class AMDSMICommands():
vlck0_clock_name = clock_name
try:
clock_info = amdsmi_interface.amdsmi_get_clock_info(args.gpu, clock_type)
clock_info_dict = amdsmi_interface.amdsmi_get_clock_info(args.gpu, clock_type)
clock_info = {"clk" : clock_info_dict["cur_clk"]}
del clock_info_dict["cur_clk"]
clock_info.update(clock_info_dict)
if clock_info['sleep_clk'] == 0xFFFFFFFF:
clock_info['sleep_clk'] = "N/A"
@@ -1303,28 +1313,28 @@ class AMDSMICommands():
continue
else:
# Handle all other failed to get clock info
clocks[clock_name] = {"cur_clk": "N/A",
clocks[clock_name] = {"clk": "N/A",
"max_clk": "N/A",
"min_clk": "N/A",
"sleep_clk": "N/A"}
logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info())
try:
is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status']
if is_clk_locked != "N/A":
if is_clk_locked:
is_clk_locked = "LOCKED"
gfxclk_lock_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['gfxclk_lock_status']
if gfxclk_lock_status != "N/A":
if gfxclk_lock_status:
gfxclk_lock_status = "ENABLED"
else:
is_clk_locked = "UNLOCKED"
gfxclk_lock_status = "DISABLED"
except amdsmi_exception.AmdSmiLibraryException as e:
is_clk_locked = "N/A"
gfxclk_lock_status = "N/A"
logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info())
if "gfx" in clocks:
if isinstance(clocks['gfx'], dict):
clocks['gfx']['is_clk_locked'] = is_clk_locked
clocks['gfx']['clk_locked'] = gfxclk_lock_status
else:
clocks['gfx'] = {"is_clk_locked": is_clk_locked}
clocks['gfx'] = {"clk_locked": gfxclk_lock_status}
values_dict['clock'] = clocks
if "temperature" in current_platform_args:
@@ -1372,37 +1382,10 @@ class AMDSMICommands():
temperatures[temperature_key] = f"{temperature_value} {unit}"
values_dict['temperature'] = temperatures
if "ecc" in current_platform_args:
if args.ecc:
ecc_count = {}
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
ecc_count['total_correctable'] = ecc_count.pop('correctable_count')
ecc_count['total_uncorrectable'] = ecc_count.pop('uncorrectable_count')
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['total_correctable'] = "N/A"
ecc_count['total_uncorrectable'] = "N/A"
ecc_count['cache_correctable'] = "N/A"
ecc_count['cache_uncorrectable'] = "N/A"
logging.debug("Failed to get total ecc count for gpu %s | %s", gpu_id, e.get_error_info())
if ecc_count['total_correctable'] != "N/A":
# Get the UMC error count for getting total cache correctable errors
umc_block = amdsmi_interface.AmdSmiGpuBlock['UMC']
try:
umc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, umc_block)
ecc_count['cache_correctable'] = ecc_count['total_correctable'] - umc_count['correctable_count']
ecc_count['cache_uncorrectable'] = ecc_count['total_uncorrectable'] - umc_count['uncorrectable_count']
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['cache_correctable'] = "N/A"
ecc_count['cache_uncorrectable'] = "N/A"
logging.debug("Failed to get cache ecc count for gpu %s at block %s | %s", gpu_id, umc_block, e.get_error_info())
values_dict['ecc'] = ecc_count
if "pcie" in current_platform_args:
if args.pcie:
pcie_dict = {"current_lanes": "N/A",
"current_speed": "N/A",
pcie_dict = {"width": "N/A",
"speed": "N/A",
"replay_count" : "N/A",
"l0_to_recovery_count" : "N/A",
"replay_roll_over_count" : "N/A",
@@ -1420,13 +1403,13 @@ class AMDSMICommands():
else:
pcie_speed_GTs_value = round(pcie_link_status['pcie_metric']['pcie_speed'] / 1000)
pcie_dict['current_speed'] = pcie_speed_GTs_value
pcie_dict['current_lanes'] = pcie_link_status['pcie_metric']['pcie_width']
pcie_dict['width'] = pcie_link_status['pcie_metric']['pcie_width']
pcie_dict['speed'] = pcie_speed_GTs_value
if self.logger.is_human_readable_format():
unit = 'GT/s'
pcie_dict['current_lanes'] = f"{pcie_link_status['pcie_metric']['pcie_width']} lanes"
pcie_dict['current_speed'] = f"{pcie_dict['current_speed']} GT/s"
pcie_dict['width'] = f"{pcie_link_status['pcie_metric']['pcie_width']} lanes"
pcie_dict['speed'] = f"{pcie_dict['speed']} GT/s"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info())
@@ -1492,8 +1475,35 @@ class AMDSMICommands():
logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info())
values_dict['pcie'] = pcie_dict
if "ecc_block" in current_platform_args:
if args.ecc_block:
if "ecc" in current_platform_args:
if args.ecc:
ecc_count = {}
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
ecc_count['total_correctable_count'] = ecc_count.pop('correctable_count')
ecc_count['total_uncorrectable_count'] = ecc_count.pop('uncorrectable_count')
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['total_correctable_count'] = "N/A"
ecc_count['total_uncorrectable_count'] = "N/A"
ecc_count['cache_correctable_count'] = "N/A"
ecc_count['cache_uncorrectable_count'] = "N/A"
logging.debug("Failed to get total ecc count for gpu %s | %s", gpu_id, e.get_error_info())
if ecc_count['total_correctable_count'] != "N/A":
# Get the UMC error count for getting total cache correctable errors
umc_block = amdsmi_interface.AmdSmiGpuBlock['UMC']
try:
umc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, umc_block)
ecc_count['cache_correctable_count'] = ecc_count['total_correctable_count'] - umc_count['correctable_count']
ecc_count['cache_uncorrectable_count'] = ecc_count['total_uncorrectable_count'] - umc_count['uncorrectable_count']
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_count['cache_correctable_count'] = "N/A"
ecc_count['cache_uncorrectable_count'] = "N/A"
logging.debug("Failed to get cache ecc count for gpu %s at block %s | %s", gpu_id, umc_block, e.get_error_info())
values_dict['ecc'] = ecc_count
if "ecc_blocks" in current_platform_args:
if args.ecc_blocks:
ecc_dict = {}
uncountable_blocks = ["ATHUB", "DF", "SMN", "SEM", "MP0", "MP1", "FUSE"]
try:
@@ -1505,16 +1515,16 @@ class AMDSMICommands():
if gpu_block.name not in uncountable_blocks:
try:
ecc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, gpu_block)
ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'],
'uncorrectable': ecc_count['uncorrectable_count']}
ecc_dict[state['block']] = {'correctable_count' : ecc_count['correctable_count'],
'uncorrectable_count' : ecc_count['uncorrectable_count']}
except amdsmi_exception.AmdSmiLibraryException as e:
ecc_dict[state['block']] = {'correctable' : "N/A",
'uncorrectable': "N/A"}
ecc_dict[state['block']] = {'correctable_count' : "N/A",
'uncorrectable_count' : "N/A"}
logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info())
values_dict['ecc_block'] = ecc_dict
values_dict['ecc_blocks'] = ecc_dict
except amdsmi_exception.AmdSmiLibraryException as e:
values_dict['ecc_block'] = "N/A"
values_dict['ecc_blocks'] = "N/A"
logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info())
if "fan" in current_platform_args:
if args.fan:
@@ -2074,7 +2084,7 @@ class AMDSMICommands():
def metric(self, args, multiple_devices=False, watching_output=False, gpu=None,
usage=None, watch=None, watch_time=None, iterations=None, power=None,
clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None,
clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None,
fan=None, voltage_curve=None, overdrive=None, perf_level=None,
xgmi_err=None, energy=None, mem_usage=None, schedule=None,
guard=None, guest_data=None, fb_usage=None, xgmi=None,
@@ -2101,7 +2111,7 @@ class AMDSMICommands():
clock (bool, optional): Value override for args.clock. Defaults to None.
temperature (bool, optional): Value override for args.temperature. Defaults to None.
ecc (bool, optional): Value override for args.ecc. Defaults to None.
ecc_block (bool, optional): Value override for args.ecc. Defaults to None.
ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None.
pcie (bool, optional): Value override for args.pcie. Defaults to None.
fan (bool, optional): Value override for args.fan. Defaults to None.
voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None.
@@ -2158,7 +2168,7 @@ class AMDSMICommands():
# Check if a GPU argument has been set
gpu_args_enabled = False
gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock",
"temperature", "ecc", "ecc_block", "pcie", "fan", "voltage_curve",
"temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve",
"overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule",
"guard", "guest_data", "fb_usage", "xgmi"]
for attr in gpu_attributes:
@@ -2183,12 +2193,13 @@ class AMDSMICommands():
if hasattr(args, attr):
core_args_enabled |= bool(getattr(args, attr))
logging.debug("gpu_args_enabled: %s, cpu_args_enabled: %s, core_args_enabled: %s",
gpu_args_enabled, cpu_args_enabled, core_args_enabled)
logging.debug("args.gpu: %s, args.cpu: %s, args.core: %s", args.gpu, args.cpu, args.core)
# Handle CPU and GPU driver intialization cases
if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized():
logging.debug("gpu_args_enabled: %s, cpu_args_enabled: %s, core_args_enabled: %s",
gpu_args_enabled, cpu_args_enabled, core_args_enabled)
logging.debug("args.gpu: %s, args.cpu: %s, args.core: %s", args.gpu, args.cpu, args.core)
# If a GPU or CPU argument is provided only print out the specified device.
if args.cpu == None and args.gpu == None and args.core == None:
# If no args are set, print out all CPU, GPU, and Core metrics info
@@ -2223,7 +2234,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.metric_gpu(args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_block, pcie,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule,
guard, guest_data, fb_usage, xgmi)
@@ -2258,7 +2269,7 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_ouput()
self.metric_gpu(args, multiple_devices, watching_output, gpu,
usage, watch, watch_time, iterations, power,
clock, temperature, ecc, ecc_block, pcie,
clock, temperature, ecc, ecc_blocks, pcie,
fan, voltage_curve, overdrive, perf_level,
xgmi_err, energy, mem_usage, schedule)
@@ -3206,7 +3217,7 @@ class AMDSMICommands():
# Only allow one device's arguments to be set at a time
if gpu_args_enabled == cpu_args_enabled == core_args_enabled == False:
raise ValueError('No GPU, CPU, or CORE arguments provided, specific target(s) are needed')
raise ValueError('No GPU, CPU, or CORE arguments provided, specific arguments are needed')
elif gpu_args_enabled == cpu_args_enabled == core_args_enabled == True:
raise ValueError('Cannot set GPU, CPU, and CORE arguments at the same time')
elif not (gpu_args_enabled ^ cpu_args_enabled ^ core_args_enabled):
+3 -3
Ver ficheiro
@@ -686,7 +686,7 @@ class AMDSMIParser(argparse.ArgumentParser):
clock_help = "Average, max, and current clock frequencies"
temperature_help = "Current temperatures"
ecc_help = "Total number of ECC errors"
ecc_block_help = "Number of ECC errors per block"
ecc_blocks_help = "Number of ECC errors per block"
pcie_help = "Current PCIe speed, width, and replay count"
# Help text for Arguments only on Linux Baremetal platforms
@@ -755,12 +755,12 @@ class AMDSMIParser(argparse.ArgumentParser):
metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help)
metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help)
metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help)
metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help)
# Optional Args for Linux Baremetal Systems
if self.helpers.is_baremetal() and self.helpers.is_linux():
metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help)
metric_parser.add_argument('-k', '--ecc-blocks', action='store_true', required=False, help=ecc_blocks_help)
metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help)
metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help)
metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help)