[SWDEV-553557] Add bad_page_threshold_exceeded to RAS (#677)
Added bad_page_threshold_exceeded field to ras, which compares retired pages count against bad page threshold. This field displays True if retired pages exceed the threshold, False if within threshold, or N/A if threshold data is unavailable. --------- Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com> Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com> Co-authored-by: Arif, Maisam <Maisam.Arif@amd.com>
This commit is contained in:
committed by
GitHub
parent
4a8ee27225
commit
edaae978a2
@@ -767,6 +767,7 @@ class AMDSMICommands():
|
||||
if args.ras:
|
||||
ras_dict = {"eeprom_version": "N/A",
|
||||
"bad_page_threshold": "N/A",
|
||||
"bad_page_threshold_exceeded": "N/A",
|
||||
"parity_schema" : "N/A",
|
||||
"single_bit_schema" : "N/A",
|
||||
"double_bit_schema" : "N/A",
|
||||
@@ -794,6 +795,23 @@ class AMDSMICommands():
|
||||
ras_dict["bad_page_threshold"] = amdsmi_interface.amdsmi_get_gpu_bad_page_threshold(args.gpu)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get bad page threshold count for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
try:
|
||||
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
|
||||
retired_pages = 0
|
||||
if bad_page_info:
|
||||
for bad_page in bad_page_info:
|
||||
if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.RESERVED:
|
||||
retired_pages += 1
|
||||
# default to N/A
|
||||
ras_dict["bad_page_threshold_exceeded"] = "N/A"
|
||||
# If this is an int, then default to False
|
||||
if isinstance(ras_dict["bad_page_threshold"], int):
|
||||
ras_dict["bad_page_threshold_exceeded"] = "False"
|
||||
if retired_pages > ras_dict["bad_page_threshold"]:
|
||||
# If there are more retired pages then set to True
|
||||
ras_dict["bad_page_threshold_exceeded"] = "True"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get retired pages count for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)
|
||||
|
||||
Reference in New Issue
Block a user