[SWDEV-553557] Add bad_page_threshold_exceeded to RAS (#677)

Added bad_page_threshold_exceeded field to ras, which
compares retired pages count against bad page threshold.
This field displays True if retired pages exceed the
threshold, False if within threshold, or N/A if
threshold data is unavailable.

---------

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
Co-authored-by: Arif, Maisam <Maisam.Arif@amd.com>
This commit is contained in:
Kanangot Balakrishnan, Bindhiya
2025-09-09 09:15:37 -05:00
committed by GitHub
parent 4a8ee27225
commit edaae978a2
+18
View File
@@ -767,6 +767,7 @@ class AMDSMICommands():
if args.ras:
ras_dict = {"eeprom_version": "N/A",
"bad_page_threshold": "N/A",
"bad_page_threshold_exceeded": "N/A",
"parity_schema" : "N/A",
"single_bit_schema" : "N/A",
"double_bit_schema" : "N/A",
@@ -794,6 +795,23 @@ class AMDSMICommands():
ras_dict["bad_page_threshold"] = amdsmi_interface.amdsmi_get_gpu_bad_page_threshold(args.gpu)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get bad page threshold count for gpu %s | %s", gpu_id, e.get_error_info())
try:
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
retired_pages = 0
if bad_page_info:
for bad_page in bad_page_info:
if bad_page["status"] == amdsmi_interface.AmdSmiMemoryPageStatus.RESERVED:
retired_pages += 1
# default to N/A
ras_dict["bad_page_threshold_exceeded"] = "N/A"
# If this is an int, then default to False
if isinstance(ras_dict["bad_page_threshold"], int):
ras_dict["bad_page_threshold_exceeded"] = "False"
if retired_pages > ras_dict["bad_page_threshold"]:
# If there are more retired pages then set to True
ras_dict["bad_page_threshold_exceeded"] = "True"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get retired pages count for gpu %s | %s", gpu_id, e.get_error_info())
try:
ras_states = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu)