[SWDEV-530385] show afids on each line of printout (#422)

* show afids on each line of printout
* clean up afids and cper code
---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: fab13c5b60]
Этот коммит содержится в:
Saeed, Oosman
2025-06-02 17:22:10 -05:00
коммит произвёл GitHub
родитель 3d75b7881a
Коммит 877c7b1bda
7 изменённых файлов: 107 добавлений и 197 удалений
+4 -1
Просмотреть файл
@@ -6476,7 +6476,10 @@ class AMDSMICommands():
args.gpu = self.device_handles
if args.afid and args.cper_file:
self.helpers.pvtDumpAfids(args.cper_file)
afids = self.helpers.pvtDumpAfids(args.cper_file)
for afid in afids:
print(afid, end=" ")
print("")
return
if not self.group_check_printed:
+13 -18
Просмотреть файл
@@ -1096,7 +1096,7 @@ class AMDSMIHelpers():
# Header
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
if folder:
print(f" {'file_name':<17}", end="")
print(f" {'file_name':<17} {'afid'}", end="")
print("")
self._cper_display_initialized = True
@@ -1122,13 +1122,19 @@ class AMDSMIHelpers():
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
if folder:
print(f" {cper_data_file:<17}", end="")
afids = self.pvtDumpAfids(cper_data_file)
for afid in afids:
print(afid, end=" ")
print("")
self.increment_cper_count()
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
# Onetime header
if not getattr(self, "_cper_display_initialized", False):
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="")
if folder:
print(f"{'file_name':<17} {'afid'}", end="")
print("")
self._cper_display_initialized = True
if folder:
@@ -1201,7 +1207,9 @@ class AMDSMIHelpers():
to_print = printed_rows
for ts, gid, prefix, fname in to_print:
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}")
cper_path = folder / cper_name
afids = self.pvtDumpAfids(cper_path)
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}")
else:
print(json.dumps(
@@ -1278,13 +1286,9 @@ class AMDSMIHelpers():
else:
# assume it's already bytes
raw = raw_data
size = len(raw)
self.hexdump_to_string(raw)
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
print(f"AFIDS: ", end="")
for afid in afids:
print(afid, end=" ")
print("")
return afids
def ras_cper(self, args, device_handle, logger, gpu_idx):
# Parse severity mask dynamically from the --severity option.
@@ -1351,16 +1355,7 @@ class AMDSMIHelpers():
if len(entries) == 0:
break
if args.folder:
if args.follow:
if device_handle:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
if device_handle:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
else:
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
break
else:
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
+14 -15
Просмотреть файл
@@ -304,26 +304,25 @@ class AMDSMIParser(argparse.ArgumentParser):
def _check_cper_file_path(self):
""" Argument action validator:
Returns a path to a file from the input file path provided.
If the file doesn't exist or is empty raise error
If the file doesn't exist, is empty, or is invalid, raise an error.
"""
class _CheckInputFilePath(argparse.Action):
# Checks the values
def __call__(self, parser, args, values, option_string=None):
path = Path(values)
if not path.exists():
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ")
if path.is_dir():
raise argparse.ArgumentTypeError(
f"Invalid Path: {path} is directory when it needs to be a specific file")
if path.is_file():
if os.stat(values).st_size == 0:
raise argparse.ArgumentTypeError(f"Invalid Path: {path} Input file is empty")
setattr(args, self.dest, path)
else:
raise argparse.ArgumentTypeError(
f"Invalid path:{path} Could not determine if value given is a valid path")
try:
if not path.exists():
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct.")
if path.is_dir():
raise IsADirectoryError(f"Invalid Path: {path} is a directory when it needs to be a specific file.")
if path.is_file():
if os.stat(values).st_size == 0:
raise ValueError(f"Invalid Path: {path} Input file is empty.")
setattr(args, self.dest, path)
else:
raise FileNotFoundError(f"Invalid Path: {path} Could not determine if the value given is a valid path.")
except Exception as root_cause:
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, _CheckInputFilePath.outputformat) from root_cause
return _CheckInputFilePath
+72 -70
Просмотреть файл
@@ -511,6 +511,45 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_board_info
Description: Returns board info for the given GPU
Input parameters:
* `processor_handle` device which to query
Output: Dictionary with fields correctable and uncorrectable
Field | Description
---|---
`model_number` | Board serial number
`product_serial` | Product serial
`fru_id` | FRU ID
`product_name` | Product name
`manufacturer_name` | Manufacturer name
Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0")
board_info = amdsmi_get_gpu_board_info(device)
print(board_info["model_number"])
print(board_info["product_serial"])
print(board_info["fru_id"])
print(board_info["product_name"])
print(board_info["manufacturer_name"])
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_cache_info
Description: Returns a list of dictionaries containing cache information for the given GPU.
@@ -1185,6 +1224,7 @@ except AmdSmiException as e:
Description: Dump CPER entries for a given GPU in a file using from CPER header file from RAS tool.
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
@@ -1218,53 +1258,65 @@ Example:
```python
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print()
print("New Cursor Position:", new_cursor)
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_board_info
### amdsmi_get_afids_from_cper
Description: Returns board info for the given GPU
Description: Get the AFIDs from CPER buffer
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
Output: Dictionary with fields correctable and uncorrectable
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
Field | Description
---|---
`model_number` | Board serial number
`product_serial` | Product serial
`fru_id` | FRU ID
`product_name` | Product name
`manufacturer_name` | Manufacturer name
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
`notify_type` | The notification type associated with the CPER entry. |
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
`revision` | The revision number of the CPER record format. |
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
`sec_cnt` | The count of sections included in the CPER entry. |
`record_length` | The total length in bytes of the CPER entry. |
`platform_id` | A character array identifying the GPU or platform. |
`creator_id` | A character array indicating the creator of the CPER entry. |
`record_id` | A unique identifier for the CPER entry. |
`flags` | Reserved flags related to the CPER entry. |
`persistence_info` | Reserved information related to persistence. |
Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function:
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0")
board_info = amdsmi_get_gpu_board_info(device)
print(board_info["model_number"])
print(board_info["product_serial"])
print(board_info["fru_id"])
print(board_info["product_name"])
print(board_info["manufacturer_name"])
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print("New Cursor Position:", new_cursor)
except AmdSmiException as e:
print(e)
```
@@ -5390,53 +5442,3 @@ try:
except AmdSmiException as e:
print(e)
```
### amdsmi_get_afids_from_cper
Description: Get the AFIDs from CPER buffer
Input parameters:
* `processor_handle` device which to query
* `severity_mask` the severity mask of the entries to be retrieved
* `buffer_size` pointer to a variable that specifies the size of the cper_data
* `cursor` pointer to a variable that will contain the cursor for the next call
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
Field | Description
---|---
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
`notify_type` | The notification type associated with the CPER entry. |
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
`revision` | The revision number of the CPER record format. |
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
`sec_cnt` | The count of sections included in the CPER entry. |
`record_length` | The total length in bytes of the CPER entry. |
`platform_id` | A character array identifying the GPU or platform. |
`creator_id` | A character array indicating the creator of the CPER entry. |
`record_id` | A unique identifier for the CPER entry. |
`flags` | Reserved flags related to the CPER entry. |
`persistence_info` | Reserved information related to persistence. |
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
* `AmdSmiLibraryException`
* `AmdSmiParameterException`
Example:
```python
for device in devices:
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
print("CPER entries for device", device)
for key, entry in entries.items():
print("Entry", key)
print(" Error Severity:", entry.get("error_severity", "Unknown"))
print(" Notify Type:", entry.get("notify_type", "Unknown"))
print(" Timestamp:", entry.get("timestamp", ""))
print()
print("New Cursor Position:", new_cursor)
except AmdSmiException as e:
print(e)
```
+1 -1
Просмотреть файл
@@ -4145,7 +4145,7 @@ amdsmi_status_t amdsmi_get_afids_from_cper(
return AMDSMI_STATUS_INVAL;
}
int i = 0;
uint32_t i = 0;
for(int afid: cper_decode(cper)) {
if(i < *num_afids) {
afids[i] = afid;
+3 -5
Просмотреть файл
@@ -213,7 +213,6 @@ static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx)
static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx)
{
struct cper_sec_desc *tmp_desc;
char *offset;
if (idx >= hdr->sec_cnt)
return 0;
@@ -313,14 +312,13 @@ static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
std::ostringstream ss;
struct cper_sec_nonstd_err_body *body;
char *offset;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n";
ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n";
ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n";
if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
if (nonstd_err->hdr.valid_bits.err_info_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
ss << "~~~~Malformed Non Standard Section!~~~~\n\n";
goto exit;
}
@@ -554,8 +552,8 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
}
else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
for(int i = 0; i < sizeof(sec_guid->b); ++i) {
ss << std::hex << static_cast<int>(sec_guid->b[i]) << ":";
for(size_t j = 0; j < sizeof(sec_guid->b); ++j) {
ss << std::hex << static_cast<int>(sec_guid->b[j]) << ":";
}
ss << "\n";
LOG_ERROR(ss);
-87
Просмотреть файл
@@ -937,100 +937,13 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
return AMDSMI_STATUS_API_FAILED;
}
static std::vector<const amdsmi_cper_hdr_t *>
amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] buffer_sz: " << buffer_sz;
LOG_DEBUG(ss);
std::vector<const amdsmi_cper_hdr_t *> headers;
if(!buffer) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] buffer is null";
LOG_ERROR(ss);
return headers;
}
static constexpr char cper_signature[] = "CPER";
static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1;
for(size_t data_idx = 0;
buffer_sz >= cper_signature_size &&
data_idx < buffer_sz - cper_signature_size;
++data_idx) {
const amdsmi_cper_hdr_t *hdr = reinterpret_cast<const amdsmi_cper_hdr_t *>(
&buffer[data_idx]);
if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' ||
hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) {
continue;
}
if(hdr->signature_end != 0xFFFFFFFF) {
continue;
}
if(hdr->record_length > buffer_sz) {
continue;
}
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] add header at data_idx: " << data_idx
<< ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3];
LOG_DEBUG(ss);
headers.emplace_back(hdr);
}
return headers;
}
struct CperFileCtx {
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
std::unique_ptr<char[]> buffer;
long file_size = 0;
};
static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
std::ostringstream ss;
CperFileCtx ctx;
ctx.status = AMDSMI_STATUS_FILE_ERROR;
ctx.file_size = 0;
struct stat file_stats;
if (stat(filepath.c_str(), &file_stats) == 0) {
if (!S_ISREG(file_stats.st_mode)) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: "
<< filepath << ", errno: " << errno << "): " << strerror(errno);
return ctx;
}
} else {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: "
<< filepath << ", errno: " << errno << "): " << strerror(errno);
ctx.status = AMDSMI_STATUS_NOT_SUPPORTED;
return ctx;
}
ctx.file_size = file_stats.st_size;
ctx.buffer = std::make_unique<char[]>(ctx.file_size);
int file = open(filepath.c_str(), O_RDONLY);
if (file == -1) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: "
<< filepath << ", errno:()" << errno << "): " << strerror(errno);
LOG_ERROR(ss);
return ctx;
}
long bytes_read = read(file, ctx.buffer.get(), ctx.file_size);
if (bytes_read <= 0) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
<< "[CPER] failed to read complete file, read only "
<< bytes_read << " of " << ctx.file_size << " bytes";
LOG_ERROR(ss);
return ctx;
}
close(file);
ctx.status = AMDSMI_STATUS_SUCCESS;
ctx.file_size = bytes_read;
return ctx;
}
void amdsmi_wait_for_user_input(void) {
for (;;) {
std::cout << "\n\t**Press any key to continue**" << std::endl;