[SWDEV-530385] show afids on each line of printout (#422)
* show afids on each line of printout
* clean up afids and cper code
---------
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: fab13c5b60]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
3d75b7881a
Коммит
877c7b1bda
@@ -6476,7 +6476,10 @@ class AMDSMICommands():
|
||||
args.gpu = self.device_handles
|
||||
|
||||
if args.afid and args.cper_file:
|
||||
self.helpers.pvtDumpAfids(args.cper_file)
|
||||
afids = self.helpers.pvtDumpAfids(args.cper_file)
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
return
|
||||
|
||||
if not self.group_check_printed:
|
||||
|
||||
@@ -1096,7 +1096,7 @@ class AMDSMIHelpers():
|
||||
# Header
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="")
|
||||
if folder:
|
||||
print(f" {'file_name':<17}", end="")
|
||||
print(f" {'file_name':<17} {'afid'}", end="")
|
||||
print("")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
@@ -1122,13 +1122,19 @@ class AMDSMIHelpers():
|
||||
print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="")
|
||||
if folder:
|
||||
print(f" {cper_data_file:<17}", end="")
|
||||
afids = self.pvtDumpAfids(cper_data_file)
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
self.increment_cper_count()
|
||||
|
||||
def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None):
|
||||
# One‐time header
|
||||
if not getattr(self, "_cper_display_initialized", False):
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}")
|
||||
print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="")
|
||||
if folder:
|
||||
print(f"{'file_name':<17} {'afid'}", end="")
|
||||
print("")
|
||||
self._cper_display_initialized = True
|
||||
|
||||
if folder:
|
||||
@@ -1201,7 +1207,9 @@ class AMDSMIHelpers():
|
||||
to_print = printed_rows
|
||||
|
||||
for ts, gid, prefix, fname in to_print:
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}")
|
||||
cper_path = folder / cper_name
|
||||
afids = self.pvtDumpAfids(cper_path)
|
||||
print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}")
|
||||
|
||||
else:
|
||||
print(json.dumps(
|
||||
@@ -1278,13 +1286,9 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
# assume it's already bytes
|
||||
raw = raw_data
|
||||
size = len(raw)
|
||||
self.hexdump_to_string(raw)
|
||||
afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw)
|
||||
print(f"AFIDS: ", end="")
|
||||
for afid in afids:
|
||||
print(afid, end=" ")
|
||||
print("")
|
||||
return afids
|
||||
|
||||
def ras_cper(self, args, device_handle, logger, gpu_idx):
|
||||
# Parse severity mask dynamically from the --severity option.
|
||||
@@ -1351,16 +1355,7 @@ class AMDSMIHelpers():
|
||||
if len(entries) == 0:
|
||||
break
|
||||
if args.folder:
|
||||
if args.follow:
|
||||
if device_handle:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
if device_handle:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
else:
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit)
|
||||
break
|
||||
else:
|
||||
self.display_cper_files_generated(entries, device_handle, args.folder, args.follow)
|
||||
|
||||
@@ -304,26 +304,25 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
def _check_cper_file_path(self):
|
||||
""" Argument action validator:
|
||||
Returns a path to a file from the input file path provided.
|
||||
If the file doesn't exist or is empty raise error
|
||||
If the file doesn't exist, is empty, or is invalid, raise an error.
|
||||
"""
|
||||
class _CheckInputFilePath(argparse.Action):
|
||||
# Checks the values
|
||||
def __call__(self, parser, args, values, option_string=None):
|
||||
path = Path(values)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ")
|
||||
|
||||
if path.is_dir():
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Invalid Path: {path} is directory when it needs to be a specific file")
|
||||
|
||||
if path.is_file():
|
||||
if os.stat(values).st_size == 0:
|
||||
raise argparse.ArgumentTypeError(f"Invalid Path: {path} Input file is empty")
|
||||
setattr(args, self.dest, path)
|
||||
else:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"Invalid path:{path} Could not determine if value given is a valid path")
|
||||
try:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct.")
|
||||
if path.is_dir():
|
||||
raise IsADirectoryError(f"Invalid Path: {path} is a directory when it needs to be a specific file.")
|
||||
if path.is_file():
|
||||
if os.stat(values).st_size == 0:
|
||||
raise ValueError(f"Invalid Path: {path} Input file is empty.")
|
||||
setattr(args, self.dest, path)
|
||||
else:
|
||||
raise FileNotFoundError(f"Invalid Path: {path} Could not determine if the value given is a valid path.")
|
||||
except Exception as root_cause:
|
||||
raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, _CheckInputFilePath.outputformat) from root_cause
|
||||
return _CheckInputFilePath
|
||||
|
||||
|
||||
|
||||
@@ -511,6 +511,45 @@ except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_board_info
|
||||
|
||||
Description: Returns board info for the given GPU
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: Dictionary with fields correctable and uncorrectable
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`model_number` | Board serial number
|
||||
`product_serial` | Product serial
|
||||
`fru_id` | FRU ID
|
||||
`product_name` | Product name
|
||||
`manufacturer_name` | Manufacturer name
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0")
|
||||
board_info = amdsmi_get_gpu_board_info(device)
|
||||
print(board_info["model_number"])
|
||||
print(board_info["product_serial"])
|
||||
print(board_info["fru_id"])
|
||||
print(board_info["product_name"])
|
||||
print(board_info["manufacturer_name"])
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_cache_info
|
||||
|
||||
Description: Returns a list of dictionaries containing cache information for the given GPU.
|
||||
@@ -1185,6 +1224,7 @@ except AmdSmiException as e:
|
||||
Description: Dump CPER entries for a given GPU in a file using from CPER header file from RAS tool.
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
@@ -1218,53 +1258,65 @@ Example:
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_board_info
|
||||
### amdsmi_get_afids_from_cper
|
||||
|
||||
Description: Returns board info for the given GPU
|
||||
Description: Get the AFIDs from CPER buffer
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
|
||||
Output: Dictionary with fields correctable and uncorrectable
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`model_number` | Board serial number
|
||||
`product_serial` | Product serial
|
||||
`fru_id` | FRU ID
|
||||
`product_name` | Product name
|
||||
`manufacturer_name` | Manufacturer name
|
||||
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
|
||||
`notify_type` | The notification type associated with the CPER entry. |
|
||||
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
|
||||
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
|
||||
`revision` | The revision number of the CPER record format. |
|
||||
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
|
||||
`sec_cnt` | The count of sections included in the CPER entry. |
|
||||
`record_length` | The total length in bytes of the CPER entry. |
|
||||
`platform_id` | A character array identifying the GPU or platform. |
|
||||
`creator_id` | A character array indicating the creator of the CPER entry. |
|
||||
`record_id` | A unique identifier for the CPER entry. |
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function:
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0")
|
||||
board_info = amdsmi_get_gpu_board_info(device)
|
||||
print(board_info["model_number"])
|
||||
print(board_info["product_serial"])
|
||||
print(board_info["fru_id"])
|
||||
print(board_info["product_name"])
|
||||
print(board_info["manufacturer_name"])
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
@@ -5390,53 +5442,3 @@ try:
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_afids_from_cper
|
||||
|
||||
Description: Get the AFIDs from CPER buffer
|
||||
|
||||
Input parameters:
|
||||
* `processor_handle` device which to query
|
||||
* `severity_mask` the severity mask of the entries to be retrieved
|
||||
* `buffer_size` pointer to a variable that specifies the size of the cper_data
|
||||
* `cursor` pointer to a variable that will contain the cursor for the next call
|
||||
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
|
||||
`notify_type` | The notification type associated with the CPER entry. |
|
||||
`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. |
|
||||
`signature` | A 4-byte signature identifying the entry, typically `CPER`. |
|
||||
`revision` | The revision number of the CPER record format. |
|
||||
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
|
||||
`sec_cnt` | The count of sections included in the CPER entry. |
|
||||
`record_length` | The total length in bytes of the CPER entry. |
|
||||
`platform_id` | A character array identifying the GPU or platform. |
|
||||
`creator_id` | A character array indicating the creator of the CPER entry. |
|
||||
`record_id` | A unique identifier for the CPER entry. |
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
for device in devices:
|
||||
entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor)
|
||||
print("CPER entries for device", device)
|
||||
for key, entry in entries.items():
|
||||
print("Entry", key)
|
||||
print(" Error Severity:", entry.get("error_severity", "Unknown"))
|
||||
print(" Notify Type:", entry.get("notify_type", "Unknown"))
|
||||
print(" Timestamp:", entry.get("timestamp", ""))
|
||||
print()
|
||||
print("New Cursor Position:", new_cursor)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
@@ -4145,7 +4145,7 @@ amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
uint32_t i = 0;
|
||||
for(int afid: cper_decode(cper)) {
|
||||
if(i < *num_afids) {
|
||||
afids[i] = afid;
|
||||
|
||||
@@ -213,7 +213,6 @@ static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx)
|
||||
static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx)
|
||||
{
|
||||
struct cper_sec_desc *tmp_desc;
|
||||
char *offset;
|
||||
|
||||
if (idx >= hdr->sec_cnt)
|
||||
return 0;
|
||||
@@ -313,14 +312,13 @@ static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err)
|
||||
std::ostringstream ss;
|
||||
|
||||
struct cper_sec_nonstd_err_body *body;
|
||||
char *offset;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n";
|
||||
|
||||
ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n";
|
||||
ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n";
|
||||
|
||||
if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
|
||||
if (nonstd_err->hdr.valid_bits.err_info_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) {
|
||||
ss << "~~~~Malformed Non Standard Section!~~~~\n\n";
|
||||
goto exit;
|
||||
}
|
||||
@@ -554,8 +552,8 @@ std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper) {
|
||||
}
|
||||
else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n";
|
||||
for(int i = 0; i < sizeof(sec_guid->b); ++i) {
|
||||
ss << std::hex << static_cast<int>(sec_guid->b[i]) << ":";
|
||||
for(size_t j = 0; j < sizeof(sec_guid->b); ++j) {
|
||||
ss << std::hex << static_cast<int>(sec_guid->b[j]) << ":";
|
||||
}
|
||||
ss << "\n";
|
||||
LOG_ERROR(ss);
|
||||
|
||||
@@ -937,100 +937,13 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
}
|
||||
|
||||
static std::vector<const amdsmi_cper_hdr_t *>
|
||||
amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) {
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] buffer_sz: " << buffer_sz;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
std::vector<const amdsmi_cper_hdr_t *> headers;
|
||||
if(!buffer) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] buffer is null";
|
||||
LOG_ERROR(ss);
|
||||
return headers;
|
||||
}
|
||||
static constexpr char cper_signature[] = "CPER";
|
||||
static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1;
|
||||
for(size_t data_idx = 0;
|
||||
buffer_sz >= cper_signature_size &&
|
||||
data_idx < buffer_sz - cper_signature_size;
|
||||
++data_idx) {
|
||||
|
||||
const amdsmi_cper_hdr_t *hdr = reinterpret_cast<const amdsmi_cper_hdr_t *>(
|
||||
&buffer[data_idx]);
|
||||
if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' ||
|
||||
hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) {
|
||||
continue;
|
||||
}
|
||||
if(hdr->signature_end != 0xFFFFFFFF) {
|
||||
continue;
|
||||
}
|
||||
if(hdr->record_length > buffer_sz) {
|
||||
continue;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] add header at data_idx: " << data_idx
|
||||
<< ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3];
|
||||
LOG_DEBUG(ss);
|
||||
headers.emplace_back(hdr);
|
||||
}
|
||||
return headers;
|
||||
}
|
||||
|
||||
struct CperFileCtx {
|
||||
amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR;
|
||||
std::unique_ptr<char[]> buffer;
|
||||
long file_size = 0;
|
||||
};
|
||||
|
||||
static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx {
|
||||
|
||||
std::ostringstream ss;
|
||||
|
||||
CperFileCtx ctx;
|
||||
ctx.status = AMDSMI_STATUS_FILE_ERROR;
|
||||
ctx.file_size = 0;
|
||||
|
||||
struct stat file_stats;
|
||||
if (stat(filepath.c_str(), &file_stats) == 0) {
|
||||
if (!S_ISREG(file_stats.st_mode)) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: "
|
||||
<< filepath << ", errno: " << errno << "): " << strerror(errno);
|
||||
return ctx;
|
||||
}
|
||||
} else {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: "
|
||||
<< filepath << ", errno: " << errno << "): " << strerror(errno);
|
||||
ctx.status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
ctx.file_size = file_stats.st_size;
|
||||
ctx.buffer = std::make_unique<char[]>(ctx.file_size);
|
||||
int file = open(filepath.c_str(), O_RDONLY);
|
||||
if (file == -1) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: "
|
||||
<< filepath << ", errno:()" << errno << "): " << strerror(errno);
|
||||
LOG_ERROR(ss);
|
||||
return ctx;
|
||||
}
|
||||
long bytes_read = read(file, ctx.buffer.get(), ctx.file_size);
|
||||
if (bytes_read <= 0) {
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__
|
||||
<< "[CPER] failed to read complete file, read only "
|
||||
<< bytes_read << " of " << ctx.file_size << " bytes";
|
||||
LOG_ERROR(ss);
|
||||
return ctx;
|
||||
}
|
||||
close(file);
|
||||
|
||||
ctx.status = AMDSMI_STATUS_SUCCESS;
|
||||
ctx.file_size = bytes_read;
|
||||
return ctx;
|
||||
}
|
||||
void amdsmi_wait_for_user_input(void) {
|
||||
for (;;) {
|
||||
std::cout << "\n\t**Press any key to continue**" << std::endl;
|
||||
|
||||
Ссылка в новой задаче
Block a user