[SWDEV_562432] update inband CPER meta data to be more consistent with OOB (#824)
* Added Product Serial Number to the raw_bytes cper entries
* Added Product Serial Number to the Python API return
---------
Signed-off-by: Saeed, Oosman <Oosman.Saeed@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: 05ea00dcc4]
This commit is contained in:
@@ -1310,12 +1310,8 @@ Input parameters:
|
||||
* `cursor` the zero based index at which to start retrieving cper entries; default value is 0; for example, if there are 10 cper entries available, then with a cursor value of 8, it will retrieve the last two cper entries only
|
||||
|
||||
Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data, status_code
|
||||
status_code:
|
||||
AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully
|
||||
AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and:
|
||||
* A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or
|
||||
* Increasing the input buffer_size will allow more entries to be fetched with the same cursor
|
||||
|
||||
Output1: Dictionary with fields
|
||||
Field | Description
|
||||
---|---
|
||||
`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. |
|
||||
@@ -1326,12 +1322,25 @@ Field | Description
|
||||
`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. |
|
||||
`sec_cnt` | The count of sections included in the CPER entry. |
|
||||
`record_length` | The total length in bytes of the CPER entry. |
|
||||
`serial_number` | The product serial number. Exists in raw entries in C++ API |
|
||||
`platform_id` | A character array identifying the GPU or platform. |
|
||||
`creator_id` | A character array indicating the creator of the CPER entry. |
|
||||
`record_id` | A unique identifier for the CPER entry. |
|
||||
`flags` | Reserved flags related to the CPER entry. |
|
||||
`persistence_info` | Reserved information related to persistence. |
|
||||
|
||||
Output2: Updated cursor (int type)
|
||||
* Cursor is the index of the next cper entry in the GPU ring buffer. For example, if 10 entries were fetched successfully, the value of cursor will be 11 upon return from the API. Subsequent call to the API with cursor value of 11 should fetch the next entry
|
||||
|
||||
Output3: A list of dictionaries, each dictionary containing the CPER record and its size:
|
||||
* {"bytes": <raw bytes>, "size": <number of bytes>}
|
||||
|
||||
Output4: status_code
|
||||
AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully
|
||||
AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and:
|
||||
* A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or
|
||||
* Increasing the input buffer_size will allow more entries to be fetched with the same cursor
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
|
||||
@@ -220,5 +220,5 @@ struct cper_1_0 {
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask,
|
||||
char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs,
|
||||
uint64_t *entry_count, uint64_t *cursor);
|
||||
uint64_t *entry_count, uint64_t *cursor, uint64_t product_serial);
|
||||
std::vector<int> cper_decode(const amdsmi_cper_hdr_t *cper);
|
||||
|
||||
@@ -187,4 +187,15 @@ void fill_2d_array(A& arr, T value) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the product serial number given the processor handle.
|
||||
*
|
||||
* @param[in] processor_handle a pointer to amdsmi_processor_handle
|
||||
* which the corresponding processor_handle will be stored
|
||||
*
|
||||
* @retval ::The serial number
|
||||
* ::0 if it cannot be determined
|
||||
*/
|
||||
uint64_t get_product_serial_number(amdsmi_processor_handle processor_handle);
|
||||
|
||||
#endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
|
||||
@@ -2776,6 +2776,13 @@ def amdsmi_get_gpu_cper_entries(
|
||||
f"{entry_ptr.contents.timestamp.seconds:02d}"
|
||||
)
|
||||
|
||||
serial_number = ""
|
||||
if isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
try:
|
||||
board_info = amdsmi_get_gpu_board_info(processor_handle)
|
||||
serial_number = board_info.get('product_serial', "")
|
||||
except Exception:
|
||||
serial_number = ""
|
||||
# Create a dictionary for the CPER entry.
|
||||
cper_entry = {
|
||||
"error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get(
|
||||
@@ -2788,6 +2795,7 @@ def amdsmi_get_gpu_cper_entries(
|
||||
"signature_end": hex(entry_ptr.contents.signature_end),
|
||||
"sec_cnt": entry_ptr.contents.sec_cnt,
|
||||
"record_length": entry_ptr.contents.record_length,
|
||||
"serial_number": serial_number,
|
||||
"platform_id": entry_ptr.contents.platform_id,
|
||||
"creator_id": entry_ptr.contents.creator_id,
|
||||
"record_id": entry_ptr.contents.record_id,
|
||||
|
||||
@@ -4414,11 +4414,11 @@ amdsmi_get_gpu_cper_entries(
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
std::string path = std::string("/sys/kernel/debug/dri/") +
|
||||
std::to_string(gpu_device->get_card_id()) +
|
||||
"/amdgpu_ring_cper";
|
||||
|
||||
|
||||
return amdsmi_get_gpu_cper_entries_by_path(
|
||||
path.c_str(),
|
||||
severity_mask,
|
||||
@@ -4426,7 +4426,9 @@ amdsmi_get_gpu_cper_entries(
|
||||
buf_size,
|
||||
cper_hdrs,
|
||||
entry_count,
|
||||
cursor);
|
||||
cursor,
|
||||
get_product_serial_number(processor_handle)
|
||||
);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
|
||||
@@ -338,6 +338,15 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const c
|
||||
return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type);
|
||||
}
|
||||
|
||||
static void inject_product_serial_number(amdsmi_cper_hdr_t *cper, uint64_t product_serial) {
|
||||
for (size_t i = 0; i < cper_num_sec(cper); i++) {
|
||||
void *sec_desc_offset = cper_get_sec_desc_offset(cper, i);
|
||||
struct cper_sec_desc *sec_desc = static_cast<struct cper_sec_desc *>(sec_desc_offset);
|
||||
strncpy(sec_desc->fru_id, std::to_string(product_serial).c_str(), sizeof(sec_desc->fru_id) - 1);
|
||||
sec_desc->fru_id[sizeof(sec_desc->fru_id) - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
} //namespace
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
@@ -347,7 +356,8 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
uint64_t *buf_size,
|
||||
amdsmi_cper_hdr_t **cper_hdrs,
|
||||
uint64_t *entry_count,
|
||||
uint64_t *cursor) {
|
||||
uint64_t *cursor,
|
||||
uint64_t product_serial) {
|
||||
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n"
|
||||
@@ -461,6 +471,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(
|
||||
&cper_data[data_idx],
|
||||
reinterpret_cast<const char*>(header),
|
||||
header->record_length);
|
||||
inject_product_serial_number(reinterpret_cast<amdsmi_cper_hdr_t*>(&cper_data[data_idx]), product_serial);
|
||||
data_idx += header->record_length;
|
||||
}
|
||||
*entry_count = num_headers_copied;
|
||||
|
||||
@@ -1004,3 +1004,41 @@ struct CperFileCtx {
|
||||
std::unique_ptr<char[]> buffer;
|
||||
long file_size = 0;
|
||||
};
|
||||
|
||||
|
||||
uint64_t get_product_serial_number(amdsmi_processor_handle processor_handle) {
|
||||
uint64_t serial_number = 0;
|
||||
amdsmi_board_info_t board_info = {};
|
||||
amdsmi_status_t status = amdsmi_get_gpu_board_info(processor_handle, &board_info);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ <<
|
||||
"Failed to retrieve product serial number! error: " <<
|
||||
static_cast<int>(status);
|
||||
LOG_DEBUG(ss);
|
||||
return serial_number;
|
||||
}
|
||||
if (!board_info.product_serial || !*board_info.product_serial) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ <<
|
||||
" Product serial string is empty.";
|
||||
LOG_DEBUG(ss);
|
||||
return serial_number;
|
||||
}
|
||||
try {
|
||||
serial_number = std::stoull(board_info.product_serial, nullptr, 10);
|
||||
} catch (const std::invalid_argument& e) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ <<
|
||||
" Invalid product serial string. Exception: " << e.what();
|
||||
LOG_DEBUG(ss);
|
||||
serial_number = 0;
|
||||
} catch (const std::out_of_range& e) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ <<
|
||||
" Product serial out of range, Exception: " << e.what();
|
||||
LOG_DEBUG(ss);
|
||||
serial_number = 0;
|
||||
}
|
||||
return serial_number;
|
||||
}
|
||||
|
||||
مرجع در شماره جدید
Block a user