diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index 54649ffb3b..7ccf0ca870 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -1310,12 +1310,8 @@ Input parameters: * `cursor` the zero based index at which to start retrieving cper entries; default value is 0; for example, if there are 10 cper entries available, then with a cursor value of 8, it will retrieve the last two cper entries only Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data, status_code - status_code: - AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully - AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and: - * A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or - * Increasing the input buffer_size will allow more entries to be fetched with the same cursor +Output1: Dictionary with fields Field | Description ---|--- `error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | @@ -1326,12 +1322,25 @@ Field | Description `signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | `sec_cnt` | The count of sections included in the CPER entry. | `record_length` | The total length in bytes of the CPER entry. | +`serial_number` | The product serial number. Exists in raw entries in C++ API | `platform_id` | A character array identifying the GPU or platform. | `creator_id` | A character array indicating the creator of the CPER entry. | `record_id` | A unique identifier for the CPER entry. | `flags` | Reserved flags related to the CPER entry. | `persistence_info` | Reserved information related to persistence. | +Output2: Updated cursor (int type) +* Cursor is the index of the next cper entry in the GPU ring buffer. For example, if 10 entries were fetched successfully, the value of cursor will be 11 upon return from the API. Subsequent call to the API with cursor value of 11 should fetch the next entry + +Output3: A list of dictionaries, each dictionary containing the CPER record and its size: +* {"bytes": , "size": } + +Output4: status_code + AMDSMI_STATUS_SUCCESS: If all entries were retrieved successfully + AMDSMI_STATUS_MORE_DATA: If some of the entries were retrieved and: + * A subsequent call to the API with the updated cursor will result in the fetching the next batch of entries, or + * Increasing the input buffer_size will allow more entries to be fetched with the same cursor + Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: * `AmdSmiLibraryException` diff --git a/include/amd_smi/impl/amd_smi_cper.h b/include/amd_smi/impl/amd_smi_cper.h index ef4c0ad7ce..0bf3905163 100644 --- a/include/amd_smi/impl/amd_smi_cper.h +++ b/include/amd_smi/impl/amd_smi_cper.h @@ -220,5 +220,5 @@ struct cper_1_0 { amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, - uint64_t *entry_count, uint64_t *cursor); + uint64_t *entry_count, uint64_t *cursor, uint64_t product_serial); std::vector cper_decode(const amdsmi_cper_hdr_t *cper); diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index cd2ec9d919..97593f1e7a 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -187,4 +187,15 @@ void fill_2d_array(A& arr, T value) { } } +/** + * @brief Get the product serial number given the processor handle. + * + * @param[in] processor_handle a pointer to amdsmi_processor_handle + * which the corresponding processor_handle will be stored + * + * @retval ::The serial number + * ::0 if it cannot be determined + */ +uint64_t get_product_serial_number(amdsmi_processor_handle processor_handle); + #endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 6146262cac..97bd458865 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -2776,6 +2776,13 @@ def amdsmi_get_gpu_cper_entries( f"{entry_ptr.contents.timestamp.seconds:02d}" ) + serial_number = "" + if isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + try: + board_info = amdsmi_get_gpu_board_info(processor_handle) + serial_number = board_info.get('product_serial', "") + except Exception: + serial_number = "" # Create a dictionary for the CPER entry. cper_entry = { "error_severity": amdsmi_wrapper.amdsmi_cper_sev_t__enumvalues.get( @@ -2788,6 +2795,7 @@ def amdsmi_get_gpu_cper_entries( "signature_end": hex(entry_ptr.contents.signature_end), "sec_cnt": entry_ptr.contents.sec_cnt, "record_length": entry_ptr.contents.record_length, + "serial_number": serial_number, "platform_id": entry_ptr.contents.platform_id, "creator_id": entry_ptr.contents.creator_id, "record_id": entry_ptr.contents.record_id, diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 3e7be704b9..82c6ddc8c8 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -4414,11 +4414,11 @@ amdsmi_get_gpu_cper_entries( if (status != AMDSMI_STATUS_SUCCESS) { return status; } + std::string path = std::string("/sys/kernel/debug/dri/") + std::to_string(gpu_device->get_card_id()) + "/amdgpu_ring_cper"; - return amdsmi_get_gpu_cper_entries_by_path( path.c_str(), severity_mask, @@ -4426,7 +4426,9 @@ amdsmi_get_gpu_cper_entries( buf_size, cper_hdrs, entry_count, - cursor); + cursor, + get_product_serial_number(processor_handle) + ); } amdsmi_status_t amdsmi_get_afids_from_cper( diff --git a/src/amd_smi/amd_smi_cper.cc b/src/amd_smi/amd_smi_cper.cc index 79516f63f6..370f3ba970 100644 --- a/src/amd_smi/amd_smi_cper.cc +++ b/src/amd_smi/amd_smi_cper.cc @@ -338,6 +338,15 @@ static int cper_dump_cr_boot(const struct cper_sec_crashdump *crashdump, const c return aca_decode_fatal(crashdump->data, section->flags_mask, section->revision_major, crashdump->data.reg_ctx_type); } +static void inject_product_serial_number(amdsmi_cper_hdr_t *cper, uint64_t product_serial) { + for (size_t i = 0; i < cper_num_sec(cper); i++) { + void *sec_desc_offset = cper_get_sec_desc_offset(cper, i); + struct cper_sec_desc *sec_desc = static_cast(sec_desc_offset); + strncpy(sec_desc->fru_id, std::to_string(product_serial).c_str(), sizeof(sec_desc->fru_id) - 1); + sec_desc->fru_id[sizeof(sec_desc->fru_id) - 1] = '\0'; + } +} + } //namespace amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( @@ -347,7 +356,8 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, - uint64_t *cursor) { + uint64_t *cursor, + uint64_t product_serial) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] begin\n" @@ -461,6 +471,7 @@ amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path( &cper_data[data_idx], reinterpret_cast(header), header->record_length); + inject_product_serial_number(reinterpret_cast(&cper_data[data_idx]), product_serial); data_idx += header->record_length; } *entry_count = num_headers_copied; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index 22230f7693..0d92421c93 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -1004,3 +1004,41 @@ struct CperFileCtx { std::unique_ptr buffer; long file_size = 0; }; + + +uint64_t get_product_serial_number(amdsmi_processor_handle processor_handle) { + uint64_t serial_number = 0; + amdsmi_board_info_t board_info = {}; + amdsmi_status_t status = amdsmi_get_gpu_board_info(processor_handle, &board_info); + if (status != AMDSMI_STATUS_SUCCESS) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << + "Failed to retrieve product serial number! error: " << + static_cast(status); + LOG_DEBUG(ss); + return serial_number; + } + if (!board_info.product_serial || !*board_info.product_serial) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << + " Product serial string is empty."; + LOG_DEBUG(ss); + return serial_number; + } + try { + serial_number = std::stoull(board_info.product_serial, nullptr, 10); + } catch (const std::invalid_argument& e) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << + " Invalid product serial string. Exception: " << e.what(); + LOG_DEBUG(ss); + serial_number = 0; + } catch (const std::out_of_range& e) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << + " Product serial out of range, Exception: " << e.what(); + LOG_DEBUG(ss); + serial_number = 0; + } + return serial_number; +}