diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index bb031b6725..8b84395bbb 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6476,7 +6476,10 @@ class AMDSMICommands(): args.gpu = self.device_handles if args.afid and args.cper_file: - self.helpers.pvtDumpAfids(args.cper_file) + afids = self.helpers.pvtDumpAfids(args.cper_file) + for afid in afids: + print(afid, end=" ") + print("") return if not self.group_check_printed: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index cb563ed33c..f29f6eb26f 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1096,7 +1096,7 @@ class AMDSMIHelpers(): # Header print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12}", end="") if folder: - print(f" {'file_name':<17}", end="") + print(f" {'file_name':<17} {'afid'}", end="") print("") self._cper_display_initialized = True @@ -1122,13 +1122,19 @@ class AMDSMIHelpers(): print(f"{timestamp:<20} {gpu_id:<7} {prefix:<12}", end="") if folder: print(f" {cper_data_file:<17}", end="") + afids = self.pvtDumpAfids(cper_data_file) + for afid in afids: + print(afid, end=" ") print("") self.increment_cper_count() def dump_cper_entries(self, folder, entries, cper_data, device_handle, file_limit=None): # One‐time header if not getattr(self, "_cper_display_initialized", False): - print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} {'file_name':<17}") + print(f"{'timestamp':<20} {'gpu_id':<7} {'severity':<12} ", end="") + if folder: + print(f"{'file_name':<17} {'afid'}", end="") + print("") self._cper_display_initialized = True if folder: @@ -1201,7 +1207,9 @@ class AMDSMIHelpers(): to_print = printed_rows for ts, gid, prefix, fname in to_print: - print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17}") + cper_path = folder / cper_name + afids = self.pvtDumpAfids(cper_path) + print(f"{ts:<20} {gid:<7} {prefix:<12} {fname:<17} {afids}") else: print(json.dumps( @@ -1278,13 +1286,9 @@ class AMDSMIHelpers(): else: # assume it's already bytes raw = raw_data - size = len(raw) self.hexdump_to_string(raw) afids, num_afids = amdsmi_interface.amdsmi_get_afids_from_cper(raw) - print(f"AFIDS: ", end="") - for afid in afids: - print(afid, end=" ") - print("") + return afids def ras_cper(self, args, device_handle, logger, gpu_idx): # Parse severity mask dynamically from the --severity option. @@ -1351,16 +1355,7 @@ class AMDSMIHelpers(): if len(entries) == 0: break if args.folder: - if args.follow: - if device_handle: - self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) - else: - self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) - else: - if device_handle: - self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) - else: - self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) + self.dump_cper_entries(args.folder, entries, cper_data, device_handle, args.file_limit) break else: self.display_cper_files_generated(entries, device_handle, args.folder, args.follow) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index b035905d6b..0ca762e717 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -304,26 +304,25 @@ class AMDSMIParser(argparse.ArgumentParser): def _check_cper_file_path(self): """ Argument action validator: Returns a path to a file from the input file path provided. - If the file doesn't exist or is empty raise error + If the file doesn't exist, is empty, or is invalid, raise an error. """ class _CheckInputFilePath(argparse.Action): # Checks the values def __call__(self, parser, args, values, option_string=None): path = Path(values) - if not path.exists(): - raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct. ") - - if path.is_dir(): - raise argparse.ArgumentTypeError( - f"Invalid Path: {path} is directory when it needs to be a specific file") - - if path.is_file(): - if os.stat(values).st_size == 0: - raise argparse.ArgumentTypeError(f"Invalid Path: {path} Input file is empty") - setattr(args, self.dest, path) - else: - raise argparse.ArgumentTypeError( - f"Invalid path:{path} Could not determine if value given is a valid path") + try: + if not path.exists(): + raise FileNotFoundError(f"CPER file could not be read. Make sure the path '{path}' is correct.") + if path.is_dir(): + raise IsADirectoryError(f"Invalid Path: {path} is a directory when it needs to be a specific file.") + if path.is_file(): + if os.stat(values).st_size == 0: + raise ValueError(f"Invalid Path: {path} Input file is empty.") + setattr(args, self.dest, path) + else: + raise FileNotFoundError(f"Invalid Path: {path} Could not determine if the value given is a valid path.") + except Exception as root_cause: + raise amdsmi_cli_exceptions.AmdSmiInvalidFilePathException(path, _CheckInputFilePath.outputformat) from root_cause return _CheckInputFilePath diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 8040c27e66..414630c55f 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -511,6 +511,45 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_board_info + +Description: Returns board info for the given GPU + +Input parameters: + +* `processor_handle` device which to query + +Output: Dictionary with fields correctable and uncorrectable + +Field | Description +---|--- +`model_number` | Board serial number +`product_serial` | Product serial +`fru_id` | FRU ID +`product_name` | Product name +`manufacturer_name` | Manufacturer name + +Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0") + board_info = amdsmi_get_gpu_board_info(device) + print(board_info["model_number"]) + print(board_info["product_serial"]) + print(board_info["fru_id"]) + print(board_info["product_name"]) + print(board_info["manufacturer_name"]) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_get_gpu_cache_info Description: Returns a list of dictionaries containing cache information for the given GPU. @@ -1185,6 +1224,7 @@ except AmdSmiException as e: Description: Dump CPER entries for a given GPU in a file using from CPER header file from RAS tool. Input parameters: + * `processor_handle` device which to query * `severity_mask` the severity mask of the entries to be retrieved * `buffer_size` pointer to a variable that specifies the size of the cper_data @@ -1218,53 +1258,65 @@ Example: ```python for device in devices: entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) - print("CPER entries for device", device) + print("CPER entries for device", device) for key, entry in entries.items(): print("Entry", key) print(" Error Severity:", entry.get("error_severity", "Unknown")) print(" Notify Type:", entry.get("notify_type", "Unknown")) print(" Timestamp:", entry.get("timestamp", "")) - print() + print() print("New Cursor Position:", new_cursor) except AmdSmiException as e: print(e) ``` -### amdsmi_get_gpu_board_info +### amdsmi_get_afids_from_cper -Description: Returns board info for the given GPU +Description: Get the AFIDs from CPER buffer Input parameters: * `processor_handle` device which to query +* `severity_mask` the severity mask of the entries to be retrieved +* `buffer_size` pointer to a variable that specifies the size of the cper_data +* `cursor` pointer to a variable that will contain the cursor for the next call -Output: Dictionary with fields correctable and uncorrectable +Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data Field | Description ---|--- -`model_number` | Board serial number -`product_serial` | Product serial -`fru_id` | FRU ID -`product_name` | Product name -`manufacturer_name` | Manufacturer name +`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | +`notify_type` | The notification type associated with the CPER entry. | +`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. | +`signature` | A 4-byte signature identifying the entry, typically `CPER`. | +`revision` | The revision number of the CPER record format. | +`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | +`sec_cnt` | The count of sections included in the CPER entry. | +`record_length` | The total length in bytes of the CPER entry. | +`platform_id` | A character array identifying the GPU or platform. | +`creator_id` | A character array indicating the creator of the CPER entry. | +`record_id` | A unique identifier for the CPER entry. | +`flags` | Reserved flags related to the CPER entry. | +`persistence_info` | Reserved information related to persistence. | -Exceptions that can be thrown by `amdsmi_get_gpu_board_info` function: +Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: * `AmdSmiLibraryException` -* `AmdSmiRetryException` * `AmdSmiParameterException` Example: ```python -try: - device = amdsmi_get_processor_handle_from_bdf("0000:23.00.0") - board_info = amdsmi_get_gpu_board_info(device) - print(board_info["model_number"]) - print(board_info["product_serial"]) - print(board_info["fru_id"]) - print(board_info["product_name"]) - print(board_info["manufacturer_name"]) +for device in devices: + entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) + print("CPER entries for device", device) + for key, entry in entries.items(): + print("Entry", key) + print(" Error Severity:", entry.get("error_severity", "Unknown")) + print(" Notify Type:", entry.get("notify_type", "Unknown")) + print(" Timestamp:", entry.get("timestamp", "")) + print() + print("New Cursor Position:", new_cursor) except AmdSmiException as e: print(e) ``` @@ -5390,53 +5442,3 @@ try: except AmdSmiException as e: print(e) ``` - -### amdsmi_get_afids_from_cper - -Description: Get the AFIDs from CPER buffer - -Input parameters: -* `processor_handle` device which to query -* `severity_mask` the severity mask of the entries to be retrieved -* `buffer_size` pointer to a variable that specifies the size of the cper_data -* `cursor` pointer to a variable that will contain the cursor for the next call - -Output: Dictionary with fields, updated cursor, and a dictionary of the cper_data - -Field | Description ----|--- -`error_severity` | The severity of the CPER error ex: `non_fatal_uncorrected`, `fatal`, `non_fatal_corrected`. | -`notify_type` | The notification type associated with the CPER entry. | -`timestamp` | The time when the CPER entry was recorded, formatted as `YYYY/MM/DD HH:MM:SS`. | -`signature` | A 4-byte signature identifying the entry, typically `CPER`. | -`revision` | The revision number of the CPER record format. | -`signature_end` | A marker value (typically `0xFFFFFFFF`) confirming the integrity of the signature. | -`sec_cnt` | The count of sections included in the CPER entry. | -`record_length` | The total length in bytes of the CPER entry. | -`platform_id` | A character array identifying the GPU or platform. | -`creator_id` | A character array indicating the creator of the CPER entry. | -`record_id` | A unique identifier for the CPER entry. | -`flags` | Reserved flags related to the CPER entry. | -`persistence_info` | Reserved information related to persistence. | - -Exceptions that can be thrown by `amdsmi_get_gpu_cper_entries` function: - -* `AmdSmiLibraryException` -* `AmdSmiParameterException` - -Example: - -```python -for device in devices: - entries, new_cursor, cper_data = amdsmi_get_gpu_cper_entries(device, severity_mask, buffer_size, initial_cursor) - print("CPER entries for device", device) - for key, entry in entries.items(): - print("Entry", key) - print(" Error Severity:", entry.get("error_severity", "Unknown")) - print(" Notify Type:", entry.get("notify_type", "Unknown")) - print(" Timestamp:", entry.get("timestamp", "")) - print() - print("New Cursor Position:", new_cursor) -except AmdSmiException as e: - print(e) -``` diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 6eeac910c9..c8af0d020f 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -4145,7 +4145,7 @@ amdsmi_status_t amdsmi_get_afids_from_cper( return AMDSMI_STATUS_INVAL; } - int i = 0; + uint32_t i = 0; for(int afid: cper_decode(cper)) { if(i < *num_afids) { afids[i] = afid; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc index aa4a118ff9..91f5b3166b 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_cper.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_cper.cc @@ -213,7 +213,6 @@ static void* cper_get_sec_desc_offset(const amdsmi_cper_hdr_t *hdr, int idx) static void* cper_get_sec_offset(const amdsmi_cper_hdr_t *hdr, int idx) { struct cper_sec_desc *tmp_desc; - char *offset; if (idx >= hdr->sec_cnt) return 0; @@ -313,14 +312,13 @@ static int cper_dump_nonstd_err(const struct cper_sec_nonstd_err *nonstd_err) std::ostringstream ss; struct cper_sec_nonstd_err_body *body; - char *offset; ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS]\n~~~~NON STANDARD SECTION~~~\n"; ss << "[NonSTD SEC] Err Info Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_info_cnt << "\n"; ss << "[NonSTD SEC] Err Context Count = 0x" << std::hex << nonstd_err->hdr.valid_bits.err_context_cnt << "\n"; - if (nonstd_err->hdr.valid_bits.err_context_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) { + if (nonstd_err->hdr.valid_bits.err_info_cnt != nonstd_err->hdr.valid_bits.err_context_cnt) { ss << "~~~~Malformed Non Standard Section!~~~~\n\n"; goto exit; } @@ -554,8 +552,8 @@ std::vector cper_decode(const amdsmi_cper_hdr_t *cper) { } else { ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] Unknown error type!!\n"; - for(int i = 0; i < sizeof(sec_guid->b); ++i) { - ss << std::hex << static_cast(sec_guid->b[i]) << ":"; + for(size_t j = 0; j < sizeof(sec_guid->b); ++j) { + ss << std::hex << static_cast(sec_guid->b[j]) << ":"; } ss << "\n"; LOG_ERROR(ss); diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 7cf27ccb92..95b9083466 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -937,100 +937,13 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index( return AMDSMI_STATUS_API_FAILED; } -static std::vector -amdsmi_get_gpu_cper_headers(const char *buffer, size_t buffer_sz) { - - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] buffer_sz: " << buffer_sz; - LOG_DEBUG(ss); - - std::vector headers; - if(!buffer) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] buffer is null"; - LOG_ERROR(ss); - return headers; - } - static constexpr char cper_signature[] = "CPER"; - static constexpr size_t cper_signature_size = sizeof(cper_signature) - 1; - for(size_t data_idx = 0; - buffer_sz >= cper_signature_size && - data_idx < buffer_sz - cper_signature_size; - ++data_idx) { - - const amdsmi_cper_hdr_t *hdr = reinterpret_cast( - &buffer[data_idx]); - if(hdr->signature[0] != 'C' || hdr->signature[1] != 'P' || - hdr->signature[2] != 'E' || hdr->signature[3] != 'R' ) { - continue; - } - if(hdr->signature_end != 0xFFFFFFFF) { - continue; - } - if(hdr->record_length > buffer_sz) { - continue; - } - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] add header at data_idx: " << data_idx - << ", sig: " << hdr->signature[0] << hdr->signature[1] << hdr->signature[2] << hdr->signature[3]; - LOG_DEBUG(ss); - headers.emplace_back(hdr); - } - return headers; -} - struct CperFileCtx { amdsmi_status_t status = AMDSMI_STATUS_FILE_ERROR; std::unique_ptr buffer; long file_size = 0; }; -static auto amdsmi_read_cper_file(const std::string &filepath) -> CperFileCtx { - std::ostringstream ss; - - CperFileCtx ctx; - ctx.status = AMDSMI_STATUS_FILE_ERROR; - ctx.file_size = 0; - - struct stat file_stats; - if (stat(filepath.c_str(), &file_stats) == 0) { - if (!S_ISREG(file_stats.st_mode)) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file is not a regular file: " - << filepath << ", errno: " << errno << "): " << strerror(errno); - return ctx; - } - } else { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] file does not exist: " - << filepath << ", errno: " << errno << "): " << strerror(errno); - ctx.status = AMDSMI_STATUS_NOT_SUPPORTED; - return ctx; - } - - ctx.file_size = file_stats.st_size; - ctx.buffer = std::make_unique(ctx.file_size); - int file = open(filepath.c_str(), O_RDONLY); - if (file == -1) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[CPER] failed to open file: " - << filepath << ", errno:()" << errno << "): " << strerror(errno); - LOG_ERROR(ss); - return ctx; - } - long bytes_read = read(file, ctx.buffer.get(), ctx.file_size); - if (bytes_read <= 0) { - ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ - << "[CPER] failed to read complete file, read only " - << bytes_read << " of " << ctx.file_size << " bytes"; - LOG_ERROR(ss); - return ctx; - } - close(file); - - ctx.status = AMDSMI_STATUS_SUCCESS; - ctx.file_size = bytes_read; - return ctx; -} void amdsmi_wait_for_user_input(void) { for (;;) { std::cout << "\n\t**Press any key to continue**" << std::endl;