From 30ffa6feb04f2b3246bf5f859aa2b8d11115e5b8 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 26 Apr 2024 02:54:25 -0500 Subject: [PATCH] SWDEV-453493 - Fix Null pointer reference in amd-smi bad-pages Signed-off-by: Maisam Arif Change-Id: I10a1278b68cbb464dd0fb38a2de50413f6f43959 [ROCm/amdsmi commit: e6054be6e7547207df68f4fcfa09176783453655] --- projects/amdsmi/CHANGELOG.md | 14 +++-- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 35 ++++++++--- projects/amdsmi/py-interface/README.md | 51 ++++++++++++++- .../amdsmi/py-interface/amdsmi_interface.py | 63 ++++++++++--------- 4 files changed, 119 insertions(+), 44 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index c0afea2e29..970604dd4c 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ### Changed +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. + - **Updated `amd-smi monitor --pcie` output** The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: @@ -332,18 +335,21 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json ### Fixed -- **Fix for GPU reset error on non-amdgpu cards** +- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. + +- **Fix for GPU reset error on non-amdgpu cards** Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. -- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** +- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards** Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). -- **Fix for `amd-smi process`** +- **Fix for `amd-smi process`** Fixed output results when getting processes running on a device. -- **Improved Error handling for `amd-smi process`** +- **Improved Error handling for `amd-smi process`** Fixed Attribute Error when getting process in csv format ### Known issues diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 492cba6572..72fe9f96f1 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1015,14 +1015,19 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + bad_pages_not_found = "No bad pages found." try: bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu) + # If bad_page_info is an empty list overwrite with not found error statement + if bad_page_info == []: + bad_page_info = bad_pages_not_found + bad_page_error = True + else: + bad_page_error = False except amdsmi_exception.AmdSmiLibraryException as e: bad_page_info = "N/A" - logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) - - if bad_page_info == "N/A" or bad_page_info == "No bad pages found.": bad_page_error = True + logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) if args.retired: if bad_page_error: @@ -1034,13 +1039,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['retired'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['retired'] = bad_pages_not_found + else: + values_dict['retired'] = bad_page_info_output if args.pending: if bad_page_error: @@ -1052,13 +1061,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['pending'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['pending'] = bad_pages_not_found + else: + values_dict['pending'] = bad_page_info_output if args.un_res: if bad_page_error: @@ -1070,13 +1083,17 @@ class AMDSMICommands(): bad_page_info_entry = {} bad_page_info_entry["page_address"] = bad_page["page_address"] bad_page_info_entry["page_size"] = bad_page["page_size"] - bad_page_info_entry["status"] = bad_page["status"].name + status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]] + bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "") bad_page_info_output.append(bad_page_info_entry) # Remove brackets if there is only one value if len(bad_page_info_output) == 1: bad_page_info_output = bad_page_info_output[0] - values_dict['un_res'] = bad_page_info_output + if bad_page_info_output == []: + values_dict['un_res'] = bad_pages_not_found + else: + values_dict['un_res'] = bad_page_info_output # Store values in logger.output self.logger.store_output(args.gpu, 'values', values_dict) diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index dae8d0ad1b..4cc6ecaf0d 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -843,7 +843,7 @@ Input parameters: * `processor_handle` device which to query -Output: List consisting of dictionaries with fields for each bad page found +Output: List consisting of dictionaries with fields for each bad page found; can be an empty list Field | Description ---|--- @@ -868,7 +868,7 @@ try: else: for device in devices: bad_page_info = amdsmi_get_gpu_bad_page_info(device) - if not len(bad_page_info): + if not bad_page_info: # Can be empty list print("No bad pages found") continue for bad_page in bad_page_info: @@ -880,6 +880,53 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_memory_reserved_pages + +Description: Returns reserved memory page info for the given GPU. +It is not supported on virtual machine guest + +Input parameters: + +* `processor_handle` device which to query + +Output: List consisting of dictionaries with fields for each reserved memory page found; can be an empty list + +Field | Description +---|--- +`value` | Value of memory reserved page +`page_address` | Address of memory reserved page +`page_size` | Size of memory reserved page +`status` | Status of memory reserved page + +Exceptions that can be thrown by `amdsmi_get_gpu_memory_reserved_pages` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + reserved_memory_page_info = amdsmi_get_gpu_memory_reserved_pages(device) + if not reserved_memory_page_info: # Can be empty list + print("No memory reserved pages found") + continue + for reserved_memory_page in reserved_memory_page_info: + print(reserved_memory_page["value"]) + print(reserved_memory_page["page_address"]) + print(reserved_memory_page["page_size"]) + print(reserved_memory_page["status"]) +except AmdSmiException as e: + print(e) +``` + + ### amdsmi_get_gpu_process_list Description: Returns the list of processes running on the target GPU; May require root level access diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 2cff4bf777..793ffdec61 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -440,21 +440,22 @@ def _format_bad_page_info(bad_page_info, bad_page_count: ctypes.c_uint32) -> Lis Format bad page info data retrieved. Parameters: - bad_page_info(`POINTER(amdsmi_retired_page_record_t)`): Pointer to bad page info - retrieved. + bad_page_info(`amdsmi_retired_page_record_t`): A populated list of amdsmi_retired_page_record_t(s) + retrieved. Ex: (amdsmi_wrapper.amdsmi_retired_page_record_t * #)() bad_page_count(`c_uint32`): Bad page count. Returns: - `list`: List containing formatted bad pages. + `list`: List containing formatted bad pages. Can be empty """ - if not isinstance( - bad_page_info, ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t) - ): - raise AmdSmiParameterException( - bad_page_info, ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t) - ) + if bad_page_count == 0: + return [] + + # Check if each struct within bad_page_info is valid + for bad_page in bad_page_info: + if not isinstance(bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t): + raise AmdSmiParameterException( + bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t + ) table_records = [] for i in range(bad_page_count.value): @@ -1803,23 +1804,24 @@ def amdsmi_get_gpu_bad_page_info( ) num_pages = ctypes.c_uint32() - retired_page_record = ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t)() - + nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( - processor_handle, ctypes.byref(num_pages), retired_page_record + processor_handle, ctypes.byref(num_pages), nullptr ) ) - table_records = _format_bad_page_info(retired_page_record, num_pages) - if num_pages.value == 0: - return "No bad pages found." - else: - table_records = _format_bad_page_info(retired_page_record, num_pages) + return [] - return table_records + bad_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value)() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_bad_page_info( + processor_handle, ctypes.byref(num_pages), bad_pages + ) + ) + + return _format_bad_page_info(bad_pages, num_pages) def amdsmi_get_gpu_total_ecc_count( @@ -3859,21 +3861,24 @@ def amdsmi_get_gpu_memory_reserved_pages( ) num_pages = ctypes.c_uint32() - retired_page_record = ctypes.POINTER( - amdsmi_wrapper.amdsmi_retired_page_record_t)() + nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)() _check_res( amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( - processor_handle, ctypes.byref(num_pages), retired_page_record + processor_handle, ctypes.byref(num_pages), nullptr ) ) - table_records = _format_bad_page_info(retired_page_record, num_pages) if num_pages.value == 0: - return "No bad pages found." - else: - table_records = _format_bad_page_info(retired_page_record, num_pages) + return [] - return table_records + mem_reserved_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages)() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages( + processor_handle, ctypes.byref(num_pages), mem_reserved_pages + ) + ) + + return _format_bad_page_info(mem_reserved_pages, num_pages) def amdsmi_get_gpu_metrics_header_info(