SWDEV-453493 - Fix Null pointer reference in amd-smi bad-pages

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I10a1278b68cbb464dd0fb38a2de50413f6f43959


[ROCm/amdsmi commit: e6054be6e7]
Bu işleme şunda yer alıyor:
Maisam Arif
2024-04-26 02:54:25 -05:00
ebeveyn 83d5f67750
işleme 30ffa6feb0
4 değiştirilmiş dosya ile 119 ekleme ve 44 silme
+10 -4
Dosyayı Görüntüle
@@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
### Changed
- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**
Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty.
- **Updated `amd-smi monitor --pcie` output**
The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output:
@@ -332,18 +335,21 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json
### Fixed
- **Fix for GPU reset error on non-amdgpu cards**
- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**
Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well.
- **Fix for GPU reset error on non-amdgpu cards**
Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix
updates CLI to target only AMD ASICs.
- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**
- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**
Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix
provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards).
- **Fix for `amd-smi process`**
- **Fix for `amd-smi process`**
Fixed output results when getting processes running on a device.
- **Improved Error handling for `amd-smi process`**
- **Improved Error handling for `amd-smi process`**
Fixed Attribute Error when getting process in csv format
### Known issues
+26 -9
Dosyayı Görüntüle
@@ -1015,14 +1015,19 @@ class AMDSMICommands():
# Get gpu_id for logging
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
bad_pages_not_found = "No bad pages found."
try:
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
# If bad_page_info is an empty list overwrite with not found error statement
if bad_page_info == []:
bad_page_info = bad_pages_not_found
bad_page_error = True
else:
bad_page_error = False
except amdsmi_exception.AmdSmiLibraryException as e:
bad_page_info = "N/A"
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
if bad_page_info == "N/A" or bad_page_info == "No bad pages found.":
bad_page_error = True
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
if args.retired:
if bad_page_error:
@@ -1034,13 +1039,17 @@ class AMDSMICommands():
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
bad_page_info_entry["status"] = bad_page["status"].name
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
values_dict['retired'] = bad_page_info_output
if bad_page_info_output == []:
values_dict['retired'] = bad_pages_not_found
else:
values_dict['retired'] = bad_page_info_output
if args.pending:
if bad_page_error:
@@ -1052,13 +1061,17 @@ class AMDSMICommands():
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
bad_page_info_entry["status"] = bad_page["status"].name
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
values_dict['pending'] = bad_page_info_output
if bad_page_info_output == []:
values_dict['pending'] = bad_pages_not_found
else:
values_dict['pending'] = bad_page_info_output
if args.un_res:
if bad_page_error:
@@ -1070,13 +1083,17 @@ class AMDSMICommands():
bad_page_info_entry = {}
bad_page_info_entry["page_address"] = bad_page["page_address"]
bad_page_info_entry["page_size"] = bad_page["page_size"]
bad_page_info_entry["status"] = bad_page["status"].name
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
bad_page_info_output.append(bad_page_info_entry)
# Remove brackets if there is only one value
if len(bad_page_info_output) == 1:
bad_page_info_output = bad_page_info_output[0]
values_dict['un_res'] = bad_page_info_output
if bad_page_info_output == []:
values_dict['un_res'] = bad_pages_not_found
else:
values_dict['un_res'] = bad_page_info_output
# Store values in logger.output
self.logger.store_output(args.gpu, 'values', values_dict)
+49 -2
Dosyayı Görüntüle
@@ -843,7 +843,7 @@ Input parameters:
* `processor_handle` device which to query
Output: List consisting of dictionaries with fields for each bad page found
Output: List consisting of dictionaries with fields for each bad page found; can be an empty list
Field | Description
---|---
@@ -868,7 +868,7 @@ try:
else:
for device in devices:
bad_page_info = amdsmi_get_gpu_bad_page_info(device)
if not len(bad_page_info):
if not bad_page_info: # Can be empty list
print("No bad pages found")
continue
for bad_page in bad_page_info:
@@ -880,6 +880,53 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_memory_reserved_pages
Description: Returns reserved memory page info for the given GPU.
It is not supported on virtual machine guest
Input parameters:
* `processor_handle` device which to query
Output: List consisting of dictionaries with fields for each reserved memory page found; can be an empty list
Field | Description
---|---
`value` | Value of memory reserved page
`page_address` | Address of memory reserved page
`page_size` | Size of memory reserved page
`status` | Status of memory reserved page
Exceptions that can be thrown by `amdsmi_get_gpu_memory_reserved_pages` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
reserved_memory_page_info = amdsmi_get_gpu_memory_reserved_pages(device)
if not reserved_memory_page_info: # Can be empty list
print("No memory reserved pages found")
continue
for reserved_memory_page in reserved_memory_page_info:
print(reserved_memory_page["value"])
print(reserved_memory_page["page_address"])
print(reserved_memory_page["page_size"])
print(reserved_memory_page["status"])
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_process_list
Description: Returns the list of processes running on the target GPU; May require root level access
+34 -29
Dosyayı Görüntüle
@@ -440,21 +440,22 @@ def _format_bad_page_info(bad_page_info, bad_page_count: ctypes.c_uint32) -> Lis
Format bad page info data retrieved.
Parameters:
bad_page_info(`POINTER(amdsmi_retired_page_record_t)`): Pointer to bad page info
retrieved.
bad_page_info(`amdsmi_retired_page_record_t`): A populated list of amdsmi_retired_page_record_t(s)
retrieved. Ex: (amdsmi_wrapper.amdsmi_retired_page_record_t * #)()
bad_page_count(`c_uint32`): Bad page count.
Returns:
`list`: List containing formatted bad pages.
`list`: List containing formatted bad pages. Can be empty
"""
if not isinstance(
bad_page_info, ctypes.POINTER(
amdsmi_wrapper.amdsmi_retired_page_record_t)
):
raise AmdSmiParameterException(
bad_page_info, ctypes.POINTER(
amdsmi_wrapper.amdsmi_retired_page_record_t)
)
if bad_page_count == 0:
return []
# Check if each struct within bad_page_info is valid
for bad_page in bad_page_info:
if not isinstance(bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t):
raise AmdSmiParameterException(
bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t
)
table_records = []
for i in range(bad_page_count.value):
@@ -1803,23 +1804,24 @@ def amdsmi_get_gpu_bad_page_info(
)
num_pages = ctypes.c_uint32()
retired_page_record = ctypes.POINTER(
amdsmi_wrapper.amdsmi_retired_page_record_t)()
nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
processor_handle, ctypes.byref(num_pages), retired_page_record
processor_handle, ctypes.byref(num_pages), nullptr
)
)
table_records = _format_bad_page_info(retired_page_record, num_pages)
if num_pages.value == 0:
return "No bad pages found."
else:
table_records = _format_bad_page_info(retired_page_record, num_pages)
return []
return table_records
bad_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
processor_handle, ctypes.byref(num_pages), bad_pages
)
)
return _format_bad_page_info(bad_pages, num_pages)
def amdsmi_get_gpu_total_ecc_count(
@@ -3859,21 +3861,24 @@ def amdsmi_get_gpu_memory_reserved_pages(
)
num_pages = ctypes.c_uint32()
retired_page_record = ctypes.POINTER(
amdsmi_wrapper.amdsmi_retired_page_record_t)()
nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages(
processor_handle, ctypes.byref(num_pages), retired_page_record
processor_handle, ctypes.byref(num_pages), nullptr
)
)
table_records = _format_bad_page_info(retired_page_record, num_pages)
if num_pages.value == 0:
return "No bad pages found."
else:
table_records = _format_bad_page_info(retired_page_record, num_pages)
return []
return table_records
mem_reserved_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages(
processor_handle, ctypes.byref(num_pages), mem_reserved_pages
)
)
return _format_bad_page_info(mem_reserved_pages, num_pages)
def amdsmi_get_gpu_metrics_header_info(