SWDEV-453493 - Fix Null pointer reference in amd-smi bad-pages
Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I10a1278b68cbb464dd0fb38a2de50413f6f43959
[ROCm/amdsmi commit: e6054be6e7]
Bu işleme şunda yer alıyor:
@@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
|
||||
|
||||
### Changed
|
||||
|
||||
- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**
|
||||
Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty.
|
||||
|
||||
- **Updated `amd-smi monitor --pcie` output**
|
||||
The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output:
|
||||
|
||||
@@ -332,18 +335,21 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json
|
||||
|
||||
### Fixed
|
||||
|
||||
- **Fix for GPU reset error on non-amdgpu cards**
|
||||
- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**
|
||||
Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well.
|
||||
|
||||
- **Fix for GPU reset error on non-amdgpu cards**
|
||||
Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix
|
||||
updates CLI to target only AMD ASICs.
|
||||
|
||||
- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**
|
||||
- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**
|
||||
Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix
|
||||
provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards).
|
||||
|
||||
- **Fix for `amd-smi process`**
|
||||
- **Fix for `amd-smi process`**
|
||||
Fixed output results when getting processes running on a device.
|
||||
|
||||
- **Improved Error handling for `amd-smi process`**
|
||||
- **Improved Error handling for `amd-smi process`**
|
||||
Fixed Attribute Error when getting process in csv format
|
||||
|
||||
### Known issues
|
||||
|
||||
@@ -1015,14 +1015,19 @@ class AMDSMICommands():
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
bad_pages_not_found = "No bad pages found."
|
||||
try:
|
||||
bad_page_info = amdsmi_interface.amdsmi_get_gpu_bad_page_info(args.gpu)
|
||||
# If bad_page_info is an empty list overwrite with not found error statement
|
||||
if bad_page_info == []:
|
||||
bad_page_info = bad_pages_not_found
|
||||
bad_page_error = True
|
||||
else:
|
||||
bad_page_error = False
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
bad_page_info = "N/A"
|
||||
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if bad_page_info == "N/A" or bad_page_info == "No bad pages found.":
|
||||
bad_page_error = True
|
||||
logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
if args.retired:
|
||||
if bad_page_error:
|
||||
@@ -1034,13 +1039,17 @@ class AMDSMICommands():
|
||||
bad_page_info_entry = {}
|
||||
bad_page_info_entry["page_address"] = bad_page["page_address"]
|
||||
bad_page_info_entry["page_size"] = bad_page["page_size"]
|
||||
bad_page_info_entry["status"] = bad_page["status"].name
|
||||
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
|
||||
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
|
||||
bad_page_info_output.append(bad_page_info_entry)
|
||||
# Remove brackets if there is only one value
|
||||
if len(bad_page_info_output) == 1:
|
||||
bad_page_info_output = bad_page_info_output[0]
|
||||
|
||||
values_dict['retired'] = bad_page_info_output
|
||||
if bad_page_info_output == []:
|
||||
values_dict['retired'] = bad_pages_not_found
|
||||
else:
|
||||
values_dict['retired'] = bad_page_info_output
|
||||
|
||||
if args.pending:
|
||||
if bad_page_error:
|
||||
@@ -1052,13 +1061,17 @@ class AMDSMICommands():
|
||||
bad_page_info_entry = {}
|
||||
bad_page_info_entry["page_address"] = bad_page["page_address"]
|
||||
bad_page_info_entry["page_size"] = bad_page["page_size"]
|
||||
bad_page_info_entry["status"] = bad_page["status"].name
|
||||
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
|
||||
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
|
||||
bad_page_info_output.append(bad_page_info_entry)
|
||||
# Remove brackets if there is only one value
|
||||
if len(bad_page_info_output) == 1:
|
||||
bad_page_info_output = bad_page_info_output[0]
|
||||
|
||||
values_dict['pending'] = bad_page_info_output
|
||||
if bad_page_info_output == []:
|
||||
values_dict['pending'] = bad_pages_not_found
|
||||
else:
|
||||
values_dict['pending'] = bad_page_info_output
|
||||
|
||||
if args.un_res:
|
||||
if bad_page_error:
|
||||
@@ -1070,13 +1083,17 @@ class AMDSMICommands():
|
||||
bad_page_info_entry = {}
|
||||
bad_page_info_entry["page_address"] = bad_page["page_address"]
|
||||
bad_page_info_entry["page_size"] = bad_page["page_size"]
|
||||
bad_page_info_entry["status"] = bad_page["status"].name
|
||||
status_string = amdsmi_interface.amdsmi_wrapper.amdsmi_memory_page_status_t__enumvalues[bad_page["status"]]
|
||||
bad_page_info_entry["status"] = status_string.replace("AMDSMI_MEM_PAGE_STATUS_", "")
|
||||
bad_page_info_output.append(bad_page_info_entry)
|
||||
# Remove brackets if there is only one value
|
||||
if len(bad_page_info_output) == 1:
|
||||
bad_page_info_output = bad_page_info_output[0]
|
||||
|
||||
values_dict['un_res'] = bad_page_info_output
|
||||
if bad_page_info_output == []:
|
||||
values_dict['un_res'] = bad_pages_not_found
|
||||
else:
|
||||
values_dict['un_res'] = bad_page_info_output
|
||||
|
||||
# Store values in logger.output
|
||||
self.logger.store_output(args.gpu, 'values', values_dict)
|
||||
|
||||
@@ -843,7 +843,7 @@ Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: List consisting of dictionaries with fields for each bad page found
|
||||
Output: List consisting of dictionaries with fields for each bad page found; can be an empty list
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
@@ -868,7 +868,7 @@ try:
|
||||
else:
|
||||
for device in devices:
|
||||
bad_page_info = amdsmi_get_gpu_bad_page_info(device)
|
||||
if not len(bad_page_info):
|
||||
if not bad_page_info: # Can be empty list
|
||||
print("No bad pages found")
|
||||
continue
|
||||
for bad_page in bad_page_info:
|
||||
@@ -880,6 +880,53 @@ except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_memory_reserved_pages
|
||||
|
||||
Description: Returns reserved memory page info for the given GPU.
|
||||
It is not supported on virtual machine guest
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: List consisting of dictionaries with fields for each reserved memory page found; can be an empty list
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`value` | Value of memory reserved page
|
||||
`page_address` | Address of memory reserved page
|
||||
`page_size` | Size of memory reserved page
|
||||
`status` | Status of memory reserved page
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_memory_reserved_pages` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
devices = amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
reserved_memory_page_info = amdsmi_get_gpu_memory_reserved_pages(device)
|
||||
if not reserved_memory_page_info: # Can be empty list
|
||||
print("No memory reserved pages found")
|
||||
continue
|
||||
for reserved_memory_page in reserved_memory_page_info:
|
||||
print(reserved_memory_page["value"])
|
||||
print(reserved_memory_page["page_address"])
|
||||
print(reserved_memory_page["page_size"])
|
||||
print(reserved_memory_page["status"])
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
|
||||
### amdsmi_get_gpu_process_list
|
||||
|
||||
Description: Returns the list of processes running on the target GPU; May require root level access
|
||||
|
||||
@@ -440,21 +440,22 @@ def _format_bad_page_info(bad_page_info, bad_page_count: ctypes.c_uint32) -> Lis
|
||||
Format bad page info data retrieved.
|
||||
|
||||
Parameters:
|
||||
bad_page_info(`POINTER(amdsmi_retired_page_record_t)`): Pointer to bad page info
|
||||
retrieved.
|
||||
bad_page_info(`amdsmi_retired_page_record_t`): A populated list of amdsmi_retired_page_record_t(s)
|
||||
retrieved. Ex: (amdsmi_wrapper.amdsmi_retired_page_record_t * #)()
|
||||
bad_page_count(`c_uint32`): Bad page count.
|
||||
|
||||
Returns:
|
||||
`list`: List containing formatted bad pages.
|
||||
`list`: List containing formatted bad pages. Can be empty
|
||||
"""
|
||||
if not isinstance(
|
||||
bad_page_info, ctypes.POINTER(
|
||||
amdsmi_wrapper.amdsmi_retired_page_record_t)
|
||||
):
|
||||
raise AmdSmiParameterException(
|
||||
bad_page_info, ctypes.POINTER(
|
||||
amdsmi_wrapper.amdsmi_retired_page_record_t)
|
||||
)
|
||||
if bad_page_count == 0:
|
||||
return []
|
||||
|
||||
# Check if each struct within bad_page_info is valid
|
||||
for bad_page in bad_page_info:
|
||||
if not isinstance(bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t):
|
||||
raise AmdSmiParameterException(
|
||||
bad_page, amdsmi_wrapper.amdsmi_retired_page_record_t
|
||||
)
|
||||
|
||||
table_records = []
|
||||
for i in range(bad_page_count.value):
|
||||
@@ -1803,23 +1804,24 @@ def amdsmi_get_gpu_bad_page_info(
|
||||
)
|
||||
|
||||
num_pages = ctypes.c_uint32()
|
||||
retired_page_record = ctypes.POINTER(
|
||||
amdsmi_wrapper.amdsmi_retired_page_record_t)()
|
||||
|
||||
nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
|
||||
processor_handle, ctypes.byref(num_pages), retired_page_record
|
||||
processor_handle, ctypes.byref(num_pages), nullptr
|
||||
)
|
||||
)
|
||||
|
||||
table_records = _format_bad_page_info(retired_page_record, num_pages)
|
||||
|
||||
if num_pages.value == 0:
|
||||
return "No bad pages found."
|
||||
else:
|
||||
table_records = _format_bad_page_info(retired_page_record, num_pages)
|
||||
return []
|
||||
|
||||
return table_records
|
||||
bad_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages.value)()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
|
||||
processor_handle, ctypes.byref(num_pages), bad_pages
|
||||
)
|
||||
)
|
||||
|
||||
return _format_bad_page_info(bad_pages, num_pages)
|
||||
|
||||
|
||||
def amdsmi_get_gpu_total_ecc_count(
|
||||
@@ -3859,21 +3861,24 @@ def amdsmi_get_gpu_memory_reserved_pages(
|
||||
)
|
||||
|
||||
num_pages = ctypes.c_uint32()
|
||||
retired_page_record = ctypes.POINTER(
|
||||
amdsmi_wrapper.amdsmi_retired_page_record_t)()
|
||||
nullptr = ctypes.POINTER(amdsmi_wrapper.amdsmi_retired_page_record_t)()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages(
|
||||
processor_handle, ctypes.byref(num_pages), retired_page_record
|
||||
processor_handle, ctypes.byref(num_pages), nullptr
|
||||
)
|
||||
)
|
||||
|
||||
table_records = _format_bad_page_info(retired_page_record, num_pages)
|
||||
if num_pages.value == 0:
|
||||
return "No bad pages found."
|
||||
else:
|
||||
table_records = _format_bad_page_info(retired_page_record, num_pages)
|
||||
return []
|
||||
|
||||
return table_records
|
||||
mem_reserved_pages = (amdsmi_wrapper.amdsmi_retired_page_record_t * num_pages)()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_memory_reserved_pages(
|
||||
processor_handle, ctypes.byref(num_pages), mem_reserved_pages
|
||||
)
|
||||
)
|
||||
|
||||
return _format_bad_page_info(mem_reserved_pages, num_pages)
|
||||
|
||||
|
||||
def amdsmi_get_gpu_metrics_header_info(
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle