SWDEV-455131 - Updated process APIs

- Removed amdsmi_get_gpu_process_info from python API
  - Updated documentation
  - Aligned process --json output format to unit & value format

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I82bba1b6df71020b4a5995ff63b9aa62611ce4fe


[ROCm/amdsmi commit: c551c3caed]
Этот коммит содержится в:
Maisam Arif
2024-04-11 05:11:54 -05:00
родитель 6d2aa6f7f8
Коммит 9b4f0f1d2b
5 изменённых файлов: 53 добавлений и 89 удалений
+3
Просмотреть файл
@@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
### Changed
- **Removed `amdsmi_get_gpu_process_info` from python library**
amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface
- **Updated metrics --clocks**
Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
+22 -12
Просмотреть файл
@@ -2576,28 +2576,38 @@ class AMDSMICommands():
raise e
filtered_process_values = []
for process in process_list:
try:
process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process)
except amdsmi_exception.AmdSmiLibraryException as e:
process_info = "N/A"
logging.debug("Failed to get process info for process %s on gpu %s | %s", process, gpu_id, e.get_error_info())
filtered_process_values.append({'process_info': process_info})
continue
for process_info in process_list:
process_info['mem_usage'] = process_info.pop('mem')
process_info['usage'] = process_info.pop('engine_usage')
engine_usage_unit = "ns"
memory_usage_unit = "B"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
engine_usage_unit = "ns"
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = f"{process_info['usage'][usage_metric]} {engine_usage_unit}"
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.convert_bytes_to_readable(process_info['memory_usage'][usage_metric])
elif self.logger.is_json_format():
process_info['mem_usage'] = {"value" : process_info['mem_usage'],
"unit" : memory_usage_unit}
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = {"value" : process_info['usage'][usage_metric],
"unit" : engine_usage_unit}
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = {"value" : process_info['memory_usage'][usage_metric],
"unit" : memory_usage_unit}
filtered_process_values.append({'process_info': process_info})
if not filtered_process_values:
process_info = "N/A"
logging.debug("Failed to detect any process on gpu %s", gpu_id)
filtered_process_values.append({'process_info': process_info})
# Arguments will filter the populated processes
@@ -2641,7 +2651,7 @@ class AMDSMICommands():
# Convert and store output by pid for csv format
if self.logger.is_csv_format():
# Check for empty list first
if filtered_process_values == []:
if not filtered_process_values:
self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
else:
for process_info in filtered_process_values:
@@ -2660,7 +2670,7 @@ class AMDSMICommands():
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
# Store values in logger.output
if filtered_process_values == []:
if not filtered_process_values:
self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
else:
for process_info in filtered_process_values:
+11 -46
Просмотреть файл
@@ -882,13 +882,21 @@ except AmdSmiException as e:
### amdsmi_get_gpu_process_list
Description: Returns the list of processes running on the target GPU.
Description: Returns the list of processes running on the target GPU; May require root level access
Input parameters:
* `processor_handle` device which to query
Output: List of `amdsmi_proc_info_t` process objects running on the target GPU; can be empty
Output: List of Dictionaries with the corresponding fields; empty list if no running process are detected
Field | Description
---|---
`name` | Name of process
`pid` | Process ID
`mem` | Process memory usage
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
@@ -910,50 +918,7 @@ try:
print("No processes running on this GPU")
else:
for process in processes:
print(amdsmi_get_gpu_process_info(device, process))
except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_process_info
Description: Returns info about process given the target GPU and the corresponding `amdsmi_proc_info_t` object
Input parameters:
* `processor_handle` device which to query
Output: Dictionary with fields
Field | Description
---|---
`name` | Name of process
`pid` | Process ID
`mem` | Process memory usage
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table
Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
processes = amdsmi_get_gpu_process_list(device)
if len(processes) == 0:
print("No processes running on this GPU")
else:
for process in processes:
print(amdsmi_get_gpu_process_info(device, process))
print(process)
except AmdSmiException as e:
print(e)
```
-1
Просмотреть файл
@@ -108,7 +108,6 @@ from .amdsmi_interface import amdsmi_get_gpu_bad_page_info
# # Process Information
from .amdsmi_interface import amdsmi_get_gpu_process_list
from .amdsmi_interface import amdsmi_get_gpu_process_info
# # ECC Error Information
from .amdsmi_interface import amdsmi_get_gpu_total_ecc_count
+17 -30
Просмотреть файл
@@ -1799,12 +1799,15 @@ def amdsmi_get_gpu_bad_page_info(
num_pages = ctypes.c_uint32()
retired_page_record = ctypes.POINTER(
amdsmi_wrapper.amdsmi_retired_page_record_t)()
_check_res(
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
processor_handle, ctypes.byref(num_pages), retired_page_record
)
)
table_records = _format_bad_page_info(retired_page_record, num_pages)
if num_pages.value == 0:
return "No bad pages found."
else:
@@ -1942,39 +1945,23 @@ def amdsmi_get_gpu_process_list(
result = []
for index in range(max_processes.value):
result.append(process_list[index])
result.append({
"name": process_list[index].name.decode("utf-8"),
"pid": process_list[index].pid,
"mem": process_list[index].mem,
"engine_usage": {
"gfx": process_list[index].engine_usage.gfx,
"enc": process_list[index].engine_usage.enc
},
"memory_usage": {
"gtt_mem": process_list[index].memory_usage.gtt_mem,
"cpu_mem": process_list[index].memory_usage.cpu_mem,
"vram_mem": process_list[index].memory_usage.vram_mem,
},
})
return result
def amdsmi_get_gpu_process_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
process: amdsmi_wrapper.amdsmi_proc_info_t,
) -> Dict[str, Any]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
if not isinstance(process, amdsmi_wrapper.amdsmi_proc_info_t):
raise AmdSmiParameterException(
process, amdsmi_wrapper.amdsmi_proc_info_t)
return {
"name": process.name.decode("utf-8"),
"pid": process.pid,
"mem": process.mem,
"engine_usage": {
"gfx": process.engine_usage.gfx,
"enc": process.engine_usage.enc
},
"memory_usage": {
"gtt_mem": process.memory_usage.gtt_mem,
"cpu_mem": process.memory_usage.cpu_mem,
"vram_mem": process.memory_usage.vram_mem,
},
}
def amdsmi_get_gpu_device_uuid(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> str:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(