SWDEV-455131 - Updated process APIs
- Removed amdsmi_get_gpu_process_info from python API
- Updated documentation
- Aligned process --json output format to unit & value format
Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: I82bba1b6df71020b4a5995ff63b9aa62611ce4fe
[ROCm/amdsmi commit: c551c3caed]
Этот коммит содержится в:
@@ -12,6 +12,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/](
|
||||
|
||||
### Changed
|
||||
|
||||
- **Removed `amdsmi_get_gpu_process_info` from python library**
|
||||
amdsmi_get_gpu_process_info was removed from the C library in an earlier build, but the API was still in the python interface
|
||||
|
||||
- **Updated metrics --clocks**
|
||||
Output for `amd-smi metric --clock` is updated to reflect each engine and bug fixes for the clock lock status and deep sleep status.
|
||||
|
||||
|
||||
@@ -2576,28 +2576,38 @@ class AMDSMICommands():
|
||||
raise e
|
||||
|
||||
filtered_process_values = []
|
||||
for process in process_list:
|
||||
try:
|
||||
process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
process_info = "N/A"
|
||||
logging.debug("Failed to get process info for process %s on gpu %s | %s", process, gpu_id, e.get_error_info())
|
||||
filtered_process_values.append({'process_info': process_info})
|
||||
continue
|
||||
|
||||
for process_info in process_list:
|
||||
process_info['mem_usage'] = process_info.pop('mem')
|
||||
process_info['usage'] = process_info.pop('engine_usage')
|
||||
|
||||
engine_usage_unit = "ns"
|
||||
memory_usage_unit = "B"
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
|
||||
|
||||
engine_usage_unit = "ns"
|
||||
for usage_metric in process_info['usage']:
|
||||
process_info['usage'][usage_metric] = f"{process_info['usage'][usage_metric]} {engine_usage_unit}"
|
||||
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info['memory_usage'][usage_metric] = self.helpers.convert_bytes_to_readable(process_info['memory_usage'][usage_metric])
|
||||
elif self.logger.is_json_format():
|
||||
process_info['mem_usage'] = {"value" : process_info['mem_usage'],
|
||||
"unit" : memory_usage_unit}
|
||||
|
||||
for usage_metric in process_info['usage']:
|
||||
process_info['usage'][usage_metric] = {"value" : process_info['usage'][usage_metric],
|
||||
"unit" : engine_usage_unit}
|
||||
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info['memory_usage'][usage_metric] = {"value" : process_info['memory_usage'][usage_metric],
|
||||
"unit" : memory_usage_unit}
|
||||
|
||||
filtered_process_values.append({'process_info': process_info})
|
||||
|
||||
if not filtered_process_values:
|
||||
process_info = "N/A"
|
||||
logging.debug("Failed to detect any process on gpu %s", gpu_id)
|
||||
filtered_process_values.append({'process_info': process_info})
|
||||
|
||||
# Arguments will filter the populated processes
|
||||
@@ -2641,7 +2651,7 @@ class AMDSMICommands():
|
||||
# Convert and store output by pid for csv format
|
||||
if self.logger.is_csv_format():
|
||||
# Check for empty list first
|
||||
if filtered_process_values == []:
|
||||
if not filtered_process_values:
|
||||
self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
|
||||
else:
|
||||
for process_info in filtered_process_values:
|
||||
@@ -2660,7 +2670,7 @@ class AMDSMICommands():
|
||||
self.logger.store_output(args.gpu, 'timestamp', int(time.time()))
|
||||
|
||||
# Store values in logger.output
|
||||
if filtered_process_values == []:
|
||||
if not filtered_process_values:
|
||||
self.logger.store_output(args.gpu, 'process_info', 'No running processes detected')
|
||||
else:
|
||||
for process_info in filtered_process_values:
|
||||
|
||||
@@ -882,13 +882,21 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_get_gpu_process_list
|
||||
|
||||
Description: Returns the list of processes running on the target GPU.
|
||||
Description: Returns the list of processes running on the target GPU; May require root level access
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: List of `amdsmi_proc_info_t` process objects running on the target GPU; can be empty
|
||||
Output: List of Dictionaries with the corresponding fields; empty list if no running process are detected
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`name` | Name of process
|
||||
`pid` | Process ID
|
||||
`mem` | Process memory usage
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
|
||||
|
||||
@@ -910,50 +918,7 @@ try:
|
||||
print("No processes running on this GPU")
|
||||
else:
|
||||
for process in processes:
|
||||
print(amdsmi_get_gpu_process_info(device, process))
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_gpu_process_info
|
||||
|
||||
Description: Returns info about process given the target GPU and the corresponding `amdsmi_proc_info_t` object
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: Dictionary with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`name` | Name of process
|
||||
`pid` | Process ID
|
||||
`mem` | Process memory usage
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_info` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
devices = amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
processes = amdsmi_get_gpu_process_list(device)
|
||||
if len(processes) == 0:
|
||||
print("No processes running on this GPU")
|
||||
else:
|
||||
for process in processes:
|
||||
print(amdsmi_get_gpu_process_info(device, process))
|
||||
print(process)
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
@@ -108,7 +108,6 @@ from .amdsmi_interface import amdsmi_get_gpu_bad_page_info
|
||||
|
||||
# # Process Information
|
||||
from .amdsmi_interface import amdsmi_get_gpu_process_list
|
||||
from .amdsmi_interface import amdsmi_get_gpu_process_info
|
||||
|
||||
# # ECC Error Information
|
||||
from .amdsmi_interface import amdsmi_get_gpu_total_ecc_count
|
||||
|
||||
@@ -1799,12 +1799,15 @@ def amdsmi_get_gpu_bad_page_info(
|
||||
num_pages = ctypes.c_uint32()
|
||||
retired_page_record = ctypes.POINTER(
|
||||
amdsmi_wrapper.amdsmi_retired_page_record_t)()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_gpu_bad_page_info(
|
||||
processor_handle, ctypes.byref(num_pages), retired_page_record
|
||||
)
|
||||
)
|
||||
|
||||
table_records = _format_bad_page_info(retired_page_record, num_pages)
|
||||
|
||||
if num_pages.value == 0:
|
||||
return "No bad pages found."
|
||||
else:
|
||||
@@ -1942,39 +1945,23 @@ def amdsmi_get_gpu_process_list(
|
||||
|
||||
result = []
|
||||
for index in range(max_processes.value):
|
||||
result.append(process_list[index])
|
||||
result.append({
|
||||
"name": process_list[index].name.decode("utf-8"),
|
||||
"pid": process_list[index].pid,
|
||||
"mem": process_list[index].mem,
|
||||
"engine_usage": {
|
||||
"gfx": process_list[index].engine_usage.gfx,
|
||||
"enc": process_list[index].engine_usage.enc
|
||||
},
|
||||
"memory_usage": {
|
||||
"gtt_mem": process_list[index].memory_usage.gtt_mem,
|
||||
"cpu_mem": process_list[index].memory_usage.cpu_mem,
|
||||
"vram_mem": process_list[index].memory_usage.vram_mem,
|
||||
},
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
def amdsmi_get_gpu_process_info(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
process: amdsmi_wrapper.amdsmi_proc_info_t,
|
||||
) -> Dict[str, Any]:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
if not isinstance(process, amdsmi_wrapper.amdsmi_proc_info_t):
|
||||
raise AmdSmiParameterException(
|
||||
process, amdsmi_wrapper.amdsmi_proc_info_t)
|
||||
|
||||
return {
|
||||
"name": process.name.decode("utf-8"),
|
||||
"pid": process.pid,
|
||||
"mem": process.mem,
|
||||
"engine_usage": {
|
||||
"gfx": process.engine_usage.gfx,
|
||||
"enc": process.engine_usage.enc
|
||||
},
|
||||
"memory_usage": {
|
||||
"gtt_mem": process.memory_usage.gtt_mem,
|
||||
"cpu_mem": process.memory_usage.cpu_mem,
|
||||
"vram_mem": process.memory_usage.vram_mem,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def amdsmi_get_gpu_device_uuid(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> str:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
|
||||
Ссылка в новой задаче
Block a user