From dc98babe3466264fd0dcb4ee67d8869dd1765177 Mon Sep 17 00:00:00 2001 From: Vladimir Stempen Date: Wed, 24 Jan 2024 15:33:04 -0500 Subject: [PATCH] Fix [Not supported] status for get_compute_process_info_by_pid On some systems [rocm-smi --showpids] reports get_compute_process_info_by_pid, Not supported on the given system [PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN get_compute_process_info_by_pid fails because cu_occupancy debugfs method is not provided on some graphics cards and GFX revisions by design Proposing a change to return success status when only cu_occupancy debugfs method is not found and provide cu_occupancy invalidation value to mark only this parameter as UNKNOWN Change-Id: Iae37070d9bd19483b4e6c8ee24c7d9a4c92f00d7 Signed-off-by: Vladimir Stempen Reviewed-by: Galantsev, Dmitrii [ROCm/rocm_smi_lib commit: 677433b367c5738e165c74ac07bdb7ab26d22949] --- projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h | 2 ++ projects/rocm-smi-lib/python_smi_tools/rocm_smi.py | 7 +++++-- projects/rocm-smi-lib/src/rocm_smi_kfd.cc | 4 +++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 28f0bd795b..77710ba7b3 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -1134,6 +1134,8 @@ typedef struct { uint32_t cu_occupancy; //!< Compute Unit usage in percent } rsmi_process_info_t; +//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method +#define CU_OCCUPANCY_INVALID 0xFFFFFFFF /** * @brief Opaque handle to function-support object diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index e910b0bc30..f6897897d8 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -2476,6 +2476,7 @@ def showPids(verbose): vramUsage = 'UNKNOWN' sdmaUsage = 'UNKNOWN' cuOccupancy = 'UNKNOWN' + cuOccupancyInvalid = 0xFFFFFFFF dv_indices = (c_uint32 * num_devices.value)() ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices)) if rsmi_ret_ok(ret, metric='get_gpu_compute_process'): @@ -2491,7 +2492,8 @@ def showPids(verbose): if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'): vramUsage = proc.vram_usage sdmaUsage = proc.sdma_usage - cuOccupancy = proc.cu_occupancy + if proc.cu_occupancy != cuOccupancyInvalid: + cuOccupancy = proc.cu_occupancy else: logging.debug('Unable to fetch process info by PID') dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)]) @@ -2500,7 +2502,8 @@ def showPids(verbose): if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'): vramUsage = proc.vram_usage sdmaUsage = proc.sdma_usage - cuOccupancy = proc.cu_occupancy + if proc.cu_occupancy != cuOccupancyInvalid: + cuOccupancy = proc.cu_occupancy else: logging.debug('Unable to fetch process info by PID') dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)]) diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index 3109781e39..a4eaf43137 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -507,7 +507,9 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); } else { - return err; + //Some GFX revisions do not provide cu_occupancy debugfs method + proc->cu_occupancy = CU_OCCUPANCY_INVALID; + cu_count = 0; } }