Fix [Not supported] status for get_compute_process_info_by_pid

On some systems [rocm-smi --showpids] reports
get_compute_process_info_by_pid, Not supported on the given system
[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN

get_compute_process_info_by_pid fails because cu_occupancy debugfs method
is not provided on some graphics cards and GFX revisions by design

Proposing a change to return success status when only cu_occupancy debugfs method
is not found and provide cu_occupancy invalidation value to mark only
this parameter as UNKNOWN

Change-Id: Iae37070d9bd19483b4e6c8ee24c7d9a4c92f00d7
Signed-off-by: Vladimir Stempen <Vladimir.Stempen@amd.com>
Reviewed-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rocm_smi_lib commit: 677433b367]
This commit is contained in:
Vladimir Stempen
2024-01-24 15:33:04 -05:00
committed by Dmitrii Galantsev
orang tua 3564c1a430
melakukan dc98babe34
3 mengubah file dengan 10 tambahan dan 3 penghapusan
@@ -1134,6 +1134,8 @@ typedef struct {
uint32_t cu_occupancy; //!< Compute Unit usage in percent
} rsmi_process_info_t;
//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method
#define CU_OCCUPANCY_INVALID 0xFFFFFFFF
/**
* @brief Opaque handle to function-support object
@@ -2476,6 +2476,7 @@ def showPids(verbose):
vramUsage = 'UNKNOWN'
sdmaUsage = 'UNKNOWN'
cuOccupancy = 'UNKNOWN'
cuOccupancyInvalid = 0xFFFFFFFF
dv_indices = (c_uint32 * num_devices.value)()
ret = rocmsmi.rsmi_compute_process_gpus_get(int(pid), None, byref(num_devices))
if rsmi_ret_ok(ret, metric='get_gpu_compute_process'):
@@ -2491,7 +2492,8 @@ def showPids(verbose):
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
vramUsage = proc.vram_usage
sdmaUsage = proc.sdma_usage
cuOccupancy = proc.cu_occupancy
if proc.cu_occupancy != cuOccupancyInvalid:
cuOccupancy = proc.cu_occupancy
else:
logging.debug('Unable to fetch process info by PID')
dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)])
@@ -2500,7 +2502,8 @@ def showPids(verbose):
if rsmi_ret_ok(ret, metric='get_compute_process_info_by_pid'):
vramUsage = proc.vram_usage
sdmaUsage = proc.sdma_usage
cuOccupancy = proc.cu_occupancy
if proc.cu_occupancy != cuOccupancyInvalid:
cuOccupancy = proc.cu_occupancy
else:
logging.debug('Unable to fetch process info by PID')
dataArray.append([pid, getProcessName(pid), str(gpuNumber), str(vramUsage), str(sdmaUsage), str(cuOccupancy)])
+3 -1
Melihat File
@@ -507,7 +507,9 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
// Collect count of compute units
cu_count += kfd_node_map[gpu_id]->cu_count();
} else {
return err;
//Some GFX revisions do not provide cu_occupancy debugfs method
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
cu_count = 0;
}
}