[SWDEV-357472] Add evicted_ms metric (#620)

- **Added evicted_time metric for kfd processes**.  
  - Time that queues are evicted on a GPU in milliseconds
  - Added to CLI in `amd-smi monitor -q` and `amd-smi process`
  - Added to C API and Python API:
    - amdsmi_get_gpu_process_list()
    - amdsmi_get_gpu_compute_process_info()
    - amdsmi_get_gpu_compute_process_info_by_pid()

---------

Signed-off-by: Pryor, Adam <Adam.Pryor@amd.com>

[ROCm/amdsmi commit: 2144cfbba4]
Этот коммит содержится в:
Pryor, Adam
2025-10-28 14:49:03 -05:00
коммит произвёл GitHub
родитель f36affe4d5
Коммит 354886f4ff
11 изменённых файлов: 95 добавлений и 31 удалений
+8
Просмотреть файл
@@ -24,6 +24,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- The entry `policies` is added to the end of the dictionary to match API definition.
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
- **Added evicted_time metric for kfd processes**.
- Time that queues are evicted on a GPU in milliseconds
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`
- Added to C API and Python API:
- amdsmi_get_gpu_process_list()
- amdsmi_get_gpu_compute_process_info()
- amdsmi_get_gpu_compute_process_info_by_pid()
### Changed
- N/A
+16 -3
Просмотреть файл
@@ -3400,11 +3400,13 @@ class AMDSMICommands():
"gfx": process_info["engine_usage"]["gfx"],
"enc": process_info["engine_usage"]["enc"],
},
"cu_occupancy": process_info["cu_occupancy"]
"cu_occupancy": process_info["cu_occupancy"],
"evicted_time": process_info["evicted_time"]
}
engine_usage_unit = "ns"
memory_usage_unit = "B"
evicted_time_unit = "ms"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
@@ -3415,6 +3417,10 @@ class AMDSMICommands():
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
process_info['mem_usage'],
memory_usage_unit)
process_info['evicted_time'] = self.helpers.unit_format(self.logger,
process_info['evicted_time'],
evicted_time_unit)
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
@@ -6130,8 +6136,10 @@ class AMDSMICommands():
process_info.pop('engine_usage') # Remove 'engine_usage' value
process_info['mem_usage'] = process_info.pop('mem')
process_info['cu_occupancy'] = process_info.pop('cu_occupancy')
process_info['evicted_time'] = process_info.pop('evicted_time')
memory_usage_unit = "B"
evicted_time_unit = "ms"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
@@ -6143,6 +6151,10 @@ class AMDSMICommands():
process_info['mem_usage'],
memory_usage_unit)
process_info['evicted_time'] = self.helpers.unit_format(self.logger,
process_info['evicted_time'],
evicted_time_unit)
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['memory_usage'][usage_metric],
@@ -6176,7 +6188,7 @@ class AMDSMICommands():
# Build the process table's title and header
self.logger.secondary_table_title = "PROCESS INFO"
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9)
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "EVICT".rjust(10)
if watching_output:
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
@@ -7334,7 +7346,7 @@ class AMDSMICommands():
try:
raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
for proc in raw_process_list:
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"}
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A", "evicted_time" : "N/A"}
proc_info_dict['gpu'] = gpu_id
proc_info_dict['pid'] = proc['pid']
proc_info_dict['name'] = proc['name']
@@ -7350,6 +7362,7 @@ class AMDSMICommands():
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
except (ValueError, TypeError):
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
proc_info_dict['evicted_time'] = proc['evicted_time']
all_process_list.append(proc_info_dict)
except amdsmi_exception.AmdSmiLibraryException as e:
+6 -3
Просмотреть файл
@@ -230,7 +230,7 @@ class AMDSMILogger():
# Add N/A for empty process_info
table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \
"N/A".rjust(9) + '\n'
"N/A".rjust(9) + "N/A".rjust(10) + '\n'
else:
#Fix this herre
for process_key, process_value in process_dict['process_info'].items():
@@ -251,6 +251,8 @@ class AMDSMILogger():
table_values += string_process_value.rjust(10)
elif process_key == "cu_occupancy":
table_values += string_process_value.rjust(9)
elif process_key == "evicted_time":
table_values += string_process_value.rjust(9)
# Add the stored gpu and stored timestamp to the next line
table_values += '\n'
if stored_timestamp:
@@ -1124,8 +1126,9 @@ class AMDSMILogger():
cu_occupancy = (str(round(process['cu_occupancy']['current_cu'] / process['cu_occupancy']['total_num_cu'] * 100, 1)) + " %").rjust(7)
else:
cu_occupancy = "N/A"
print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} |".format(
gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy))
evicted_time = str(process['evicted_time']).rjust(9)
print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} {7:9.9s} |".format(
gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy, evicted_time))
if process['name'] == "N/A":
elevated_permission_error = True
else:
+3
Просмотреть файл
@@ -1186,6 +1186,7 @@ Field | Description
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>Process VRAM memory usage in Bytes</td></tr> </tbody></table>
`cu_occupancy` | Number of Compute Units utilized
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
@@ -3534,6 +3535,7 @@ Field | Description
`vram_usage` | VRAM usage
`sdma_usage` | SDMA usage in microseconds
`cu_occupancy` | Compute Unit usage in percents
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info` function:
@@ -3568,6 +3570,7 @@ Field | Description
`vram_usage` | VRAM usage
`sdma_usage` | SDMA usage in microseconds
`cu_occupancy` | Compute Unit usage in percents
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info_by_pid` function:
+3 -1
Просмотреть файл
@@ -1159,7 +1159,8 @@ typedef struct {
} memory_usage; //!< In Bytes
char container_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t cu_occupancy; //!< Num CUs utilized
uint32_t reserved[11];
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
uint32_t reserved[10];
} amdsmi_proc_info_t;
/**
@@ -2085,6 +2086,7 @@ typedef struct {
uint64_t vram_usage; //!< VRAM usage in MB
uint64_t sdma_usage; //!< SDMA usage in microseconds
uint32_t cu_occupancy; //!< Compute Unit usage in percent
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
} amdsmi_process_info_t;
/**
+4 -1
Просмотреть файл
@@ -2953,7 +2953,8 @@ def amdsmi_get_gpu_process_list(
"cpu_mem": process_list[index].memory_usage.cpu_mem,
"vram_mem": process_list[index].memory_usage.vram_mem,
},
"cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T)
"cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T),
"evicted_time": _validate_if_max_uint(process_list[index].evicted_time, MaxUIntegerTypes.UINT32_T)
})
return result
@@ -5303,6 +5304,7 @@ def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]:
"vram_usage": proc.vram_usage,
"sdma_usage": proc.sdma_usage,
"cu_occupancy": proc.cu_occupancy,
"evicted_time": proc.evicted_time,
}
for proc in procs
]
@@ -5324,6 +5326,7 @@ def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]:
"vram_usage": proc.vram_usage,
"sdma_usage": proc.sdma_usage,
"cu_occupancy": proc.cu_occupancy,
"evicted_time": proc.evicted_time,
}
+10 -8
Просмотреть файл
@@ -904,22 +904,22 @@ amdsmi_frequency_range_t = struct_amdsmi_frequency_range_t
class union_amdsmi_bdf_t(Union):
pass
class struct_amdsmi_bdf_t(Structure):
class struct_bdf_(Structure):
pass
struct_amdsmi_bdf_t._pack_ = 1 # source:False
struct_amdsmi_bdf_t._fields_ = [
struct_bdf_._pack_ = 1 # source:False
struct_bdf_._fields_ = [
('function_number', ctypes.c_uint64, 3),
('device_number', ctypes.c_uint64, 5),
('bus_number', ctypes.c_uint64, 8),
('domain_number', ctypes.c_uint64, 48),
]
class struct_bdf_(Structure):
class struct_amdsmi_bdf_t(Structure):
pass
struct_bdf_._pack_ = 1 # source:False
struct_bdf_._fields_ = [
struct_amdsmi_bdf_t._pack_ = 1 # source:False
struct_amdsmi_bdf_t._fields_ = [
('function_number', ctypes.c_uint64, 3),
('device_number', ctypes.c_uint64, 5),
('bus_number', ctypes.c_uint64, 8),
@@ -1397,7 +1397,9 @@ struct_amdsmi_proc_info_t._fields_ = [
('memory_usage', struct_memory_usage_),
('container_name', ctypes.c_char * 256),
('cu_occupancy', ctypes.c_uint32),
('reserved', ctypes.c_uint32 * 11),
('evicted_time', ctypes.c_uint32),
('reserved', ctypes.c_uint32 * 10),
('PADDING_1', ctypes.c_ubyte * 4),
]
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
@@ -2196,7 +2198,7 @@ struct_amdsmi_process_info_t._fields_ = [
('vram_usage', ctypes.c_uint64),
('sdma_usage', ctypes.c_uint64),
('cu_occupancy', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('evicted_time', ctypes.c_uint32),
]
amdsmi_process_info_t = struct_amdsmi_process_info_t
+2 -1
Просмотреть файл
@@ -1471,10 +1471,11 @@ typedef struct {
uint64_t vram_usage; //!< VRAM usage
uint64_t sdma_usage; //!< SDMA usage in microseconds
uint32_t cu_occupancy; //!< Compute Unit usage in percent
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
} rsmi_process_info_t;
//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method
#define CU_OCCUPANCY_INVALID 0xFFFFFFFF
#define KFD_STATS_INVALID 0xFFFFFFFF
/**
* @brief Opaque handle to function-support object
+39 -13
Просмотреть файл
@@ -440,6 +440,28 @@ static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
return sysfs_ret;
}
static int GetProcessKFDStats(std::string path, uint32_t& val){
std::string tmp;
int err = ReadSysfsStr(path, &tmp);
auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if(sysfs_data_errcode==0){
// Update KFD stat by the process
val = static_cast<uint32_t>(std::stoul(tmp));
}
else {
// Some GFX revisions do not provide KFD stats debugfs method
// which may cause ENOENT
val = KFD_STATS_INVALID;
}
return 0;
}
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
std::unordered_set<uint64_t> *gpu_set) {
assert(proc != nullptr);
@@ -447,6 +469,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
int err;
std::string tmp;
std::unordered_set<uint64_t>::iterator itr;
uint32_t kfd_stat;
std::string proc_str_path = kKFDProcPathRoot;
proc_str_path += "/";
@@ -460,6 +483,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
proc->vram_usage = 0;
proc->sdma_usage = 0;
proc->cu_occupancy = 0;
proc->evicted_time = 0;
for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) {
uint64_t gpu_id = (*itr);
@@ -502,21 +526,23 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
cu_occupancy_path += std::to_string(gpu_id);
cu_occupancy_path += "/cu_occupancy";
err = ReadSysfsStr(cu_occupancy_path, &tmp);
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
err = GetProcessKFDStats(cu_occupancy_path, kfd_stat);
if (err != 0){
return err;
}
proc->cu_occupancy = kfd_stat;
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if(sysfs_data_errcode==0){
// Update CU usage by the process
proc->cu_occupancy = std::stoi(tmp);
}
else {
// Some GFX revisions do not provide cu_occupancy debugfs method
// which may cause ENOENT
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
std::string evicted_time_path = proc_str_path;
evicted_time_path += "/stats_";
evicted_time_path += std::to_string(gpu_id);
evicted_time_path += "/evicted_ms";
err = GetProcessKFDStats(evicted_time_path, kfd_stat);
if (err != 0){
return err;
}
proc->evicted_time = kfd_stat;
}
return 0;
+2 -1
Просмотреть файл
@@ -178,8 +178,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
amdsmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
}
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
// Copy the kfd stats from rsmi_process_info_t to amdsmi_proc_info_t
amdsmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
amdsmi_proc_info.evicted_time = rsmi_proc_info.evicted_time;
// Safely handle KFD processes to get total memory_usage of the process
uint64_t kfd_gpu_id = get_kfd_gpu_id();
+2
Просмотреть файл
@@ -184,6 +184,8 @@ void TestProcInfoRead::Run(void) {
proc_info.sdma_usage <<
" Compute Unit Usage: " <<
proc_info.cu_occupancy <<
" Evicted Time: " <<
proc_info.evicted_time << std::endl <<
std::endl;
}
}