[SWDEV-357472] Add evicted_ms metric (#620)
- **Added evicted_time metric for kfd processes**.
- Time that queues are evicted on a GPU in milliseconds
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`
- Added to C API and Python API:
- amdsmi_get_gpu_process_list()
- amdsmi_get_gpu_compute_process_info()
- amdsmi_get_gpu_compute_process_info_by_pid()
---------
Signed-off-by: Pryor, Adam <Adam.Pryor@amd.com>
This commit is contained in:
@@ -24,6 +24,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- The entry `policies` is added to the end of the dictionary to match API definition.
|
||||
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
|
||||
|
||||
- **Added evicted_time metric for kfd processes**.
|
||||
- Time that queues are evicted on a GPU in milliseconds
|
||||
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`
|
||||
- Added to C API and Python API:
|
||||
- amdsmi_get_gpu_process_list()
|
||||
- amdsmi_get_gpu_compute_process_info()
|
||||
- amdsmi_get_gpu_compute_process_info_by_pid()
|
||||
|
||||
### Changed
|
||||
|
||||
- N/A
|
||||
|
||||
@@ -3400,11 +3400,13 @@ class AMDSMICommands():
|
||||
"gfx": process_info["engine_usage"]["gfx"],
|
||||
"enc": process_info["engine_usage"]["enc"],
|
||||
},
|
||||
"cu_occupancy": process_info["cu_occupancy"]
|
||||
"cu_occupancy": process_info["cu_occupancy"],
|
||||
"evicted_time": process_info["evicted_time"]
|
||||
}
|
||||
|
||||
engine_usage_unit = "ns"
|
||||
memory_usage_unit = "B"
|
||||
evicted_time_unit = "ms"
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
|
||||
@@ -3415,6 +3417,10 @@ class AMDSMICommands():
|
||||
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
|
||||
process_info['mem_usage'],
|
||||
memory_usage_unit)
|
||||
|
||||
process_info['evicted_time'] = self.helpers.unit_format(self.logger,
|
||||
process_info['evicted_time'],
|
||||
evicted_time_unit)
|
||||
|
||||
for usage_metric in process_info['usage']:
|
||||
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
|
||||
@@ -6130,8 +6136,10 @@ class AMDSMICommands():
|
||||
process_info.pop('engine_usage') # Remove 'engine_usage' value
|
||||
process_info['mem_usage'] = process_info.pop('mem')
|
||||
process_info['cu_occupancy'] = process_info.pop('cu_occupancy')
|
||||
process_info['evicted_time'] = process_info.pop('evicted_time')
|
||||
|
||||
memory_usage_unit = "B"
|
||||
evicted_time_unit = "ms"
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
|
||||
@@ -6143,6 +6151,10 @@ class AMDSMICommands():
|
||||
process_info['mem_usage'],
|
||||
memory_usage_unit)
|
||||
|
||||
process_info['evicted_time'] = self.helpers.unit_format(self.logger,
|
||||
process_info['evicted_time'],
|
||||
evicted_time_unit)
|
||||
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
|
||||
process_info['memory_usage'][usage_metric],
|
||||
@@ -6176,7 +6188,7 @@ class AMDSMICommands():
|
||||
# Build the process table's title and header
|
||||
self.logger.secondary_table_title = "PROCESS INFO"
|
||||
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9)
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "EVICT".rjust(10)
|
||||
|
||||
if watching_output:
|
||||
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
|
||||
@@ -7334,7 +7346,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
|
||||
for proc in raw_process_list:
|
||||
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"}
|
||||
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A", "evicted_time" : "N/A"}
|
||||
proc_info_dict['gpu'] = gpu_id
|
||||
proc_info_dict['pid'] = proc['pid']
|
||||
proc_info_dict['name'] = proc['name']
|
||||
@@ -7350,6 +7362,7 @@ class AMDSMICommands():
|
||||
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
|
||||
except (ValueError, TypeError):
|
||||
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
|
||||
proc_info_dict['evicted_time'] = proc['evicted_time']
|
||||
|
||||
all_process_list.append(proc_info_dict)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
|
||||
@@ -230,7 +230,7 @@ class AMDSMILogger():
|
||||
# Add N/A for empty process_info
|
||||
table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(9) + '\n'
|
||||
"N/A".rjust(9) + "N/A".rjust(10) + '\n'
|
||||
else:
|
||||
#Fix this herre
|
||||
for process_key, process_value in process_dict['process_info'].items():
|
||||
@@ -251,6 +251,8 @@ class AMDSMILogger():
|
||||
table_values += string_process_value.rjust(10)
|
||||
elif process_key == "cu_occupancy":
|
||||
table_values += string_process_value.rjust(9)
|
||||
elif process_key == "evicted_time":
|
||||
table_values += string_process_value.rjust(9)
|
||||
# Add the stored gpu and stored timestamp to the next line
|
||||
table_values += '\n'
|
||||
if stored_timestamp:
|
||||
@@ -1124,8 +1126,9 @@ class AMDSMILogger():
|
||||
cu_occupancy = (str(round(process['cu_occupancy']['current_cu'] / process['cu_occupancy']['total_num_cu'] * 100, 1)) + " %").rjust(7)
|
||||
else:
|
||||
cu_occupancy = "N/A"
|
||||
print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} |".format(
|
||||
gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy))
|
||||
evicted_time = str(process['evicted_time']).rjust(9)
|
||||
print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} {7:9.9s} |".format(
|
||||
gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy, evicted_time))
|
||||
if process['name'] == "N/A":
|
||||
elevated_permission_error = True
|
||||
else:
|
||||
|
||||
@@ -1186,6 +1186,7 @@ Field | Description
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>Process VRAM memory usage in Bytes</td></tr> </tbody></table>
|
||||
`cu_occupancy` | Number of Compute Units utilized
|
||||
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
|
||||
|
||||
@@ -3534,6 +3535,7 @@ Field | Description
|
||||
`vram_usage` | VRAM usage
|
||||
`sdma_usage` | SDMA usage in microseconds
|
||||
`cu_occupancy` | Compute Unit usage in percents
|
||||
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info` function:
|
||||
|
||||
@@ -3568,6 +3570,7 @@ Field | Description
|
||||
`vram_usage` | VRAM usage
|
||||
`sdma_usage` | SDMA usage in microseconds
|
||||
`cu_occupancy` | Compute Unit usage in percents
|
||||
`evicted_time` | Time that queues are evicted on a GPU in milliseconds
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info_by_pid` function:
|
||||
|
||||
|
||||
@@ -1159,7 +1159,8 @@ typedef struct {
|
||||
} memory_usage; //!< In Bytes
|
||||
char container_name[AMDSMI_MAX_STRING_LENGTH];
|
||||
uint32_t cu_occupancy; //!< Num CUs utilized
|
||||
uint32_t reserved[11];
|
||||
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
|
||||
uint32_t reserved[10];
|
||||
} amdsmi_proc_info_t;
|
||||
|
||||
/**
|
||||
@@ -2085,6 +2086,7 @@ typedef struct {
|
||||
uint64_t vram_usage; //!< VRAM usage in MB
|
||||
uint64_t sdma_usage; //!< SDMA usage in microseconds
|
||||
uint32_t cu_occupancy; //!< Compute Unit usage in percent
|
||||
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
|
||||
} amdsmi_process_info_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -2953,7 +2953,8 @@ def amdsmi_get_gpu_process_list(
|
||||
"cpu_mem": process_list[index].memory_usage.cpu_mem,
|
||||
"vram_mem": process_list[index].memory_usage.vram_mem,
|
||||
},
|
||||
"cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T)
|
||||
"cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T),
|
||||
"evicted_time": _validate_if_max_uint(process_list[index].evicted_time, MaxUIntegerTypes.UINT32_T)
|
||||
})
|
||||
|
||||
return result
|
||||
@@ -5303,6 +5304,7 @@ def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]:
|
||||
"vram_usage": proc.vram_usage,
|
||||
"sdma_usage": proc.sdma_usage,
|
||||
"cu_occupancy": proc.cu_occupancy,
|
||||
"evicted_time": proc.evicted_time,
|
||||
}
|
||||
for proc in procs
|
||||
]
|
||||
@@ -5324,6 +5326,7 @@ def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]:
|
||||
"vram_usage": proc.vram_usage,
|
||||
"sdma_usage": proc.sdma_usage,
|
||||
"cu_occupancy": proc.cu_occupancy,
|
||||
"evicted_time": proc.evicted_time,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -904,22 +904,22 @@ amdsmi_frequency_range_t = struct_amdsmi_frequency_range_t
|
||||
class union_amdsmi_bdf_t(Union):
|
||||
pass
|
||||
|
||||
class struct_amdsmi_bdf_t(Structure):
|
||||
class struct_bdf_(Structure):
|
||||
pass
|
||||
|
||||
struct_amdsmi_bdf_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_bdf_t._fields_ = [
|
||||
struct_bdf_._pack_ = 1 # source:False
|
||||
struct_bdf_._fields_ = [
|
||||
('function_number', ctypes.c_uint64, 3),
|
||||
('device_number', ctypes.c_uint64, 5),
|
||||
('bus_number', ctypes.c_uint64, 8),
|
||||
('domain_number', ctypes.c_uint64, 48),
|
||||
]
|
||||
|
||||
class struct_bdf_(Structure):
|
||||
class struct_amdsmi_bdf_t(Structure):
|
||||
pass
|
||||
|
||||
struct_bdf_._pack_ = 1 # source:False
|
||||
struct_bdf_._fields_ = [
|
||||
struct_amdsmi_bdf_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_bdf_t._fields_ = [
|
||||
('function_number', ctypes.c_uint64, 3),
|
||||
('device_number', ctypes.c_uint64, 5),
|
||||
('bus_number', ctypes.c_uint64, 8),
|
||||
@@ -1397,7 +1397,9 @@ struct_amdsmi_proc_info_t._fields_ = [
|
||||
('memory_usage', struct_memory_usage_),
|
||||
('container_name', ctypes.c_char * 256),
|
||||
('cu_occupancy', ctypes.c_uint32),
|
||||
('reserved', ctypes.c_uint32 * 11),
|
||||
('evicted_time', ctypes.c_uint32),
|
||||
('reserved', ctypes.c_uint32 * 10),
|
||||
('PADDING_1', ctypes.c_ubyte * 4),
|
||||
]
|
||||
|
||||
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
|
||||
@@ -2196,7 +2198,7 @@ struct_amdsmi_process_info_t._fields_ = [
|
||||
('vram_usage', ctypes.c_uint64),
|
||||
('sdma_usage', ctypes.c_uint64),
|
||||
('cu_occupancy', ctypes.c_uint32),
|
||||
('PADDING_1', ctypes.c_ubyte * 4),
|
||||
('evicted_time', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
amdsmi_process_info_t = struct_amdsmi_process_info_t
|
||||
|
||||
@@ -1471,10 +1471,11 @@ typedef struct {
|
||||
uint64_t vram_usage; //!< VRAM usage
|
||||
uint64_t sdma_usage; //!< SDMA usage in microseconds
|
||||
uint32_t cu_occupancy; //!< Compute Unit usage in percent
|
||||
uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
|
||||
} rsmi_process_info_t;
|
||||
|
||||
//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method
|
||||
#define CU_OCCUPANCY_INVALID 0xFFFFFFFF
|
||||
#define KFD_STATS_INVALID 0xFFFFFFFF
|
||||
|
||||
/**
|
||||
* @brief Opaque handle to function-support object
|
||||
|
||||
@@ -440,6 +440,28 @@ static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
|
||||
return sysfs_ret;
|
||||
}
|
||||
|
||||
static int GetProcessKFDStats(std::string path, uint32_t& val){
|
||||
|
||||
std::string tmp;
|
||||
int err = ReadSysfsStr(path, &tmp);
|
||||
auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
|
||||
|
||||
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
|
||||
return sysfs_data_errcode;
|
||||
}
|
||||
else if(sysfs_data_errcode==0){
|
||||
// Update KFD stat by the process
|
||||
val = static_cast<uint32_t>(std::stoul(tmp));
|
||||
}
|
||||
else {
|
||||
// Some GFX revisions do not provide KFD stats debugfs method
|
||||
// which may cause ENOENT
|
||||
val = KFD_STATS_INVALID;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
std::unordered_set<uint64_t> *gpu_set) {
|
||||
assert(proc != nullptr);
|
||||
@@ -447,6 +469,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
int err;
|
||||
std::string tmp;
|
||||
std::unordered_set<uint64_t>::iterator itr;
|
||||
uint32_t kfd_stat;
|
||||
|
||||
std::string proc_str_path = kKFDProcPathRoot;
|
||||
proc_str_path += "/";
|
||||
@@ -460,6 +483,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
proc->vram_usage = 0;
|
||||
proc->sdma_usage = 0;
|
||||
proc->cu_occupancy = 0;
|
||||
proc->evicted_time = 0;
|
||||
|
||||
for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) {
|
||||
uint64_t gpu_id = (*itr);
|
||||
@@ -502,21 +526,23 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
cu_occupancy_path += std::to_string(gpu_id);
|
||||
cu_occupancy_path += "/cu_occupancy";
|
||||
|
||||
err = ReadSysfsStr(cu_occupancy_path, &tmp);
|
||||
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
|
||||
err = GetProcessKFDStats(cu_occupancy_path, kfd_stat);
|
||||
if (err != 0){
|
||||
return err;
|
||||
}
|
||||
proc->cu_occupancy = kfd_stat;
|
||||
|
||||
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
|
||||
return sysfs_data_errcode;
|
||||
}
|
||||
else if(sysfs_data_errcode==0){
|
||||
// Update CU usage by the process
|
||||
proc->cu_occupancy = std::stoi(tmp);
|
||||
}
|
||||
else {
|
||||
// Some GFX revisions do not provide cu_occupancy debugfs method
|
||||
// which may cause ENOENT
|
||||
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
|
||||
std::string evicted_time_path = proc_str_path;
|
||||
evicted_time_path += "/stats_";
|
||||
evicted_time_path += std::to_string(gpu_id);
|
||||
evicted_time_path += "/evicted_ms";
|
||||
|
||||
err = GetProcessKFDStats(evicted_time_path, kfd_stat);
|
||||
if (err != 0){
|
||||
return err;
|
||||
}
|
||||
proc->evicted_time = kfd_stat;
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -178,8 +178,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
amdsmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
|
||||
}
|
||||
|
||||
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
|
||||
// Copy the kfd stats from rsmi_process_info_t to amdsmi_proc_info_t
|
||||
amdsmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
|
||||
amdsmi_proc_info.evicted_time = rsmi_proc_info.evicted_time;
|
||||
|
||||
// Safely handle KFD processes to get total memory_usage of the process
|
||||
uint64_t kfd_gpu_id = get_kfd_gpu_id();
|
||||
|
||||
@@ -184,6 +184,8 @@ void TestProcInfoRead::Run(void) {
|
||||
proc_info.sdma_usage <<
|
||||
" Compute Unit Usage: " <<
|
||||
proc_info.cu_occupancy <<
|
||||
" Evicted Time: " <<
|
||||
proc_info.evicted_time << std::endl <<
|
||||
std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user