Skip missing vram_str_path and sdma_str_path if sysfs files not created when passing some, but not all, GPUs to a docker image.
- This fix addresses SWDEV-456049 and probably SWDEV-442181 which
have the same apparent root cause of an early exiting
loop while enumerating GPU stats
Change-Id: I517329e06fa2c53205d8b6e002895e648ebf521c
[ROCm/rocm_smi_lib commit: 35496cabc4]
This commit is contained in:
committed by
Maisam Arif
orang tua
ddba959395
melakukan
be18b69b33
@@ -415,6 +415,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
|
||||
if(sysfs_ret==0 && !is_number(s)){
|
||||
return EINVAL;
|
||||
}
|
||||
return sysfs_ret;
|
||||
}
|
||||
|
||||
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
std::unordered_set<uint64_t> *gpu_set) {
|
||||
assert(proc != nullptr);
|
||||
@@ -464,30 +471,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
vram_str_path += std::to_string(gpu_id);
|
||||
|
||||
err = ReadSysfsStr(vram_str_path, &tmp);
|
||||
if (err) {
|
||||
return err;
|
||||
}
|
||||
auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
|
||||
|
||||
if (!is_number(tmp)) {
|
||||
return EINVAL;
|
||||
// Report all errors, except ENOENT (2), which should be ignored
|
||||
// and the proc->vram_usage should be unmodified
|
||||
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
|
||||
return sysfs_data_errcode;
|
||||
}
|
||||
// Do not store any invalid values
|
||||
else if (sysfs_data_errcode == 0) {
|
||||
proc->vram_usage += std::stoull(tmp);
|
||||
}
|
||||
|
||||
proc->vram_usage += std::stoull(tmp);
|
||||
|
||||
std::string sdma_str_path = proc_str_path;
|
||||
sdma_str_path += "/sdma_";
|
||||
sdma_str_path += std::to_string(gpu_id);
|
||||
|
||||
err = ReadSysfsStr(sdma_str_path, &tmp);
|
||||
if (err) {
|
||||
return err;
|
||||
}
|
||||
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
|
||||
|
||||
if (!is_number(tmp)) {
|
||||
return EINVAL;
|
||||
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
|
||||
return sysfs_data_errcode;
|
||||
}
|
||||
else if (sysfs_data_errcode == 0) {
|
||||
proc->sdma_usage += std::stoull(tmp);
|
||||
}
|
||||
|
||||
proc->sdma_usage += std::stoull(tmp);
|
||||
|
||||
// Build the path and read from Sysfs file, info that
|
||||
// encodes Compute Unit usage by a process of interest
|
||||
@@ -497,17 +505,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
cu_occupancy_path += "/cu_occupancy";
|
||||
|
||||
err = ReadSysfsStr(cu_occupancy_path, &tmp);
|
||||
if (err == 0) {
|
||||
if (!is_number(tmp)) {
|
||||
return EINVAL;
|
||||
}
|
||||
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
|
||||
|
||||
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
|
||||
return sysfs_data_errcode;
|
||||
}
|
||||
else if(sysfs_data_errcode==0){
|
||||
// Update CU usage by the process
|
||||
proc->cu_occupancy += std::stoi(tmp);
|
||||
|
||||
// Collect count of compute units
|
||||
cu_count += kfd_node_map[gpu_id]->cu_count();
|
||||
} else {
|
||||
//Some GFX revisions do not provide cu_occupancy debugfs method
|
||||
}
|
||||
else {
|
||||
// Some GFX revisions do not provide cu_occupancy debugfs method
|
||||
// which may cause ENOENT
|
||||
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
|
||||
cu_count = 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user