Skip missing vram_str_path and sdma_str_path if sysfs files not created when passing some, but not all, GPUs to a docker image.

- This fix addresses SWDEV-456049 and probably SWDEV-442181 which
	have the same apparent root cause of an early exiting
	loop while enumerating GPU stats

Change-Id: I517329e06fa2c53205d8b6e002895e648ebf521c


[ROCm/rocm_smi_lib commit: 35496cabc4]
This commit is contained in:
James Xu
2024-09-16 10:57:33 -04:00
committed by Maisam Arif
orang tua ddba959395
melakukan be18b69b33
+32 -21
Melihat File
@@ -415,6 +415,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
return 0;
}
static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
if(sysfs_ret==0 && !is_number(s)){
return EINVAL;
}
return sysfs_ret;
}
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
std::unordered_set<uint64_t> *gpu_set) {
assert(proc != nullptr);
@@ -464,30 +471,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
vram_str_path += std::to_string(gpu_id);
err = ReadSysfsStr(vram_str_path, &tmp);
if (err) {
return err;
}
auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!is_number(tmp)) {
return EINVAL;
// Report all errors, except ENOENT (2), which should be ignored
// and the proc->vram_usage should be unmodified
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
// Do not store any invalid values
else if (sysfs_data_errcode == 0) {
proc->vram_usage += std::stoull(tmp);
}
proc->vram_usage += std::stoull(tmp);
std::string sdma_str_path = proc_str_path;
sdma_str_path += "/sdma_";
sdma_str_path += std::to_string(gpu_id);
err = ReadSysfsStr(sdma_str_path, &tmp);
if (err) {
return err;
}
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!is_number(tmp)) {
return EINVAL;
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if (sysfs_data_errcode == 0) {
proc->sdma_usage += std::stoull(tmp);
}
proc->sdma_usage += std::stoull(tmp);
// Build the path and read from Sysfs file, info that
// encodes Compute Unit usage by a process of interest
@@ -497,17 +505,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
cu_occupancy_path += "/cu_occupancy";
err = ReadSysfsStr(cu_occupancy_path, &tmp);
if (err == 0) {
if (!is_number(tmp)) {
return EINVAL;
}
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if(sysfs_data_errcode==0){
// Update CU usage by the process
proc->cu_occupancy += std::stoi(tmp);
// Collect count of compute units
cu_count += kfd_node_map[gpu_id]->cu_count();
} else {
//Some GFX revisions do not provide cu_occupancy debugfs method
}
else {
// Some GFX revisions do not provide cu_occupancy debugfs method
// which may cause ENOENT
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
cu_count = 0;
}