diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index 5c501388b8..ae1f9bc25a 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -415,6 +415,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return 0; } +static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){ + if(sysfs_ret==0 && !is_number(s)){ + return EINVAL; + } + return sysfs_ret; +} + int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, std::unordered_set *gpu_set) { assert(proc != nullptr); @@ -464,30 +471,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, vram_str_path += std::to_string(gpu_id); err = ReadSysfsStr(vram_str_path, &tmp); - if (err) { - return err; - } + auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + // Report all errors, except ENOENT (2), which should be ignored + // and the proc->vram_usage should be unmodified + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + // Do not store any invalid values + else if (sysfs_data_errcode == 0) { + proc->vram_usage += std::stoull(tmp); } - - proc->vram_usage += std::stoull(tmp); std::string sdma_str_path = proc_str_path; sdma_str_path += "/sdma_"; sdma_str_path += std::to_string(gpu_id); err = ReadSysfsStr(sdma_str_path, &tmp); - if (err) { - return err; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if (sysfs_data_errcode == 0) { + proc->sdma_usage += std::stoull(tmp); } - - proc->sdma_usage += std::stoull(tmp); // Build the path and read from Sysfs file, info that // encodes Compute Unit usage by a process of interest @@ -497,17 +505,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, cu_occupancy_path += "/cu_occupancy"; err = ReadSysfsStr(cu_occupancy_path, &tmp); - if (err == 0) { - if (!is_number(tmp)) { - return EINVAL; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if(sysfs_data_errcode==0){ // Update CU usage by the process proc->cu_occupancy += std::stoi(tmp); - // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); - } else { - //Some GFX revisions do not provide cu_occupancy debugfs method + } + else { + // Some GFX revisions do not provide cu_occupancy debugfs method + // which may cause ENOENT proc->cu_occupancy = CU_OCCUPANCY_INVALID; cu_count = 0; }