From be18b69b33d71cd4ac886984cf69d553fb0f2c5e Mon Sep 17 00:00:00 2001 From: James Xu Date: Mon, 16 Sep 2024 10:57:33 -0400 Subject: [PATCH] Skip missing vram_str_path and sdma_str_path if sysfs files not created when passing some, but not all, GPUs to a docker image. - This fix addresses SWDEV-456049 and probably SWDEV-442181 which have the same apparent root cause of an early exiting loop while enumerating GPU stats Change-Id: I517329e06fa2c53205d8b6e002895e648ebf521c [ROCm/rocm_smi_lib commit: 35496cabc4fcb194e77f952a66b2904132a5b870] --- projects/rocm-smi-lib/src/rocm_smi_kfd.cc | 53 ++++++++++++++--------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index 5c501388b8..ae1f9bc25a 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -415,6 +415,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return 0; } +static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){ + if(sysfs_ret==0 && !is_number(s)){ + return EINVAL; + } + return sysfs_ret; +} + int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, std::unordered_set *gpu_set) { assert(proc != nullptr); @@ -464,30 +471,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, vram_str_path += std::to_string(gpu_id); err = ReadSysfsStr(vram_str_path, &tmp); - if (err) { - return err; - } + auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + // Report all errors, except ENOENT (2), which should be ignored + // and the proc->vram_usage should be unmodified + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + // Do not store any invalid values + else if (sysfs_data_errcode == 0) { + proc->vram_usage += std::stoull(tmp); } - - proc->vram_usage += std::stoull(tmp); std::string sdma_str_path = proc_str_path; sdma_str_path += "/sdma_"; sdma_str_path += std::to_string(gpu_id); err = ReadSysfsStr(sdma_str_path, &tmp); - if (err) { - return err; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); - if (!is_number(tmp)) { - return EINVAL; + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if (sysfs_data_errcode == 0) { + proc->sdma_usage += std::stoull(tmp); } - - proc->sdma_usage += std::stoull(tmp); // Build the path and read from Sysfs file, info that // encodes Compute Unit usage by a process of interest @@ -497,17 +505,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, cu_occupancy_path += "/cu_occupancy"; err = ReadSysfsStr(cu_occupancy_path, &tmp); - if (err == 0) { - if (!is_number(tmp)) { - return EINVAL; - } + sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if(sysfs_data_errcode==0){ // Update CU usage by the process proc->cu_occupancy += std::stoi(tmp); - // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); - } else { - //Some GFX revisions do not provide cu_occupancy debugfs method + } + else { + // Some GFX revisions do not provide cu_occupancy debugfs method + // which may cause ENOENT proc->cu_occupancy = CU_OCCUPANCY_INVALID; cu_count = 0; }