From 8bc7216a651956296bde9754b75f8f560fd7729d Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Thu, 2 Oct 2025 10:57:08 -0500 Subject: [PATCH] [SWDEV-525336] Use KFD to determine process start/stop (#723) * Used KFD to determine linking between GPUs and PIDs rather than depend on fdinfo's per pid single gpu bdf info that we were getting. Signed-off-by: adapryor --------- Signed-off-by: adapryor Signed-off-by: Arif, Maisam [ROCm/amdsmi commit: c967aead580632544eff97839af547ab95336496] --- projects/amdsmi/CHANGELOG.md | 2 +- projects/amdsmi/include/amd_smi/impl/fdinfo.h | 1 - .../rocm_smi/include/rocm_smi/rocm_smi_kfd.h | 4 +- projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc | 70 +++++++------- projects/amdsmi/src/amd_smi/fdinfo.cc | 92 +++++++++++-------- 5 files changed, 96 insertions(+), 73 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index e7791666c4..820a422dbc 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -186,7 +186,7 @@ GPU: 0 ### Optimized -- N/A +- **Optimized the way `amd-smi process` validates which proccesses are running on a GPU**. ### Resolved Issues diff --git a/projects/amdsmi/include/amd_smi/impl/fdinfo.h b/projects/amdsmi/include/amd_smi/impl/fdinfo.h index 3fe313944f..aa9db74864 100644 --- a/projects/amdsmi/include/amd_smi/impl/fdinfo.h +++ b/projects/amdsmi/include/amd_smi/impl/fdinfo.h @@ -29,7 +29,6 @@ extern "C" { #endif -amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, uint64_t *size); amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, amdsmi_proc_info_t &info); #ifdef __cplusplus diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h index 88cc594111..eb3b1ccab8 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h @@ -116,6 +116,8 @@ int read_node_properties(uint32_t node, std::string property_name, uint64_t *val); int get_gpu_id(uint32_t node, uint64_t *gpu_id); -} // namespace amd::smi +int GetKfdGpuIdsForPid(long pid, std::unordered_set* out); + +} // namespace amd::smi #endif // INCLUDE_ROCM_SMI_ROCM_SMI_KFD_H_ diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index b85b6902b1..20f67ed6d5 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -312,6 +312,42 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, return 0; } +int GetKfdGpuIdsForPid(long pid, std::unordered_set* out){ + + if (!out) return EINVAL; + out->clear(); + + std::string pdir = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid); + DIR* d = opendir(pdir.c_str()); + + if (!d) { + perror(("Unable to open KFD process directory for process " + std::to_string(pid)).c_str()); + return errno ? errno : ESRCH; + } + + struct dirent* e; + + while ((e = readdir(d))) { + + if (e->d_name[0] == '.') continue; // skip "."/".." and hidden entries + + // Grab KFD GPU id from one of these fields + if (!strncmp(e->d_name, "stats_", 6)) { + out->insert(strtoull(e->d_name + 6, nullptr, 10)); + } else if (!strncmp(e->d_name, "vram_", 5)) { + out->insert(strtoull(e->d_name + 5, nullptr, 10)); + } else if (!strncmp(e->d_name, "counters_", 9)) { + out->insert(strtoull(e->d_name + 9, nullptr, 10)); + } else if (!strncmp(e->d_name, "sdma_", 5)) { + out->insert(strtoull(e->d_name + 5, nullptr, 10)); + } + } + + closedir(d); + return 0; + +} + // Read the gpuid files found in all the dirs and put them in // gpus_found. // Directory structure: @@ -330,8 +366,6 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { return 0; } - errno = 0; - std::string queues_dir = kKFDProcPathRoot; queues_dir += "/"; queues_dir += std::to_string(pid); @@ -387,35 +421,9 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { } // if no queues were present, fallback to grab KFD GPU IDs from parent dir names - if (gpu_set->empty()) { - - std::string pdir = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid); - auto queues_dir_kfd = opendir(pdir.c_str()); - - if (queues_dir_kfd == nullptr) { - std::string err_str = "Unable to open KFD process directory for process "; - err_str += std::to_string(pid); - perror(err_str.c_str()); - return ESRCH; - } - - struct dirent* e; - - while ((e = readdir(queues_dir_kfd))) { - - // These files encode the KFD GPU ID when process is running - if (!strncmp(e->d_name, "stats_", 6)) { - gpu_set->insert(strtoull(e->d_name + 6, nullptr, 10)); - } else if (!strncmp(e->d_name, "vram_", 5)) { - gpu_set->insert(strtoull(e->d_name + 5, nullptr, 10)); - } else if (!strncmp(e->d_name, "counters_", 9)) { - gpu_set->insert(strtoull(e->d_name + 9, nullptr, 10)); - } else if (!strncmp(e->d_name, "sdma_", 5)) { - gpu_set->insert(strtoull(e->d_name + 5, nullptr, 10)); - } - } - - closedir(queues_dir_kfd); + int kfd_ret = GetKfdGpuIdsForPid(pid, gpu_set); + if (kfd_ret != 0) { + return kfd_ret; } errno = 0; diff --git a/projects/amdsmi/src/amd_smi/fdinfo.cc b/projects/amdsmi/src/amd_smi/fdinfo.cc index 17478e2514..7a4e209894 100644 --- a/projects/amdsmi/src/amd_smi/fdinfo.cc +++ b/projects/amdsmi/src/amd_smi/fdinfo.cc @@ -33,6 +33,7 @@ #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_utils.h" +#include "rocm_smi/rocm_smi_kfd.h" extern "C" { @@ -67,48 +68,55 @@ amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) { return AMDSMI_STATUS_NOT_FOUND; } -amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, - uint64_t *size) { - char bdf_str[13]; - DIR *d; - struct dirent *dir; +// Determine via kfd whether pid uses specified gpu +amdsmi_status_t gpu_is_in_kfd_pid(const amdsmi_bdf_t &bdf, long pid) { - /* 0000:00:00.0 */ - snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, - static_cast(bdf.domain_number & 0xffff), - static_cast(bdf.bus_number & 0xff), - static_cast(bdf.device_number & 0x1f), - static_cast(bdf.function_number & 0x7)); + // pack (domain,bus,device,function) to the same 64-bit key + // (DOMAIN << 32) | (BUS << 8) | (DEVICE << 3) | FUNCTION + auto pack_bdf_to_kfd_bdfid = [](const amdsmi_bdf_t& b) -> uint64_t { + const uint64_t domain = static_cast(b.domain_number & 0xffffu); + const uint64_t bus = static_cast(b.bus_number & 0xffu); + const uint64_t dev = static_cast(b.device_number & 0x1fu); + const uint64_t func = static_cast(b.function_number & 0x7u); + const uint64_t loc = (bus << 8) | (dev << 3) | func; + return (domain << 32) | loc; + }; - d = opendir("/proc"); - if (!d) return AMDSMI_STATUS_NO_PERM; + // Build map of KFD nodes + std::map> nodes; + int ret = DiscoverKFDNodes(&nodes); - pids.clear(); - /* Find the pid folders in /proc/ that we have access to */ - while ((dir = readdir(d)) != NULL) { - if (dir->d_type == DT_DIR) { - /* Try to cast the name of the folder to a - * number, if it fails, it is not */ - char *p; - long int pid; - - pid = strtol(dir->d_name, &p, 10); - if (*p != 0) continue; - - /* Check if fdinfo is accesible */ - std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/"; - - if (access(path.c_str(), R_OK)) continue; - - /* check if GPU is present */ - if (gpuvsmi_pid_is_gpu(path, bdf_str)) continue; - pids.push_back(pid); - } + if (ret != 0) { + return AMDSMI_STATUS_API_FAILED; } - closedir(d); - *size = pids.size(); - return AMDSMI_STATUS_SUCCESS; + // Convert bdf and find node + const uint64_t key = pack_bdf_to_kfd_bdfid(bdf); + auto it = nodes.find(key); + + if (it == nodes.end()) { + return AMDSMI_STATUS_NOT_FOUND; + } + + // Grab gpu id and ensure not cpu + const uint64_t target_gid = it->second->gpu_id(); + if (target_gid == 0) { + return AMDSMI_STATUS_NOT_FOUND; + } + + // Get all KFD GPU ids for pid + std::unordered_set pid_gids; + ret = amd::smi::GetKfdGpuIdsForPid(pid, &pid_gids); + if (ret != 0) { + if (ret == EACCES) { + return AMDSMI_STATUS_NO_PERM; + } + return AMDSMI_STATUS_NOT_FOUND; + } + + // Return success if gpu id is in pid gpu ids + return (pid_gids.count(target_gid) ? AMDSMI_STATUS_SUCCESS + : AMDSMI_STATUS_NOT_FOUND); } amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, @@ -128,8 +136,14 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, std::string name_path = "/proc/" + std::to_string(pid) + "/exe"; std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup"; - if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) { - return AMDSMI_STATUS_INVAL; + amdsmi_status_t ret = gpu_is_in_kfd_pid(bdf, pid); + + if (ret != AMDSMI_STATUS_SUCCESS) { + // If kfd process detection fails, fallback on old bdf code + ret = gpuvsmi_pid_is_gpu(path.c_str(), bdf_str); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } } d = opendir(path.c_str());