[SWDEV-525336] Use KFD to determine process start/stop (#723)
* Used KFD to determine linking between GPUs and PIDs rather than depend on fdinfo's per pid single gpu bdf info that we were getting.
Signed-off-by: adapryor <Adam.pryor@amd.com>
---------
Signed-off-by: adapryor <Adam.pryor@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: c967aead58]
Tento commit je obsažen v:
@@ -186,7 +186,7 @@ GPU: 0
|
||||
|
||||
### Optimized
|
||||
|
||||
- N/A
|
||||
- **Optimized the way `amd-smi process` validates which proccesses are running on a GPU**.
|
||||
|
||||
### Resolved Issues
|
||||
|
||||
|
||||
@@ -29,7 +29,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids, uint64_t *size);
|
||||
amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, amdsmi_proc_info_t &info);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -116,6 +116,8 @@ int read_node_properties(uint32_t node, std::string property_name,
|
||||
uint64_t *val);
|
||||
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
|
||||
|
||||
} // namespace amd::smi
|
||||
int GetKfdGpuIdsForPid(long pid, std::unordered_set<uint64_t>* out);
|
||||
|
||||
} // namespace amd::smi
|
||||
|
||||
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_KFD_H_
|
||||
|
||||
@@ -312,6 +312,42 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int GetKfdGpuIdsForPid(long pid, std::unordered_set<uint64_t>* out){
|
||||
|
||||
if (!out) return EINVAL;
|
||||
out->clear();
|
||||
|
||||
std::string pdir = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid);
|
||||
DIR* d = opendir(pdir.c_str());
|
||||
|
||||
if (!d) {
|
||||
perror(("Unable to open KFD process directory for process " + std::to_string(pid)).c_str());
|
||||
return errno ? errno : ESRCH;
|
||||
}
|
||||
|
||||
struct dirent* e;
|
||||
|
||||
while ((e = readdir(d))) {
|
||||
|
||||
if (e->d_name[0] == '.') continue; // skip "."/".." and hidden entries
|
||||
|
||||
// Grab KFD GPU id from one of these fields
|
||||
if (!strncmp(e->d_name, "stats_", 6)) {
|
||||
out->insert(strtoull(e->d_name + 6, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "vram_", 5)) {
|
||||
out->insert(strtoull(e->d_name + 5, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "counters_", 9)) {
|
||||
out->insert(strtoull(e->d_name + 9, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "sdma_", 5)) {
|
||||
out->insert(strtoull(e->d_name + 5, nullptr, 10));
|
||||
}
|
||||
}
|
||||
|
||||
closedir(d);
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
// Read the gpuid files found in all the <queue id> dirs and put them in
|
||||
// gpus_found.
|
||||
// Directory structure:
|
||||
@@ -330,8 +366,6 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
errno = 0;
|
||||
|
||||
std::string queues_dir = kKFDProcPathRoot;
|
||||
queues_dir += "/";
|
||||
queues_dir += std::to_string(pid);
|
||||
@@ -387,35 +421,9 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
|
||||
}
|
||||
|
||||
// if no queues were present, fallback to grab KFD GPU IDs from parent dir names
|
||||
if (gpu_set->empty()) {
|
||||
|
||||
std::string pdir = std::string(kKFDProcPathRoot) + "/" + std::to_string(pid);
|
||||
auto queues_dir_kfd = opendir(pdir.c_str());
|
||||
|
||||
if (queues_dir_kfd == nullptr) {
|
||||
std::string err_str = "Unable to open KFD process directory for process ";
|
||||
err_str += std::to_string(pid);
|
||||
perror(err_str.c_str());
|
||||
return ESRCH;
|
||||
}
|
||||
|
||||
struct dirent* e;
|
||||
|
||||
while ((e = readdir(queues_dir_kfd))) {
|
||||
|
||||
// These files encode the KFD GPU ID when process is running
|
||||
if (!strncmp(e->d_name, "stats_", 6)) {
|
||||
gpu_set->insert(strtoull(e->d_name + 6, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "vram_", 5)) {
|
||||
gpu_set->insert(strtoull(e->d_name + 5, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "counters_", 9)) {
|
||||
gpu_set->insert(strtoull(e->d_name + 9, nullptr, 10));
|
||||
} else if (!strncmp(e->d_name, "sdma_", 5)) {
|
||||
gpu_set->insert(strtoull(e->d_name + 5, nullptr, 10));
|
||||
}
|
||||
}
|
||||
|
||||
closedir(queues_dir_kfd);
|
||||
int kfd_ret = GetKfdGpuIdsForPid(pid, gpu_set);
|
||||
if (kfd_ret != 0) {
|
||||
return kfd_ret;
|
||||
}
|
||||
|
||||
errno = 0;
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -67,48 +68,55 @@ amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids,
|
||||
uint64_t *size) {
|
||||
char bdf_str[13];
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
// Determine via kfd whether pid uses specified gpu
|
||||
amdsmi_status_t gpu_is_in_kfd_pid(const amdsmi_bdf_t &bdf, long pid) {
|
||||
|
||||
/* 0000:00:00.0 */
|
||||
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint32_t>(bdf.domain_number & 0xffff),
|
||||
static_cast<uint32_t>(bdf.bus_number & 0xff),
|
||||
static_cast<uint32_t>(bdf.device_number & 0x1f),
|
||||
static_cast<uint32_t>(bdf.function_number & 0x7));
|
||||
// pack (domain,bus,device,function) to the same 64-bit key
|
||||
// (DOMAIN << 32) | (BUS << 8) | (DEVICE << 3) | FUNCTION
|
||||
auto pack_bdf_to_kfd_bdfid = [](const amdsmi_bdf_t& b) -> uint64_t {
|
||||
const uint64_t domain = static_cast<uint64_t>(b.domain_number & 0xffffu);
|
||||
const uint64_t bus = static_cast<uint64_t>(b.bus_number & 0xffu);
|
||||
const uint64_t dev = static_cast<uint64_t>(b.device_number & 0x1fu);
|
||||
const uint64_t func = static_cast<uint64_t>(b.function_number & 0x7u);
|
||||
const uint64_t loc = (bus << 8) | (dev << 3) | func;
|
||||
return (domain << 32) | loc;
|
||||
};
|
||||
|
||||
d = opendir("/proc");
|
||||
if (!d) return AMDSMI_STATUS_NO_PERM;
|
||||
// Build map of KFD nodes
|
||||
std::map<uint64_t, std::shared_ptr<amd::smi::KFDNode>> nodes;
|
||||
int ret = DiscoverKFDNodes(&nodes);
|
||||
|
||||
pids.clear();
|
||||
/* Find the pid folders in /proc/ that we have access to */
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_DIR) {
|
||||
/* Try to cast the name of the folder to a
|
||||
* number, if it fails, it is not */
|
||||
char *p;
|
||||
long int pid;
|
||||
|
||||
pid = strtol(dir->d_name, &p, 10);
|
||||
if (*p != 0) continue;
|
||||
|
||||
/* Check if fdinfo is accesible */
|
||||
std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/";
|
||||
|
||||
if (access(path.c_str(), R_OK)) continue;
|
||||
|
||||
/* check if GPU is present */
|
||||
if (gpuvsmi_pid_is_gpu(path, bdf_str)) continue;
|
||||
pids.push_back(pid);
|
||||
}
|
||||
if (ret != 0) {
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
}
|
||||
closedir(d);
|
||||
|
||||
*size = pids.size();
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
// Convert bdf and find node
|
||||
const uint64_t key = pack_bdf_to_kfd_bdfid(bdf);
|
||||
auto it = nodes.find(key);
|
||||
|
||||
if (it == nodes.end()) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
// Grab gpu id and ensure not cpu
|
||||
const uint64_t target_gid = it->second->gpu_id();
|
||||
if (target_gid == 0) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
// Get all KFD GPU ids for pid
|
||||
std::unordered_set<uint64_t> pid_gids;
|
||||
ret = amd::smi::GetKfdGpuIdsForPid(pid, &pid_gids);
|
||||
if (ret != 0) {
|
||||
if (ret == EACCES) {
|
||||
return AMDSMI_STATUS_NO_PERM;
|
||||
}
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
// Return success if gpu id is in pid gpu ids
|
||||
return (pid_gids.count(target_gid) ? AMDSMI_STATUS_SUCCESS
|
||||
: AMDSMI_STATUS_NOT_FOUND);
|
||||
}
|
||||
|
||||
amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
|
||||
@@ -128,8 +136,14 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
|
||||
std::string name_path = "/proc/" + std::to_string(pid) + "/exe";
|
||||
std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup";
|
||||
|
||||
if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
amdsmi_status_t ret = gpu_is_in_kfd_pid(bdf, pid);
|
||||
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
// If kfd process detection fails, fallback on old bdf code
|
||||
ret = gpuvsmi_pid_is_gpu(path.c_str(), bdf_str);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
d = opendir(path.c_str());
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele