Merge amd-staging into amd-master 20240924

Signed-off-by: Zhang Ava <niandong.zhang@amd.com>
Change-Id: I8a3bc054f4fa4ca3d3de789db78fd34954bb0d88


[ROCm/rocm_smi_lib commit: fa5312dacc]
This commit is contained in:
Zhang Ava
2024-09-26 18:36:12 +08:00
کامیت 7a75c65b38
2فایلهای تغییر یافته به همراه33 افزوده شده و 21 حذف شده
@@ -45,6 +45,7 @@
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_COMMON_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_COMMON_H_
#include <cstdint>
#include <memory>
#include <map>
#include <vector>
@@ -415,6 +415,13 @@ int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
return 0;
}
static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
if(sysfs_ret==0 && !is_number(s)){
return EINVAL;
}
return sysfs_ret;
}
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
std::unordered_set<uint64_t> *gpu_set) {
assert(proc != nullptr);
@@ -464,30 +471,31 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
vram_str_path += std::to_string(gpu_id);
err = ReadSysfsStr(vram_str_path, &tmp);
if (err) {
return err;
}
auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!is_number(tmp)) {
return EINVAL;
// Report all errors, except ENOENT (2), which should be ignored
// and the proc->vram_usage should be unmodified
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
// Do not store any invalid values
else if (sysfs_data_errcode == 0) {
proc->vram_usage += std::stoull(tmp);
}
proc->vram_usage += std::stoull(tmp);
std::string sdma_str_path = proc_str_path;
sdma_str_path += "/sdma_";
sdma_str_path += std::to_string(gpu_id);
err = ReadSysfsStr(sdma_str_path, &tmp);
if (err) {
return err;
}
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!is_number(tmp)) {
return EINVAL;
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if (sysfs_data_errcode == 0) {
proc->sdma_usage += std::stoull(tmp);
}
proc->sdma_usage += std::stoull(tmp);
// Build the path and read from Sysfs file, info that
// encodes Compute Unit usage by a process of interest
@@ -497,17 +505,20 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
cu_occupancy_path += "/cu_occupancy";
err = ReadSysfsStr(cu_occupancy_path, &tmp);
if (err == 0) {
if (!is_number(tmp)) {
return EINVAL;
}
sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
return sysfs_data_errcode;
}
else if(sysfs_data_errcode==0){
// Update CU usage by the process
proc->cu_occupancy += std::stoi(tmp);
// Collect count of compute units
cu_count += kfd_node_map[gpu_id]->cu_count();
} else {
//Some GFX revisions do not provide cu_occupancy debugfs method
}
else {
// Some GFX revisions do not provide cu_occupancy debugfs method
// which may cause ENOENT
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
cu_count = 0;
}