[SWDEV-488303] Adjusted process vram_mem data source (#411)

* [SWDEV-488303] Adjusted process vram_mem data source
* Standardized sscanf format strings

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
This commit is contained in:
Arif, Maisam
2025-05-29 23:26:12 -05:00
zatwierdzone przez GitHub
rodzic 876f3976e0
commit 42441c78ea
6 zmienionych plików z 91 dodań i 27 usunięć
+2 -2
Wyświetl plik
@@ -1107,9 +1107,9 @@ Field | Description
---|---
`name` | Name of process. If user does not have permission this will be "N/A"
`pid` | Process ID
`mem` | Process memory usage
`mem` | Process memory usage in Bytes
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage in Bytes</td></tr> </tbody></table>
`cu_occupancy` | Number of Compute Units utilized
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
+5 -5
Wyświetl plik
@@ -1080,18 +1080,18 @@ typedef uint32_t amdsmi_process_handle_t;
typedef struct {
char name[AMDSMI_MAX_STRING_LENGTH];
amdsmi_process_handle_t pid;
uint64_t mem; //!< In bytes
uint64_t mem; //!< In Bytes
struct engine_usage_ {
uint64_t gfx; //!< In nano-secs
uint64_t enc; //!< In nano-secs
uint32_t reserved[12];
} engine_usage; //!< time the process spends using these engines in ns
struct memory_usage_ {
uint64_t gtt_mem; //!< In MB
uint64_t cpu_mem; //!< In MB
uint64_t vram_mem; //!< In MB
uint64_t gtt_mem; //!< In Bytes
uint64_t cpu_mem; //!< In Bytes
uint64_t vram_mem; //!< In Bytes
uint32_t reserved[10];
} memory_usage; //!< in bytes
} memory_usage; //!< In Bytes
char container_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t cu_occupancy; //!< Num CUs utilized
uint32_t reserved[11];
@@ -62,6 +62,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
uint32_t get_gpu_fd() const;
uint32_t get_card_id(); // -e feature + we can get card_id for our internal functions
uint32_t get_drm_render_minor(); // -e feature + we can get card_id for our internal functions
uint64_t get_kfd_gpu_id(); // Used to decode vram usage for KFD processes
std::string& get_gpu_path();
amdsmi_bdf_t get_bdf();
bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); }
@@ -84,6 +85,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
AMDSmiDrm& drm_;
uint32_t card_index_;
uint32_t drm_render_minor_;
uint64_t kfd_gpu_id_; // Used to decode vram usage for KFD processes
GPUComputeProcessList_t compute_process_list_;
int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
ComputeProcessListType_t list_type);
+13 -14
Wyświetl plik
@@ -1290,7 +1290,7 @@ struct_amdsmi_proc_info_t._fields_ = [
('memory_usage', struct_memory_usage_),
('container_name', ctypes.c_char * 256),
('cu_occupancy', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
('reserved', ctypes.c_uint32 * 11),
]
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
@@ -3163,19 +3163,18 @@ __all__ = \
'AMDSMI_VOLT_LAST', 'AMDSMI_VOLT_LOWEST', 'AMDSMI_VOLT_MAX',
'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT',
'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID',
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX',
'AMDSMI_VOLT_TYPE_VDDBOARD',
'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3',
'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1',
'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3',
'AMDSMI_VRAM_TYPE_GDDR4', 'AMDSMI_VRAM_TYPE_GDDR5',
'AMDSMI_VRAM_TYPE_GDDR6', 'AMDSMI_VRAM_TYPE_GDDR7',
'AMDSMI_VRAM_TYPE_HBM', 'AMDSMI_VRAM_TYPE_HBM2',
'AMDSMI_VRAM_TYPE_HBM2E', 'AMDSMI_VRAM_TYPE_HBM3',
'AMDSMI_VRAM_TYPE_UNKNOWN', 'AMDSMI_VRAM_TYPE__MAX',
'AMDSMI_XGMI_LINK_DISABLE', 'AMDSMI_XGMI_LINK_DOWN',
'AMDSMI_XGMI_LINK_UP', 'AMDSMI_XGMI_STATUS_ERROR',
'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDBOARD',
'AMDSMI_VOLT_TYPE_VDDGFX', 'AMDSMI_VRAM_TYPE_DDR2',
'AMDSMI_VRAM_TYPE_DDR3', 'AMDSMI_VRAM_TYPE_DDR4',
'AMDSMI_VRAM_TYPE_GDDR1', 'AMDSMI_VRAM_TYPE_GDDR2',
'AMDSMI_VRAM_TYPE_GDDR3', 'AMDSMI_VRAM_TYPE_GDDR4',
'AMDSMI_VRAM_TYPE_GDDR5', 'AMDSMI_VRAM_TYPE_GDDR6',
'AMDSMI_VRAM_TYPE_GDDR7', 'AMDSMI_VRAM_TYPE_HBM',
'AMDSMI_VRAM_TYPE_HBM2', 'AMDSMI_VRAM_TYPE_HBM2E',
'AMDSMI_VRAM_TYPE_HBM3', 'AMDSMI_VRAM_TYPE_UNKNOWN',
'AMDSMI_VRAM_TYPE__MAX', 'AMDSMI_XGMI_LINK_DISABLE',
'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP',
'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN',
'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t',
'amdsmi_accelerator_partition_profile_config_t',
+64 -1
Wyświetl plik
@@ -100,6 +100,35 @@ uint32_t AMDSmiGPUDevice::get_drm_render_minor() {
return this->drm_render_minor_;
}
uint64_t AMDSmiGPUDevice::get_kfd_gpu_id() {
std::ostringstream ss;
// Should never return not_supported, but just in case
rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
uint32_t gpu_index = this->get_gpu_id();
rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{};
ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers);
if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) {
this->kfd_gpu_id_ = std::numeric_limits<uint64_t>::max();
} else {
this->kfd_gpu_id_ = identifiers.kfd_gpu_id;
}
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n"
<< " | gpu_id_: " << gpu_id_ << "\n"
<< " | identifiers.card_index: " << identifiers.card_index << "\n"
<< " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n"
<< " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n"
<< " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n"
<< " | identifiers.partition_id: " << identifiers.partition_id << "\n"
<< " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n"
<< " | returning kfd_gpu_id_: "
<< this->kfd_gpu_id_ << std::endl;
// std::cout << ss.str();
LOG_DEBUG(ss);
return this->kfd_gpu_id_;
}
uint32_t AMDSmiGPUDevice::get_gpu_fd() const {
return fd_;
}
@@ -159,7 +188,6 @@ pthread_mutex_t* AMDSmiGPUDevice::get_mutex() {
return amd::smi::GetMutex(gpu_id_);
}
int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
ComputeProcessListType_t list_type)
{
@@ -231,6 +259,41 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
// Safely handle KFD file access
uint64_t kfd_gpu_id = get_kfd_gpu_id();
std::string kfd_path = "/sys/class/kfd/kfd/proc/" +
std::to_string(rsmi_proc_info.process_id) +
"/vram_" + std::to_string(kfd_gpu_id);
// Check if the file exists before attempting to open it
if (access(kfd_path.c_str(), R_OK) == 0) {
std::ifstream kfd_file(kfd_path.c_str());
if (kfd_file.is_open()) {
std::string line;
if (std::getline(kfd_file, line)) {
try {
uint64_t vram_bytes = std::stoull(line);
asmi_proc_info.mem = vram_bytes; // Already in bytes
asmi_proc_info.memory_usage.vram_mem = vram_bytes; // Already in bytes
} catch (const std::exception& e) {
// Handle conversion error gracefully
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | Failed to parse VRAM value from KFD: " << e.what();
LOG_DEBUG(ss);
}
}
kfd_file.close();
} else {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | Failed to open KFD file: " << kfd_path;
LOG_DEBUG(ss);
}
} else {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | KFD file not accessible: " << kfd_path;
LOG_DEBUG(ss);
}
return status_code;
};
+5 -5
Wyświetl plik
@@ -166,26 +166,26 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
if (it == pasids.end()) pasids.push_back(pasid);
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue;
if (sscanf(line.c_str(), "drm-memory-gtt: %" PRIu32, &mem) != 1) continue;
info.mem += mem * 1000;
info.memory_usage.gtt_mem += mem * 1000;
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue;
if (sscanf(line.c_str(), "drm-memory-cpu: %" PRIu32, &mem) != 1) continue;
info.mem += mem * 1000;
info.memory_usage.cpu_mem += mem * 1000;
} else if (line.find("drm-memory-vram:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\
if (sscanf(line.c_str(), "drm-memory-vram: %" PRIu32, &mem) != 1) continue;
info.mem += mem * 1000;
info.memory_usage.vram_mem += mem * 1000;
} else if (line.find("drm-engine-gfx") != std::string::npos) {
uint64_t engine_gfx;
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue;
if (sscanf(line.c_str(), "drm-engine-gfx: %" PRIu32, &engine_gfx) != 1) continue;
info.engine_usage.gfx = engine_gfx;
} else if (line.find("drm-engine-enc") != std::string::npos) {
uint64_t engine_enc;
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue;
if (sscanf(line.c_str(), "drm-engine-enc: %" PRIu32, &engine_enc) != 1) continue;
info.engine_usage.enc = engine_enc;
}
}