[SWDEV-488303] Adjusted process vram_mem data source (#411)
* [SWDEV-488303] Adjusted process vram_mem data source * Standardized sscanf format strings --------- Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> Co-authored-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
876f3976e0
commit
42441c78ea
@@ -1107,9 +1107,9 @@ Field | Description
|
||||
---|---
|
||||
`name` | Name of process. If user does not have permission this will be "N/A"
|
||||
`pid` | Process ID
|
||||
`mem` | Process memory usage
|
||||
`mem` | Process memory usage in Bytes
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage in Bytes</td></tr> </tbody></table>
|
||||
`cu_occupancy` | Number of Compute Units utilized
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
|
||||
|
||||
@@ -1080,18 +1080,18 @@ typedef uint32_t amdsmi_process_handle_t;
|
||||
typedef struct {
|
||||
char name[AMDSMI_MAX_STRING_LENGTH];
|
||||
amdsmi_process_handle_t pid;
|
||||
uint64_t mem; //!< In bytes
|
||||
uint64_t mem; //!< In Bytes
|
||||
struct engine_usage_ {
|
||||
uint64_t gfx; //!< In nano-secs
|
||||
uint64_t enc; //!< In nano-secs
|
||||
uint32_t reserved[12];
|
||||
} engine_usage; //!< time the process spends using these engines in ns
|
||||
struct memory_usage_ {
|
||||
uint64_t gtt_mem; //!< In MB
|
||||
uint64_t cpu_mem; //!< In MB
|
||||
uint64_t vram_mem; //!< In MB
|
||||
uint64_t gtt_mem; //!< In Bytes
|
||||
uint64_t cpu_mem; //!< In Bytes
|
||||
uint64_t vram_mem; //!< In Bytes
|
||||
uint32_t reserved[10];
|
||||
} memory_usage; //!< in bytes
|
||||
} memory_usage; //!< In Bytes
|
||||
char container_name[AMDSMI_MAX_STRING_LENGTH];
|
||||
uint32_t cu_occupancy; //!< Num CUs utilized
|
||||
uint32_t reserved[11];
|
||||
|
||||
@@ -62,6 +62,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
uint32_t get_gpu_fd() const;
|
||||
uint32_t get_card_id(); // -e feature + we can get card_id for our internal functions
|
||||
uint32_t get_drm_render_minor(); // -e feature + we can get card_id for our internal functions
|
||||
uint64_t get_kfd_gpu_id(); // Used to decode vram usage for KFD processes
|
||||
std::string& get_gpu_path();
|
||||
amdsmi_bdf_t get_bdf();
|
||||
bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); }
|
||||
@@ -84,6 +85,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
AMDSmiDrm& drm_;
|
||||
uint32_t card_index_;
|
||||
uint32_t drm_render_minor_;
|
||||
uint64_t kfd_gpu_id_; // Used to decode vram usage for KFD processes
|
||||
GPUComputeProcessList_t compute_process_list_;
|
||||
int32_t get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
|
||||
ComputeProcessListType_t list_type);
|
||||
|
||||
@@ -1290,7 +1290,7 @@ struct_amdsmi_proc_info_t._fields_ = [
|
||||
('memory_usage', struct_memory_usage_),
|
||||
('container_name', ctypes.c_char * 256),
|
||||
('cu_occupancy', ctypes.c_uint32),
|
||||
('PADDING_1', ctypes.c_ubyte * 4),
|
||||
('reserved', ctypes.c_uint32 * 11),
|
||||
]
|
||||
|
||||
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
|
||||
@@ -3163,19 +3163,18 @@ __all__ = \
|
||||
'AMDSMI_VOLT_LAST', 'AMDSMI_VOLT_LOWEST', 'AMDSMI_VOLT_MAX',
|
||||
'AMDSMI_VOLT_MAX_CRIT', 'AMDSMI_VOLT_MIN', 'AMDSMI_VOLT_MIN_CRIT',
|
||||
'AMDSMI_VOLT_TYPE_FIRST', 'AMDSMI_VOLT_TYPE_INVALID',
|
||||
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDGFX',
|
||||
'AMDSMI_VOLT_TYPE_VDDBOARD',
|
||||
'AMDSMI_VRAM_TYPE_DDR2', 'AMDSMI_VRAM_TYPE_DDR3',
|
||||
'AMDSMI_VRAM_TYPE_DDR4', 'AMDSMI_VRAM_TYPE_GDDR1',
|
||||
'AMDSMI_VRAM_TYPE_GDDR2', 'AMDSMI_VRAM_TYPE_GDDR3',
|
||||
'AMDSMI_VRAM_TYPE_GDDR4', 'AMDSMI_VRAM_TYPE_GDDR5',
|
||||
'AMDSMI_VRAM_TYPE_GDDR6', 'AMDSMI_VRAM_TYPE_GDDR7',
|
||||
'AMDSMI_VRAM_TYPE_HBM', 'AMDSMI_VRAM_TYPE_HBM2',
|
||||
'AMDSMI_VRAM_TYPE_HBM2E', 'AMDSMI_VRAM_TYPE_HBM3',
|
||||
'AMDSMI_VRAM_TYPE_UNKNOWN', 'AMDSMI_VRAM_TYPE__MAX',
|
||||
'AMDSMI_XGMI_LINK_DISABLE', 'AMDSMI_XGMI_LINK_DOWN',
|
||||
'AMDSMI_XGMI_LINK_UP', 'AMDSMI_XGMI_STATUS_ERROR',
|
||||
'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
|
||||
'AMDSMI_VOLT_TYPE_LAST', 'AMDSMI_VOLT_TYPE_VDDBOARD',
|
||||
'AMDSMI_VOLT_TYPE_VDDGFX', 'AMDSMI_VRAM_TYPE_DDR2',
|
||||
'AMDSMI_VRAM_TYPE_DDR3', 'AMDSMI_VRAM_TYPE_DDR4',
|
||||
'AMDSMI_VRAM_TYPE_GDDR1', 'AMDSMI_VRAM_TYPE_GDDR2',
|
||||
'AMDSMI_VRAM_TYPE_GDDR3', 'AMDSMI_VRAM_TYPE_GDDR4',
|
||||
'AMDSMI_VRAM_TYPE_GDDR5', 'AMDSMI_VRAM_TYPE_GDDR6',
|
||||
'AMDSMI_VRAM_TYPE_GDDR7', 'AMDSMI_VRAM_TYPE_HBM',
|
||||
'AMDSMI_VRAM_TYPE_HBM2', 'AMDSMI_VRAM_TYPE_HBM2E',
|
||||
'AMDSMI_VRAM_TYPE_HBM3', 'AMDSMI_VRAM_TYPE_UNKNOWN',
|
||||
'AMDSMI_VRAM_TYPE__MAX', 'AMDSMI_XGMI_LINK_DISABLE',
|
||||
'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP',
|
||||
'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS',
|
||||
'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN',
|
||||
'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t',
|
||||
'amdsmi_accelerator_partition_profile_config_t',
|
||||
|
||||
@@ -100,6 +100,35 @@ uint32_t AMDSmiGPUDevice::get_drm_render_minor() {
|
||||
return this->drm_render_minor_;
|
||||
}
|
||||
|
||||
uint64_t AMDSmiGPUDevice::get_kfd_gpu_id() {
|
||||
std::ostringstream ss;
|
||||
// Should never return not_supported, but just in case
|
||||
rsmi_status_t ret = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
|
||||
uint32_t gpu_index = this->get_gpu_id();
|
||||
rsmi_device_identifiers_t identifiers = rsmi_device_identifiers_t{};
|
||||
ret = rsmi_dev_device_identifiers_get(gpu_index, &identifiers);
|
||||
if (ret != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
this->kfd_gpu_id_ = std::numeric_limits<uint64_t>::max();
|
||||
} else {
|
||||
this->kfd_gpu_id_ = identifiers.kfd_gpu_id;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_identifiers_get status: " << getRSMIStatusString(ret, false) << "\n"
|
||||
<< " | gpu_id_: " << gpu_id_ << "\n"
|
||||
<< " | identifiers.card_index: " << identifiers.card_index << "\n"
|
||||
<< " | identifiers.drm_render_minor: " << identifiers.drm_render_minor << "\n"
|
||||
<< " | identifiers.bdfid: " << std::hex << "0x" << identifiers.bdfid << "\n"
|
||||
<< " | identifiers.kfd_gpu_id: " << std::dec << identifiers.kfd_gpu_id << "\n"
|
||||
<< " | identifiers.partition_id: " << identifiers.partition_id << "\n"
|
||||
<< " | identifiers.smi_device_id: " << identifiers.smi_device_id << "\n"
|
||||
<< " | returning kfd_gpu_id_: "
|
||||
<< this->kfd_gpu_id_ << std::endl;
|
||||
// std::cout << ss.str();
|
||||
LOG_DEBUG(ss);
|
||||
return this->kfd_gpu_id_;
|
||||
}
|
||||
|
||||
uint32_t AMDSmiGPUDevice::get_gpu_fd() const {
|
||||
return fd_;
|
||||
}
|
||||
@@ -159,7 +188,6 @@ pthread_mutex_t* AMDSmiGPUDevice::get_mutex() {
|
||||
return amd::smi::GetMutex(gpu_id_);
|
||||
}
|
||||
|
||||
|
||||
int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list,
|
||||
ComputeProcessListType_t list_type)
|
||||
{
|
||||
@@ -231,6 +259,41 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
|
||||
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
|
||||
|
||||
// Safely handle KFD file access
|
||||
uint64_t kfd_gpu_id = get_kfd_gpu_id();
|
||||
std::string kfd_path = "/sys/class/kfd/kfd/proc/" +
|
||||
std::to_string(rsmi_proc_info.process_id) +
|
||||
"/vram_" + std::to_string(kfd_gpu_id);
|
||||
|
||||
// Check if the file exists before attempting to open it
|
||||
if (access(kfd_path.c_str(), R_OK) == 0) {
|
||||
std::ifstream kfd_file(kfd_path.c_str());
|
||||
if (kfd_file.is_open()) {
|
||||
std::string line;
|
||||
if (std::getline(kfd_file, line)) {
|
||||
try {
|
||||
uint64_t vram_bytes = std::stoull(line);
|
||||
asmi_proc_info.mem = vram_bytes; // Already in bytes
|
||||
asmi_proc_info.memory_usage.vram_mem = vram_bytes; // Already in bytes
|
||||
} catch (const std::exception& e) {
|
||||
// Handle conversion error gracefully
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to parse VRAM value from KFD: " << e.what();
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
}
|
||||
kfd_file.close();
|
||||
} else {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to open KFD file: " << kfd_path;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
} else {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | KFD file not accessible: " << kfd_path;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
};
|
||||
|
||||
|
||||
@@ -166,26 +166,26 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
|
||||
if (it == pasids.end()) pasids.push_back(pasid);
|
||||
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %" PRIu32, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.gtt_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %" PRIu32, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.cpu_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-vram:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %" PRIu32, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.vram_mem += mem * 1000;
|
||||
} else if (line.find("drm-engine-gfx") != std::string::npos) {
|
||||
uint64_t engine_gfx;
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %" PRIu32, &engine_gfx) != 1) continue;
|
||||
info.engine_usage.gfx = engine_gfx;
|
||||
} else if (line.find("drm-engine-enc") != std::string::npos) {
|
||||
uint64_t engine_enc;
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %" PRIu32, &engine_enc) != 1) continue;
|
||||
info.engine_usage.enc = engine_enc;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user