[SWDEV-488303] Updated CU occupancy for per-process retrieval (#243)

Change-Id: I2990597c6dd4b2e8cf3e11ce60f72049ebdd9a8c
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
This commit is contained in:
Arif, Maisam
2025-05-29 20:35:27 -05:00
committed by GitHub
parent fba62e2270
commit 0fdaebdbaa
12 changed files with 232 additions and 259 deletions
+7 -5
View File
@@ -12,6 +12,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Changed
- **Added Compute Unit Occupancy information per process**
Measuring compute units are the best way currently to determine gfx usage on a per process basis
- Added `CU_OCCUPANCY` to `amd-smi process` output.
- Added `CU%` to `amd-smi monitor -q`
- **Expanded Violation Status tracking for GPU metrics 1.8.**
- The driver will no longer be supporting existing single-value GFX Clk Below Host Limit fields (`acc_gfx_clk_below_host_limit`, `per_gfx_clk_below_host_limit`, `active_gfx_clk_below_host_limit`), they are now changed in favor of new per-XCP/XCC arrays.
- Added new fields to `amdsmi_violation_status_t` and related interfaces for enhanced violation breakdown:
@@ -54,11 +59,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Resolved issues
- N/A
### Upcoming changes
- N/A
- **Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`.**
- Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and calculated using KB vs KiB.
### Known issues
+38 -18
View File
@@ -3300,8 +3300,21 @@ class AMDSMICommands():
filtered_process_values = []
for process_info in process_list:
process_info['mem_usage'] = process_info.pop('mem')
process_info['usage'] = process_info.pop('engine_usage')
process_info = {
"name": process_info["name"],
"pid": process_info["pid"],
"memory_usage": {
"gtt_mem": process_info["memory_usage"]["gtt_mem"],
"cpu_mem": process_info["memory_usage"]["cpu_mem"],
"vram_mem": process_info["memory_usage"]["vram_mem"],
},
"mem_usage": process_info["mem"],
"usage": {
"gfx": process_info["engine_usage"]["gfx"],
"enc": process_info["engine_usage"]["enc"],
},
"cu_occupancy": process_info["cu_occupancy"]
}
engine_usage_unit = "ns"
memory_usage_unit = "B"
@@ -5714,35 +5727,43 @@ class AMDSMICommands():
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
raise e
try:
num_compute_units = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)['num_compute_units']
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
num_compute_units = "N/A"
logging.debug("Failed to get num compute units for gpu %s | %s", gpu_id, e.get_error_info())
# Clean processes dictionary
filtered_process_values = []
for process_info in process_list:
process_info['mem_usage'] = process_info.pop('mem')
process_info['usage'] = process_info.pop('engine_usage')
process_info.pop('mem') # Remove 'mem' value
process_info.pop('engine_usage') # Remove 'engine_usage' value
engine_usage_unit = "ns"
memory_usage_unit = "B"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
for usage_metric in process_info['memory_usage']:
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
memory_usage_unit = ""
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
process_info['mem_usage'],
memory_usage_unit)
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['usage'][usage_metric],
engine_usage_unit)
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['memory_usage'][usage_metric],
memory_usage_unit)
if 'cu_occupancy' in process_info:
try:
cu_occupancy = process_info['cu_occupancy']
if num_compute_units != "N/A" and num_compute_units > 0:
cu_percentage = round((cu_occupancy / num_compute_units) * 100, 1)
process_info['cu_occupancy'] = self.helpers.unit_format(self.logger,
cu_percentage,
'%')
else:
process_info['cu_occupancy'] = "N/A"
except Exception as e:
process_info['cu_occupancy'] = "N/A"
logging.debug("Failed to calculate cu_occupancy percentage for GPU %s | %s", gpu_id, str(e))
filtered_process_values.append({'process_info': process_info})
# If no processes are populated then we populate an N/A placeholder
@@ -5757,8 +5778,7 @@ class AMDSMICommands():
# Build the process table's title and header
self.logger.secondary_table_title = "PROCESS INFO"
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USAGE".rjust(11) + \
"GFX".rjust(8) + "ENC".rjust(8)
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "CU%".rjust(9)
if watching_output:
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
+4 -21
View File
@@ -214,9 +214,9 @@ class AMDSMILogger():
if process_dict['process_info'] == "No running processes detected":
# Add N/A for empty process_info
table_values += "N/A".rjust(20) + "N/A".rjust(9) + "N/A".rjust(10) + \
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(11) + \
"N/A".rjust(8) + "N/A".rjust(8) + '\n'
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(9) + '\n'
else:
#Fix this herre
for process_key, process_value in process_dict['process_info'].items():
string_process_value = str(process_value)
if process_key == "name":
@@ -230,11 +230,8 @@ class AMDSMILogger():
elif process_key == "memory_usage":
for memory_key, memory_value in process_value.items():
table_values += str(memory_value).rjust(10)
elif process_key == "mem_usage":
table_values += string_process_value.rjust(11)
elif process_key == "usage":
for usage_key, usage_value in process_value.items():
table_values += str(usage_value).rjust(8)
elif process_key == "cu_occupancy":
table_values += string_process_value.rjust(9)
# Add the stored gpu and stored timestamp to the next line
table_values += '\n'
if stored_timestamp:
@@ -486,20 +483,6 @@ class AMDSMILogger():
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
def _store_output_rocmsmi(self, gpu_id, argument, data):
if self.is_json_format():
# put output into self.json_output
pass
elif self.is_csv_format():
# put output into self.csv_output
pass
elif self.is_human_readable_format():
# put output into self.human_readable_output
pass
else:
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
def store_multiple_device_output(self):
""" Store the current output into the multiple_device_output
then clear the current output
+1 -1
View File
@@ -1093,7 +1093,6 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_gpu_process_list
Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A"
@@ -1111,6 +1110,7 @@ Field | Description
`mem` | Process memory usage
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
`cu_occupancy` | Number of Compute Units utilized
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
+12 -8
View File
@@ -817,6 +817,7 @@ int main() {
amdsmi_proc_info_t process = {};
uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0;
uint64_t gfx = 0, enc = 0;
uint32_t cu_occupancy = 0;
char bdf_str[20];
sprintf(bdf_str, "%04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
static_cast<uint64_t>(bdf.domain_number),
@@ -837,7 +838,7 @@ int main() {
printf(
"| pid | name | user | gpu bdf | "
"fb usage | gtt memory | cpu memory | vram memory | "
"engine usage (ns) |\n");
"engine usage (ns) | cu occupancy |\n");
printf("| | | | "
"| | | | "
" | gfx enc |\n");
@@ -855,30 +856,34 @@ int main() {
pwd = getpwuid(st.st_uid);
if (!pwd)
printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB "
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
process_info_list[it].pid, process_info_list[it].name, st.st_uid,
bdf_str, process_info_list[it].mem / 1024,
process_info_list[it].memory_usage.gtt_mem / 1024,
process_info_list[it].memory_usage.cpu_mem / 1024,
process_info_list[it].memory_usage.vram_mem / 1024,
process_info_list[it].engine_usage.gfx,
process_info_list[it].engine_usage.enc);
process_info_list[it].engine_usage.enc,
process_info_list[it].cu_occupancy);
else
printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB "
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
process_info_list[it].pid, process_info_list[it].name,
pwd->pw_name, bdf_str, process_info_list[it].mem / 1024,
process_info_list[it].memory_usage.gtt_mem / 1024,
process_info_list[it].memory_usage.cpu_mem / 1024,
process_info_list[it].memory_usage.vram_mem / 1024,
process_info_list[it].engine_usage.gfx,
process_info_list[it].engine_usage.enc);
process_info_list[it].engine_usage.enc,
process_info_list[it].cu_occupancy);
mem += process_info_list[it].mem / 1024;
gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024;
cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024;
vram_mem += process_info_list[it].memory_usage.vram_mem / 1024;
gfx = process_info_list[it].engine_usage.gfx;
enc = process_info_list[it].engine_usage.enc;
cu_occupancy = process_info_list[it].cu_occupancy;
printf(
"+-------+------------------+------------+-------------"
"-+-------------+-------------+-------------+----------"
@@ -887,10 +892,9 @@ int main() {
// TODO: To remove compiler warning, the last 3 values in this printf were
// set to 0L. Need to find out what these values need to be.
printf("| TOTAL:| %s | %7ld "
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu "
"%lu %lu %lu |\n",
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx,
enc, 0L, 0L, 0L);
enc, cu_occupancy, 0L);
printf("+=======+==================+============+=============="
"+=============+=============+=============+============"
"=+==========================================+\n");
+2 -1
View File
@@ -1093,7 +1093,8 @@ typedef struct {
uint32_t reserved[10];
} memory_usage; //!< in bytes
char container_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t reserved[12];
uint32_t cu_occupancy; //!< Num CUs utilized
uint32_t reserved[11];
} amdsmi_proc_info_t;
/**
+1
View File
@@ -2691,6 +2691,7 @@ def amdsmi_get_gpu_process_list(
"cpu_mem": process_list[index].memory_usage.cpu_mem,
"vram_mem": process_list[index].memory_usage.vram_mem,
},
"cu_occupancy": process_list[index].cu_occupancy
})
return result
+2 -1
View File
@@ -1289,7 +1289,8 @@ struct_amdsmi_proc_info_t._fields_ = [
('engine_usage', struct_engine_usage_),
('memory_usage', struct_memory_usage_),
('container_name', ctypes.c_char * 256),
('reserved', ctypes.c_uint32 * 12),
('cu_occupancy', ctypes.c_uint32),
('PADDING_1', ctypes.c_ubyte * 4),
]
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
+1 -10
View File
@@ -456,7 +456,6 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
proc->sdma_usage = 0;
proc->cu_occupancy = 0;
uint32_t cu_count = 0;
static amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
static std::map<uint64_t, std::shared_ptr<KFDNode>>& kfd_node_map =
smi.kfd_node_map();
@@ -510,23 +509,15 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
}
else if(sysfs_data_errcode==0){
// Update CU usage by the process
proc->cu_occupancy += std::stoi(tmp);
// Collect count of compute units
cu_count += kfd_node_map[gpu_id]->cu_count();
proc->cu_occupancy = std::stoi(tmp);
}
else {
// Some GFX revisions do not provide cu_occupancy debugfs method
// which may cause ENOENT
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
cu_count = 0;
}
}
// Adjust CU occupancy to percent.
if (cu_count > 0) {
proc->cu_occupancy = ((proc->cu_occupancy * 100) / cu_count);
}
return 0;
}
+6
View File
@@ -216,6 +216,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
/**
* Complete the process information
* This is where we copy rsmi_process_info_t into the larger amdsmi_proc_info_t
* Then populate the remaining fields with the gpuvsmi_get_pid_info()
* TODO FIX HERE TO GRAB KFD VRAM if /proc is inconsistent
*/
auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) {
auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info);
@@ -225,6 +228,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
}
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
return status_code;
};
+157 -190
View File
@@ -20,16 +20,17 @@
* THE SOFTWARE.
*/
#include <sys/types.h>
#include <dirent.h>
#include <inttypes.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <memory>
#include <vector>
#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <algorithm>
#include <string.h>
#include <memory>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/amd_smi_utils.h"
@@ -37,230 +38,196 @@
extern "C" {
static const char *container_type_name[AMDSMI_MAX_CONTAINER_TYPE] = {
[AMDSMI_CONTAINER_LXC] = "lxc",
[AMDSMI_CONTAINER_DOCKER] = "docker",
[AMDSMI_CONTAINER_LXC] = "lxc",
[AMDSMI_CONTAINER_DOCKER] = "docker",
};
amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf)
{
DIR *d;
struct dirent *dir;
amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) {
DIR *d;
struct dirent *dir;
d = opendir(path.c_str());
if (!d)
return AMDSMI_STATUS_NO_PERM;
d = opendir(path.c_str());
if (!d) return AMDSMI_STATUS_NO_PERM;
/* iterate through all the fds, try to find
* a match for the GPU bdf
*/
while ((dir = readdir(d)) != NULL) {
std::string file = path + dir->d_name;
std::ifstream fdinfo(file.c_str());
for (std::string line; std::getline(fdinfo, line);) {
if (line.find(bdf) != std::string::npos) {
closedir(d);
return AMDSMI_STATUS_SUCCESS;
}
}
}
/* iterate through all the fds, try to find
* a match for the GPU bdf
*/
while ((dir = readdir(d)) != NULL) {
std::string file = path + dir->d_name;
std::ifstream fdinfo(file.c_str());
for (std::string line; std::getline(fdinfo, line);) {
if (line.find(bdf) != std::string::npos) {
closedir(d);
return AMDSMI_STATUS_SUCCESS;
}
}
}
closedir(d);
closedir(d);
return AMDSMI_STATUS_NOT_FOUND;
return AMDSMI_STATUS_NOT_FOUND;
}
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids, uint64_t *size)
{
char bdf_str[13];
DIR *d;
struct dirent *dir;
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids,
uint64_t *size) {
char bdf_str[13];
DIR *d;
struct dirent *dir;
/* 0000:00:00.0 */
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
static_cast<uint32_t>(bdf.domain_number & 0xffff),
static_cast<uint32_t>(bdf.bus_number & 0xff),
static_cast<uint32_t>(bdf.device_number & 0x1f),
static_cast<uint32_t>(bdf.function_number & 0x7));
/* 0000:00:00.0 */
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
static_cast<uint32_t>(bdf.domain_number & 0xffff),
static_cast<uint32_t>(bdf.bus_number & 0xff),
static_cast<uint32_t>(bdf.device_number & 0x1f),
static_cast<uint32_t>(bdf.function_number & 0x7));
d = opendir("/proc");
if (!d)
return AMDSMI_STATUS_NO_PERM;
d = opendir("/proc");
if (!d) return AMDSMI_STATUS_NO_PERM;
pids.clear();
/* Find the pid folders in /proc/ that we have access to */
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_DIR) {
/* Try to cast the name of the folder to a
* number, if it fails, it is not */
char *p;
long int pid;
pids.clear();
/* Find the pid folders in /proc/ that we have access to */
while ((dir = readdir(d)) != NULL) {
if (dir->d_type == DT_DIR) {
/* Try to cast the name of the folder to a
* number, if it fails, it is not */
char *p;
long int pid;
pid = strtol(dir->d_name, &p, 10);
if (*p != 0)
continue;
pid = strtol(dir->d_name, &p, 10);
if (*p != 0) continue;
/* Check if fdinfo is accesible */
std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/";
/* Check if fdinfo is accesible */
std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/";
if (access(path.c_str(), R_OK))
continue;
if (access(path.c_str(), R_OK)) continue;
/* check if GPU is present */
if (gpuvsmi_pid_is_gpu(path, bdf_str))
continue;
pids.push_back(pid);
}
}
closedir(d);
/* check if GPU is present */
if (gpuvsmi_pid_is_gpu(path, bdf_str)) continue;
pids.push_back(pid);
}
}
closedir(d);
*size = pids.size();
return AMDSMI_STATUS_SUCCESS;
*size = pids.size();
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
amdsmi_proc_info_t &info)
{
char bdf_str[13];
DIR *d;
struct dirent *dir;
amdsmi_proc_info_t &info) {
char bdf_str[13];
DIR *d;
struct dirent *dir;
/* 0000:00:00.0 */
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
static_cast<uint32_t>(bdf.domain_number & 0xffff),
static_cast<uint32_t>(bdf.bus_number & 0xff),
static_cast<uint32_t>(bdf.device_number & 0x1f),
static_cast<uint32_t>(bdf.function_number & 0x7));
/* 0000:00:00.0 */
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
static_cast<uint32_t>(bdf.domain_number & 0xffff),
static_cast<uint32_t>(bdf.bus_number & 0xff),
static_cast<uint32_t>(bdf.device_number & 0x1f),
static_cast<uint32_t>(bdf.function_number & 0x7));
std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/";
std::string name_path = "/proc/" + std::to_string(pid) + "/comm";
std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup";
std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/";
std::string name_path = "/proc/" + std::to_string(pid) + "/comm";
std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup";
if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) {
return AMDSMI_STATUS_INVAL;
}
if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) {
return AMDSMI_STATUS_INVAL;
}
d = opendir(path.c_str());
if (!d)
return AMDSMI_STATUS_NO_PERM;
d = opendir(path.c_str());
if (!d) return AMDSMI_STATUS_NO_PERM;
/* Vectors to check if repated fd pasid */
// TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0
std::vector<int> pasids;
/* Vectors to check if repated fd pasid */
// TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0
std::vector<int> pasids;
memset(&info, 0, sizeof(info));
/* Iterate through all fdinfos */
while ((dir = readdir(d)) != NULL) {
memset(&info, 0, sizeof(info));
/* Iterate through all fdinfos */
while ((dir = readdir(d)) != NULL) {
std::string file = path + dir->d_name;
std::ifstream fdinfo(file.c_str());
std::string file = path + dir->d_name;
std::ifstream fdinfo(file.c_str());
for (std::string bdfline; getline(fdinfo, bdfline);) {
if (bdfline.find("drm-pdev:") != std::string::npos) {
char fd_bdf_str[13];
for (std::string bdfline; getline(fdinfo, bdfline);) {
if (bdfline.find("drm-pdev:") != std::string::npos) {
char fd_bdf_str[13];
/* Only check against fdinfo files that contain a bdf */
if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1) continue;
/* Only check against fdinfo files that contain a bdf */
if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1)
continue;
/* Populate amdsmi_proc_info_t struct only if the bdf in
* the fdinfo file matches the passed bdf */
if (strncmp(bdf_str, fd_bdf_str, 13) == 0) {
std::ifstream fdinfo(file.c_str());
/* Populate amdsmi_proc_info_t struct only if the bdf in
* the fdinfo file matches the passed bdf */
if (strncmp(bdf_str, fd_bdf_str, 13) == 0){
std::ifstream fdinfo(file.c_str());
for (std::string line; getline(fdinfo, line);) {
if (line.find("pasid:") != std::string::npos) {
int pasid;
if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1) continue;
auto it = std::find(pasids.begin(), pasids.end(), pasid);
if (it == pasids.end()) pasids.push_back(pasid);
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue;
info.mem += mem * 1000;
info.memory_usage.gtt_mem += mem * 1000;
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue;
info.mem += mem * 1000;
info.memory_usage.cpu_mem += mem * 1000;
} else if (line.find("drm-memory-vram:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\
info.mem += mem * 1000;
info.memory_usage.vram_mem += mem * 1000;
} else if (line.find("drm-engine-gfx") != std::string::npos) {
uint64_t engine_gfx;
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue;
info.engine_usage.gfx = engine_gfx;
} else if (line.find("drm-engine-enc") != std::string::npos) {
uint64_t engine_enc;
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue;
info.engine_usage.enc = engine_enc;
}
}
}
}
}
}
for (std::string line; getline(fdinfo, line);) {
if (line.find("pasid:") != std::string::npos) {
int pasid;
if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1)
continue;
auto it = std::find(pasids.begin(), pasids.end(), pasid);
if (it == pasids.end())
pasids.push_back(pasid);
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1)
continue;
info.mem += mem * 1024;
info.memory_usage.gtt_mem += mem * 1024;
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1)
continue;
info.mem += mem * 1024;
info.memory_usage.cpu_mem += mem * 1024;
} else if (line.find("drm-memory-vram:") != std::string::npos) {
unsigned long mem;
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1)
continue;
info.mem += mem * 1024;
info.memory_usage.vram_mem += mem * 1024;
} else if (line.find("drm-engine-gfx") != std::string::npos) {
uint64_t engine_gfx;
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1)
continue;
info.engine_usage.gfx = engine_gfx;
} else if (line.find("drm-engine-enc") != std::string::npos) {
uint64_t engine_enc;
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1)
continue;
info.engine_usage.enc = engine_enc;
}
}
}
}
}
}
closedir(d);
closedir(d);
// Note: If possible at all, try to get the name of the process/container.
// In case the other info fail, get at least something.
std::ifstream filename(name_path.c_str());
std::string name;
std::ifstream filename(name_path.c_str());
std::string name;
getline(filename, name);
getline(filename, name);
if (name.empty())
return AMDSMI_STATUS_API_FAILED;
if (name.empty()) return AMDSMI_STATUS_API_FAILED;
strncpy(info.name, name.c_str(), std::min(
(unsigned long) AMDSMI_MAX_STRING_LENGTH,
name.length()));
strncpy(info.name, name.c_str(),
std::min((unsigned long)AMDSMI_MAX_STRING_LENGTH, name.length()));
for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) {
std::ifstream cgroup_info(cgroup_path.c_str());
std::string container_id;
for (std::string line; getline(cgroup_info, line);) {
if (line.find(container_type_name[i]) != std::string::npos) {
container_id = line.substr(line.find(container_type_name[i]) +
strlen(container_type_name[i]) + 1, 16);
strcpy(info.container_name, container_id.c_str());
break;
}
}
if (strlen(info.container_name) > 0)
break;
}
info.pid = (uint32_t)pid;
for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) {
std::ifstream cgroup_info(cgroup_path.c_str());
std::string container_id;
for (std::string line; getline(cgroup_info, line);) {
if (line.find(container_type_name[i]) != std::string::npos) {
container_id = line.substr(line.find(container_type_name[i]) +
strlen(container_type_name[i]) + 1, 16);
strcpy(info.container_name, container_id.c_str());
break;
}
}
if (strlen(info.container_name) > 0) break;
}
info.pid = (uint32_t)pid;
if (!pasids.size()) {
return AMDSMI_STATUS_NOT_FOUND;
if (!pasids.size()) {
return AMDSMI_STATUS_NOT_FOUND;
}
return AMDSMI_STATUS_SUCCESS;
return AMDSMI_STATUS_SUCCESS;
}
} // extern "C"
} // extern "C"
+1 -4
View File
@@ -44,15 +44,12 @@ try:
from amdsmi_logger import AMDSMILogger
from amdsmi_parser import AMDSMIParser
import amdsmi_cli_exceptions
helpers = AMDSMIHelpers()
except ImportError as e:
print(f"Failed to import amdsmi cli libs: {e}")
print("Ensure that you have installed amdsmi's package.")
helpers = AMDSMIHelpers()
# Make exit & quit work without parens because it's annoying
type(exit).__repr__ = sys.exit