[SWDEV-488303] Updated CU occupancy for per-process retrieval (#243)
Change-Id: I2990597c6dd4b2e8cf3e11ce60f72049ebdd9a8c
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: 0fdaebdbaa]
此提交包含在:
@@ -12,6 +12,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Changed
|
||||
|
||||
- **Added Compute Unit Occupancy information per process**
|
||||
Measuring compute units are the best way currently to determine gfx usage on a per process basis
|
||||
- Added `CU_OCCUPANCY` to `amd-smi process` output.
|
||||
- Added `CU%` to `amd-smi monitor -q`
|
||||
|
||||
- **Expanded Violation Status tracking for GPU metrics 1.8.**
|
||||
- The driver will no longer be supporting existing single-value GFX Clk Below Host Limit fields (`acc_gfx_clk_below_host_limit`, `per_gfx_clk_below_host_limit`, `active_gfx_clk_below_host_limit`), they are now changed in favor of new per-XCP/XCC arrays.
|
||||
- Added new fields to `amdsmi_violation_status_t` and related interfaces for enhanced violation breakdown:
|
||||
@@ -54,11 +59,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Resolved issues
|
||||
|
||||
- N/A
|
||||
|
||||
### Upcoming changes
|
||||
|
||||
- N/A
|
||||
- **Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`.**
|
||||
- Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and calculated using KB vs KiB.
|
||||
|
||||
### Known issues
|
||||
|
||||
|
||||
@@ -3300,8 +3300,21 @@ class AMDSMICommands():
|
||||
|
||||
filtered_process_values = []
|
||||
for process_info in process_list:
|
||||
process_info['mem_usage'] = process_info.pop('mem')
|
||||
process_info['usage'] = process_info.pop('engine_usage')
|
||||
process_info = {
|
||||
"name": process_info["name"],
|
||||
"pid": process_info["pid"],
|
||||
"memory_usage": {
|
||||
"gtt_mem": process_info["memory_usage"]["gtt_mem"],
|
||||
"cpu_mem": process_info["memory_usage"]["cpu_mem"],
|
||||
"vram_mem": process_info["memory_usage"]["vram_mem"],
|
||||
},
|
||||
"mem_usage": process_info["mem"],
|
||||
"usage": {
|
||||
"gfx": process_info["engine_usage"]["gfx"],
|
||||
"enc": process_info["engine_usage"]["enc"],
|
||||
},
|
||||
"cu_occupancy": process_info["cu_occupancy"]
|
||||
}
|
||||
|
||||
engine_usage_unit = "ns"
|
||||
memory_usage_unit = "B"
|
||||
@@ -5714,35 +5727,43 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
raise e
|
||||
|
||||
try:
|
||||
num_compute_units = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)['num_compute_units']
|
||||
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
|
||||
num_compute_units = "N/A"
|
||||
logging.debug("Failed to get num compute units for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
# Clean processes dictionary
|
||||
filtered_process_values = []
|
||||
for process_info in process_list:
|
||||
process_info['mem_usage'] = process_info.pop('mem')
|
||||
process_info['usage'] = process_info.pop('engine_usage')
|
||||
process_info.pop('mem') # Remove 'mem' value
|
||||
process_info.pop('engine_usage') # Remove 'engine_usage' value
|
||||
|
||||
engine_usage_unit = "ns"
|
||||
memory_usage_unit = "B"
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
|
||||
memory_usage_unit = ""
|
||||
|
||||
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
|
||||
process_info['mem_usage'],
|
||||
memory_usage_unit)
|
||||
|
||||
for usage_metric in process_info['usage']:
|
||||
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
|
||||
process_info['usage'][usage_metric],
|
||||
engine_usage_unit)
|
||||
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
|
||||
process_info['memory_usage'][usage_metric],
|
||||
memory_usage_unit)
|
||||
|
||||
if 'cu_occupancy' in process_info:
|
||||
try:
|
||||
cu_occupancy = process_info['cu_occupancy']
|
||||
if num_compute_units != "N/A" and num_compute_units > 0:
|
||||
cu_percentage = round((cu_occupancy / num_compute_units) * 100, 1)
|
||||
process_info['cu_occupancy'] = self.helpers.unit_format(self.logger,
|
||||
cu_percentage,
|
||||
'%')
|
||||
else:
|
||||
process_info['cu_occupancy'] = "N/A"
|
||||
except Exception as e:
|
||||
process_info['cu_occupancy'] = "N/A"
|
||||
logging.debug("Failed to calculate cu_occupancy percentage for GPU %s | %s", gpu_id, str(e))
|
||||
|
||||
filtered_process_values.append({'process_info': process_info})
|
||||
|
||||
# If no processes are populated then we populate an N/A placeholder
|
||||
@@ -5757,8 +5778,7 @@ class AMDSMICommands():
|
||||
# Build the process table's title and header
|
||||
self.logger.secondary_table_title = "PROCESS INFO"
|
||||
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USAGE".rjust(11) + \
|
||||
"GFX".rjust(8) + "ENC".rjust(8)
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "CU%".rjust(9)
|
||||
|
||||
if watching_output:
|
||||
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
|
||||
|
||||
@@ -214,9 +214,9 @@ class AMDSMILogger():
|
||||
if process_dict['process_info'] == "No running processes detected":
|
||||
# Add N/A for empty process_info
|
||||
table_values += "N/A".rjust(20) + "N/A".rjust(9) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(11) + \
|
||||
"N/A".rjust(8) + "N/A".rjust(8) + '\n'
|
||||
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(9) + '\n'
|
||||
else:
|
||||
#Fix this herre
|
||||
for process_key, process_value in process_dict['process_info'].items():
|
||||
string_process_value = str(process_value)
|
||||
if process_key == "name":
|
||||
@@ -230,11 +230,8 @@ class AMDSMILogger():
|
||||
elif process_key == "memory_usage":
|
||||
for memory_key, memory_value in process_value.items():
|
||||
table_values += str(memory_value).rjust(10)
|
||||
elif process_key == "mem_usage":
|
||||
table_values += string_process_value.rjust(11)
|
||||
elif process_key == "usage":
|
||||
for usage_key, usage_value in process_value.items():
|
||||
table_values += str(usage_value).rjust(8)
|
||||
elif process_key == "cu_occupancy":
|
||||
table_values += string_process_value.rjust(9)
|
||||
# Add the stored gpu and stored timestamp to the next line
|
||||
table_values += '\n'
|
||||
if stored_timestamp:
|
||||
@@ -486,20 +483,6 @@ class AMDSMILogger():
|
||||
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
|
||||
|
||||
|
||||
def _store_output_rocmsmi(self, gpu_id, argument, data):
|
||||
if self.is_json_format():
|
||||
# put output into self.json_output
|
||||
pass
|
||||
elif self.is_csv_format():
|
||||
# put output into self.csv_output
|
||||
pass
|
||||
elif self.is_human_readable_format():
|
||||
# put output into self.human_readable_output
|
||||
pass
|
||||
else:
|
||||
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
|
||||
|
||||
|
||||
def store_multiple_device_output(self):
|
||||
""" Store the current output into the multiple_device_output
|
||||
then clear the current output
|
||||
|
||||
@@ -1093,7 +1093,6 @@ except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
|
||||
### amdsmi_get_gpu_process_list
|
||||
|
||||
Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A"
|
||||
@@ -1111,6 +1110,7 @@ Field | Description
|
||||
`mem` | Process memory usage
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
|
||||
`cu_occupancy` | Number of Compute Units utilized
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
|
||||
|
||||
|
||||
@@ -817,6 +817,7 @@ int main() {
|
||||
amdsmi_proc_info_t process = {};
|
||||
uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0;
|
||||
uint64_t gfx = 0, enc = 0;
|
||||
uint32_t cu_occupancy = 0;
|
||||
char bdf_str[20];
|
||||
sprintf(bdf_str, "%04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint64_t>(bdf.domain_number),
|
||||
@@ -837,7 +838,7 @@ int main() {
|
||||
printf(
|
||||
"| pid | name | user | gpu bdf | "
|
||||
"fb usage | gtt memory | cpu memory | vram memory | "
|
||||
"engine usage (ns) |\n");
|
||||
"engine usage (ns) | cu occupancy |\n");
|
||||
printf("| | | | "
|
||||
"| | | | "
|
||||
" | gfx enc |\n");
|
||||
@@ -855,30 +856,34 @@ int main() {
|
||||
pwd = getpwuid(st.st_uid);
|
||||
if (!pwd)
|
||||
printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB "
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
process_info_list[it].pid, process_info_list[it].name, st.st_uid,
|
||||
bdf_str, process_info_list[it].mem / 1024,
|
||||
process_info_list[it].memory_usage.gtt_mem / 1024,
|
||||
process_info_list[it].memory_usage.cpu_mem / 1024,
|
||||
process_info_list[it].memory_usage.vram_mem / 1024,
|
||||
process_info_list[it].engine_usage.gfx,
|
||||
process_info_list[it].engine_usage.enc);
|
||||
process_info_list[it].engine_usage.enc,
|
||||
process_info_list[it].cu_occupancy);
|
||||
else
|
||||
printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB "
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
process_info_list[it].pid, process_info_list[it].name,
|
||||
pwd->pw_name, bdf_str, process_info_list[it].mem / 1024,
|
||||
process_info_list[it].memory_usage.gtt_mem / 1024,
|
||||
process_info_list[it].memory_usage.cpu_mem / 1024,
|
||||
process_info_list[it].memory_usage.vram_mem / 1024,
|
||||
process_info_list[it].engine_usage.gfx,
|
||||
process_info_list[it].engine_usage.enc);
|
||||
process_info_list[it].engine_usage.enc,
|
||||
process_info_list[it].cu_occupancy);
|
||||
|
||||
mem += process_info_list[it].mem / 1024;
|
||||
gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024;
|
||||
cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024;
|
||||
vram_mem += process_info_list[it].memory_usage.vram_mem / 1024;
|
||||
gfx = process_info_list[it].engine_usage.gfx;
|
||||
enc = process_info_list[it].engine_usage.enc;
|
||||
cu_occupancy = process_info_list[it].cu_occupancy;
|
||||
printf(
|
||||
"+-------+------------------+------------+-------------"
|
||||
"-+-------------+-------------+-------------+----------"
|
||||
@@ -887,10 +892,9 @@ int main() {
|
||||
// TODO: To remove compiler warning, the last 3 values in this printf were
|
||||
// set to 0L. Need to find out what these values need to be.
|
||||
printf("| TOTAL:| %s | %7ld "
|
||||
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu "
|
||||
"%lu %lu %lu |\n",
|
||||
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx,
|
||||
enc, 0L, 0L, 0L);
|
||||
enc, cu_occupancy, 0L);
|
||||
printf("+=======+==================+============+=============="
|
||||
"+=============+=============+=============+============"
|
||||
"=+==========================================+\n");
|
||||
|
||||
@@ -1093,7 +1093,8 @@ typedef struct {
|
||||
uint32_t reserved[10];
|
||||
} memory_usage; //!< in bytes
|
||||
char container_name[AMDSMI_MAX_STRING_LENGTH];
|
||||
uint32_t reserved[12];
|
||||
uint32_t cu_occupancy; //!< Num CUs utilized
|
||||
uint32_t reserved[11];
|
||||
} amdsmi_proc_info_t;
|
||||
|
||||
/**
|
||||
|
||||
@@ -2691,6 +2691,7 @@ def amdsmi_get_gpu_process_list(
|
||||
"cpu_mem": process_list[index].memory_usage.cpu_mem,
|
||||
"vram_mem": process_list[index].memory_usage.vram_mem,
|
||||
},
|
||||
"cu_occupancy": process_list[index].cu_occupancy
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
@@ -1289,7 +1289,8 @@ struct_amdsmi_proc_info_t._fields_ = [
|
||||
('engine_usage', struct_engine_usage_),
|
||||
('memory_usage', struct_memory_usage_),
|
||||
('container_name', ctypes.c_char * 256),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
('cu_occupancy', ctypes.c_uint32),
|
||||
('PADDING_1', ctypes.c_ubyte * 4),
|
||||
]
|
||||
|
||||
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
|
||||
|
||||
@@ -456,7 +456,6 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
proc->sdma_usage = 0;
|
||||
proc->cu_occupancy = 0;
|
||||
|
||||
uint32_t cu_count = 0;
|
||||
static amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
static std::map<uint64_t, std::shared_ptr<KFDNode>>& kfd_node_map =
|
||||
smi.kfd_node_map();
|
||||
@@ -510,23 +509,15 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
|
||||
}
|
||||
else if(sysfs_data_errcode==0){
|
||||
// Update CU usage by the process
|
||||
proc->cu_occupancy += std::stoi(tmp);
|
||||
// Collect count of compute units
|
||||
cu_count += kfd_node_map[gpu_id]->cu_count();
|
||||
proc->cu_occupancy = std::stoi(tmp);
|
||||
}
|
||||
else {
|
||||
// Some GFX revisions do not provide cu_occupancy debugfs method
|
||||
// which may cause ENOENT
|
||||
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
|
||||
cu_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Adjust CU occupancy to percent.
|
||||
if (cu_count > 0) {
|
||||
proc->cu_occupancy = ((proc->cu_occupancy * 100) / cu_count);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -216,6 +216,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
|
||||
/**
|
||||
* Complete the process information
|
||||
* This is where we copy rsmi_process_info_t into the larger amdsmi_proc_info_t
|
||||
* Then populate the remaining fields with the gpuvsmi_get_pid_info()
|
||||
* TODO FIX HERE TO GRAB KFD VRAM if /proc is inconsistent
|
||||
*/
|
||||
auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) {
|
||||
auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info);
|
||||
@@ -225,6 +228,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
|
||||
}
|
||||
|
||||
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
|
||||
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
|
||||
|
||||
return status_code;
|
||||
};
|
||||
|
||||
|
||||
+157
-190
@@ -20,16 +20,17 @@
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <dirent.h>
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <string.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
@@ -37,230 +38,196 @@
|
||||
extern "C" {
|
||||
|
||||
static const char *container_type_name[AMDSMI_MAX_CONTAINER_TYPE] = {
|
||||
[AMDSMI_CONTAINER_LXC] = "lxc",
|
||||
[AMDSMI_CONTAINER_DOCKER] = "docker",
|
||||
[AMDSMI_CONTAINER_LXC] = "lxc",
|
||||
[AMDSMI_CONTAINER_DOCKER] = "docker",
|
||||
};
|
||||
|
||||
amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf)
|
||||
{
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) {
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
|
||||
d = opendir(path.c_str());
|
||||
if (!d)
|
||||
return AMDSMI_STATUS_NO_PERM;
|
||||
d = opendir(path.c_str());
|
||||
if (!d) return AMDSMI_STATUS_NO_PERM;
|
||||
|
||||
/* iterate through all the fds, try to find
|
||||
* a match for the GPU bdf
|
||||
*/
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
std::string file = path + dir->d_name;
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
for (std::string line; std::getline(fdinfo, line);) {
|
||||
if (line.find(bdf) != std::string::npos) {
|
||||
closedir(d);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* iterate through all the fds, try to find
|
||||
* a match for the GPU bdf
|
||||
*/
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
std::string file = path + dir->d_name;
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
for (std::string line; std::getline(fdinfo, line);) {
|
||||
if (line.find(bdf) != std::string::npos) {
|
||||
closedir(d);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir(d);
|
||||
closedir(d);
|
||||
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids, uint64_t *size)
|
||||
{
|
||||
char bdf_str[13];
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector<long int> &pids,
|
||||
uint64_t *size) {
|
||||
char bdf_str[13];
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
|
||||
/* 0000:00:00.0 */
|
||||
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint32_t>(bdf.domain_number & 0xffff),
|
||||
static_cast<uint32_t>(bdf.bus_number & 0xff),
|
||||
static_cast<uint32_t>(bdf.device_number & 0x1f),
|
||||
static_cast<uint32_t>(bdf.function_number & 0x7));
|
||||
/* 0000:00:00.0 */
|
||||
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint32_t>(bdf.domain_number & 0xffff),
|
||||
static_cast<uint32_t>(bdf.bus_number & 0xff),
|
||||
static_cast<uint32_t>(bdf.device_number & 0x1f),
|
||||
static_cast<uint32_t>(bdf.function_number & 0x7));
|
||||
|
||||
d = opendir("/proc");
|
||||
if (!d)
|
||||
return AMDSMI_STATUS_NO_PERM;
|
||||
d = opendir("/proc");
|
||||
if (!d) return AMDSMI_STATUS_NO_PERM;
|
||||
|
||||
pids.clear();
|
||||
/* Find the pid folders in /proc/ that we have access to */
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_DIR) {
|
||||
/* Try to cast the name of the folder to a
|
||||
* number, if it fails, it is not */
|
||||
char *p;
|
||||
long int pid;
|
||||
pids.clear();
|
||||
/* Find the pid folders in /proc/ that we have access to */
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
if (dir->d_type == DT_DIR) {
|
||||
/* Try to cast the name of the folder to a
|
||||
* number, if it fails, it is not */
|
||||
char *p;
|
||||
long int pid;
|
||||
|
||||
pid = strtol(dir->d_name, &p, 10);
|
||||
if (*p != 0)
|
||||
continue;
|
||||
pid = strtol(dir->d_name, &p, 10);
|
||||
if (*p != 0) continue;
|
||||
|
||||
/* Check if fdinfo is accesible */
|
||||
std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/";
|
||||
/* Check if fdinfo is accesible */
|
||||
std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/";
|
||||
|
||||
if (access(path.c_str(), R_OK))
|
||||
continue;
|
||||
if (access(path.c_str(), R_OK)) continue;
|
||||
|
||||
/* check if GPU is present */
|
||||
if (gpuvsmi_pid_is_gpu(path, bdf_str))
|
||||
continue;
|
||||
pids.push_back(pid);
|
||||
}
|
||||
}
|
||||
closedir(d);
|
||||
/* check if GPU is present */
|
||||
if (gpuvsmi_pid_is_gpu(path, bdf_str)) continue;
|
||||
pids.push_back(pid);
|
||||
}
|
||||
}
|
||||
closedir(d);
|
||||
|
||||
*size = pids.size();
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
*size = pids.size();
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
|
||||
amdsmi_proc_info_t &info)
|
||||
{
|
||||
char bdf_str[13];
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
amdsmi_proc_info_t &info) {
|
||||
char bdf_str[13];
|
||||
DIR *d;
|
||||
struct dirent *dir;
|
||||
|
||||
/* 0000:00:00.0 */
|
||||
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint32_t>(bdf.domain_number & 0xffff),
|
||||
static_cast<uint32_t>(bdf.bus_number & 0xff),
|
||||
static_cast<uint32_t>(bdf.device_number & 0x1f),
|
||||
static_cast<uint32_t>(bdf.function_number & 0x7));
|
||||
/* 0000:00:00.0 */
|
||||
snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
|
||||
static_cast<uint32_t>(bdf.domain_number & 0xffff),
|
||||
static_cast<uint32_t>(bdf.bus_number & 0xff),
|
||||
static_cast<uint32_t>(bdf.device_number & 0x1f),
|
||||
static_cast<uint32_t>(bdf.function_number & 0x7));
|
||||
|
||||
std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/";
|
||||
std::string name_path = "/proc/" + std::to_string(pid) + "/comm";
|
||||
std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup";
|
||||
std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/";
|
||||
std::string name_path = "/proc/" + std::to_string(pid) + "/comm";
|
||||
std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup";
|
||||
|
||||
if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
d = opendir(path.c_str());
|
||||
if (!d)
|
||||
return AMDSMI_STATUS_NO_PERM;
|
||||
d = opendir(path.c_str());
|
||||
if (!d) return AMDSMI_STATUS_NO_PERM;
|
||||
|
||||
/* Vectors to check if repated fd pasid */
|
||||
// TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0
|
||||
std::vector<int> pasids;
|
||||
/* Vectors to check if repated fd pasid */
|
||||
// TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0
|
||||
std::vector<int> pasids;
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
/* Iterate through all fdinfos */
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
memset(&info, 0, sizeof(info));
|
||||
/* Iterate through all fdinfos */
|
||||
while ((dir = readdir(d)) != NULL) {
|
||||
std::string file = path + dir->d_name;
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
|
||||
std::string file = path + dir->d_name;
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
for (std::string bdfline; getline(fdinfo, bdfline);) {
|
||||
if (bdfline.find("drm-pdev:") != std::string::npos) {
|
||||
char fd_bdf_str[13];
|
||||
|
||||
for (std::string bdfline; getline(fdinfo, bdfline);) {
|
||||
if (bdfline.find("drm-pdev:") != std::string::npos) {
|
||||
char fd_bdf_str[13];
|
||||
/* Only check against fdinfo files that contain a bdf */
|
||||
if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1) continue;
|
||||
|
||||
/* Only check against fdinfo files that contain a bdf */
|
||||
if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1)
|
||||
continue;
|
||||
/* Populate amdsmi_proc_info_t struct only if the bdf in
|
||||
* the fdinfo file matches the passed bdf */
|
||||
if (strncmp(bdf_str, fd_bdf_str, 13) == 0) {
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
|
||||
/* Populate amdsmi_proc_info_t struct only if the bdf in
|
||||
* the fdinfo file matches the passed bdf */
|
||||
if (strncmp(bdf_str, fd_bdf_str, 13) == 0){
|
||||
std::ifstream fdinfo(file.c_str());
|
||||
for (std::string line; getline(fdinfo, line);) {
|
||||
if (line.find("pasid:") != std::string::npos) {
|
||||
int pasid;
|
||||
if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1) continue;
|
||||
auto it = std::find(pasids.begin(), pasids.end(), pasid);
|
||||
if (it == pasids.end()) pasids.push_back(pasid);
|
||||
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.gtt_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.cpu_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-vram:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.vram_mem += mem * 1000;
|
||||
} else if (line.find("drm-engine-gfx") != std::string::npos) {
|
||||
uint64_t engine_gfx;
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue;
|
||||
info.engine_usage.gfx = engine_gfx;
|
||||
} else if (line.find("drm-engine-enc") != std::string::npos) {
|
||||
uint64_t engine_enc;
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue;
|
||||
info.engine_usage.enc = engine_enc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (std::string line; getline(fdinfo, line);) {
|
||||
if (line.find("pasid:") != std::string::npos) {
|
||||
int pasid;
|
||||
|
||||
if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1)
|
||||
continue;
|
||||
|
||||
auto it = std::find(pasids.begin(), pasids.end(), pasid);
|
||||
|
||||
if (it == pasids.end())
|
||||
pasids.push_back(pasid);
|
||||
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1)
|
||||
continue;
|
||||
|
||||
info.mem += mem * 1024;
|
||||
info.memory_usage.gtt_mem += mem * 1024;
|
||||
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1)
|
||||
continue;
|
||||
|
||||
info.mem += mem * 1024;
|
||||
info.memory_usage.cpu_mem += mem * 1024;
|
||||
} else if (line.find("drm-memory-vram:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1)
|
||||
continue;
|
||||
|
||||
info.mem += mem * 1024;
|
||||
info.memory_usage.vram_mem += mem * 1024;
|
||||
} else if (line.find("drm-engine-gfx") != std::string::npos) {
|
||||
uint64_t engine_gfx;
|
||||
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1)
|
||||
continue;
|
||||
|
||||
info.engine_usage.gfx = engine_gfx;
|
||||
} else if (line.find("drm-engine-enc") != std::string::npos) {
|
||||
uint64_t engine_enc;
|
||||
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1)
|
||||
continue;
|
||||
|
||||
info.engine_usage.enc = engine_enc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir(d);
|
||||
closedir(d);
|
||||
|
||||
// Note: If possible at all, try to get the name of the process/container.
|
||||
// In case the other info fail, get at least something.
|
||||
std::ifstream filename(name_path.c_str());
|
||||
std::string name;
|
||||
std::ifstream filename(name_path.c_str());
|
||||
std::string name;
|
||||
|
||||
getline(filename, name);
|
||||
getline(filename, name);
|
||||
|
||||
if (name.empty())
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
if (name.empty()) return AMDSMI_STATUS_API_FAILED;
|
||||
|
||||
strncpy(info.name, name.c_str(), std::min(
|
||||
(unsigned long) AMDSMI_MAX_STRING_LENGTH,
|
||||
name.length()));
|
||||
strncpy(info.name, name.c_str(),
|
||||
std::min((unsigned long)AMDSMI_MAX_STRING_LENGTH, name.length()));
|
||||
|
||||
for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) {
|
||||
std::ifstream cgroup_info(cgroup_path.c_str());
|
||||
std::string container_id;
|
||||
for (std::string line; getline(cgroup_info, line);) {
|
||||
if (line.find(container_type_name[i]) != std::string::npos) {
|
||||
container_id = line.substr(line.find(container_type_name[i]) +
|
||||
strlen(container_type_name[i]) + 1, 16);
|
||||
strcpy(info.container_name, container_id.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (strlen(info.container_name) > 0)
|
||||
break;
|
||||
}
|
||||
info.pid = (uint32_t)pid;
|
||||
for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) {
|
||||
std::ifstream cgroup_info(cgroup_path.c_str());
|
||||
std::string container_id;
|
||||
for (std::string line; getline(cgroup_info, line);) {
|
||||
if (line.find(container_type_name[i]) != std::string::npos) {
|
||||
container_id = line.substr(line.find(container_type_name[i]) +
|
||||
strlen(container_type_name[i]) + 1, 16);
|
||||
strcpy(info.container_name, container_id.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (strlen(info.container_name) > 0) break;
|
||||
}
|
||||
info.pid = (uint32_t)pid;
|
||||
|
||||
if (!pasids.size()) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
if (!pasids.size()) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
} // extern "C"
|
||||
} // extern "C"
|
||||
|
||||
@@ -44,15 +44,12 @@ try:
|
||||
from amdsmi_logger import AMDSMILogger
|
||||
from amdsmi_parser import AMDSMIParser
|
||||
import amdsmi_cli_exceptions
|
||||
helpers = AMDSMIHelpers()
|
||||
except ImportError as e:
|
||||
print(f"Failed to import amdsmi cli libs: {e}")
|
||||
print("Ensure that you have installed amdsmi's package.")
|
||||
|
||||
|
||||
helpers = AMDSMIHelpers()
|
||||
|
||||
|
||||
|
||||
# Make exit & quit work without parens because it's annoying
|
||||
type(exit).__repr__ = sys.exit
|
||||
|
||||
|
||||
新增問題並參考
封鎖使用者