[SWDEV-488303] Fixed process list information source
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> Change-Id: Iec3416cb5ca1bdd806c3225b514bbf3dbf8c0d2e
This commit is contained in:
committed by
Arif, Maisam
orang tua
cc4dfd834f
melakukan
cebb0799cb
@@ -2165,14 +2165,14 @@ class AMDSMICommands():
|
||||
clk_value = int(clk.get("value", 0))
|
||||
else:
|
||||
if isinstance(clk, str):
|
||||
clk_value = int(str(clk).split()[0])
|
||||
clk_value = int(str(clk).split()[0])
|
||||
else:
|
||||
clk_value = int(clk)
|
||||
if isinstance(min_clk, dict):
|
||||
min_clk_value = int(min_clk.get("value", 0))
|
||||
else:
|
||||
if isinstance(min_clk, str):
|
||||
min_clk_value = int(str(min_clk).split()[0])
|
||||
min_clk_value = int(str(min_clk).split()[0])
|
||||
else:
|
||||
min_clk_value = int(min_clk)
|
||||
# If the clk value is less than the min_clk value, then deep sleep is enabled
|
||||
@@ -5755,15 +5755,22 @@ class AMDSMICommands():
|
||||
# Clean processes dictionary
|
||||
filtered_process_values = []
|
||||
for process_info in process_list:
|
||||
process_info.pop('mem') # Remove 'mem' value
|
||||
process_info.pop('engine_usage') # Remove 'engine_usage' value
|
||||
process_info['mem_usage'] = process_info.pop('mem')
|
||||
process_info['cu_occupancy'] = process_info.pop('cu_occupancy')
|
||||
|
||||
memory_usage_unit = "B"
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
|
||||
memory_usage_unit = ""
|
||||
|
||||
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
|
||||
process_info['mem_usage'],
|
||||
memory_usage_unit)
|
||||
|
||||
for usage_metric in process_info['memory_usage']:
|
||||
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
|
||||
process_info['memory_usage'][usage_metric],
|
||||
@@ -5796,8 +5803,8 @@ class AMDSMICommands():
|
||||
|
||||
# Build the process table's title and header
|
||||
self.logger.secondary_table_title = "PROCESS INFO"
|
||||
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "CU%".rjust(9)
|
||||
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
|
||||
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9)
|
||||
|
||||
if watching_output:
|
||||
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
|
||||
@@ -6446,7 +6453,7 @@ class AMDSMICommands():
|
||||
else:
|
||||
with self.logger.destination.open('a', encoding="utf-8") as output_file:
|
||||
output_file.write(legend_output + '\n')
|
||||
|
||||
|
||||
|
||||
def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
|
||||
severity=None, folder=None, file_limit=None, cper_file=None, follow=None):
|
||||
@@ -6494,7 +6501,7 @@ class AMDSMICommands():
|
||||
|
||||
if not args.gpu:
|
||||
return
|
||||
|
||||
|
||||
if not isinstance(args.gpu, list):
|
||||
args.gpu = [args.gpu]
|
||||
|
||||
@@ -6506,6 +6513,7 @@ class AMDSMICommands():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def default(self, args):
|
||||
"""Display the default amdsmi view when no args are given."""
|
||||
|
||||
@@ -6560,7 +6568,7 @@ class AMDSMICommands():
|
||||
else:
|
||||
partition_mode = f"{current_comp}/{current_mem}"
|
||||
gpu_info_dict.update({"partition_mode": partition_mode})
|
||||
|
||||
|
||||
# GPU name market name and OAM ID
|
||||
try:
|
||||
asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(processor)
|
||||
|
||||
@@ -1102,7 +1102,6 @@ class AMDSMIHelpers():
|
||||
|
||||
# Loop through all entries in the dictionary.
|
||||
for entry_index, entry in enumerate(entries.values()):
|
||||
|
||||
# Assume 'entry' is a dictionary with keys: "error_severity" and "notify_type".
|
||||
error_severity = entry.get("error_severity", "Unknown")
|
||||
notify_type = entry.get("notify_type", "Unknown")
|
||||
|
||||
@@ -213,23 +213,26 @@ class AMDSMILogger():
|
||||
for process_dict in value:
|
||||
if process_dict['process_info'] == "No running processes detected":
|
||||
# Add N/A for empty process_info
|
||||
table_values += "N/A".rjust(20) + "N/A".rjust(9) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(9) + '\n'
|
||||
table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \
|
||||
"N/A".rjust(9) + '\n'
|
||||
else:
|
||||
#Fix this herre
|
||||
for process_key, process_value in process_dict['process_info'].items():
|
||||
string_process_value = str(process_value)
|
||||
if process_key == "name":
|
||||
# Truncate name if too long
|
||||
process_name = string_process_value[:20]
|
||||
process_name = string_process_value[:17]
|
||||
if process_name == "":
|
||||
process_name = "N/A"
|
||||
table_values += process_name.rjust(20)
|
||||
table_values += process_name.rjust(17)
|
||||
elif process_key == "pid":
|
||||
table_values += string_process_value.rjust(9)
|
||||
elif process_key == "memory_usage":
|
||||
for memory_key, memory_value in process_value.items():
|
||||
table_values += str(memory_value).rjust(10)
|
||||
elif process_key == "mem_usage":
|
||||
table_values += string_process_value.rjust(10)
|
||||
elif process_key == "cu_occupancy":
|
||||
table_values += string_process_value.rjust(9)
|
||||
# Add the stored gpu and stored timestamp to the next line
|
||||
|
||||
@@ -1107,9 +1107,9 @@ Field | Description
|
||||
---|---
|
||||
`name` | Name of process. If user does not have permission this will be "N/A"
|
||||
`pid` | Process ID
|
||||
`mem` | Process memory usage in Bytes
|
||||
`mem` | Total memory usage by GPU during process in Bytes
|
||||
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage in Bytes</td></tr> </tbody></table>
|
||||
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage in Bytes</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage in Bytes</td></tr><tr><td>`vram_mem`</td><td>Process VRAM memory usage in Bytes</td></tr> </tbody></table>
|
||||
`cu_occupancy` | Number of Compute Units utilized
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
|
||||
|
||||
@@ -855,8 +855,8 @@ int main() {
|
||||
continue;
|
||||
pwd = getpwuid(st.st_uid);
|
||||
if (!pwd)
|
||||
printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB "
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
printf("| %5d | %16s | %10d | %s | %7ld KB | %7ld KB "
|
||||
"| %7ld KB | %7ld KB | %lu %lu | %u |\n",
|
||||
process_info_list[it].pid, process_info_list[it].name, st.st_uid,
|
||||
bdf_str, process_info_list[it].mem / 1024,
|
||||
process_info_list[it].memory_usage.gtt_mem / 1024,
|
||||
@@ -866,8 +866,8 @@ int main() {
|
||||
process_info_list[it].engine_usage.enc,
|
||||
process_info_list[it].cu_occupancy);
|
||||
else
|
||||
printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB "
|
||||
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
printf("| %5d | %16s | %10s | %s | %7ld KB | %7ld KB "
|
||||
"| %7ld KB | %7ld KB | %lu %lu | %u |\n",
|
||||
process_info_list[it].pid, process_info_list[it].name,
|
||||
pwd->pw_name, bdf_str, process_info_list[it].mem / 1024,
|
||||
process_info_list[it].memory_usage.gtt_mem / 1024,
|
||||
@@ -892,9 +892,9 @@ int main() {
|
||||
// TODO: To remove compiler warning, the last 3 values in this printf were
|
||||
// set to 0L. Need to find out what these values need to be.
|
||||
printf("| TOTAL:| %s | %7ld "
|
||||
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
|
||||
"KB | %7ld KB | %7ld KB | %7ld KB | %lu %lu | %u |\n",
|
||||
bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx,
|
||||
enc, cu_occupancy, 0L);
|
||||
enc, cu_occupancy);
|
||||
printf("+=======+==================+============+=============="
|
||||
"+=============+=============+=============+============"
|
||||
"=+==========================================+\n");
|
||||
|
||||
@@ -68,9 +68,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); }
|
||||
uint32_t get_vendor_id();
|
||||
const GPUComputeProcessList_t& amdgpu_get_compute_process_list(ComputeProcessListType_t list_type = ComputeProcessListType_t::kAllProcessesOnDevice);
|
||||
const GPUComputeProcessList_t& amdgpu_get_all_compute_process_list() {
|
||||
return amdgpu_get_compute_process_list(ComputeProcessListType_t::kAllProcesses);
|
||||
}
|
||||
|
||||
|
||||
// New methods for -e feature
|
||||
std::string bdf_to_string() const; // -e feature
|
||||
|
||||
+23
-13
@@ -4122,46 +4122,56 @@ amdsmi_status_t amdsmi_get_afids_from_cper(
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
// Validate the max_processes pointer
|
||||
if (!max_processes) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Retrieve the GPU device associated with the processor handle
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t status_code = get_gpu_device_from_handle(processor_handle, &gpu_device);
|
||||
if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
|
||||
if (status_code != AMDSMI_STATUS_SUCCESS) {
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Get the list of compute processes running on the GPU
|
||||
auto compute_process_list = gpu_device->amdgpu_get_compute_process_list();
|
||||
|
||||
// If max_processes is 0, return the number of processes currently running
|
||||
// If compute_process_list is empty, return success with max_processes set to 0
|
||||
if ((*max_processes == 0) || compute_process_list.empty()) {
|
||||
*max_processes = static_cast<uint32_t>(compute_process_list.size());
|
||||
return amdsmi_status_t::AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
if (!list) {
|
||||
return amdsmi_status_t::AMDSMI_STATUS_INVAL;
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// Validate the list pointer
|
||||
if (!list) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Store the original size of max_processes
|
||||
const auto max_processes_original_size(*max_processes);
|
||||
auto idx = uint32_t(0);
|
||||
|
||||
// Populate the list with process information
|
||||
for (auto& process : compute_process_list) {
|
||||
if (idx < *max_processes) {
|
||||
// Iterate over the map of processes and store the amdsmi_proc_info_t in the list
|
||||
list[idx++] = static_cast<amdsmi_proc_info_t>(process.second);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: If the reserved size for processes is smaller than the number of
|
||||
// actual processes running. The AMDSMI_STATUS_OUT_OF_RESOURCES is
|
||||
// an indication the caller should handle the situation (resize).
|
||||
// The max_processes is always changed to reflect the actual size of
|
||||
// list of processes running, so the caller knows where it is at.
|
||||
// Holding a copy of max_process before it is passed in will be helpful
|
||||
// for the caller.
|
||||
// Update max_processes to reflect the actual number of running processes
|
||||
*max_processes = static_cast<uint32_t>(compute_process_list.size());
|
||||
|
||||
// Check if the caller-provided size for processes is sufficient to store all running processes
|
||||
return (max_processes_original_size >= static_cast<uint32_t>(compute_process_list.size()))
|
||||
? AMDSMI_STATUS_SUCCESS : amdsmi_status_t::AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
? AMDSMI_STATUS_SUCCESS : AMDSMI_STATUS_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_info_t *info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
@@ -198,71 +198,69 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
compute_process_list.clear();
|
||||
|
||||
/**
|
||||
* The first call to GetProcessInfo() helps to find the size it needs,
|
||||
* so we can create a tailored size list.
|
||||
* The first call to rsmi_compute_process_info_get() to find the number of
|
||||
* rsmi_process_info_t currently running on the system.
|
||||
*/
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
auto list_process_running_size = uint32_t(0);
|
||||
auto num_running_processes = uint32_t(0);
|
||||
auto list_process_allocation_size = uint32_t(0);
|
||||
|
||||
status_code = rsmi_compute_process_info_get(nullptr, &list_process_running_size);
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_process_running_size <= 0)) {
|
||||
status_code = rsmi_compute_process_info_get(nullptr, &num_running_processes);
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (num_running_processes <= 0)) {
|
||||
return status_code;
|
||||
}
|
||||
|
||||
/**
|
||||
* The second call to GetProcessInfo() helps to set proper sizes for both,
|
||||
* the raw array of processes (amdsmi_process_info_t) and list of processes (amdsmi_proc_info_t).
|
||||
* Make a type safe pointer, then
|
||||
*
|
||||
* second call to rsmi_compute_process_info_get() g
|
||||
* the allocated rsmi_process_info_t array.
|
||||
*/
|
||||
using RsmiDeviceList_t = uint32_t[];
|
||||
using RsmiProcessList_t = rsmi_process_info_t[];
|
||||
std::unique_ptr<RsmiProcessList_t> list_all_processes_ptr = std::make_unique<RsmiProcessList_t>(list_process_running_size);
|
||||
std::unique_ptr<RsmiProcessList_t> list_all_processes_ptr = std::make_unique<RsmiProcessList_t>(num_running_processes);
|
||||
|
||||
list_process_allocation_size = list_process_running_size;
|
||||
status_code = rsmi_compute_process_info_get(list_all_processes_ptr.get(), &list_process_allocation_size);
|
||||
if (status_code) {
|
||||
status_code = rsmi_compute_process_info_get(list_all_processes_ptr.get(), &num_running_processes);
|
||||
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Restore the original size to read
|
||||
list_process_running_size = list_process_allocation_size;
|
||||
if (list_process_running_size <= 0) {
|
||||
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
|
||||
if (num_running_processes <= 0) {
|
||||
return rsmi_status_t::RSMI_STATUS_SUCCESS; // No processes running
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Setup for the cases where the process list is by device.
|
||||
* Check that you have devices that are able to be monitored, ie excluding CPUs
|
||||
*/
|
||||
auto list_device_running_size = uint32_t(0);
|
||||
auto num_running_devices = uint32_t(0);
|
||||
auto list_device_allocation_size = uint32_t(0);
|
||||
status_code = rsmi_num_monitor_devices(&list_device_running_size);
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (list_device_running_size <= 0)) {
|
||||
status_code = rsmi_num_monitor_devices(&num_running_devices);
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (num_running_devices <= 0)) {
|
||||
return status_code;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Complete the process information
|
||||
* This is where we copy rsmi_process_info_t into the larger amdsmi_proc_info_t
|
||||
* Then populate the remaining fields with the gpuvsmi_get_pid_info()
|
||||
* TODO FIX HERE TO GRAB KFD VRAM if /proc is inconsistent
|
||||
* Populate process information for the given AMDSmiGPUDevice reference.
|
||||
* This function retrieves the process information given in rsmi_proc_info_t
|
||||
* and populates the amdsmi_proc_info_t structure.
|
||||
*/
|
||||
auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) {
|
||||
auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info);
|
||||
auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& amdsmi_proc_info) {
|
||||
// amdsmi_proc_info_t gets populated with /proc information from gpuvsmi_get_pid_info()
|
||||
|
||||
auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, amdsmi_proc_info);
|
||||
// If we cannot get the info from sysfs, save the minimum info
|
||||
if (status_code != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
|
||||
asmi_proc_info.pid = rsmi_proc_info.process_id;
|
||||
asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
|
||||
amdsmi_proc_info.pid = rsmi_proc_info.process_id;
|
||||
amdsmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
|
||||
}
|
||||
|
||||
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
|
||||
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
|
||||
amdsmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
|
||||
|
||||
// Safely handle KFD file access
|
||||
// Safely handle KFD processes to get total memory_usage of the process
|
||||
uint64_t kfd_gpu_id = get_kfd_gpu_id();
|
||||
std::string kfd_path = "/sys/class/kfd/kfd/proc/" +
|
||||
std::to_string(rsmi_proc_info.process_id) +
|
||||
std::string kfd_path = "/sys/class/kfd/kfd/proc/" +
|
||||
std::to_string(rsmi_proc_info.process_id) +
|
||||
"/vram_" + std::to_string(kfd_gpu_id);
|
||||
|
||||
// Check if the file exists before attempting to open it
|
||||
@@ -273,8 +271,7 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
if (std::getline(kfd_file, line)) {
|
||||
try {
|
||||
uint64_t vram_bytes = std::stoull(line);
|
||||
asmi_proc_info.mem = vram_bytes; // Already in bytes
|
||||
asmi_proc_info.memory_usage.vram_mem = vram_bytes; // Already in bytes
|
||||
amdsmi_proc_info.mem = vram_bytes; // Already in bytes
|
||||
} catch (const std::exception& e) {
|
||||
// Handle conversion error gracefully
|
||||
std::ostringstream ss;
|
||||
@@ -297,36 +294,15 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
return status_code;
|
||||
};
|
||||
|
||||
/**
|
||||
* Get process information
|
||||
*/
|
||||
auto update_list_by_running_process = [&](const uint32_t process_id) {
|
||||
auto status_result(true);
|
||||
rsmi_process_info_t rsmi_proc_info{};
|
||||
auto status_code = rsmi_compute_process_info_by_pid_get(process_id, &rsmi_proc_info);
|
||||
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
status_result = false;
|
||||
return status_result;
|
||||
}
|
||||
|
||||
amdsmi_proc_info_t tmp_asmi_proc_info{};
|
||||
get_process_info(rsmi_proc_info, tmp_asmi_proc_info);
|
||||
compute_process_list.emplace(process_id, tmp_asmi_proc_info);
|
||||
|
||||
return status_result;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Devices used by a process.
|
||||
*/
|
||||
auto update_list_by_running_device = [&](const uint32_t process_id,
|
||||
const uint32_t proc_addr_id) {
|
||||
// Get all devices running this process
|
||||
auto update_list_by_running_device = [&](rsmi_process_info_t rsmi_proc_info) {
|
||||
// Get all devices running this process into list_device_ptr
|
||||
auto status_result(true);
|
||||
std::unique_ptr<RsmiDeviceList_t> list_device_ptr = std::make_unique<RsmiDeviceList_t>(list_device_running_size);
|
||||
list_device_allocation_size = list_device_running_size;
|
||||
auto status_code = rsmi_compute_process_gpus_get(process_id, list_device_ptr.get(), &list_device_allocation_size);
|
||||
std::unique_ptr<RsmiDeviceList_t> list_device_ptr = std::make_unique<RsmiDeviceList_t>(num_running_devices);
|
||||
list_device_allocation_size = num_running_devices;
|
||||
auto status_code = rsmi_compute_process_gpus_get(rsmi_proc_info.process_id, list_device_ptr.get(), &list_device_allocation_size);
|
||||
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
status_result = false;
|
||||
return status_result;
|
||||
@@ -335,16 +311,13 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
for (auto device_idx = uint32_t(0); device_idx < list_device_allocation_size; ++device_idx) {
|
||||
// Is this device running this process?
|
||||
if (list_device_ptr[device_idx] == get_gpu_id()) {
|
||||
rsmi_process_info_t rsmi_dev_proc_info{};
|
||||
// TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0
|
||||
auto status_code = rsmi_compute_process_info_by_device_get(process_id, list_device_ptr[device_idx], &rsmi_dev_proc_info);
|
||||
if ((status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) &&
|
||||
((rsmi_dev_proc_info.process_id == process_id) && (rsmi_dev_proc_info.pasid == proc_addr_id))) {
|
||||
amdsmi_proc_info_t tmp_asmi_proc_info{};
|
||||
get_process_info(rsmi_dev_proc_info, tmp_asmi_proc_info);
|
||||
compute_process_list.emplace(process_id, tmp_asmi_proc_info);
|
||||
}
|
||||
}
|
||||
std::unordered_set<uint64_t> gpu_set;
|
||||
gpu_set.insert(get_kfd_gpu_id());
|
||||
GetProcessInfoForPID(rsmi_proc_info.process_id, &rsmi_proc_info, &gpu_set);
|
||||
amdsmi_proc_info_t tmp_amdsmi_proc_info{};
|
||||
get_process_info(rsmi_proc_info, tmp_amdsmi_proc_info);
|
||||
compute_process_list.emplace(rsmi_proc_info.process_id, tmp_amdsmi_proc_info);
|
||||
}
|
||||
}
|
||||
|
||||
return status_result;
|
||||
@@ -355,15 +328,10 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
|
||||
* Transfer/Save the ones linked to this device.
|
||||
*/
|
||||
compute_process_list.clear();
|
||||
for (auto process_idx = uint32_t(0); process_idx < list_process_running_size; ++process_idx) {
|
||||
if (list_type == ComputeProcessListType_t::kAllProcesses) {
|
||||
if (update_list_by_running_process(list_all_processes_ptr[process_idx].process_id)) {
|
||||
}
|
||||
}
|
||||
|
||||
if (list_type == ComputeProcessListType_t::kAllProcessesOnDevice) {
|
||||
if (update_list_by_running_device(list_all_processes_ptr[process_idx].process_id,
|
||||
list_all_processes_ptr[process_idx].pasid)) {
|
||||
for (auto process_idx = uint32_t(0); process_idx < num_running_processes; ++process_idx) {
|
||||
if (list_type == ComputeProcessListType_t::kAllProcesses ||
|
||||
list_type == ComputeProcessListType_t::kAllProcessesOnDevice) {
|
||||
if (update_list_by_running_device(list_all_processes_ptr[process_idx])) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,26 +166,26 @@ amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid,
|
||||
if (it == pasids.end()) pasids.push_back(pasid);
|
||||
} else if (line.find("drm-memory-gtt:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %" PRIu32, &mem) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-memory-gtt: %" PRIu64, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.gtt_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-cpu:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %" PRIu32, &mem) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-memory-cpu: %" PRIu64, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.cpu_mem += mem * 1000;
|
||||
} else if (line.find("drm-memory-vram:") != std::string::npos) {
|
||||
unsigned long mem;
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %" PRIu32, &mem) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-memory-vram: %" PRIu64, &mem) != 1) continue;
|
||||
info.mem += mem * 1000;
|
||||
info.memory_usage.vram_mem += mem * 1000;
|
||||
} else if (line.find("drm-engine-gfx") != std::string::npos) {
|
||||
uint64_t engine_gfx;
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %" PRIu32, &engine_gfx) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-engine-gfx: %" PRIu64, &engine_gfx) != 1) continue;
|
||||
info.engine_usage.gfx = engine_gfx;
|
||||
} else if (line.find("drm-engine-enc") != std::string::npos) {
|
||||
uint64_t engine_enc;
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %" PRIu32, &engine_enc) != 1) continue;
|
||||
if (sscanf(line.c_str(), "drm-engine-enc: %" PRIu64, &engine_enc) != 1) continue;
|
||||
info.engine_usage.enc = engine_enc;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user