From 465f2e6a417f441e800682d56a61acea77d5d55d Mon Sep 17 00:00:00 2001 From: "Arif, Maisam" Date: Thu, 29 May 2025 20:35:27 -0500 Subject: [PATCH] [SWDEV-488303] Updated CU occupancy for per-process retrieval (#243) Change-Id: I2990597c6dd4b2e8cf3e11ce60f72049ebdd9a8c Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 0fdaebdbaaf272443d8044cd6d9081f45cb1fee6] --- projects/amdsmi/CHANGELOG.md | 12 +- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 56 ++- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 25 +- .../amdsmi/docs/reference/amdsmi-py-api.md | 2 +- .../amdsmi/example/amd_smi_drm_example.cc | 20 +- projects/amdsmi/include/amd_smi/amdsmi.h | 3 +- .../amdsmi/py-interface/amdsmi_interface.py | 1 + .../amdsmi/py-interface/amdsmi_wrapper.py | 3 +- projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc | 11 +- .../amdsmi/src/amd_smi/amd_smi_gpu_device.cc | 6 + projects/amdsmi/src/amd_smi/fdinfo.cc | 347 ++++++++---------- projects/amdsmi/tools/amdsmi_quick_start.py | 5 +- 12 files changed, 232 insertions(+), 259 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index cbe903cdf5..9cf0efeacc 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -12,6 +12,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Changed +- **Added Compute Unit Occupancy information per process** + Measuring compute units are the best way currently to determine gfx usage on a per process basis + - Added `CU_OCCUPANCY` to `amd-smi process` output. + - Added `CU%` to `amd-smi monitor -q` + - **Expanded Violation Status tracking for GPU metrics 1.8.** - The driver will no longer be supporting existing single-value GFX Clk Below Host Limit fields (`acc_gfx_clk_below_host_limit`, `per_gfx_clk_below_host_limit`, `active_gfx_clk_below_host_limit`), they are now changed in favor of new per-XCP/XCC arrays. - Added new fields to `amdsmi_violation_status_t` and related interfaces for enhanced violation breakdown: @@ -54,11 +59,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Resolved issues -- N/A - -### Upcoming changes - -- N/A +- **Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`.** + - Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and calculated using KB vs KiB. ### Known issues diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index ffc91c34e1..a330e2b3d8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -3300,8 +3300,21 @@ class AMDSMICommands(): filtered_process_values = [] for process_info in process_list: - process_info['mem_usage'] = process_info.pop('mem') - process_info['usage'] = process_info.pop('engine_usage') + process_info = { + "name": process_info["name"], + "pid": process_info["pid"], + "memory_usage": { + "gtt_mem": process_info["memory_usage"]["gtt_mem"], + "cpu_mem": process_info["memory_usage"]["cpu_mem"], + "vram_mem": process_info["memory_usage"]["vram_mem"], + }, + "mem_usage": process_info["mem"], + "usage": { + "gfx": process_info["engine_usage"]["gfx"], + "enc": process_info["engine_usage"]["enc"], + }, + "cu_occupancy": process_info["cu_occupancy"] + } engine_usage_unit = "ns" memory_usage_unit = "B" @@ -5714,35 +5727,43 @@ class AMDSMICommands(): logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) raise e + try: + num_compute_units = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)['num_compute_units'] + except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: + num_compute_units = "N/A" + logging.debug("Failed to get num compute units for gpu %s | %s", gpu_id, e.get_error_info()) + # Clean processes dictionary filtered_process_values = [] for process_info in process_list: - process_info['mem_usage'] = process_info.pop('mem') - process_info['usage'] = process_info.pop('engine_usage') + process_info.pop('mem') # Remove 'mem' value + process_info.pop('engine_usage') # Remove 'engine_usage' value - engine_usage_unit = "ns" memory_usage_unit = "B" - if self.logger.is_human_readable_format(): - process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage']) for usage_metric in process_info['memory_usage']: process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric]) memory_usage_unit = "" - process_info['mem_usage'] = self.helpers.unit_format(self.logger, - process_info['mem_usage'], - memory_usage_unit) - - for usage_metric in process_info['usage']: - process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger, - process_info['usage'][usage_metric], - engine_usage_unit) - for usage_metric in process_info['memory_usage']: process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger, process_info['memory_usage'][usage_metric], memory_usage_unit) + if 'cu_occupancy' in process_info: + try: + cu_occupancy = process_info['cu_occupancy'] + if num_compute_units != "N/A" and num_compute_units > 0: + cu_percentage = round((cu_occupancy / num_compute_units) * 100, 1) + process_info['cu_occupancy'] = self.helpers.unit_format(self.logger, + cu_percentage, + '%') + else: + process_info['cu_occupancy'] = "N/A" + except Exception as e: + process_info['cu_occupancy'] = "N/A" + logging.debug("Failed to calculate cu_occupancy percentage for GPU %s | %s", gpu_id, str(e)) + filtered_process_values.append({'process_info': process_info}) # If no processes are populated then we populate an N/A placeholder @@ -5757,8 +5778,7 @@ class AMDSMICommands(): # Build the process table's title and header self.logger.secondary_table_title = "PROCESS INFO" self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \ - "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USAGE".rjust(11) + \ - "GFX".rjust(8) + "ENC".rjust(8) + "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "CU%".rjust(9) if watching_output: self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 571e4a2a9e..06f8c930ee 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -214,9 +214,9 @@ class AMDSMILogger(): if process_dict['process_info'] == "No running processes detected": # Add N/A for empty process_info table_values += "N/A".rjust(20) + "N/A".rjust(9) + "N/A".rjust(10) + \ - "N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(11) + \ - "N/A".rjust(8) + "N/A".rjust(8) + '\n' + "N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(9) + '\n' else: + #Fix this herre for process_key, process_value in process_dict['process_info'].items(): string_process_value = str(process_value) if process_key == "name": @@ -230,11 +230,8 @@ class AMDSMILogger(): elif process_key == "memory_usage": for memory_key, memory_value in process_value.items(): table_values += str(memory_value).rjust(10) - elif process_key == "mem_usage": - table_values += string_process_value.rjust(11) - elif process_key == "usage": - for usage_key, usage_value in process_value.items(): - table_values += str(usage_value).rjust(8) + elif process_key == "cu_occupancy": + table_values += string_process_value.rjust(9) # Add the stored gpu and stored timestamp to the next line table_values += '\n' if stored_timestamp: @@ -486,20 +483,6 @@ class AMDSMILogger(): raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported") - def _store_output_rocmsmi(self, gpu_id, argument, data): - if self.is_json_format(): - # put output into self.json_output - pass - elif self.is_csv_format(): - # put output into self.csv_output - pass - elif self.is_human_readable_format(): - # put output into self.human_readable_output - pass - else: - raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported") - - def store_multiple_device_output(self): """ Store the current output into the multiple_device_output then clear the current output diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index a30f2a1f11..fff571ace7 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1093,7 +1093,6 @@ except AmdSmiException as e: print(e) ``` - ### amdsmi_get_gpu_process_list Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A" @@ -1111,6 +1110,7 @@ Field | Description `mem` | Process memory usage `engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage
`cpu_mem`CPU memory usage
`vram_mem`VRAM memory usage
+`cu_occupancy` | Number of Compute Units utilized Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 6698e6c799..ded6dd9113 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -817,6 +817,7 @@ int main() { amdsmi_proc_info_t process = {}; uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0; uint64_t gfx = 0, enc = 0; + uint32_t cu_occupancy = 0; char bdf_str[20]; sprintf(bdf_str, "%04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, static_cast(bdf.domain_number), @@ -837,7 +838,7 @@ int main() { printf( "| pid | name | user | gpu bdf | " "fb usage | gtt memory | cpu memory | vram memory | " - "engine usage (ns) |\n"); + "engine usage (ns) | cu occupancy |\n"); printf("| | | | " "| | | | " " | gfx enc |\n"); @@ -855,30 +856,34 @@ int main() { pwd = getpwuid(st.st_uid); if (!pwd) printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB " - "| %7ld KiB | %7ld KiB | %lu %lu |\n", + "| %7ld KiB | %7ld KiB | %lu %lu | %u |\n", process_info_list[it].pid, process_info_list[it].name, st.st_uid, bdf_str, process_info_list[it].mem / 1024, process_info_list[it].memory_usage.gtt_mem / 1024, process_info_list[it].memory_usage.cpu_mem / 1024, process_info_list[it].memory_usage.vram_mem / 1024, process_info_list[it].engine_usage.gfx, - process_info_list[it].engine_usage.enc); + process_info_list[it].engine_usage.enc, + process_info_list[it].cu_occupancy); else printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB " - "| %7ld KiB | %7ld KiB | %lu %lu |\n", + "| %7ld KiB | %7ld KiB | %lu %lu | %u |\n", process_info_list[it].pid, process_info_list[it].name, pwd->pw_name, bdf_str, process_info_list[it].mem / 1024, process_info_list[it].memory_usage.gtt_mem / 1024, process_info_list[it].memory_usage.cpu_mem / 1024, process_info_list[it].memory_usage.vram_mem / 1024, process_info_list[it].engine_usage.gfx, - process_info_list[it].engine_usage.enc); + process_info_list[it].engine_usage.enc, + process_info_list[it].cu_occupancy); + mem += process_info_list[it].mem / 1024; gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024; cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024; vram_mem += process_info_list[it].memory_usage.vram_mem / 1024; gfx = process_info_list[it].engine_usage.gfx; enc = process_info_list[it].engine_usage.enc; + cu_occupancy = process_info_list[it].cu_occupancy; printf( "+-------+------------------+------------+-------------" "-+-------------+-------------+-------------+----------" @@ -887,10 +892,9 @@ int main() { // TODO: To remove compiler warning, the last 3 values in this printf were // set to 0L. Need to find out what these values need to be. printf("| TOTAL:| %s | %7ld " - "KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu " - "%lu %lu %lu |\n", + "KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu | %u |\n", bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx, - enc, 0L, 0L, 0L); + enc, cu_occupancy, 0L); printf("+=======+==================+============+==============" "+=============+=============+=============+============" "=+==========================================+\n"); diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 02a0117772..d5cf983b93 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1093,7 +1093,8 @@ typedef struct { uint32_t reserved[10]; } memory_usage; //!< in bytes char container_name[AMDSMI_MAX_STRING_LENGTH]; - uint32_t reserved[12]; + uint32_t cu_occupancy; //!< Num CUs utilized + uint32_t reserved[11]; } amdsmi_proc_info_t; /** diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index fc942257b0..6e7f2a3b01 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2691,6 +2691,7 @@ def amdsmi_get_gpu_process_list( "cpu_mem": process_list[index].memory_usage.cpu_mem, "vram_mem": process_list[index].memory_usage.vram_mem, }, + "cu_occupancy": process_list[index].cu_occupancy }) return result diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index c915ba8467..565688cfd6 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -1289,7 +1289,8 @@ struct_amdsmi_proc_info_t._fields_ = [ ('engine_usage', struct_engine_usage_), ('memory_usage', struct_memory_usage_), ('container_name', ctypes.c_char * 256), - ('reserved', ctypes.c_uint32 * 12), + ('cu_occupancy', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), ] amdsmi_proc_info_t = struct_amdsmi_proc_info_t diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index 070e7a6f7b..cbdf93f539 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -456,7 +456,6 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, proc->sdma_usage = 0; proc->cu_occupancy = 0; - uint32_t cu_count = 0; static amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); static std::map>& kfd_node_map = smi.kfd_node_map(); @@ -510,23 +509,15 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, } else if(sysfs_data_errcode==0){ // Update CU usage by the process - proc->cu_occupancy += std::stoi(tmp); - // Collect count of compute units - cu_count += kfd_node_map[gpu_id]->cu_count(); + proc->cu_occupancy = std::stoi(tmp); } else { // Some GFX revisions do not provide cu_occupancy debugfs method // which may cause ENOENT proc->cu_occupancy = CU_OCCUPANCY_INVALID; - cu_count = 0; } } - // Adjust CU occupancy to percent. - if (cu_count > 0) { - proc->cu_occupancy = ((proc->cu_occupancy * 100) / cu_count); - } - return 0; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index acaa94e627..602041674d 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -216,6 +216,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& /** * Complete the process information + * This is where we copy rsmi_process_info_t into the larger amdsmi_proc_info_t + * Then populate the remaining fields with the gpuvsmi_get_pid_info() + * TODO FIX HERE TO GRAB KFD VRAM if /proc is inconsistent */ auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) { auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info); @@ -225,6 +228,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage; } + // Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t + asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy; + return status_code; }; diff --git a/projects/amdsmi/src/amd_smi/fdinfo.cc b/projects/amdsmi/src/amd_smi/fdinfo.cc index 30f1be27a9..13454a3b29 100644 --- a/projects/amdsmi/src/amd_smi/fdinfo.cc +++ b/projects/amdsmi/src/amd_smi/fdinfo.cc @@ -20,16 +20,17 @@ * THE SOFTWARE. */ -#include #include #include +#include +#include #include -#include -#include + +#include #include #include -#include -#include +#include +#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_utils.h" @@ -37,230 +38,196 @@ extern "C" { static const char *container_type_name[AMDSMI_MAX_CONTAINER_TYPE] = { - [AMDSMI_CONTAINER_LXC] = "lxc", - [AMDSMI_CONTAINER_DOCKER] = "docker", + [AMDSMI_CONTAINER_LXC] = "lxc", + [AMDSMI_CONTAINER_DOCKER] = "docker", }; -amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) -{ - DIR *d; - struct dirent *dir; +amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) { + DIR *d; + struct dirent *dir; - d = opendir(path.c_str()); - if (!d) - return AMDSMI_STATUS_NO_PERM; + d = opendir(path.c_str()); + if (!d) return AMDSMI_STATUS_NO_PERM; - /* iterate through all the fds, try to find - * a match for the GPU bdf - */ - while ((dir = readdir(d)) != NULL) { - std::string file = path + dir->d_name; - std::ifstream fdinfo(file.c_str()); - for (std::string line; std::getline(fdinfo, line);) { - if (line.find(bdf) != std::string::npos) { - closedir(d); - return AMDSMI_STATUS_SUCCESS; - } - } - } + /* iterate through all the fds, try to find + * a match for the GPU bdf + */ + while ((dir = readdir(d)) != NULL) { + std::string file = path + dir->d_name; + std::ifstream fdinfo(file.c_str()); + for (std::string line; std::getline(fdinfo, line);) { + if (line.find(bdf) != std::string::npos) { + closedir(d); + return AMDSMI_STATUS_SUCCESS; + } + } + } - closedir(d); + closedir(d); - return AMDSMI_STATUS_NOT_FOUND; + return AMDSMI_STATUS_NOT_FOUND; } -amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, uint64_t *size) -{ - char bdf_str[13]; - DIR *d; - struct dirent *dir; +amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, + uint64_t *size) { + char bdf_str[13]; + DIR *d; + struct dirent *dir; - /* 0000:00:00.0 */ - snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, - static_cast(bdf.domain_number & 0xffff), - static_cast(bdf.bus_number & 0xff), - static_cast(bdf.device_number & 0x1f), - static_cast(bdf.function_number & 0x7)); + /* 0000:00:00.0 */ + snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, + static_cast(bdf.domain_number & 0xffff), + static_cast(bdf.bus_number & 0xff), + static_cast(bdf.device_number & 0x1f), + static_cast(bdf.function_number & 0x7)); - d = opendir("/proc"); - if (!d) - return AMDSMI_STATUS_NO_PERM; + d = opendir("/proc"); + if (!d) return AMDSMI_STATUS_NO_PERM; - pids.clear(); - /* Find the pid folders in /proc/ that we have access to */ - while ((dir = readdir(d)) != NULL) { - if (dir->d_type == DT_DIR) { - /* Try to cast the name of the folder to a - * number, if it fails, it is not */ - char *p; - long int pid; + pids.clear(); + /* Find the pid folders in /proc/ that we have access to */ + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_DIR) { + /* Try to cast the name of the folder to a + * number, if it fails, it is not */ + char *p; + long int pid; - pid = strtol(dir->d_name, &p, 10); - if (*p != 0) - continue; + pid = strtol(dir->d_name, &p, 10); + if (*p != 0) continue; - /* Check if fdinfo is accesible */ - std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/"; + /* Check if fdinfo is accesible */ + std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/"; - if (access(path.c_str(), R_OK)) - continue; + if (access(path.c_str(), R_OK)) continue; - /* check if GPU is present */ - if (gpuvsmi_pid_is_gpu(path, bdf_str)) - continue; - pids.push_back(pid); - } - } - closedir(d); + /* check if GPU is present */ + if (gpuvsmi_pid_is_gpu(path, bdf_str)) continue; + pids.push_back(pid); + } + } + closedir(d); - *size = pids.size(); - return AMDSMI_STATUS_SUCCESS; + *size = pids.size(); + return AMDSMI_STATUS_SUCCESS; } amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, - amdsmi_proc_info_t &info) -{ - char bdf_str[13]; - DIR *d; - struct dirent *dir; + amdsmi_proc_info_t &info) { + char bdf_str[13]; + DIR *d; + struct dirent *dir; - /* 0000:00:00.0 */ - snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, - static_cast(bdf.domain_number & 0xffff), - static_cast(bdf.bus_number & 0xff), - static_cast(bdf.device_number & 0x1f), - static_cast(bdf.function_number & 0x7)); + /* 0000:00:00.0 */ + snprintf(bdf_str, 13, "%04" PRIx32 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32, + static_cast(bdf.domain_number & 0xffff), + static_cast(bdf.bus_number & 0xff), + static_cast(bdf.device_number & 0x1f), + static_cast(bdf.function_number & 0x7)); - std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/"; - std::string name_path = "/proc/" + std::to_string(pid) + "/comm"; - std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup"; + std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/"; + std::string name_path = "/proc/" + std::to_string(pid) + "/comm"; + std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup"; - if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) { - return AMDSMI_STATUS_INVAL; - } + if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) { + return AMDSMI_STATUS_INVAL; + } - d = opendir(path.c_str()); - if (!d) - return AMDSMI_STATUS_NO_PERM; + d = opendir(path.c_str()); + if (!d) return AMDSMI_STATUS_NO_PERM; - /* Vectors to check if repated fd pasid */ - // TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0 - std::vector pasids; + /* Vectors to check if repated fd pasid */ + // TODO remove pasid Not working in ROCm 6.4+, deprecating in 7.0 + std::vector pasids; - memset(&info, 0, sizeof(info)); - /* Iterate through all fdinfos */ - while ((dir = readdir(d)) != NULL) { + memset(&info, 0, sizeof(info)); + /* Iterate through all fdinfos */ + while ((dir = readdir(d)) != NULL) { + std::string file = path + dir->d_name; + std::ifstream fdinfo(file.c_str()); - std::string file = path + dir->d_name; - std::ifstream fdinfo(file.c_str()); + for (std::string bdfline; getline(fdinfo, bdfline);) { + if (bdfline.find("drm-pdev:") != std::string::npos) { + char fd_bdf_str[13]; - for (std::string bdfline; getline(fdinfo, bdfline);) { - if (bdfline.find("drm-pdev:") != std::string::npos) { - char fd_bdf_str[13]; + /* Only check against fdinfo files that contain a bdf */ + if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1) continue; - /* Only check against fdinfo files that contain a bdf */ - if (sscanf(bdfline.c_str(), "drm-pdev: %s", &fd_bdf_str[0]) != 1) - continue; + /* Populate amdsmi_proc_info_t struct only if the bdf in + * the fdinfo file matches the passed bdf */ + if (strncmp(bdf_str, fd_bdf_str, 13) == 0) { + std::ifstream fdinfo(file.c_str()); - /* Populate amdsmi_proc_info_t struct only if the bdf in - * the fdinfo file matches the passed bdf */ - if (strncmp(bdf_str, fd_bdf_str, 13) == 0){ - std::ifstream fdinfo(file.c_str()); + for (std::string line; getline(fdinfo, line);) { + if (line.find("pasid:") != std::string::npos) { + int pasid; + if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1) continue; + auto it = std::find(pasids.begin(), pasids.end(), pasid); + if (it == pasids.end()) pasids.push_back(pasid); + } else if (line.find("drm-memory-gtt:") != std::string::npos) { + unsigned long mem; + if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) continue; + info.mem += mem * 1000; + info.memory_usage.gtt_mem += mem * 1000; + } else if (line.find("drm-memory-cpu:") != std::string::npos) { + unsigned long mem; + if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) continue; + info.mem += mem * 1000; + info.memory_usage.cpu_mem += mem * 1000; + } else if (line.find("drm-memory-vram:") != std::string::npos) { + unsigned long mem; + if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) continue;\ + info.mem += mem * 1000; + info.memory_usage.vram_mem += mem * 1000; + } else if (line.find("drm-engine-gfx") != std::string::npos) { + uint64_t engine_gfx; + if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) continue; + info.engine_usage.gfx = engine_gfx; + } else if (line.find("drm-engine-enc") != std::string::npos) { + uint64_t engine_enc; + if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) continue; + info.engine_usage.enc = engine_enc; + } + } + } + } + } + } - for (std::string line; getline(fdinfo, line);) { - if (line.find("pasid:") != std::string::npos) { - int pasid; - - if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1) - continue; - - auto it = std::find(pasids.begin(), pasids.end(), pasid); - - if (it == pasids.end()) - pasids.push_back(pasid); - } else if (line.find("drm-memory-gtt:") != std::string::npos) { - unsigned long mem; - - if (sscanf(line.c_str(), "drm-memory-gtt: %lu", &mem) != 1) - continue; - - info.mem += mem * 1024; - info.memory_usage.gtt_mem += mem * 1024; - } else if (line.find("drm-memory-cpu:") != std::string::npos) { - unsigned long mem; - - if (sscanf(line.c_str(), "drm-memory-cpu: %lu", &mem) != 1) - continue; - - info.mem += mem * 1024; - info.memory_usage.cpu_mem += mem * 1024; - } else if (line.find("drm-memory-vram:") != std::string::npos) { - unsigned long mem; - - if (sscanf(line.c_str(), "drm-memory-vram: %lu", &mem) != 1) - continue; - - info.mem += mem * 1024; - info.memory_usage.vram_mem += mem * 1024; - } else if (line.find("drm-engine-gfx") != std::string::npos) { - uint64_t engine_gfx; - - if (sscanf(line.c_str(), "drm-engine-gfx: %lu", &engine_gfx) != 1) - continue; - - info.engine_usage.gfx = engine_gfx; - } else if (line.find("drm-engine-enc") != std::string::npos) { - uint64_t engine_enc; - - if (sscanf(line.c_str(), "drm-engine-enc: %lu", &engine_enc) != 1) - continue; - - info.engine_usage.enc = engine_enc; - } - } - } - } - } - } - - closedir(d); + closedir(d); // Note: If possible at all, try to get the name of the process/container. // In case the other info fail, get at least something. - std::ifstream filename(name_path.c_str()); - std::string name; + std::ifstream filename(name_path.c_str()); + std::string name; - getline(filename, name); + getline(filename, name); - if (name.empty()) - return AMDSMI_STATUS_API_FAILED; + if (name.empty()) return AMDSMI_STATUS_API_FAILED; - strncpy(info.name, name.c_str(), std::min( - (unsigned long) AMDSMI_MAX_STRING_LENGTH, - name.length())); + strncpy(info.name, name.c_str(), + std::min((unsigned long)AMDSMI_MAX_STRING_LENGTH, name.length())); - for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) { - std::ifstream cgroup_info(cgroup_path.c_str()); - std::string container_id; - for (std::string line; getline(cgroup_info, line);) { - if (line.find(container_type_name[i]) != std::string::npos) { - container_id = line.substr(line.find(container_type_name[i]) + - strlen(container_type_name[i]) + 1, 16); - strcpy(info.container_name, container_id.c_str()); - break; - } - } - if (strlen(info.container_name) > 0) - break; - } - info.pid = (uint32_t)pid; + for (int i = 0; i < AMDSMI_MAX_CONTAINER_TYPE; i++) { + std::ifstream cgroup_info(cgroup_path.c_str()); + std::string container_id; + for (std::string line; getline(cgroup_info, line);) { + if (line.find(container_type_name[i]) != std::string::npos) { + container_id = line.substr(line.find(container_type_name[i]) + + strlen(container_type_name[i]) + 1, 16); + strcpy(info.container_name, container_id.c_str()); + break; + } + } + if (strlen(info.container_name) > 0) break; + } + info.pid = (uint32_t)pid; - if (!pasids.size()) { - return AMDSMI_STATUS_NOT_FOUND; + if (!pasids.size()) { + return AMDSMI_STATUS_NOT_FOUND; } - return AMDSMI_STATUS_SUCCESS; + return AMDSMI_STATUS_SUCCESS; } - -} // extern "C" +} // extern "C" diff --git a/projects/amdsmi/tools/amdsmi_quick_start.py b/projects/amdsmi/tools/amdsmi_quick_start.py index 4c0c35a4b6..2440fbeff0 100644 --- a/projects/amdsmi/tools/amdsmi_quick_start.py +++ b/projects/amdsmi/tools/amdsmi_quick_start.py @@ -44,15 +44,12 @@ try: from amdsmi_logger import AMDSMILogger from amdsmi_parser import AMDSMIParser import amdsmi_cli_exceptions + helpers = AMDSMIHelpers() except ImportError as e: print(f"Failed to import amdsmi cli libs: {e}") print("Ensure that you have installed amdsmi's package.") -helpers = AMDSMIHelpers() - - - # Make exit & quit work without parens because it's annoying type(exit).__repr__ = sys.exit