From dfaf8386fa5f1043224e1bd02e84d110628ef378 Mon Sep 17 00:00:00 2001 From: "Pham, Gabriel" Date: Fri, 13 Jun 2025 16:43:56 -0500 Subject: [PATCH] Added GTT Memory to default output process table (#480) * Added GTT Memory to default command and adjusted table format --------- Signed-off-by: gabrpham [ROCm/amdsmi commit: 940ece68135e6026fbea89aebf9cc1f0214668a3] --- projects/amdsmi/CHANGELOG.md | 94 +++++++++---------- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 5 +- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 42 +++++---- 3 files changed, 68 insertions(+), 73 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 98e15b368a..c3ca787184 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -13,57 +13,49 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ```console $ amd-smi - +------------------------------------------------------------------------------+ - | AMD-SMI 26.10.10+42441c78 amdgpu version: 6.15.5 ROCm version: 7.0.0 | - |--------------------------------------+---------------------------------------| - | BDF GPU-Name | Mem-Util Temp UECC Power-Usage | - | GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage | - |======================================+=======================================| - | 0000:0c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W | - | 0 0 2 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:22:00.0 AMD Instinct MI300X | 0 % 40 °C 0 155/750 W | - | 1 1 1 SPX/NPS1 | 0 % N/A 284/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:38:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W | - | 2 2 0 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:5c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 139/750 W | - | 3 3 3 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:9f:00.0 AMD Instinct MI300X | 0 % 37 °C 0 140/750 W | - | 4 4 7 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:af:00.0 AMD Instinct MI300X | 0 % 37 °C 0 142/750 W | - | 5 5 5 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:bf:00.0 AMD Instinct MI300X | 0 % 36 °C 0 138/750 W | - | 6 6 4 SPX/NPS1 | 0 % N/A 283/196592 MB | - |--------------------------------------+---------------------------------------| - | 0000:df:00.0 AMD Instinct MI300X | 0 % 40 °C 0 138/750 W | - | 7 7 6 SPX/NPS1 | 0 % N/A 283/196592 MB | - +--------------------------------------+---------------------------------------+ - +------------------------------------------------------------------------------+ - | Processes: | - | GPU PID Process Name VRAM_MEM MEM_USAGE NUM_CU | - |==============================================================================| - | 0 269867 rvs 17.9 GB 19.2 GB 38 | - | 0 269888 rvs 17.9 GB 19.2 GB 38 | - | 1 269867 rvs 17.9 GB 19.2 GB 38 | - | 1 269888 rvs 17.9 GB 19.2 GB 38 | - | 2 269867 rvs 17.9 GB 19.2 GB 38 | - | 2 269888 rvs 17.9 GB 19.2 GB 38 | - | 3 269867 rvs 17.9 GB 19.2 GB 76 | - | 3 269888 rvs 17.9 GB 19.2 GB 0 | - | 4 269867 rvs 17.9 GB 19.0 GB 37 | - | 4 269888 rvs 17.9 GB 19.2 GB 36 | - | 5 269867 rvs 17.9 GB 19.0 GB 76 | - | 5 269888 rvs 17.9 GB 19.2 GB 0 | - | 6 269867 rvs 17.9 GB 19.0 GB 76 | - | 6 269888 rvs 17.9 GB 19.2 GB 0 | - | 7 269867 rvs 17.9 GB 19.2 GB 34 | - | 7 269888 rvs 17.9 GB 19.2 GB 38 | - +------------------------------------------------------------------------------+ ++------------------------------------------------------------------------------+ +| AMD-SMI 26.0.0+eaa54ecc amdgpu version: 6.12.12 ROCm version: 7.0.0 | +|-------------------------------------+----------------------------------------| +| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage | +| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage | +|=====================================+========================================| +| 0000:0c:00.0 AMD Instinct MI300X | 13 % 60 °C 0 734/750 W | +| 0 0 2 SPX/NPS1 | 98 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:22:00.0 AMD Instinct MI300X | 10 % 60 °C 0 652/750 W | +| 1 1 1 SPX/NPS1 | 83 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:38:00.0 AMD Instinct MI300X | 5 % 55 °C 0 376/750 W | +| 2 2 0 SPX/NPS1 | 34 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:5c:00.0 AMD Instinct MI300X | 2 % 57 °C 0 234/750 W | +| 3 3 3 SPX/NPS1 | 12 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:9f:00.0 AMD Instinct MI300X | 1 % 57 °C 0 219/750 W | +| 4 4 7 SPX/NPS1 | 11 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:af:00.0 AMD Instinct MI300X | 3 % 61 °C 0 295/750 W | +| 5 5 5 SPX/NPS1 | 23 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:bf:00.0 AMD Instinct MI300X | 5 % 58 °C 0 367/750 W | +| 6 6 4 SPX/NPS1 | 36 % N/A 4976/196592 MB | +|-------------------------------------+----------------------------------------| +| 0000:df:00.0 AMD Instinct MI300X | 6 % 62 °C 0 434/750 W | +| 7 7 6 SPX/NPS1 | 47 % N/A 4976/196592 MB | ++-------------------------------------+----------------------------------------+ ++------------------------------------------------------------------------------+ +| Processes: | +| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE NUM_CU | +|==============================================================================| +| 0 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 | +| 1 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 | +| 2 1253994 rvs 2.0 MB 2.5 GB 4.6 GB 0 | +| 3 1253994 rvs 2.0 MB 2.5 GB 4.6 GB 0 | +| 4 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 | +| 5 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 | +| 6 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 | +| 7 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 | ++------------------------------------------------------------------------------+ ``` - **Added support for GPU metrics 1.8**. diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index ac67a2aedd..027ab4212a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -6614,7 +6614,7 @@ class AMDSMICommands(): hip_id = "N/A" gpu_info_dict.update({"hip_id": hip_id}) - # mem utilization, GPU utilization, power usage, and temperature + # mem utilization, GPU utilization, power usage, and temperature from gpu_metrics if gpu_metrics != "N/A": mem_util = gpu_metrics['average_umc_activity'] mem_util = round(mem_util) @@ -6683,10 +6683,11 @@ class AMDSMICommands(): try: raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor) for proc in raw_process_list: - proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"} + proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"} proc_info_dict['gpu'] = gpu_id proc_info_dict['pid'] = proc['pid'] proc_info_dict['name'] = proc['name'] + proc_info_dict['gtt'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['gtt_mem']) proc_info_dict['vram'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['vram_mem']) proc_info_dict['mem_usage'] = self.helpers.convert_bytes_to_readable(proc['mem']) proc_info_dict['cu_occupancy'] = str(proc['cu_occupancy']) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index fc4d88f4c4..8b4697582c 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -986,9 +986,9 @@ class AMDSMILogger(): def print_default_output(self, output: Dict): # some template lines default_line_1 = "+------------------------------------------------------------------------------+" - default_line_2 = "|--------------------------------------+---------------------------------------|" - default_line_3 = "|======================================+=======================================|" - default_line_4 = "+--------------------------------------+---------------------------------------+" + default_line_2 = "|-------------------------------------+----------------------------------------|" + default_line_3 = "|=====================================+========================================|" + default_line_4 = "+-------------------------------------+----------------------------------------+" default_line_5 = "|==============================================================================|" # print the version information first @@ -1008,8 +1008,8 @@ class AMDSMILogger(): print(default_line_1) print("| AMD-SMI {0:20s} amdgpu version: {1:8s} ROCm version: {2:8s} |".format(amd_smi_version.ljust(20), amdgpu_version, rocm_version)) print(default_line_2) - print("| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |") - print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |") + print("| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |") + print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |") print(default_line_3) line_count = 0 @@ -1026,41 +1026,42 @@ class AMDSMILogger(): mem_util = gpu_info['mem_util'] if mem_util != "N/A": mem_util = str(mem_util) + " %" - mem_util = mem_util.rjust(8) + mem_util = mem_util.ljust(5) temp = gpu_info['temp'] if temp != "N/A": temp = str(temp) + " \u00b0C" temp = temp.rjust(6) - u_ecc = str(gpu_info['uncorr_ecc']).rjust(5) + u_ecc = str(gpu_info['uncorr_ecc']).ljust(5) power_usage = gpu_info['power_usage'] if power_usage != "N/A": power_usage = f"{gpu_info['power_usage']['current_power']}/{gpu_info['power_usage']['power_limit']} W" - power_usage = str(power_usage).rjust(12) - print("| {0:12.12s} {1:22.22s} | {2:8.8s} {3:6.6s} {4:5.5s} {5:12.12s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage)) - + power_usage = str(power_usage).rjust(13) + gpu_id = str(gpu_info['gpu_id']).rjust(3) hip_id = str(gpu_info['hip_id']).rjust(6) - oam_id = str(gpu_info['oam_id']).rjust(7) + oam_id = str(gpu_info['oam_id']).rjust(6) partition_modes = str(gpu_info['partition_mode']).rjust(14) gfx_util = gpu_info['gfx_util'] if gfx_util != "N/A": gfx_util = str(gfx_util) + " %" - gfx_util = gfx_util.rjust(8) + gfx_util = gfx_util.ljust(5) fan = gpu_info['fan'] if fan != "N/A": fan = str(fan) + " %" - fan = fan.rjust(7) + fan = fan.rjust(6) mem_usage = gpu_info['mem_usage'] if mem_usage != "N/A": mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB" - mem_usage = mem_usage.rjust(19) - print("| {0:3.3s} {1:6.6s} {2:7.7s} {3:14.14s} | {4:8.8s} {5:7.7s} {6:19.19s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage)) + mem_usage = mem_usage.rjust(21) + + print("| {0:12.12s} {1:22.22s} | {2:5.5s} {3:6.6s} {4:5.5s} {5:13.13s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage)) + print("| {0:3.3s} {1:6.6s} {2:6.6s} {3:14.14s} | {4:5.5s} {5:6.6s} {6:21.21s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage)) if line_count < end: print(default_line_2) @@ -1071,18 +1072,19 @@ class AMDSMILogger(): # print process list of all GPUs last print(default_line_1) print("| Processes: |") - print("| GPU PID Process Name VRAM_MEM MEM_USAGE NUM_CU |") + print("| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE NUM_CU |") print(default_line_5) if len(output['processes']) != 0: for process in output['processes']: gpu_id = str(process['gpu']).rjust(4) pid = str(process['pid']).rjust(9) - process_name = str(process['name']).ljust(29) - vram_mem = str(process['vram']).rjust(9) + process_name = str(process['name']).ljust(20) + gtt_mem = str(process['gtt']).rjust(8) + vram_mem = str(process['vram']).rjust(8) mem_usage = str(process['mem_usage']).rjust(9) cu_occupancy = str(process['cu_occupancy']).rjust(6) - print("| {0:4s} {1:9s} {2:29s} {3:9s} {4:9s} {5:6s} |".format( - gpu_id, pid, process_name, vram_mem, mem_usage, cu_occupancy)) + print("| {0:4s} {1:9s} {2:20s} {3:8s} {4:8s} {5:9s} {6:6s} |".format( + gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy)) else: print("| No running processes found |") print(default_line_1) \ No newline at end of file