Added GTT Memory to default output process table (#480)

* Added GTT Memory to default command and adjusted table format

---------

Signed-off-by: gabrpham <Gabriel.Pham@amd.com>

[ROCm/amdsmi commit: 940ece6813]
Šī revīzija ir iekļauta:
Pham, Gabriel
2025-06-13 16:43:56 -05:00
revīziju iesūtīja GitHub
vecāks b1753ad3b3
revīzija dfaf8386fa
3 mainīti faili ar 68 papildinājumiem un 73 dzēšanām
+43 -51
Parādīt failu
@@ -13,57 +13,49 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
```console
$ amd-smi
+------------------------------------------------------------------------------+
| AMD-SMI 26.10.10+42441c78 amdgpu version: 6.15.5 ROCm version: 7.0.0 |
|--------------------------------------+---------------------------------------|
| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |
|======================================+=======================================|
| 0000:0c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
| 0 0 2 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:22:00.0 AMD Instinct MI300X | 0 % 40 °C 0 155/750 W |
| 1 1 1 SPX/NPS1 | 0 % N/A 284/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:38:00.0 AMD Instinct MI300X | 0 % 37 °C 0 141/750 W |
| 2 2 0 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:5c:00.0 AMD Instinct MI300X | 0 % 37 °C 0 139/750 W |
| 3 3 3 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:9f:00.0 AMD Instinct MI300X | 0 % 37 °C 0 140/750 W |
| 4 4 7 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:af:00.0 AMD Instinct MI300X | 0 % 37 °C 0 142/750 W |
| 5 5 5 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:bf:00.0 AMD Instinct MI300X | 0 % 36 °C 0 138/750 W |
| 6 6 4 SPX/NPS1 | 0 % N/A 283/196592 MB |
|--------------------------------------+---------------------------------------|
| 0000:df:00.0 AMD Instinct MI300X | 0 % 40 °C 0 138/750 W |
| 7 7 6 SPX/NPS1 | 0 % N/A 283/196592 MB |
+--------------------------------------+---------------------------------------+
+------------------------------------------------------------------------------+
| Processes: |
| GPU PID Process Name VRAM_MEM MEM_USAGE NUM_CU |
|==============================================================================|
| 0 269867 rvs 17.9 GB 19.2 GB 38 |
| 0 269888 rvs 17.9 GB 19.2 GB 38 |
| 1 269867 rvs 17.9 GB 19.2 GB 38 |
| 1 269888 rvs 17.9 GB 19.2 GB 38 |
| 2 269867 rvs 17.9 GB 19.2 GB 38 |
| 2 269888 rvs 17.9 GB 19.2 GB 38 |
| 3 269867 rvs 17.9 GB 19.2 GB 76 |
| 3 269888 rvs 17.9 GB 19.2 GB 0 |
| 4 269867 rvs 17.9 GB 19.0 GB 37 |
| 4 269888 rvs 17.9 GB 19.2 GB 36 |
| 5 269867 rvs 17.9 GB 19.0 GB 76 |
| 5 269888 rvs 17.9 GB 19.2 GB 0 |
| 6 269867 rvs 17.9 GB 19.0 GB 76 |
| 6 269888 rvs 17.9 GB 19.2 GB 0 |
| 7 269867 rvs 17.9 GB 19.2 GB 34 |
| 7 269888 rvs 17.9 GB 19.2 GB 38 |
+------------------------------------------------------------------------------+
+------------------------------------------------------------------------------+
| AMD-SMI 26.0.0+eaa54ecc amdgpu version: 6.12.12 ROCm version: 7.0.0 |
|-------------------------------------+----------------------------------------|
| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |
|=====================================+========================================|
| 0000:0c:00.0 AMD Instinct MI300X | 13 % 60 °C 0 734/750 W |
| 0 0 2 SPX/NPS1 | 98 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:22:00.0 AMD Instinct MI300X | 10 % 60 °C 0 652/750 W |
| 1 1 1 SPX/NPS1 | 83 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:38:00.0 AMD Instinct MI300X | 5 % 55 °C 0 376/750 W |
| 2 2 0 SPX/NPS1 | 34 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:5c:00.0 AMD Instinct MI300X | 2 % 57 °C 0 234/750 W |
| 3 3 3 SPX/NPS1 | 12 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:9f:00.0 AMD Instinct MI300X | 1 % 57 °C 0 219/750 W |
| 4 4 7 SPX/NPS1 | 11 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:af:00.0 AMD Instinct MI300X | 3 % 61 °C 0 295/750 W |
| 5 5 5 SPX/NPS1 | 23 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:bf:00.0 AMD Instinct MI300X | 5 % 58 °C 0 367/750 W |
| 6 6 4 SPX/NPS1 | 36 % N/A 4976/196592 MB |
|-------------------------------------+----------------------------------------|
| 0000:df:00.0 AMD Instinct MI300X | 6 % 62 °C 0 434/750 W |
| 7 7 6 SPX/NPS1 | 47 % N/A 4976/196592 MB |
+-------------------------------------+----------------------------------------+
+------------------------------------------------------------------------------+
| Processes: |
| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE NUM_CU |
|==============================================================================|
| 0 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 |
| 1 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 |
| 2 1253994 rvs 2.0 MB 2.5 GB 4.6 GB 0 |
| 3 1253994 rvs 2.0 MB 2.5 GB 4.6 GB 0 |
| 4 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 |
| 5 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 |
| 6 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 114 |
| 7 1253994 rvs 2.0 MB 2.4 GB 4.6 GB 0 |
+------------------------------------------------------------------------------+
```
- **Added support for GPU metrics 1.8**.
@@ -6614,7 +6614,7 @@ class AMDSMICommands():
hip_id = "N/A"
gpu_info_dict.update({"hip_id": hip_id})
# mem utilization, GPU utilization, power usage, and temperature
# mem utilization, GPU utilization, power usage, and temperature from gpu_metrics
if gpu_metrics != "N/A":
mem_util = gpu_metrics['average_umc_activity']
mem_util = round(mem_util)
@@ -6683,10 +6683,11 @@ class AMDSMICommands():
try:
raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
for proc in raw_process_list:
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"}
proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"}
proc_info_dict['gpu'] = gpu_id
proc_info_dict['pid'] = proc['pid']
proc_info_dict['name'] = proc['name']
proc_info_dict['gtt'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['gtt_mem'])
proc_info_dict['vram'] = self.helpers.convert_bytes_to_readable(proc['memory_usage']['vram_mem'])
proc_info_dict['mem_usage'] = self.helpers.convert_bytes_to_readable(proc['mem'])
proc_info_dict['cu_occupancy'] = str(proc['cu_occupancy'])
@@ -986,9 +986,9 @@ class AMDSMILogger():
def print_default_output(self, output: Dict):
# some template lines
default_line_1 = "+------------------------------------------------------------------------------+"
default_line_2 = "|--------------------------------------+---------------------------------------|"
default_line_3 = "|======================================+=======================================|"
default_line_4 = "+--------------------------------------+---------------------------------------+"
default_line_2 = "|-------------------------------------+----------------------------------------|"
default_line_3 = "|=====================================+========================================|"
default_line_4 = "+-------------------------------------+----------------------------------------+"
default_line_5 = "|==============================================================================|"
# print the version information first
@@ -1008,8 +1008,8 @@ class AMDSMILogger():
print(default_line_1)
print("| AMD-SMI {0:20s} amdgpu version: {1:8s} ROCm version: {2:8s} |".format(amd_smi_version.ljust(20), amdgpu_version, rocm_version))
print(default_line_2)
print("| BDF GPU-Name | Mem-Util Temp UECC Power-Usage |")
print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Util Fan Memory-Usage |")
print("| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |")
print("| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |")
print(default_line_3)
line_count = 0
@@ -1026,41 +1026,42 @@ class AMDSMILogger():
mem_util = gpu_info['mem_util']
if mem_util != "N/A":
mem_util = str(mem_util) + " %"
mem_util = mem_util.rjust(8)
mem_util = mem_util.ljust(5)
temp = gpu_info['temp']
if temp != "N/A":
temp = str(temp) + " \u00b0C"
temp = temp.rjust(6)
u_ecc = str(gpu_info['uncorr_ecc']).rjust(5)
u_ecc = str(gpu_info['uncorr_ecc']).ljust(5)
power_usage = gpu_info['power_usage']
if power_usage != "N/A":
power_usage = f"{gpu_info['power_usage']['current_power']}/{gpu_info['power_usage']['power_limit']} W"
power_usage = str(power_usage).rjust(12)
print("| {0:12.12s} {1:22.22s} | {2:8.8s} {3:6.6s} {4:5.5s} {5:12.12s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage))
power_usage = str(power_usage).rjust(13)
gpu_id = str(gpu_info['gpu_id']).rjust(3)
hip_id = str(gpu_info['hip_id']).rjust(6)
oam_id = str(gpu_info['oam_id']).rjust(7)
oam_id = str(gpu_info['oam_id']).rjust(6)
partition_modes = str(gpu_info['partition_mode']).rjust(14)
gfx_util = gpu_info['gfx_util']
if gfx_util != "N/A":
gfx_util = str(gfx_util) + " %"
gfx_util = gfx_util.rjust(8)
gfx_util = gfx_util.ljust(5)
fan = gpu_info['fan']
if fan != "N/A":
fan = str(fan) + " %"
fan = fan.rjust(7)
fan = fan.rjust(6)
mem_usage = gpu_info['mem_usage']
if mem_usage != "N/A":
mem_usage = f"{gpu_info['mem_usage']['used_vram']}/{gpu_info['mem_usage']['total_vram']} MB"
mem_usage = mem_usage.rjust(19)
print("| {0:3.3s} {1:6.6s} {2:7.7s} {3:14.14s} | {4:8.8s} {5:7.7s} {6:19.19s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage))
mem_usage = mem_usage.rjust(21)
print("| {0:12.12s} {1:22.22s} | {2:5.5s} {3:6.6s} {4:5.5s} {5:13.13s} |".format(bdf, market_name, mem_util, temp, u_ecc, power_usage))
print("| {0:3.3s} {1:6.6s} {2:6.6s} {3:14.14s} | {4:5.5s} {5:6.6s} {6:21.21s} |".format(gpu_id, hip_id, oam_id, partition_modes, gfx_util, fan, mem_usage))
if line_count < end:
print(default_line_2)
@@ -1071,18 +1072,19 @@ class AMDSMILogger():
# print process list of all GPUs last
print(default_line_1)
print("| Processes: |")
print("| GPU PID Process Name VRAM_MEM MEM_USAGE NUM_CU |")
print("| GPU PID Process Name GTT_MEM VRAM_MEM MEM_USAGE NUM_CU |")
print(default_line_5)
if len(output['processes']) != 0:
for process in output['processes']:
gpu_id = str(process['gpu']).rjust(4)
pid = str(process['pid']).rjust(9)
process_name = str(process['name']).ljust(29)
vram_mem = str(process['vram']).rjust(9)
process_name = str(process['name']).ljust(20)
gtt_mem = str(process['gtt']).rjust(8)
vram_mem = str(process['vram']).rjust(8)
mem_usage = str(process['mem_usage']).rjust(9)
cu_occupancy = str(process['cu_occupancy']).rjust(6)
print("| {0:4s} {1:9s} {2:29s} {3:9s} {4:9s} {5:6s} |".format(
gpu_id, pid, process_name, vram_mem, mem_usage, cu_occupancy))
print("| {0:4s} {1:9s} {2:20s} {3:8s} {4:8s} {5:9s} {6:6s} |".format(
gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy))
else:
print("| No running processes found |")
print(default_line_1)