diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md
index 02877694a1..873aa0426e 100644
--- a/projects/amdsmi/CHANGELOG.md
+++ b/projects/amdsmi/CHANGELOG.md
@@ -24,6 +24,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- The entry `policies` is added to the end of the dictionary to match API definition.
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
+- **Added evicted_time metric for kfd processes**.
+ - Time that queues are evicted on a GPU in milliseconds
+ - Added to CLI in `amd-smi monitor -q` and `amd-smi process`
+ - Added to C API and Python API:
+ - amdsmi_get_gpu_process_list()
+ - amdsmi_get_gpu_compute_process_info()
+ - amdsmi_get_gpu_compute_process_info_by_pid()
+
### Changed
- N/A
diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py
index 5164075960..daf08577fa 100644
--- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py
+++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py
@@ -3400,11 +3400,13 @@ class AMDSMICommands():
"gfx": process_info["engine_usage"]["gfx"],
"enc": process_info["engine_usage"]["enc"],
},
- "cu_occupancy": process_info["cu_occupancy"]
+ "cu_occupancy": process_info["cu_occupancy"],
+ "evicted_time": process_info["evicted_time"]
}
engine_usage_unit = "ns"
memory_usage_unit = "B"
+ evicted_time_unit = "ms"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
@@ -3415,6 +3417,10 @@ class AMDSMICommands():
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
process_info['mem_usage'],
memory_usage_unit)
+
+ process_info['evicted_time'] = self.helpers.unit_format(self.logger,
+ process_info['evicted_time'],
+ evicted_time_unit)
for usage_metric in process_info['usage']:
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
@@ -6130,8 +6136,10 @@ class AMDSMICommands():
process_info.pop('engine_usage') # Remove 'engine_usage' value
process_info['mem_usage'] = process_info.pop('mem')
process_info['cu_occupancy'] = process_info.pop('cu_occupancy')
+ process_info['evicted_time'] = process_info.pop('evicted_time')
memory_usage_unit = "B"
+ evicted_time_unit = "ms"
if self.logger.is_human_readable_format():
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
@@ -6143,6 +6151,10 @@ class AMDSMICommands():
process_info['mem_usage'],
memory_usage_unit)
+ process_info['evicted_time'] = self.helpers.unit_format(self.logger,
+ process_info['evicted_time'],
+ evicted_time_unit)
+
for usage_metric in process_info['memory_usage']:
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
process_info['memory_usage'][usage_metric],
@@ -6176,7 +6188,7 @@ class AMDSMICommands():
# Build the process table's title and header
self.logger.secondary_table_title = "PROCESS INFO"
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
- "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9)
+ "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "EVICT".rjust(10)
if watching_output:
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header
@@ -7334,7 +7346,7 @@ class AMDSMICommands():
try:
raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor)
for proc in raw_process_list:
- proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"}
+ proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A", "evicted_time" : "N/A"}
proc_info_dict['gpu'] = gpu_id
proc_info_dict['pid'] = proc['pid']
proc_info_dict['name'] = proc['name']
@@ -7350,6 +7362,7 @@ class AMDSMICommands():
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
except (ValueError, TypeError):
proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu}
+ proc_info_dict['evicted_time'] = proc['evicted_time']
all_process_list.append(proc_info_dict)
except amdsmi_exception.AmdSmiLibraryException as e:
diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py
index 7f23153aab..3efa82c2c0 100644
--- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py
+++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py
@@ -230,7 +230,7 @@ class AMDSMILogger():
# Add N/A for empty process_info
table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \
- "N/A".rjust(9) + '\n'
+ "N/A".rjust(9) + "N/A".rjust(10) + '\n'
else:
#Fix this herre
for process_key, process_value in process_dict['process_info'].items():
@@ -251,6 +251,8 @@ class AMDSMILogger():
table_values += string_process_value.rjust(10)
elif process_key == "cu_occupancy":
table_values += string_process_value.rjust(9)
+ elif process_key == "evicted_time":
+ table_values += string_process_value.rjust(9)
# Add the stored gpu and stored timestamp to the next line
table_values += '\n'
if stored_timestamp:
@@ -1124,8 +1126,9 @@ class AMDSMILogger():
cu_occupancy = (str(round(process['cu_occupancy']['current_cu'] / process['cu_occupancy']['total_num_cu'] * 100, 1)) + " %").rjust(7)
else:
cu_occupancy = "N/A"
- print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} |".format(
- gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy))
+ evicted_time = str(process['evicted_time']).rjust(9)
+ print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} {7:9.9s} |".format(
+ gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy, evicted_time))
if process['name'] == "N/A":
elevated_permission_error = True
else:
diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md
index 968fde8432..ca66acc6e1 100644
--- a/projects/amdsmi/docs/reference/amdsmi-py-api.md
+++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md
@@ -1186,6 +1186,7 @@ Field | Description
`engine_usage` |
| Subfield | Description |
| `gfx` | GFX engine usage in ns |
| `enc` | Encode engine usage in ns |
`memory_usage` | | Subfield | Description |
| `gtt_mem` | GTT memory usage in Bytes |
| `cpu_mem` | CPU memory usage in Bytes |
| `vram_mem` | Process VRAM memory usage in Bytes |
`cu_occupancy` | Number of Compute Units utilized
+`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
@@ -3534,6 +3535,7 @@ Field | Description
`vram_usage` | VRAM usage
`sdma_usage` | SDMA usage in microseconds
`cu_occupancy` | Compute Unit usage in percents
+`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info` function:
@@ -3568,6 +3570,7 @@ Field | Description
`vram_usage` | VRAM usage
`sdma_usage` | SDMA usage in microseconds
`cu_occupancy` | Compute Unit usage in percents
+`evicted_time` | Time that queues are evicted on a GPU in milliseconds
Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info_by_pid` function:
diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h
index 8a7214ff9f..4c8a377971 100644
--- a/projects/amdsmi/include/amd_smi/amdsmi.h
+++ b/projects/amdsmi/include/amd_smi/amdsmi.h
@@ -1159,7 +1159,8 @@ typedef struct {
} memory_usage; //!< In Bytes
char container_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t cu_occupancy; //!< Num CUs utilized
- uint32_t reserved[11];
+ uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
+ uint32_t reserved[10];
} amdsmi_proc_info_t;
/**
@@ -2085,6 +2086,7 @@ typedef struct {
uint64_t vram_usage; //!< VRAM usage in MB
uint64_t sdma_usage; //!< SDMA usage in microseconds
uint32_t cu_occupancy; //!< Compute Unit usage in percent
+ uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
} amdsmi_process_info_t;
/**
diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py
index 348a341fc8..6b611f2f28 100644
--- a/projects/amdsmi/py-interface/amdsmi_interface.py
+++ b/projects/amdsmi/py-interface/amdsmi_interface.py
@@ -2953,7 +2953,8 @@ def amdsmi_get_gpu_process_list(
"cpu_mem": process_list[index].memory_usage.cpu_mem,
"vram_mem": process_list[index].memory_usage.vram_mem,
},
- "cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T)
+ "cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T),
+ "evicted_time": _validate_if_max_uint(process_list[index].evicted_time, MaxUIntegerTypes.UINT32_T)
})
return result
@@ -5303,6 +5304,7 @@ def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]:
"vram_usage": proc.vram_usage,
"sdma_usage": proc.sdma_usage,
"cu_occupancy": proc.cu_occupancy,
+ "evicted_time": proc.evicted_time,
}
for proc in procs
]
@@ -5324,6 +5326,7 @@ def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]:
"vram_usage": proc.vram_usage,
"sdma_usage": proc.sdma_usage,
"cu_occupancy": proc.cu_occupancy,
+ "evicted_time": proc.evicted_time,
}
diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py
index 7c4ae3817b..8bbffd2b1e 100644
--- a/projects/amdsmi/py-interface/amdsmi_wrapper.py
+++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py
@@ -904,22 +904,22 @@ amdsmi_frequency_range_t = struct_amdsmi_frequency_range_t
class union_amdsmi_bdf_t(Union):
pass
-class struct_amdsmi_bdf_t(Structure):
+class struct_bdf_(Structure):
pass
-struct_amdsmi_bdf_t._pack_ = 1 # source:False
-struct_amdsmi_bdf_t._fields_ = [
+struct_bdf_._pack_ = 1 # source:False
+struct_bdf_._fields_ = [
('function_number', ctypes.c_uint64, 3),
('device_number', ctypes.c_uint64, 5),
('bus_number', ctypes.c_uint64, 8),
('domain_number', ctypes.c_uint64, 48),
]
-class struct_bdf_(Structure):
+class struct_amdsmi_bdf_t(Structure):
pass
-struct_bdf_._pack_ = 1 # source:False
-struct_bdf_._fields_ = [
+struct_amdsmi_bdf_t._pack_ = 1 # source:False
+struct_amdsmi_bdf_t._fields_ = [
('function_number', ctypes.c_uint64, 3),
('device_number', ctypes.c_uint64, 5),
('bus_number', ctypes.c_uint64, 8),
@@ -1397,7 +1397,9 @@ struct_amdsmi_proc_info_t._fields_ = [
('memory_usage', struct_memory_usage_),
('container_name', ctypes.c_char * 256),
('cu_occupancy', ctypes.c_uint32),
- ('reserved', ctypes.c_uint32 * 11),
+ ('evicted_time', ctypes.c_uint32),
+ ('reserved', ctypes.c_uint32 * 10),
+ ('PADDING_1', ctypes.c_ubyte * 4),
]
amdsmi_proc_info_t = struct_amdsmi_proc_info_t
@@ -2196,7 +2198,7 @@ struct_amdsmi_process_info_t._fields_ = [
('vram_usage', ctypes.c_uint64),
('sdma_usage', ctypes.c_uint64),
('cu_occupancy', ctypes.c_uint32),
- ('PADDING_1', ctypes.c_ubyte * 4),
+ ('evicted_time', ctypes.c_uint32),
]
amdsmi_process_info_t = struct_amdsmi_process_info_t
diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h
index 3e8a5c7b67..847a4d0092 100644
--- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h
+++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -1471,10 +1471,11 @@ typedef struct {
uint64_t vram_usage; //!< VRAM usage
uint64_t sdma_usage; //!< SDMA usage in microseconds
uint32_t cu_occupancy; //!< Compute Unit usage in percent
+ uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds
} rsmi_process_info_t;
//! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method
-#define CU_OCCUPANCY_INVALID 0xFFFFFFFF
+#define KFD_STATS_INVALID 0xFFFFFFFF
/**
* @brief Opaque handle to function-support object
diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc
index 20f67ed6d5..86ffc315b6 100644
--- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc
+++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc
@@ -440,6 +440,28 @@ static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){
return sysfs_ret;
}
+static int GetProcessKFDStats(std::string path, uint32_t& val){
+
+ std::string tmp;
+ int err = ReadSysfsStr(path, &tmp);
+ auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
+
+ if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
+ return sysfs_data_errcode;
+ }
+ else if(sysfs_data_errcode==0){
+ // Update KFD stat by the process
+ val = static_cast(std::stoul(tmp));
+ }
+ else {
+ // Some GFX revisions do not provide KFD stats debugfs method
+ // which may cause ENOENT
+ val = KFD_STATS_INVALID;
+ }
+
+ return 0;
+}
+
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
std::unordered_set *gpu_set) {
assert(proc != nullptr);
@@ -447,6 +469,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
int err;
std::string tmp;
std::unordered_set::iterator itr;
+ uint32_t kfd_stat;
std::string proc_str_path = kKFDProcPathRoot;
proc_str_path += "/";
@@ -460,6 +483,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
proc->vram_usage = 0;
proc->sdma_usage = 0;
proc->cu_occupancy = 0;
+ proc->evicted_time = 0;
for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) {
uint64_t gpu_id = (*itr);
@@ -502,21 +526,23 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
cu_occupancy_path += std::to_string(gpu_id);
cu_occupancy_path += "/cu_occupancy";
- err = ReadSysfsStr(cu_occupancy_path, &tmp);
- sysfs_data_errcode = CheckValidProcessInfoData(tmp, err);
+ err = GetProcessKFDStats(cu_occupancy_path, kfd_stat);
+ if (err != 0){
+ return err;
+ }
+ proc->cu_occupancy = kfd_stat;
- if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){
- return sysfs_data_errcode;
- }
- else if(sysfs_data_errcode==0){
- // Update CU usage by the process
- proc->cu_occupancy = std::stoi(tmp);
- }
- else {
- // Some GFX revisions do not provide cu_occupancy debugfs method
- // which may cause ENOENT
- proc->cu_occupancy = CU_OCCUPANCY_INVALID;
+ std::string evicted_time_path = proc_str_path;
+ evicted_time_path += "/stats_";
+ evicted_time_path += std::to_string(gpu_id);
+ evicted_time_path += "/evicted_ms";
+
+ err = GetProcessKFDStats(evicted_time_path, kfd_stat);
+ if (err != 0){
+ return err;
}
+ proc->evicted_time = kfd_stat;
+
}
return 0;
diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc
index 107d215898..eb1e183a4b 100644
--- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc
+++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc
@@ -178,8 +178,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
amdsmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
}
- // Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
+ // Copy the kfd stats from rsmi_process_info_t to amdsmi_proc_info_t
amdsmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
+ amdsmi_proc_info.evicted_time = rsmi_proc_info.evicted_time;
// Safely handle KFD processes to get total memory_usage of the process
uint64_t kfd_gpu_id = get_kfd_gpu_id();
diff --git a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc
index 6feeb902e9..a1e9b72d88 100644
--- a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc
+++ b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc
@@ -184,6 +184,8 @@ void TestProcInfoRead::Run(void) {
proc_info.sdma_usage <<
" Compute Unit Usage: " <<
proc_info.cu_occupancy <<
+ " Evicted Time: " <<
+ proc_info.evicted_time << std::endl <<
std::endl;
}
}