diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 02877694a1..873aa0426e 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -24,6 +24,14 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - The entry `policies` is added to the end of the dictionary to match API definition. - The entry `plpds` is marked for deprecation as it has the same information as `policies`. +- **Added evicted_time metric for kfd processes**. + - Time that queues are evicted on a GPU in milliseconds + - Added to CLI in `amd-smi monitor -q` and `amd-smi process` + - Added to C API and Python API: + - amdsmi_get_gpu_process_list() + - amdsmi_get_gpu_compute_process_info() + - amdsmi_get_gpu_compute_process_info_by_pid() + ### Changed - N/A diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 5164075960..daf08577fa 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -3400,11 +3400,13 @@ class AMDSMICommands(): "gfx": process_info["engine_usage"]["gfx"], "enc": process_info["engine_usage"]["enc"], }, - "cu_occupancy": process_info["cu_occupancy"] + "cu_occupancy": process_info["cu_occupancy"], + "evicted_time": process_info["evicted_time"] } engine_usage_unit = "ns" memory_usage_unit = "B" + evicted_time_unit = "ms" if self.logger.is_human_readable_format(): process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage']) @@ -3415,6 +3417,10 @@ class AMDSMICommands(): process_info['mem_usage'] = self.helpers.unit_format(self.logger, process_info['mem_usage'], memory_usage_unit) + + process_info['evicted_time'] = self.helpers.unit_format(self.logger, + process_info['evicted_time'], + evicted_time_unit) for usage_metric in process_info['usage']: process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger, @@ -6130,8 +6136,10 @@ class AMDSMICommands(): process_info.pop('engine_usage') # Remove 'engine_usage' value process_info['mem_usage'] = process_info.pop('mem') process_info['cu_occupancy'] = process_info.pop('cu_occupancy') + process_info['evicted_time'] = process_info.pop('evicted_time') memory_usage_unit = "B" + evicted_time_unit = "ms" if self.logger.is_human_readable_format(): process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage']) @@ -6143,6 +6151,10 @@ class AMDSMICommands(): process_info['mem_usage'], memory_usage_unit) + process_info['evicted_time'] = self.helpers.unit_format(self.logger, + process_info['evicted_time'], + evicted_time_unit) + for usage_metric in process_info['memory_usage']: process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger, process_info['memory_usage'][usage_metric], @@ -6176,7 +6188,7 @@ class AMDSMICommands(): # Build the process table's title and header self.logger.secondary_table_title = "PROCESS INFO" self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(19) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \ - "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USG".rjust(10) + "CU%".rjust(9) + "EVICT".rjust(10) if watching_output: self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header @@ -7334,7 +7346,7 @@ class AMDSMICommands(): try: raw_process_list = amdsmi_interface.amdsmi_get_gpu_process_list(processor) for proc in raw_process_list: - proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A"} + proc_info_dict = {"gpu": "N/A", "pid": "N/A", "name": "N/A","gtt": "N/A", "vram": "N/A", "mem_usage": "N/A", "cu_occupancy": "N/A", "evicted_time" : "N/A"} proc_info_dict['gpu'] = gpu_id proc_info_dict['pid'] = proc['pid'] proc_info_dict['name'] = proc['name'] @@ -7350,6 +7362,7 @@ class AMDSMICommands(): proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu} except (ValueError, TypeError): proc_info_dict['cu_occupancy'] = {"current_cu": "N/A", "total_num_cu": total_num_cu} + proc_info_dict['evicted_time'] = proc['evicted_time'] all_process_list.append(proc_info_dict) except amdsmi_exception.AmdSmiLibraryException as e: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 7f23153aab..3efa82c2c0 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -230,7 +230,7 @@ class AMDSMILogger(): # Add N/A for empty process_info table_values += "N/A".rjust(17) + "N/A".rjust(9) + "N/A".rjust(10) + \ "N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(10) + \ - "N/A".rjust(9) + '\n' + "N/A".rjust(9) + "N/A".rjust(10) + '\n' else: #Fix this herre for process_key, process_value in process_dict['process_info'].items(): @@ -251,6 +251,8 @@ class AMDSMILogger(): table_values += string_process_value.rjust(10) elif process_key == "cu_occupancy": table_values += string_process_value.rjust(9) + elif process_key == "evicted_time": + table_values += string_process_value.rjust(9) # Add the stored gpu and stored timestamp to the next line table_values += '\n' if stored_timestamp: @@ -1124,8 +1126,9 @@ class AMDSMILogger(): cu_occupancy = (str(round(process['cu_occupancy']['current_cu'] / process['cu_occupancy']['total_num_cu'] * 100, 1)) + " %").rjust(7) else: cu_occupancy = "N/A" - print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} |".format( - gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy)) + evicted_time = str(process['evicted_time']).rjust(9) + print("| {0:4.4s} {1:9.9s} {2:19.19s} {3:8.8s} {4:8.8s} {5:9.9s} {6:7.7s} {7:9.9s} |".format( + gpu_id, pid, process_name, gtt_mem, vram_mem, mem_usage, cu_occupancy, evicted_time)) if process['name'] == "N/A": elevated_permission_error = True else: diff --git a/projects/amdsmi/docs/reference/amdsmi-py-api.md b/projects/amdsmi/docs/reference/amdsmi-py-api.md index 968fde8432..ca66acc6e1 100644 --- a/projects/amdsmi/docs/reference/amdsmi-py-api.md +++ b/projects/amdsmi/docs/reference/amdsmi-py-api.md @@ -1186,6 +1186,7 @@ Field | Description `engine_usage` |
Subfield Description
`gfx`GFX engine usage in ns
`enc`Encode engine usage in ns
`memory_usage` |
Subfield Description
`gtt_mem`GTT memory usage in Bytes
`cpu_mem`CPU memory usage in Bytes
`vram_mem`Process VRAM memory usage in Bytes
`cu_occupancy` | Number of Compute Units utilized +`evicted_time` | Time that queues are evicted on a GPU in milliseconds Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function: @@ -3534,6 +3535,7 @@ Field | Description `vram_usage` | VRAM usage `sdma_usage` | SDMA usage in microseconds `cu_occupancy` | Compute Unit usage in percents +`evicted_time` | Time that queues are evicted on a GPU in milliseconds Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info` function: @@ -3568,6 +3570,7 @@ Field | Description `vram_usage` | VRAM usage `sdma_usage` | SDMA usage in microseconds `cu_occupancy` | Compute Unit usage in percents +`evicted_time` | Time that queues are evicted on a GPU in milliseconds Exceptions that can be thrown by `amdsmi_get_gpu_compute_process_info_by_pid` function: diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 8a7214ff9f..4c8a377971 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1159,7 +1159,8 @@ typedef struct { } memory_usage; //!< In Bytes char container_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t cu_occupancy; //!< Num CUs utilized - uint32_t reserved[11]; + uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds + uint32_t reserved[10]; } amdsmi_proc_info_t; /** @@ -2085,6 +2086,7 @@ typedef struct { uint64_t vram_usage; //!< VRAM usage in MB uint64_t sdma_usage; //!< SDMA usage in microseconds uint32_t cu_occupancy; //!< Compute Unit usage in percent + uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds } amdsmi_process_info_t; /** diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 348a341fc8..6b611f2f28 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -2953,7 +2953,8 @@ def amdsmi_get_gpu_process_list( "cpu_mem": process_list[index].memory_usage.cpu_mem, "vram_mem": process_list[index].memory_usage.vram_mem, }, - "cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T) + "cu_occupancy": _validate_if_max_uint(process_list[index].cu_occupancy, MaxUIntegerTypes.UINT32_T), + "evicted_time": _validate_if_max_uint(process_list[index].evicted_time, MaxUIntegerTypes.UINT32_T) }) return result @@ -5303,6 +5304,7 @@ def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]: "vram_usage": proc.vram_usage, "sdma_usage": proc.sdma_usage, "cu_occupancy": proc.cu_occupancy, + "evicted_time": proc.evicted_time, } for proc in procs ] @@ -5324,6 +5326,7 @@ def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]: "vram_usage": proc.vram_usage, "sdma_usage": proc.sdma_usage, "cu_occupancy": proc.cu_occupancy, + "evicted_time": proc.evicted_time, } diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 7c4ae3817b..8bbffd2b1e 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -904,22 +904,22 @@ amdsmi_frequency_range_t = struct_amdsmi_frequency_range_t class union_amdsmi_bdf_t(Union): pass -class struct_amdsmi_bdf_t(Structure): +class struct_bdf_(Structure): pass -struct_amdsmi_bdf_t._pack_ = 1 # source:False -struct_amdsmi_bdf_t._fields_ = [ +struct_bdf_._pack_ = 1 # source:False +struct_bdf_._fields_ = [ ('function_number', ctypes.c_uint64, 3), ('device_number', ctypes.c_uint64, 5), ('bus_number', ctypes.c_uint64, 8), ('domain_number', ctypes.c_uint64, 48), ] -class struct_bdf_(Structure): +class struct_amdsmi_bdf_t(Structure): pass -struct_bdf_._pack_ = 1 # source:False -struct_bdf_._fields_ = [ +struct_amdsmi_bdf_t._pack_ = 1 # source:False +struct_amdsmi_bdf_t._fields_ = [ ('function_number', ctypes.c_uint64, 3), ('device_number', ctypes.c_uint64, 5), ('bus_number', ctypes.c_uint64, 8), @@ -1397,7 +1397,9 @@ struct_amdsmi_proc_info_t._fields_ = [ ('memory_usage', struct_memory_usage_), ('container_name', ctypes.c_char * 256), ('cu_occupancy', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 11), + ('evicted_time', ctypes.c_uint32), + ('reserved', ctypes.c_uint32 * 10), + ('PADDING_1', ctypes.c_ubyte * 4), ] amdsmi_proc_info_t = struct_amdsmi_proc_info_t @@ -2196,7 +2198,7 @@ struct_amdsmi_process_info_t._fields_ = [ ('vram_usage', ctypes.c_uint64), ('sdma_usage', ctypes.c_uint64), ('cu_occupancy', ctypes.c_uint32), - ('PADDING_1', ctypes.c_ubyte * 4), + ('evicted_time', ctypes.c_uint32), ] amdsmi_process_info_t = struct_amdsmi_process_info_t diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 3e8a5c7b67..847a4d0092 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1471,10 +1471,11 @@ typedef struct { uint64_t vram_usage; //!< VRAM usage uint64_t sdma_usage; //!< SDMA usage in microseconds uint32_t cu_occupancy; //!< Compute Unit usage in percent + uint32_t evicted_time; //!< Time that queues are evicted on a GPU in milliseconds } rsmi_process_info_t; //! CU occupancy invalidation value for the GFX revisions not providing cu_occupancy debugfs method -#define CU_OCCUPANCY_INVALID 0xFFFFFFFF +#define KFD_STATS_INVALID 0xFFFFFFFF /** * @brief Opaque handle to function-support object diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index 20f67ed6d5..86ffc315b6 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -440,6 +440,28 @@ static int CheckValidProcessInfoData(const std::string& s, int sysfs_ret){ return sysfs_ret; } +static int GetProcessKFDStats(std::string path, uint32_t& val){ + + std::string tmp; + int err = ReadSysfsStr(path, &tmp); + auto sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + + if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ + return sysfs_data_errcode; + } + else if(sysfs_data_errcode==0){ + // Update KFD stat by the process + val = static_cast(std::stoul(tmp)); + } + else { + // Some GFX revisions do not provide KFD stats debugfs method + // which may cause ENOENT + val = KFD_STATS_INVALID; + } + + return 0; +} + int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, std::unordered_set *gpu_set) { assert(proc != nullptr); @@ -447,6 +469,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, int err; std::string tmp; std::unordered_set::iterator itr; + uint32_t kfd_stat; std::string proc_str_path = kKFDProcPathRoot; proc_str_path += "/"; @@ -460,6 +483,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, proc->vram_usage = 0; proc->sdma_usage = 0; proc->cu_occupancy = 0; + proc->evicted_time = 0; for (itr = gpu_set->begin(); itr != gpu_set->end(); itr++) { uint64_t gpu_id = (*itr); @@ -502,21 +526,23 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, cu_occupancy_path += std::to_string(gpu_id); cu_occupancy_path += "/cu_occupancy"; - err = ReadSysfsStr(cu_occupancy_path, &tmp); - sysfs_data_errcode = CheckValidProcessInfoData(tmp, err); + err = GetProcessKFDStats(cu_occupancy_path, kfd_stat); + if (err != 0){ + return err; + } + proc->cu_occupancy = kfd_stat; - if (!(sysfs_data_errcode == 0 || sysfs_data_errcode == ENOENT)){ - return sysfs_data_errcode; - } - else if(sysfs_data_errcode==0){ - // Update CU usage by the process - proc->cu_occupancy = std::stoi(tmp); - } - else { - // Some GFX revisions do not provide cu_occupancy debugfs method - // which may cause ENOENT - proc->cu_occupancy = CU_OCCUPANCY_INVALID; + std::string evicted_time_path = proc_str_path; + evicted_time_path += "/stats_"; + evicted_time_path += std::to_string(gpu_id); + evicted_time_path += "/evicted_ms"; + + err = GetProcessKFDStats(evicted_time_path, kfd_stat); + if (err != 0){ + return err; } + proc->evicted_time = kfd_stat; + } return 0; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index 107d215898..eb1e183a4b 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -178,8 +178,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& amdsmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage; } - // Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t + // Copy the kfd stats from rsmi_process_info_t to amdsmi_proc_info_t amdsmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy; + amdsmi_proc_info.evicted_time = rsmi_proc_info.evicted_time; // Safely handle KFD processes to get total memory_usage of the process uint64_t kfd_gpu_id = get_kfd_gpu_id(); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc index 6feeb902e9..a1e9b72d88 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc @@ -184,6 +184,8 @@ void TestProcInfoRead::Run(void) { proc_info.sdma_usage << " Compute Unit Usage: " << proc_info.cu_occupancy << + " Evicted Time: " << + proc_info.evicted_time << std::endl << std::endl; } }