diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 287351d562..226ff81be2 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -88,6 +88,40 @@ ASIC products. This requires users to update any ABIs using this structure. ### Fixes +- **Fixed multiple processes not being registered in `amd-smi process` with json and csv format**. +Multiple process outputs in the CLI tool were not being registered correctly. The json output did not handle multiple processes and is now in a new valid json format: + +```shell +[ + { + "gpu": 0, + "process_list": [ + { + "process_info": { + "name": "TransferBench", + "pid": 420157, + "mem_usage": { + "value": 0, + "unit": "B" + } + } + }, + { + "process_info": { + "name": "rvs", + "pid": 420315, + "mem_usage": { + "value": 0, + "unit": "B" + } + } + } + ] + } +] +``` + + - **Removed `throttle-status` from `amd-smi monitor` as it is no longer reliably supported**. Throttle status may work for older ASICs, but will be replaced with PVIOL and TVIOL metrics for future ASIC support. It remains a field in the gpu_metrics API and in `amd-smi metric --power`. diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 49b6b694ff..d063698921 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -2706,47 +2706,47 @@ class AMDSMICommands(): process_names.append(process_info) filtered_process_values = process_names + # If the name or pid args filter processes out then insert an N/A placeholder + if not filtered_process_values: + filtered_process_values.append({'process_info': "N/A"}) + logging.debug(f"Process Info for GPU {gpu_id} | {filtered_process_values}") - multiple_devices_csv_override = False - # Convert and store output by pid for csv format - if self.logger.is_csv_format(): - # Check for empty list first - if not filtered_process_values: - self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') - else: - for process_info in filtered_process_values: - if process_info['process_info'] == "N/A": - self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') - else: - for key, value in process_info['process_info'].items(): - multiple_devices_csv_override = True - if watching_output: - self.logger.store_output(args.gpu, 'timestamp', int(time.time())) - self.logger.store_output(args.gpu, key, value) + for index, process in enumerate(filtered_process_values): + if process['process_info'] == "N/A": + filtered_process_values[index]['process_info'] = "No running processes detected" - self.logger.store_multiple_device_output() - else: + if self.logger.is_json_format(): if watching_output: self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, 'process_list', filtered_process_values) - # Store values in logger.output - if not filtered_process_values: - self.logger.store_output(args.gpu, 'process_info', 'No running processes detected') - else: - for process_info in filtered_process_values: - if process_info['process_info'] == "N/A": - process_info['process_info'] = 'No running processes detected' - self.logger.store_output(args.gpu, 'process_info', process_info['process_info']) + if self.logger.is_human_readable_format(): + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + # When we print out process_info we remove the index + # The removal is needed only for human readable process format to align with Host + for index, process in enumerate(filtered_process_values): + self.logger.store_output(args.gpu, f'process_info_{index}', process['process_info']) + + multiple_devices_csv_override = False + if self.logger.is_csv_format(): + multiple_devices_csv_override = True + for process in filtered_process_values: + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + self.logger.store_output(args.gpu, 'process_info', process['process_info']) + self.logger.store_multiple_device_output() if multiple_devices: self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override, watching_output=watching_output) + multiple_devices = multiple_devices or multiple_devices_csv_override + self.logger.print_output(multiple_device_enabled=multiple_devices, watching_output=watching_output) if watching_output: # End of single gpu add to watch_output - self.logger.store_watch_output(multiple_device_enabled=multiple_devices_csv_override) + self.logger.store_watch_output(multiple_device_enabled=multiple_devices) def profile(self, args): diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 49747694f0..5b6e35457b 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -72,9 +72,11 @@ class AMDSMILogger(): def is_human_readable_format(self): return self.format == self.LoggerFormat.human_readable.value + def clear_multiple_devices_ouput(self): self.multiple_device_output.clear() + def _capitalize_keys(self, input_dict): output_dict = {} for key in input_dict.keys(): @@ -163,6 +165,9 @@ class AMDSMILogger(): yaml_output = yaml_output.replace("AMDSMI_SPACING_REMOVAL:\n", "") yaml_output = yaml_output.replace("'", "") # Remove '' + # Remove process_info indicies for Host parity: + yaml_output = re.sub(r'PROCESS_INFO_[0-9]+:', 'PROCESS_INFO:', yaml_output) + clean_yaml_output = '' for line in yaml_output.splitlines(): line = line.split(':') diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index a5ccb6db76..278f594df8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -148,6 +148,16 @@ class AMDSMIParser(argparse.ArgumentParser): raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(int_value, outputformat) + def _is_valid_string(self, string_value): + # Argument type validator + # This is for triggering a cli exception if an empty string is detected + if string_value: + return string_value + + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat) + + def _check_output_file_path(self): """ Argument action validator: Returns a path to a file from the output file path provided. @@ -863,7 +873,7 @@ class AMDSMIParser(argparse.ArgumentParser): process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help) - process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) + process_parser.add_argument('-n', '--name', action='store', type=self._is_valid_string, required=False, help=name_help) def _add_profile_parser(self, subparsers, func): diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index 72c5cc4d3c..650227d9fd 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -158,6 +158,12 @@ amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_vbios(void *info) const { int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t& compute_process_list, ComputeProcessListType_t list_type) { + + /** + * Clear the compute_process_list before starting. + */ + compute_process_list.clear(); + /** * The first call to GetProcessInfo() helps to find the size it needs, * so we can create a tailored size list.