diff --git a/projects/amdsmi/amdsmi_cli/_version.py b/projects/amdsmi/amdsmi_cli/_version.py index 27fdca497c..81f0fdeccf 100644 --- a/projects/amdsmi/amdsmi_cli/_version.py +++ b/projects/amdsmi/amdsmi_cli/_version.py @@ -1 +1 @@ -__version__ = "0.0.3" +__version__ = "0.0.4" diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 2debab8041..37fbfff22c 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -20,16 +20,8 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -"""AMDSMICommands - -This class contains all the commands corresponding to AMDSMIParser -Each command function will interact with AMDSMILogger to handle -displaying the output to the specified compatibility, format, and -destination. - -""" - import threading +import time from _version import __version__ from amdsmi_helpers import AMDSMIHelpers @@ -39,6 +31,11 @@ from amdsmi import amdsmi_exception class AMDSMICommands(): + """This class contains all the commands corresponding to AMDSMIParser + Each command function will interact with AMDSMILogger to handle + displaying the output to the specified compatibility, format, and + destination. + """ def __init__(self, compatibility='amdsmi', format='human_readable', destination='stdout') -> None: @@ -143,7 +140,7 @@ class AMDSMICommands(): # compatibility with gpuvsmi needs a list for single gpu if self.logger.is_gpuvsmi_compatibility() and not multiple_devices: self.logger.store_multiple_device_output() - self.logger.print_output(multiple_device_output=True) + self.logger.print_output(multiple_device_enabled=True) else: self.logger.print_output() @@ -206,7 +203,7 @@ class AMDSMICommands(): if not any([args.asic, args.bus, args.vbios, args.limit, args.driver, args.caps, args.ras, args.board]): args.asic = args.bus = args.vbios = args.limit = args.driver = args.caps = args.ras = args.board = self.all_arguments = True - values_dict = {} + static_dict = {} if args.asic: try: @@ -218,9 +215,9 @@ class AMDSMICommands(): if asic_info['asic_serial'] != '': asic_info['asic_serial'] = '0x' + asic_info['asic_serial'] - values_dict['asic'] = asic_info + static_dict['asic'] = asic_info except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['asic'] = e.get_error_info() + static_dict['asic'] = e.get_error_info() if not self.all_arguments: raise e if args.bus: @@ -245,7 +242,7 @@ class AMDSMICommands(): raise e bus_output_info.update(bus_info) - values_dict['bus'] = bus_output_info + static_dict['bus'] = bus_output_info if args.vbios: try: vbios_info = amdsmi_interface.amdsmi_get_vbios_info(args.gpu) @@ -255,9 +252,9 @@ class AMDSMICommands(): vbios_info['part_number'] = vbios_info.pop('part_number') vbios_info['vbios_version'] = vbios_info.pop('vbios_version') - values_dict['vbios'] = vbios_info + static_dict['vbios'] = vbios_info except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['vbios'] = e.get_error_info() + static_dict['vbios'] = e.get_error_info() if not self.all_arguments: raise e if args.board: @@ -270,9 +267,9 @@ class AMDSMICommands(): board_info['product_number'] = board_info.pop('product_serial') board_info['product_name'] = board_info.pop('product_name') - values_dict['board'] = board_info + static_dict['board'] = board_info except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['board'] = e.get_error_info() + static_dict['board'] = e.get_error_info() if not self.all_arguments: raise e if args.limit: @@ -322,22 +319,26 @@ class AMDSMICommands(): limit_info['temperature_junction'] = temp_junction_limit limit_info['temperature_vram'] = temp_vram_limit - values_dict['limit'] = limit_info + static_dict['limit'] = limit_info if args.driver: try: driver_info = {} driver_info['driver_version'] = amdsmi_interface.amdsmi_get_driver_version(args.gpu) - values_dict['driver'] = driver_info + static_dict['driver'] = driver_info except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['driver'] = e.get_error_info() + static_dict['driver'] = e.get_error_info() if not self.all_arguments: raise e if args.ras: try: - values_dict['ras'] = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) + if self.helpers.has_ras_support(args.gpu): + static_dict['ras'] = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) + else: + static_dict['ras'] = 'N/A' + except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['ras'] = e.get_error_info() + static_dict['ras'] = e.get_error_info() if not self.all_arguments: raise e if args.caps: @@ -348,21 +349,44 @@ class AMDSMICommands(): for capability_name, capability_value in caps_info.items(): if isinstance(capability_value, list): caps_info[capability_name] = f"{capability_value}" + if isinstance(capability_value, bool): + caps_info[capability_name] = f"{bool(capability_value)}" - values_dict['caps'] = caps_info + if self.logger.is_csv_format() and self.logger.is_gpuvsmi_compatibility(): + if 'mm_ip_list' in caps_info: + if caps_info['mm_ip_list']: # Don't index if it's not populated + caps_info['mm_ip_list'] = caps_info['mm_ip_list'][0] + + static_dict['caps'] = caps_info except amdsmi_exception.AmdSmiLibraryException as e: - values_dict['caps'] = e.get_error_info() + static_dict['caps'] = e.get_error_info() if not self.all_arguments: raise e - # Store values in logger.output - self.logger.store_output(args.gpu, 'values', values_dict) + multiple_devices_csv_override = False + # Convert and store output by pid for csv format + if self.logger.is_csv_format() and args.ras: + # expand if ras blocks are populated + if isinstance(static_dict['ras'], list): + ras_dicts = static_dict.pop('ras') + multiple_devices_csv_override = True + for ras_dict in ras_dicts: + for key, value in ras_dict.items(): + self.logger.store_output(args.gpu, key, value) + self.logger.store_output(args.gpu, 'values', static_dict) + self.logger.store_multiple_device_output() + else: + # Store values if ras has an error + self.logger.store_output(args.gpu, 'values', static_dict) + else: + # Store values in logger.output + self.logger.store_output(args.gpu, 'values', static_dict) if multiple_devices: self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices - self.logger.print_output() + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -445,7 +469,7 @@ class AMDSMICommands(): self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_output=multiple_devices_csv_override) + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) def bad_pages(self, args, multiple_devices=False, gpu=None, retired=None, pending=None, un_res=None): @@ -456,7 +480,7 @@ class AMDSMICommands(): multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. retired (bool, optional) - Value override for args.retired - pending (bool, optional) - Value override for args.pending + pending (bool, optional) - Value override for args.pending/ un_res (bool, optional) - Value override for args.un_res Raises: @@ -611,7 +635,6 @@ class AMDSMICommands(): Returns: None: Print output via AMDSMILogger to destination """ - # Set args.* to passed in arguments if gpu: args.gpu = gpu @@ -662,20 +685,33 @@ class AMDSMICommands(): # Handle watch logic, will only enter this block once if args.watch: - self.helpers.handle_watch(args=args, subcommand=self.metric) - self.logger.print_output(watch_output=True) # Print at the end of watch ( final flush ) + self.helpers.handle_watch(args=args, subcommand=self.metric, logger=self.logger) + return # Handle multiple GPUs if isinstance(args.gpu, list): if len(args.gpu) > 1: - for device_handle in args.gpu: - # Handle multiple_devices to print all output at once - self.metric(args, multiple_devices=True, watching_output=False, gpu=device_handle) - self.logger.print_output(multiple_device_output=True) + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) - # End of multiple gpus add to watch_output + # Store output from multiple devices + for device_handle in args.gpu: + self.metric(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + # Print multiple device output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + # Add output to total watch output and clear multiple device output if watching_output: - self.logger.store_watch_output(multiple_devices=True) + self.logger.store_watch_output(multiple_device_enabled=True) + + # Flush the watching output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) return elif len(args.gpu) == 1: @@ -822,14 +858,15 @@ class AMDSMICommands(): if args.ecc: ecc_dict = {} try: - ras_states = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) - for state in ras_states: - if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED: - gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']] - ecc_count = amdsmi_interface.amdsmi_get_ecc_error_count(args.gpu, gpu_block) - ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'], - 'uncorrectable': ecc_count['uncorrectable_count']} - if ecc_dict == {}: + if self.helpers.has_ras_support(args.gpu): + ras_states = amdsmi_interface.amdsmi_get_ras_block_features_enabled(args.gpu) + for state in ras_states: + if state['status'] == amdsmi_interface.AmdSmiRasErrState.ENABLED: + gpu_block = amdsmi_interface.AmdSmiGpuBlock[state['block']] + ecc_count = amdsmi_interface.amdsmi_dev_get_ecc_count(args.gpu, gpu_block) + ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'], + 'uncorrectable': ecc_count['uncorrectable_count']} + if not ecc_dict: ecc_dict['correctable'] = 'N/A' ecc_dict['uncorrectable'] = 'N/A' @@ -1021,17 +1058,19 @@ class AMDSMICommands(): values_dict['mem_usage'] = memory_total - # Store values in logger.output + # Store timestamp first if watching_output is enabled + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) self.logger.store_output(args.gpu, 'values', values_dict) if multiple_devices: self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices - self.logger.print_output() + self.logger.print_output(watching_output=watching_output) if watching_output: # End of single gpu add to watch_output - self.logger.store_watch_output(multiple_devices=False) + self.logger.store_watch_output(multiple_device_enabled=False) def process(self, args, multiple_devices=False, watching_output=False, @@ -1082,21 +1121,33 @@ class AMDSMICommands(): # Handle watch logic, will only enter this block once if args.watch: - args = self.helpers.handle_watch(args=args, subcommand=self.process) - self.logger.print_output(watch_output=True) # Print at the end of watch ( final flush ) + self.helpers.handle_watch(args=args, subcommand=self.process, logger=self.logger) return # Handle multiple GPUs if isinstance(args.gpu, list): if len(args.gpu) > 1: - for device_handle in args.gpu: - # Handle multiple_devices to print all output at once - self.process(args, multiple_devices=True, watching_output=False, gpu=device_handle) - self.logger.print_output(multiple_device_output=True) + # Deepcopy gpus as recursion will destroy the gpu list + stored_gpus = [] + for gpu in args.gpu: + stored_gpus.append(gpu) - # End of multiple gpus add to watch_output + # Store output from multiple devices + for device_handle in args.gpu: + self.process(args, multiple_devices=True, watching_output=watching_output, gpu=device_handle) + + # Reload original gpus + args.gpu = stored_gpus + + # Print multiple device output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) + + # Add output to total watch output and clear multiple device output if watching_output: - self.logger.store_watch_output(multiple_devices=True) + self.logger.store_watch_output(multiple_device_enabled=True) + + # Flush the watching output + self.logger.print_output(multiple_device_enabled=True, watching_output=watching_output) return elif len(args.gpu) == 1: @@ -1126,7 +1177,7 @@ class AMDSMICommands(): mem_usage_mb = (process_info['mem_usage']//1024) // 1024 if mem_usage_mb < 0: - process_info['mem_usage'] = (process_info['mem_usage']//1024) + process_info['mem_usage'] = process_info['mem_usage']//1024 mem_usage_unit = 'B' else: process_info['mem_usage'] = mem_usage_mb @@ -1180,13 +1231,20 @@ class AMDSMICommands(): for process_info in filtered_process_values: for key, value in process_info['process_info'].items(): multiple_devices_csv_override = True + + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) self.logger.store_output(args.gpu, key, value) + self.logger.store_multiple_device_output() else: # Remove brackets if there is only one value if len(filtered_process_values) == 1: filtered_process_values = filtered_process_values[0] + if watching_output: + self.logger.store_output(args.gpu, 'timestamp', int(time.time())) + # Store values in logger.output if filtered_process_values == []: self.logger.store_output(args.gpu, 'values', {'process_info': 'Not Found'}) @@ -1197,10 +1255,10 @@ class AMDSMICommands(): self.logger.store_multiple_device_output() return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_output=multiple_devices_csv_override) + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override, watching_output=watching_output) if watching_output: # End of single gpu add to watch_output - self.logger.store_watch_output(multiple_devices=False) + self.logger.store_watch_output(multiple_device_enabled=multiple_devices_csv_override) def profile(self, args): @@ -1262,23 +1320,31 @@ class AMDSMICommands(): if args.gpu is None: args.gpu = self.device_handles + # Handle multiple GPUs + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.topology) + if handled_multiple_gpus: + return # This function is recursive + # Handle all args being false if not any([args.access, args.weight, args.hops, args.type, args.numa, args.numa_bw]): args.access = args.weight = args.hops = args.type = args.numa = args.numa_bw = True - topo_json = {} - topo_table = [] - + topo_dict = {} if args.access: - pass + topo_dict['access'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + if args.weight: - pass + topo_dict['weight'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + if args.hops: - pass + topo_dict['hops'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + if args.type: - pass + topo_dict['type'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + if args.numa: - pass + topo_dict['numa'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + # numa_numbers = c_uint32() # for device in deviceList: # ret = rocmsmi.rsmi_get_numa_node_number(device, byref(numa_numbers)) @@ -1293,7 +1359,17 @@ class AMDSMICommands(): # else: # printErrLog(device, 'Cannot read Numa Affinity') if args.numa_bw: - pass + topo_dict['numa_bw'] = amdsmi_exception.AmdSmiLibraryException(amdsmi_exception.AmdSmiRetCode.NOT_IMPLEMENTED).err_info + + + # Store values in logger.output + self.logger.store_output(args.gpu, 'values', topo_dict) + + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + + self.logger.print_output() def set_value(self, args, multiple_devices=False, gpu=None, clock=None, sclk=None, mclk=None, @@ -1306,22 +1382,22 @@ class AMDSMICommands(): args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. - clock (bool, optional): Value over ride for args.clock. Defaults to None. - sclk (bool, optional): Value over ride for args.sclk. Defaults to None. - mclk (bool, optional): Value over ride for args.mclk. Defaults to None. - pcie (bool, optional): Value over ride for args.pcie. Defaults to None. - slevel (bool, optional): Value over ride for args.slevel. Defaults to None. - mlevel (bool, optional): Value over ride for args.mlevel. Defaults to None. - vc (bool, optional): Value over ride for args.vc. Defaults to None. - srange (bool, optional): Value over ride for args.srange. Defaults to None. - mrange (bool, optional): Value over ride for args.mrange. Defaults to None. - fan (bool, optional): Value over ride for args.fan. Defaults to None. - perflevel (bool, optional): Value over ride for args.perflevel. Defaults to None. - overdrive (bool, optional): Value over ride for args.overdrive. Defaults to None. - memoverdrive (bool, optional): Value over ride for args.memoverdrive. Defaults to None. - poweroverdrive (bool, optional): Value over ride for args.poweroverdrive. Defaults to None. - profile (bool, optional): Value over ride for args.profile. Defaults to None. - perfdeterminism (bool, optional): Value over ride for args.perfdeterminism. Defaults to None. + clock ((amdsmi_interface.AmdSmiClkType, int), optional): Value override for args.clock. Defaults to None. + sclk (int, optional): Value override for args.sclk. Defaults to None. + mclk (int, optional): Value override for args.mclk. Defaults to None. + pcie (int, optional): Value override for args.pcie. Defaults to None. + slevel ((amdsmi_interface.AmdSmiFreqInd), int), optional): Value override for args.slevel. Defaults to None. + mlevel ((amdsmi_interface.AmdSmiFreqInd), optional): Value override for args.mlevel. Defaults to None. + vc ((int, int, int), optional): Value override for args.vc. Defaults to None. + srange ((int, int), optional): Value override for args.srange. Defaults to None. + mrange ((int, int), optional): Value override for args.mrange. Defaults to None. + fan (int, optional): Value override for args.fan. Defaults to None. + perflevel (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perflevel. Defaults to None. + overdrive (int, optional): Value override for args.overdrive. Defaults to None. + memoverdrive (int, optional): Value override for args.memoverdrive. Defaults to None. + poweroverdrive (int, optional): Value override for args.poweroverdrive. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + perfdeterminism (int, optional): Value override for args.perfdeterminism. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1397,7 +1473,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1405,7 +1481,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e if clock_type != amdsmi_interface.AmdSmiClkType.PCIE: @@ -1413,18 +1489,17 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e else: try: amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e self.logger.store_output(args.gpu, 'clock', f'Successfully set clock frequency bitmask for {clock_type}') - if isinstance(args.sclk, int): freq_bitmask = args.sclk clock_type = amdsmi_interface.AmdSmiClkType.SYS @@ -1433,7 +1508,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1441,14 +1516,14 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e try: amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e self.logger.store_output(args.gpu, 'sclk', 'Successfully set clock frequency bitmask') @@ -1460,7 +1535,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1468,14 +1543,14 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e try: amdsmi_interface.amdsmi_dev_set_clk_freq(args.gpu, clock_type, freq_bitmask) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e self.logger.store_output(args.gpu, 'mclk', 'Successfully set clock frequency bitmask') @@ -1487,7 +1562,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1495,17 +1570,18 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e try: amdsmi_interface.amdsmi_dev_set_pci_bandwidth(args.gpu, freq_bitmask) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the {clock_type} clock frequency on {gpu_string}") from e self.logger.store_output(args.gpu, 'pcie', 'Successfully set clock frequency bitmask') if isinstance(args.slevel, int): + level, value = args.slevel level = amdsmi_interface.AmdSmiFreqInd(level) clock_type = amdsmi_interface.AmdSmiClkType.SYS @@ -1513,7 +1589,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e self.logger.store_output(args.gpu, 'slevel', 'Successfully changed clock frequency') @@ -1525,7 +1601,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_od_clk_info(args.gpu, level, value, clock_type) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to change the {clock_type} clock frequency in the PowerPlay table on {gpu_string}") from e self.logger.store_output(args.gpu, 'mlevel', 'Successfully changed clock frequency') @@ -1535,7 +1611,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_od_volt_info(args.gpu, point, clk, volt) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the Voltage Curve point {point} to {clk}(MHz) {volt}(mV) on {gpu_string}") from e self.logger.store_output(args.gpu, 'vc', f'Successfully set voltage point {point} to {clk}(MHz) {volt}(mV)') @@ -1546,7 +1622,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e self.logger.store_output(args.gpu, 'srange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)") @@ -1557,7 +1633,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_clk_range(args.gpu, min_value, max_value, clock_type) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set {clock_type} from {min_value}(MHz) to {max_value}(MHz) on {gpu_string}") from e self.logger.store_output(args.gpu, 'mrange', f"Successfully set {clock_type} from {min_value}(MHz) to {max_value}(MHz)") @@ -1566,7 +1642,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_fan_speed(args.gpu, 0, args.fan) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}") @@ -1576,7 +1652,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, perf_level) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set performance level {args.perflevel} on {gpu_string}") from e self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perflevel}") @@ -1586,7 +1662,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1594,14 +1670,14 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e try: amdsmi_interface.amdsmi_dev_set_overdrive_level_v1(args.gpu, args.overdrive) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set overdrive {args.overdrive} to {gpu_string}") from e self.logger.store_output(args.gpu, 'overdrive', f"Successfully to set overdrive level to {args.overdrive}") @@ -1611,7 +1687,7 @@ class AMDSMICommands(): perf_level = amdsmi_interface.amdsmi_dev_get_perf_level(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get performance level of {gpu_string}") from e if 'manual' in perf_level.lower(): @@ -1619,7 +1695,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_perf_level_v1(args.gpu, amdsmi_interface.AmdSmiDevPerfLevel.MANUAL) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set the performance level of {gpu_string} to manual") from e self.logger.store_output(args.gpu, 'memoverdrive', f"Successfully to set memoverdrive level to {args.memoverdrive}") @@ -1629,7 +1705,7 @@ class AMDSMICommands(): power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get the power cap info for {gpu_string}") from e if overdrive_power_cap == 0: overdrive_power_cap = power_caps['power_cap_default'] @@ -1649,14 +1725,14 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_dev_set_power_cap(args.gpu, 0, overdrive_power_cap) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set power cap to {overdrive_power_cap} on {gpu_string}") from e try: power_caps = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to get the power cap info for {gpu_string} post set") from e if power_caps['power_cap'] == overdrive_power_cap: @@ -1670,7 +1746,7 @@ class AMDSMICommands(): amdsmi_interface.amdsmi_set_perf_determinism_mode(args.gpu, args.perfdeterminism) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perfdeterminism} on {gpu_string}") from e self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perfdeterminism}") @@ -1691,13 +1767,13 @@ class AMDSMICommands(): args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. - gpureset (bool, optional): Value over ride for args.gpureset. Defaults to None. - clocks (bool, optional): Value over ride for args.clocks. Defaults to None. - fans (bool, optional): Value over ride for args.fans. Defaults to None. - profile (bool, optional): Value over ride for args.profile. Defaults to None. - poweroverdrive (bool, optional): Value over ride for args.poweroverdrive. Defaults to None. - xgmierr (bool, optional): Value over ride for args.xgmierr. Defaults to None. - perfdeterminism (bool, optional): Value over ride for args.perfdeterminism. Defaults to None. + gpureset (bool, optional): Value override for args.gpureset. Defaults to None. + clocks (bool, optional): Value override for args.clocks. Defaults to None. + fans (bool, optional): Value override for args.fans. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + poweroverdrive (bool, optional): Value override for args.poweroverdrive. Defaults to None. + xgmierr (bool, optional): Value override for args.xgmierr. Defaults to None. + perfdeterminism (bool, optional): Value override for args.perfdeterminism. Defaults to None. Raises: ValueError: Value error if no gpu value is provided @@ -1742,14 +1818,13 @@ class AMDSMICommands(): result = 'Successfully reset GPU' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e result = e.get_error_info() else: result = 'Unable to reset non-amd GPU' self.logger.store_output(args.gpu, 'gpu_reset', result) if args.clocks: - # rsmi_string = ' Reset Clocks ' reset_clocks_results = {'overdrive' : '', 'clocks' : '', 'performance': ''} @@ -1758,7 +1833,7 @@ class AMDSMICommands(): reset_clocks_results['overdrive'] = 'Overdrive set to 0' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e reset_clocks_results['overdrive'] = e.get_error_info() try: @@ -1767,7 +1842,7 @@ class AMDSMICommands(): reset_clocks_results['clocks'] = 'Successfully reset clocks' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e reset_clocks_results['clocks'] = e.get_error_info() try: @@ -1776,7 +1851,7 @@ class AMDSMICommands(): reset_clocks_results['performance'] = 'Performance level reset to auto' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e reset_clocks_results['performance'] = e.get_error_info() self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) @@ -1786,7 +1861,7 @@ class AMDSMICommands(): result = 'Successfully reset fan speed to driver control' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e result = e.get_error_info() self.logger.store_output(args.gpu, 'reset_fans', result) @@ -1799,7 +1874,7 @@ class AMDSMICommands(): reset_profile_results['power_profile'] = 'Successfully reset Power Profile' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e reset_profile_results['power_profile'] = e.get_error_info() try: @@ -1808,7 +1883,7 @@ class AMDSMICommands(): reset_profile_results['performance_level'] = 'Successfully reset Performance Level' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e reset_profile_results['performance_level'] = e.get_error_info() self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) @@ -1818,7 +1893,7 @@ class AMDSMICommands(): result = 'Successfully reset XGMI Error count' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e result = e.get_error_info() self.logger.store_output(args.gpu, 'reset_xgmi_err', result) if args.perfdeterminism: @@ -1828,7 +1903,7 @@ class AMDSMICommands(): result = 'Successfully disabled performance determinism' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_exception.AmdSmiRetCode.ERR_NO_PERM: - raise PermissionError('Command requires elevation') + raise PermissionError('Command requires elevation') from e result = e.get_error_info() self.logger.store_output(args.gpu, 'reset_perf_determinism', result) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 152c554e69..054c3cae47 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -229,7 +229,7 @@ class AMDSMIHelpers(): for device_handle in args.gpu: # Handle multiple_devices to print all output at once subcommand(args, multiple_devices=True, gpu=device_handle) - logger.print_output(multiple_device_output=True) + logger.print_output(multiple_devices_enabled=True) return True, args.gpu elif len(args.gpu) == 1: args.gpu = args.gpu[0] @@ -240,13 +240,14 @@ class AMDSMIHelpers(): return False, args.gpu - def handle_watch(self, args, subcommand): + def handle_watch(self, args, subcommand, logger): """This function will run the subcommand multiple times based on the passed watch, watch_time, and iterations passed in. params: args - argparser args to pass to subcommand subcommand (AMDSMICommands) - Function that can handle watching output (Currently: metric & process) + logger (AMDSMILogger) - Logger for accessing config values return: Nothing """ @@ -260,6 +261,8 @@ class AMDSMIHelpers(): args.watch_time = None args.iterations = None + # Set the signal handler to flush a delmiter to file if the format is json + print("'CTRL' + 'C' to stop watching output:") if watch_time: # Run for set amount of time iterations_ran = 0 end_time = time.time() + watch_time @@ -267,11 +270,11 @@ class AMDSMIHelpers(): subcommand(args, watching_output=True) # Handle iterations limit iterations_ran += 1 - if iterations: - if iterations >= iterations_ran: + if iterations is not None: + if iterations <= iterations_ran: break time.sleep(watch) - elif iterations: # Run for a set amount of iterations + elif iterations is not None: # Run for a set amount of iterations for iteration in range(iterations): subcommand(args, watching_output=True) if iteration == iterations - 1: # Break on iteration completion @@ -386,3 +389,15 @@ class AMDSMIHelpers(): return True, profile_presets[profile] else: return False, profile_presets.values() + + + def has_ras_support(self, device_handle): + try: + caps_info = amdsmi_interface.amdsmi_get_caps_info(device_handle) + + if caps_info['ras_supported']: + return True + else: + return False + except amdsmi_exception.AmdSmiLibraryException: + return False diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index a31ea7a6a9..fc6a86599d 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -39,7 +39,7 @@ class AMDSMILogger(): self.compatibility = compatibility # amd-smi, gpuv-smi, or rocm-smi self.format = format # csv, json, or human_readable self.destination = destination # stdout, path to a file (append) - self.amd_smi_helpers = AMDSMIHelpers() + self.helpers = AMDSMIHelpers() class LoggerFormat(Enum): @@ -182,8 +182,12 @@ class AMDSMILogger(): value_with_parent_key = {} for parent_key, child_dict in value.items(): if isinstance(child_dict, dict): - for child_key, value1 in child_dict.items(): - value_with_parent_key[parent_key + '_' + child_key] = value1 + if parent_key in ('gfx'): + for child_key, value1 in child_dict.items(): + value_with_parent_key[child_key] = value1 + else: + for child_key, value1 in child_dict.items(): + value_with_parent_key[parent_key + '_' + child_key] = value1 else: value_with_parent_key[parent_key] = child_dict value = value_with_parent_key @@ -212,7 +216,7 @@ class AMDSMILogger(): return: Nothing """ - gpu_id = self.amd_smi_helpers.get_gpu_id_from_device_handle(device_handle) + gpu_id = self.helpers.get_gpu_id_from_device_handle(device_handle) if self.is_amdsmi_compatibility(): self._store_output_amdsmi(gpu_id=gpu_id, argument=argument, data=data) elif self.is_rocmsmi_compatibility(): @@ -222,6 +226,9 @@ class AMDSMILogger(): def _store_output_amdsmi(self, gpu_id, argument, data): + if argument == 'timestamp': # Make sure timestamp is the first element in the output + self.output['timestamp'] = int(time.time()) + if self.is_json_format() or self.is_human_readable_format(): self.output['gpu'] = int(gpu_id) if argument == 'values' and isinstance(data, dict): @@ -237,7 +244,6 @@ class AMDSMILogger(): self.output.update(flat_dict) else: self.output[argument] = data - else: raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported") @@ -257,6 +263,9 @@ class AMDSMILogger(): def _store_output_gpuvsmi(self, gpu_id, argument, data): + if argument == 'timestamp': # Make sure timestamp is the first element in the output + self.output['timestamp'] = int(time.time()) + if self.is_json_format() or self.is_human_readable_format(): self.output['gpu'] = int(gpu_id) self.output[argument] = data @@ -299,60 +308,68 @@ class AMDSMILogger(): """ if not self.output: return + output = {} + for key, value in self.output.items(): + output[key] = value - self.multiple_device_output.append(self.output) + self.multiple_device_output.append(output) self.output = {} - def store_watch_output(self, multiple_devices=False): + def store_watch_output(self, multiple_device_enabled=False): """ Add the current output or multiple_devices_output params: - multiple_devices (bool) - True if watching multiple devices + multiple_device_enabled (bool) - True if watching multiple devices return: Nothing """ - values = self.output - if multiple_devices: - values = self.multiple_device_output + if multiple_device_enabled: + for output in self.multiple_device_output: + self.watch_output.append(output) - self.watch_output.append({'timestamp': int(time.time()), - 'values': values}) + self.multiple_device_output = [] + else: + output = {} + + for key, value in self.output.items(): + output[key] = value + self.watch_output.append(output) + + self.output = {} - def print_output(self, multiple_device_output=False, watch_output=False): + def print_output(self, multiple_device_enabled=False, watching_output=False): """ Print current output acording to format and then destination params: - multiple_device_output (bool) - True if printing output from + multiple_device_enabled (bool) - True if printing output from multiple devices - watch_output (bool) - True if printing watch output + watching_output (bool) - True if printing watch output return: Nothing """ if self.is_json_format(): - self._print_json_output(multiple_device_output=multiple_device_output, - watch_output=watch_output) + self._print_json_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) elif self.is_csv_format(): - self._print_csv_output(multiple_device_output=multiple_device_output, - watch_output=watch_output) + self._print_csv_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) elif self.is_human_readable_format(): - self._print_human_readable_output(multiple_device_output=multiple_device_output, - watch_output=watch_output) + self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled, + watching_output=watching_output) - def _print_json_output(self, multiple_device_output=False, watch_output=False): - if multiple_device_output: + def _print_json_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: json_output = self.multiple_device_output else: json_output = self.output if self.destination == 'stdout': - if watch_output: - return # We don't need to print to stdout at the end of watch - else: - json_std_output = json.dumps(json_output, indent = 4) + if json_output: + json_std_output = json.dumps(json_output, indent=4) print(json_std_output) else: # Write output to file - if watch_output: # Flush the full JSON output to the file on watch command completion + if watching_output: # Flush the full JSON output to the file on watch command completion with self.destination.open('w') as output_file: json.dump(self.watch_output, output_file, indent=4) else: @@ -360,43 +377,42 @@ class AMDSMILogger(): json.dump(json_output, output_file, indent=4) - def _print_csv_output(self, multiple_device_output=False, watch_output=False): - if watch_output: # Don't print output if it's for watch - return - - if multiple_device_output: + def _print_csv_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: stored_csv_output = self.multiple_device_output else: if not isinstance(self.output, list): stored_csv_output = [self.output] if self.destination == 'stdout': - csv_header = stored_csv_output[0].keys() - csv_stdout_output = self.CsvStdoutBuilder() - writer = csv.DictWriter(csv_stdout_output, csv_header) - writer.writeheader() - writer.writerows(stored_csv_output) - - if self.is_gpuvsmi_compatibility(): - print(str(csv_stdout_output).replace('"','')) - else: - print(str(csv_stdout_output)) - else: - with self.destination.open('a', newline = '') as output_file: + if stored_csv_output: csv_header = stored_csv_output[0].keys() - writer = csv.DictWriter(output_file, csv_header) + csv_stdout_output = self.CsvStdoutBuilder() + writer = csv.DictWriter(csv_stdout_output, csv_header) writer.writeheader() writer.writerows(stored_csv_output) + print(str(csv_stdout_output)) + else: + if watching_output: + with self.destination.open('w', newline = '') as output_file: + if self.watch_output: + csv_header = self.watch_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(self.watch_output) + else: + with self.destination.open('a', newline = '') as output_file: + csv_header = stored_csv_output[0].keys() + writer = csv.DictWriter(output_file, csv_header) + writer.writeheader() + writer.writerows(stored_csv_output) - def _print_human_readable_output(self, multiple_device_output=False, watch_output=False): - if watch_output: # Don't print output if it's for watch - return - - if multiple_device_output: + def _print_human_readable_output(self, multiple_device_enabled=False, watching_output=False): + if multiple_device_enabled: human_readable_output = '' for output in self.multiple_device_output: - human_readable_output += (self._convert_json_to_human_readable(output)) + human_readable_output += self._convert_json_to_human_readable(output) else: human_readable_output = self._convert_json_to_human_readable(self.output) @@ -408,5 +424,12 @@ class AMDSMILogger(): # print as ascii, ignore incompatible characters print(human_readable_output.encode('ascii', 'ignore').decode('ascii')) else: - with self.destination.open('a') as output_file: - output_file.write(human_readable_output) + if watching_output: + with self.destination.open('w') as output_file: + human_readable_output = '' + for output in self.watch_output: + human_readable_output += self._convert_json_to_human_readable(output) + output_file.write(human_readable_output) + else: + with self.destination.open('a') as output_file: + output_file.write(human_readable_output) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 4c2361076f..026c3d6dd2 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -118,7 +118,7 @@ class AMDSMIParser(argparse.ArgumentParser): elif args.csv: file_name += ".csv" else: - file_name += "txt" + file_name += ".txt" path = path / file_name path.touch() setattr(args, self.dest, path) @@ -169,6 +169,7 @@ class AMDSMIParser(argparse.ArgumentParser): setattr(args, self.dest, values) return WatchSelectedAction + def _gpu_select(self, gpu_choices): """ Custom argparse action to return the device handle(s) for the gpu(s) selected This will set the destination (args.gpu) to a list of 1 or more device handles @@ -279,8 +280,8 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_static_parser(self, subparsers, func): # Subparser help text static_help = "Gets static information about the specified GPU" - static_subcommand_help = "If no argument is provided, return static information for all GPUs on the system.\ - \nIf no static argument is specified all static information will be displayed." + static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\ + \nIf no static argument is provided, all static information will be displayed." static_optionals_title = "Static Arguments" # Optional arguments help text @@ -334,7 +335,7 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_firmware_parser(self, subparsers, func): # Subparser help text firmware_help = "Gets firmware information about the specified GPU" - firmware_subcommand_help = "If no argument is provided, return firmware information for all GPUs on the system." + firmware_subcommand_help = "If no GPU is specified, return firmware information for all GPUs on the system." firmware_optionals_title = "Firmware Arguments" # Optional arguments help text @@ -366,8 +367,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text bad_pages_help = "Gets bad page information about the specified GPU" - bad_pages_subcommand_help = "If no argument is provided, return bad page information for all GPUs on the system." - bad_pages_optionals_title = "Bad pages Arguments" + bad_pages_subcommand_help = "If no GPU is specified, return bad page information for all GPUs on the system." + bad_pages_optionals_title = "Bad Pages Arguments" # Optional arguments help text pending_help = "Displays all pending retired pages" @@ -393,8 +394,8 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_metric_parser(self, subparsers, func): # Subparser help text metric_help = "Gets metric/performance information about the specified GPU" - metric_subcommand_help = "If no argument is provided, return metric information for all GPUs on the system.\ - \nIf no metric argument is specified all metric information will be displayed." + metric_subcommand_help = "If no GPU is specified, returns metric information for all GPUs on the system.\ + \nIf no metric argument is provided all metric information will be displayed." metric_optionals_title = "Metric arguments" # Optional arguments help text @@ -483,8 +484,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text process_help = "Lists general process information running on the specified GPU" - process_subcommand_help = "If no argument is provided, returns information for all GPUs on the system.\ - \nIf no argument is provided all process information will be displayed." + process_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ + \nIf no process argument is provided all process information will be displayed." process_optionals_title = "Process arguments" # Optional Arguments help text @@ -522,7 +523,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text profile_help = "Displays information about all profiles and current profile" - profile_subcommand_help = "If no argument is provided, returns information for all GPUs on the system." + profile_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system." profile_optionals_title = "Profile Arguments" # Create profile subparser @@ -543,7 +544,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text event_help = "Displays event information for the given GPU" - event_subcommand_help = "If no argument is provided, returns event information for all GPUs on the system." + event_subcommand_help = "If no GPU is specified, returns event information for all GPUs on the system." event_optionals_title = "Event Arguments" # Create event subparser @@ -558,14 +559,14 @@ class AMDSMIParser(argparse.ArgumentParser): def _add_topology_parser(self, subparsers, func): - return if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return # Subparser help text topology_help = "Displays topology information of the devices." - topology_subcommand_help = "If no argument is provided, returns information for all GPUs on the system." + topology_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ + \nIf no topology argument is provided all topology information will be displayed." topology_optionals_title = "Topology arguments" # Help text for Arguments only on Guest and BM platforms @@ -602,7 +603,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text set_value_help = "Set options for devices." - set_value_subcommand_help = "The user must specify one of the options for the set configuration." + set_value_subcommand_help = "A GPU must be specified to set a configuration.\ + \nA set argument must be provided; Multiple set arguments are accepted" set_value_optionals_title = "Set Arguments" # Help text for Arguments only on Guest and BM platforms @@ -649,7 +651,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-O', '--memoverdrive', action=self._validate_overdrive_percent(), required=False, help=set_mem_overdrive_help, metavar='%') set_value_parser.add_argument('-w', '--poweroverdrive', action=self._prompt_spec_warning(), type=self._positive_int, required=False, help=set_power_overdrive_help, metavar="WATTS") set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') - set_value_parser.add_argument('-d', '--perfdeterminism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLK') + set_value_parser.add_argument('-d', '--perfdeterminism', action='store', type=self._positive_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') def _validate_set_clock(self, validate_clock_type=True): @@ -752,7 +754,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text reset_help = "Reset options for devices." - reset_subcommand_help = "The user must specify one of the options to reset devices." + reset_subcommand_help = "A GPU must be specified to reset a configuration.\ + \nA reset argument must be provided; Multiple reset arguments are accepted" reset_optionals_title = "Reset Arguments" # Help text for Arguments only on Guest and BM platforms @@ -788,7 +791,7 @@ class AMDSMIParser(argparse.ArgumentParser): return # Subparser help text rocm_smi_help = "Legacy rocm_smi commands ported for backward compatibility" - rocm_smi_subcommand_help = "If no argument is provided, return showall and print the information for all\ + rocm_smi_subcommand_help = "If no GPU is specified, returns showall and print the information for all\ GPUs on the system." rocm_smi_optionals_title = "rocm_smi Arguments"