From f58613561c3c52197e484ad0d6fa02e218c3b68e Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 21 Feb 2024 03:48:09 -0600 Subject: [PATCH] Refactor ESMI Initialization and Argument Parsing Signed-off-by: Maisam Arif Change-Id: Iefab3a8110e0d3c525ee0cef1bdef9101550e9de --- amdsmi_cli/README.md | 16 +- amdsmi_cli/amdsmi_commands.py | 1701 ++++++++++++++++++-------------- amdsmi_cli/amdsmi_helpers.py | 22 +- amdsmi_cli/amdsmi_init.py | 58 +- amdsmi_cli/amdsmi_logger.py | 3 +- amdsmi_cli/amdsmi_parser.py | 1089 ++++++++++---------- include/amd_smi/amdsmi.h | 4 +- py-interface/README.md | 9 +- py-interface/amdsmi_wrapper.py | 4 +- src/amd_smi/amd_smi_system.cc | 4 +- 10 files changed, 1615 insertions(+), 1295 deletions(-) diff --git a/amdsmi_cli/README.md b/amdsmi_cli/README.md index 470d8de6eb..0fa72b8534 100644 --- a/amdsmi_cli/README.md +++ b/amdsmi_cli/README.md @@ -183,7 +183,7 @@ Static Arguments: -l, --limit All limit metric values (i.e. power and thermal limits) -u, --numa All numa node information -CPU Option: +CPU Arguments: -s, --smu All SMU FW information -i, --interface_ver Displays hsmp interface version @@ -276,7 +276,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [--cpu_lclk_dpm_level NBIOID] [--cpu_pwr_svi_telemtry_rails] [--cpu_io_bandwidth IO_BW LINKID_NAME] [--cpu_xgmi_bandwidth XGMI_BW LINKID_NAME] [--cpu_enable_apb] - [--cpu_disable_apb DF_PSTATE] [--set_cpu_pow_limit POW_LIMIT] + [--cpu_disable_apb DF_PSTATE] [--set_cpu_pwr_limit PWR_LIMIT] [--set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH] [--set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM] [--core_boost_limit] [--core_curr_active_freq_core_limit] @@ -285,10 +285,10 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL] [--cpu_metrics_table] [--core_energy] [--socket_energy] [--set_cpu_pwr_eff_mode MODE] [--cpu_ddr_bandwidth] [--cpu_temp] [--cpu_dimm_temp_range_rate DIMM_ADDR] - [--cpu_dimm_pow_conumption DIMM_ADDR] + [--cpu_dimm_pow_consumption DIMM_ADDR] [--cpu_dimm_thermal_sensor DIMM_ADDR] [--set_cpu_gmi3_link_width MIN_LW MAX_LW] - [--set_cpu_pcie_lnk_rate LINK_RATE] + [--set_cpu_pcie_link_rate LINK_RATE] [--set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE] If no GPU is specified, returns metric information for all GPUs on the system. @@ -329,7 +329,7 @@ Metric arguments: -x, --xgmi-err XGMI error information since last read -E, --energy Amount of energy consumed -CPU Option: +CPU Arguments: --cpu_power_metrics Cpu power metrics --cpu_prochot Displays prochot status --cpu_freq_metrics Displays currentFclkMemclk frequencies and cclk frequency limit @@ -353,18 +353,18 @@ CPU Option: --cpu_ddr_bandwidth Displays per socket max ddr bw, current utilized bw and current utilized ddr bw in percentage --cpu_temp Displays cpu socket temperature --cpu_dimm_temp_range_rate DIMM_ADDR Displays dimm temperature range and refresh rate - --cpu_dimm_pow_conumption DIMM_ADDR Displays dimm power consumption + --cpu_dimm_pow_consumption DIMM_ADDR Displays dimm power consumption --cpu_dimm_thermal_sensor DIMM_ADDR Displays dimm thermal sensor Set Options: - --set_cpu_pow_limit POW_LIMIT Set power limit for the given socket. Input parameter is power limit value. + --set_cpu_pwr_limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value. --set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values --set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. Inpur parameters are die_index, min dpm, max dpm. --set_soc_boost_limit BOOST_LIMIT Sets the boost limit for the given socket. Input parameter is socket limit value --set_core_boost_limit BOOST_LIMIT Sets the boost limit for the given core. Input parameter is core limit value --set_cpu_pwr_eff_mode MODE Sets the power efficency mode policy. Input parameter is mode. --set_cpu_gmi3_link_width MIN_LW MAX_LW Sets max and min gmi3 link width range - --set_cpu_pcie_lnk_rate LINK_RATE Sets pcie link rate + --set_cpu_pcie_link_rate LINK_RATE Sets pcie link rate --set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE Sets max and min df-pstates Command Modifiers: diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index dfc108ae25..bb717e498d 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -46,44 +46,55 @@ class AMDSMICommands(): self.device_handles = [] self.cpu_handles = [] self.core_handles = [] - try: - self.device_handles = amdsmi_interface.amdsmi_get_processor_handles() - except amdsmi_exception.AmdSmiLibraryException as e: - if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, - amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): - logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)') - else: - raise e - - if len(self.device_handles) == 0: - logging.info('Unable to detect any devices, check if driver is initialized (amdgpu not found in modules)') - - # Fetch CPU handles - try: - self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() - except amdsmi_exception.AmdSmiLibraryException as e: - if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, - amdsmi_interface.amdsmi_wrapper.AMDSMI_NO_DRV): - - logging.info('Unable to get CPU devices, hsmp driver not loaded') - else: - raise e - - # core handles - try: - self.core_handles = amdsmi_interface.amdsmi_get_cpucore_handles() - except amdsmi_exception.AmdSmiLibraryException as e: - if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, - amdsmi_interface.amdsmi_wrapper.AMDSMI_NO_DRV): - logging.info('Unable to get CORE devices, hsmp driver not loaded') - else: - raise e - - if (len(self.device_handles) == 0 and len(self.cpu_handles) == 0 and len(self.core_handles) == 0): - logging.error('Unable to detect any devices, check if amdgpu and hsmp drivers are initialized') - sys.exit(-1) self.stop = '' + amdsmi_init_flag = self.helpers.get_amdsmi_init_flag() + logging.debug(f"AMDSMI Init Flag: {amdsmi_init_flag}") + exit_flag = False + + if self.helpers.is_amdgpu_initialized(): + try: + self.device_handles = amdsmi_interface.amdsmi_get_processor_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): + logging.error('Unable to get devices, driver not initialized (amdgpu not found in modules)') + else: + raise e + + if len(self.device_handles) == 0: + # No GPU's found post amdgpu driver initialization + logging.error('Unable to detect any GPU devices, check amdgpu version and module status') + exit_flag = True + + if self.helpers.is_amd_hsmp_initialized(): + try: + self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_NO_DRV): + logging.info('Unable to get CPU devices, amd_hsmp driver not loaded') + else: + raise e + + # core handles + try: + self.core_handles = amdsmi_interface.amdsmi_get_cpucore_handles() + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, + amdsmi_interface.amdsmi_wrapper.AMDSMI_NO_DRV): + logging.info('Unable to get CORE devices, amd_hsmp driver not loaded') + else: + raise e + + if len(self.cpu_handles) == 0 and len(self.core_handles) == 0: + # No CPU's found post amd_hsmp driver initialization + logging.error('Unable to detect any CPU devices, check amd_hsmp version and module status') + exit_flag = True + + if exit_flag: + sys.exit(-1) + def version(self, args): """Print Version String @@ -164,7 +175,7 @@ class AMDSMICommands(): self.logger.print_output() - def get_static_cpu(self, args, multiple_devices=False, cpu=None): + def static_cpu(self, args, multiple_devices=False, cpu=None, interface_ver=None): """Get Static information for target cpu Args: @@ -176,57 +187,61 @@ class AMDSMICommands(): None: Print output via AMDSMILogger to destination """ - if (cpu): + if cpu: args.cpu = cpu + if interface_ver: + args.interface_ver = interface_ver - #store cpu args that are applicable to the current platform + # Store cpu args that are applicable to the current platform curr_platform_cpu_args = ["smu", "interface_ver"] curr_platform_cpu_values = [args.smu, args.interface_ver] - if (not any(curr_platform_cpu_values)): + # If no cpu options are passed, return all available args + if not any(curr_platform_cpu_values): for arg in curr_platform_cpu_args: setattr(args, arg, True) - if (len(self.cpu_handles)): - handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, - self.logger, - self.get_static_cpu) - if handled_multiple_cpus: - return # This function is recursive - args.cpu = device_handle - # get cpu id for logging - cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) - logging.debug(f"Static Arg information for CPU {cpu_id} on {self.helpers.os_info()}") + # Handle multiple CPUs + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, + self.logger, + self.static_cpu) + if handled_multiple_cpus: + return # This function is recursive + args.cpu = device_handle - static_dict = {} + # Get cpu id for logging + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + logging.debug(f"Static Arg information for CPU {cpu_id} on {self.helpers.os_info()}") - if (args.smu): - try: - smu = amdsmi_interface.amdsmi_get_cpu_smu_fw_version(args.cpu) - static_dict["smu"] = {"FW_VERSION" : f"{ smu['smu_fw_major_ver_num']}" - f".{smu['smu_fw_minor_ver_num']}.{smu['smu_fw_debug_ver_num']}"} - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["smu"] = "N/A" - logging.debug("Failed to get SMU FW for cpu %s | %s", cpu_id, e.get_error_info()) + static_dict = {} - if (args.interface_ver): - static_dict["interface_version"] = {} - try: - intf_ver = amdsmi_interface.amdsmi_get_cpu_hsmp_proto_ver(args.cpu) - static_dict["interface_version"]["proto version"] = intf_ver - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["interface_version"]["proto version"] = "N/A" - logging.debug("Failed to get proto version for cpu %s | %s", cpu_id, e.get_error_info()) + if args.smu: + try: + smu = amdsmi_interface.amdsmi_get_cpu_smu_fw_version(args.cpu) + static_dict["smu"] = {"FW_VERSION" : f"{smu['smu_fw_major_ver_num']}." + f"{smu['smu_fw_minor_ver_num']}.{smu['smu_fw_debug_ver_num']}"} + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["smu"] = "N/A" + logging.debug("Failed to get SMU FW for cpu %s | %s", cpu_id, e.get_error_info()) - multiple_devices_csv_override = False - self.logger.store_cpu_output(args.cpu, 'values', static_dict) - if multiple_devices: - self.logger.store_multiple_device_output() - return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + if args.interface_ver: + static_dict["interface_version"] = {} + try: + intf_ver = amdsmi_interface.amdsmi_get_cpu_hsmp_proto_ver(args.cpu) + static_dict["interface_version"]["proto version"] = intf_ver + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["interface_version"]["proto version"] = "N/A" + logging.debug("Failed to get proto version for cpu %s | %s", cpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) - def get_static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, + def static_gpu(self, args, multiple_devices=False, gpu=None, asic=None, bus=None, vbios=None, limit=None, driver=None, ras=None, board=None, numa=None, vram=None, cache=None, partition=None, dfc_ucode=None, fb_info=None, num_vf=None): """Get Static information for target gpu @@ -303,11 +318,11 @@ class AMDSMICommands(): current_platform_args += ["dfc_ucode", "fb_info", "num_vf"] current_platform_values += [args.dfc_ucode, args.fb_info, args.num_vf] - if (not any(current_platform_values)): + if not any(current_platform_values): for arg in current_platform_args: setattr(args, arg, True) - handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.get_static_gpu) + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.static_gpu) if handled_multiple_gpus: return # This function is recursive args.gpu = device_handle @@ -318,8 +333,8 @@ class AMDSMICommands(): logging.debug(f"Applicable Args: {current_platform_args}") logging.debug(f"Arg Values: {current_platform_values}") + # Populate static dictionary for each enabled argument static_dict = {} - if args.asic: try: asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu) @@ -651,8 +666,8 @@ class AMDSMICommands(): static_dict['numa'] = {'node' : numa_node_number, 'affinity' : numa_affinity} - multiple_devices_csv_override = False # Convert and store output by pid for csv format + multiple_devices_csv_override = False if self.logger.is_csv_format(): # expand if ras blocks are populated if self.helpers.is_linux() and self.helpers.is_baremetal() and args.ras: @@ -715,55 +730,66 @@ class AMDSMICommands(): Returns: None: Print output via AMDSMILogger to destination """ - # Set args.* to passed in arguments - if gpu: - args.gpu = gpu + # Mutually exclusive arguments if cpu: args.cpu = cpu - if interface_ver: - args.interface_ver = interface_ver + if gpu: + args.gpu = gpu - gpus = args.gpu - cpus = args.cpu + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["smu", "interface_ver"] + for attr in cpu_attributes: + if hasattr(args, attr): + cpu_args_enabled |= bool(getattr(args, attr)) - gpu_options = any([args.gpu, args.asic, args.bus, args.vbios, args.driver, args.vram, args.cache, args.board]) - cpu_options = any([args.smu, args.interface_ver]) + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["asic", "bus", "vbios", "limit", "driver", "ras", + "board", "numa", "vram", "cache", "partition", + "dfc_ucode", "fb_info", "num_vf"] + for attr in gpu_attributes: + if hasattr(args, attr): + gpu_args_enabled |= bool(getattr(args, attr)) - # Handle No GPU passed - if args.gpu == None: - args.gpu = self.device_handles + # Handle CPU and GPU intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + # Print out all CPU and all GPU static info only if no device was specified. + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None: + if not cpu_args_enabled and not gpu_args_enabled: + args.cpu = self.cpu_handles + args.gpu = self.device_handles - # Handle No CPU passed - if args.cpu == None: - args.cpu = self.cpu_handles + # Handle cases where the user has only specified an argument and no specific device + if args.gpu == None and gpu_args_enabled: + args.gpu = self.device_handles + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles - if (len(self.cpu_handles) and ((((not gpus) and (not cpus)) or cpus) - and not gpu_options)): - self.get_static_cpu(args, cpu) - else: - logging.info("No CPU devices present") + if args.cpu: + self.static_cpu(args, multiple_devices, cpu, interface_ver) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.static_gpu(args, multiple_devices, gpu, asic, + bus, vbios, limit, driver, ras, + board, numa, vram, cache, partition, + dfc_ucode, fb_info, num_vf) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None: + args.cpu = self.cpu_handles - if (cpu_options and (len(self.cpu_handles) == 0)): - logging.error("No CPU devices present") - sys.exit(-1) + self.static_cpu(args, multiple_devices, cpu, interface_ver) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + args.gpu = self.device_handles - if (len(self.device_handles) and ((((not gpus) and (not cpus)) or gpus) - and not cpu_options)): self.logger.clear_multiple_devices_ouput() - self.get_static_gpu(args, multiple_devices, gpu, asic, + self.static_gpu(args, multiple_devices, gpu, asic, bus, vbios, limit, driver, ras, board, numa, vram, cache, partition, dfc_ucode, fb_info, num_vf) - else: - logging.info("No GPU devices present") - - if (gpu_options and (len(self.device_handles) == 0)): - logging.error("No GPU devices present") - sys.exit(-1) - - if (len(self.cpu_handles) == 0 and len(self.device_handles) == 0): - logging.error("No CPU and GPU devices present") - sys.exit(-1) def firmware(self, args, multiple_devices=False, gpu=None, fw_list=True): @@ -972,7 +998,7 @@ class AMDSMICommands(): Args: args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. - watching_output (bool, optional): True if watch option has been set. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. usage (bool, optional): Value override for args.usage. Defaults to None. watch (Positive int, optional): Value override for args.watch. Defaults to None. @@ -1077,7 +1103,7 @@ class AMDSMICommands(): # Handle watch logic, will only enter this block once if args.watch: - self.helpers.handle_watch(args=args, subcommand=self.metric, logger=self.logger) + self.helpers.handle_watch(args=args, subcommand=self.metric_gpu, logger=self.logger) return # Handle multiple GPUs @@ -1125,6 +1151,7 @@ class AMDSMICommands(): logging.debug(f"Metric Arg information for GPU {gpu_id} on {self.helpers.os_info()}") logging.debug(f"Args: {current_platform_args}") logging.debug(f"Values: {current_platform_values}") + # Set the platform applicable args to True if no args are set if not any(current_platform_values): for arg in current_platform_args: @@ -1653,14 +1680,12 @@ class AMDSMICommands(): self.logger.store_watch_output(multiple_device_enabled=False) - def metric_cpu(self, args, multiple_devices=False, cpu=None, power_metrics=None, prochot=None, - freq_metrics=None, c0_res=None, lclk_dpm_level=None,pwr_svi_telemtry_rails=None, - io_bandwidth=None, xgmi_bandwidth=None, enable_apb=None, disable_apb=None, - set_pow_limit=None, set_xgmi_link_width=None, set_lclk_dpm_level=None, - set_soc_boost_limit=None, metrics_ver=None, metrics_table=None, socket_energy=None, - set_pwr_eff_mode=None, ddr_bandwidth=None, cpu_temp=None, dimm_temp_range_rate=None, - dimm_pow_conumption=None, dimm_thermal_sensor=None, set_gmi3_link_width=None, - set_pcie_lnk_rate=None, set_df_pstate_range=None): + def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None, + cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None, + cpu_pwr_svi_telemtry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, + cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None, + cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None, + cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None): """Get Metric information for target cpu Args: @@ -1668,531 +1693,364 @@ class AMDSMICommands(): multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. cpu (cpu_handle, optional): device_handle for target device. Defaults to None. cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None - prochot (bool, optional): Value override for args.prochot. Defaults to None. - freq_metrics (bool, optional): Value override for args.freq_metrics. Defaults to None. - c0_res (bool, optional): Value override for args.c0_res. Defaults to None - lclk_dpm_level (list, optional): Value override for args.lclk_dpm_level. Defaults to None - pwr_svi_telemtry_rails (list, optional): value override for args.pwr_svi_telemtry_rails. Defaults to None - io_bandwidth (list, optional): value override for args.io_bandwidth. Defaults to None - xgmi_bandwidth (list, optional): value override for args.xgmi_bandwidth. Defaults to None - enable_apb (bool, optional): Value override for args.enable_apb. Defaults to None - disable_apb (bool, optional): Value override for args.disable_apb. Defaults to None - set_pow_limit (int, optional): Value override for args.cpu_set_pow_limit. Defaults to None - set_xgmi_link_width (list, optional): Value override for args.set_cpu_xgmi_link_width. Defaults to None - set_lclk_dpm_level (list, optional): Value override for args.set_cpu_lclk_dpm_level. Defaults to None - set_soc_boost_limit (list, optional): Value override for args.set_soc_boost_limit. Defaults to None - metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None - metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None - socket_energy (bool, optional): Value override for args.socket_energy. Defaults to None - set_pwr_eff_mode (list, optional): Value override for args.set_cpu_pwr_eff_mode. Defaults to None - ddr_bandwidth (bool, optional): Value override for args.ddr_bandwidth. Defaults to None + cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None. + cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None. + cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None + cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None + cpu_pwr_svi_telemtry_rails (list, optional): value override for args.cpu_pwr_svi_telemtry_rails. Defaults to None + cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None + cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None + cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None + cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None + cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None + cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None - dimm_temp_range_rate (list, optional): Dimm address.Value override for args.cpu_dimm_temp_range_rate. Defaults to None - dimm_pow_conumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_conumption. Defaults to None - dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None - set_gmi3_link_width (list, optional): Min and Max link wdiths.Value override for args.set_cpu_gmi3_link_width. Defaults to None - set_pcie_lnk_rate (list, optional): Link rate.Value override for args.set_cpu_pcie_lnk_rate. Defaults to None - set_df_pstate_range (list, optional): Max and Min pstates.Value override for args.set_cpu_df_pstate_range. Defaults to None + cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None + cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None + cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None + Returns: None: Print output via AMDSMILogger to destination """ - if (cpu): + if cpu: args.cpu = cpu - if (power_metrics): - args.cpu_power_metrics = power_metrics - if (prochot): - args.cpu_prochot = prochot - if (freq_metrics): - args.cpu_freq_metrics = freq_metrics - if (c0_res): - args.cpu_c0_res = c0_res - if (lclk_dpm_level): - args.cpu_lclk_dpm_level = lclk_dpm_level - if (pwr_svi_telemtry_rails): - args.cpu_pwr_svi_telemtry_rails = pwr_svi_telemtry_rails - if (io_bandwidth): - args.cpu_io_bandwidth = io_bandwidth - if (xgmi_bandwidth): - args.cpu_xgmi_bandwidth = xgmi_bandwidth - if (enable_apb): - args.cpu_enable_apb = enable_apb - if (disable_apb): - args.cpu_disable_apb = disable_apb - if (set_pow_limit): - args.set_cpu_pow_limit = set_pow_limit - if (set_xgmi_link_width): - args.set_xgmi_link_width = set_xgmi_link_width - if (set_lclk_dpm_level): - args.set_lclk_dpm_level = set_lclk_dpm_level - if (set_soc_boost_limit): - args.set_soc_boost_limit = set_soc_boost_limit - if (metrics_ver): - args.cpu_metrics_ver = metrics_ver - if (metrics_table): - args.cpu_metrics_table = metrics_table - if (socket_energy): - args.socket_energy = socket_energy - if (set_pwr_eff_mode): - args.set_cpu_pwr_eff_mode = set_pwr_eff_mode - if (ddr_bandwidth): - args.set_cpu_pwr_eff_mode = ddr_bandwidth - if (cpu_temp): + if cpu_power_metrics: + args.cpu_power_metrics = cpu_power_metrics + if cpu_prochot: + args.cpu_prochot = cpu_prochot + if cpu_freq_metrics: + args.cpu_freq_metrics = cpu_freq_metrics + if cpu_c0_res: + args.cpu_c0_res = cpu_c0_res + if cpu_lclk_dpm_level: + args.cpu_lclk_dpm_level = cpu_lclk_dpm_level + if cpu_pwr_svi_telemtry_rails: + args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemtry_rails + if cpu_io_bandwidth: + args.cpu_io_bandwidth = cpu_io_bandwidth + if cpu_xgmi_bandwidth: + args.cpu_xgmi_bandwidth = cpu_xgmi_bandwidth + if cpu_metrics_ver: + args.cpu_metrics_ver = cpu_metrics_ver + if cpu_metrics_table: + args.cpu_metrics_table = cpu_metrics_table + if cpu_socket_energy: + args.cpu_socket_energy = cpu_socket_energy + if cpu_ddr_bandwidth: + args.cpu_ddr_bandwidth = cpu_ddr_bandwidth + if cpu_temp: args.cpu_temp = cpu_temp - if (dimm_temp_range_rate): - args.cpu_dimm_temp_range_rate = dimm_temp_range_rate - if (dimm_pow_conumption): - args.cpu_dimm_pow_conumption = dimm_pow_conumption - if (dimm_thermal_sensor): - args.cpu_dimm_thermal_sensor = dimm_thermal_sensor - if (set_gmi3_link_width): - args.set_cpu_gmi3_link_width = set_gmi3_link_width - if (set_pcie_lnk_rate): - args.set_cpu_pcie_lnk_rate = set_pcie_lnk_rate - if (set_df_pstate_range): - args.set_cpu_df_pstate_range = set_df_pstate_range - + if cpu_dimm_temp_range_rate: + args.cpu_dimm_temp_range_rate = cpu_dimm_temp_range_rate + if cpu_dimm_pow_consumption: + args.cpu_dimm_pow_consumption = cpu_dimm_pow_consumption + if cpu_dimm_thermal_sensor: + args.cpu_dimm_thermal_sensor = cpu_dimm_thermal_sensor #store cpu args that are applicable to the current platform curr_platform_cpu_args = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics", "cpu_c0_res", "cpu_lclk_dpm_level", "cpu_pwr_svi_telemtry_rails", - "cpu_io_bandwidth", "cpu_xgmi_bandwidth", "cpu_disable_apb", - "set_cpu_pow_limit","set_cpu_xgmi_link_width", "set_cpu_lclk_dpm_level", - "set_soc_boost_limit", "cpu_metrics_ver", "cpu_metrics_table", - "socket_energy", "set_cpu_pwr_eff_mode", "cpu_ddr_bandwidth", - "cpu_temp", "cpu_dimm_temp_range_rate", "cpu_dimm_pow_conumption", - "cpu_dimm_thermal_sensor", "set_cpu_gmi3_link_width", "set_cpu_pcie_lnk_rate", - "set_cpu_df_pstate_range", "cpu_enable_apb"] + "cpu_io_bandwidth", "cpu_xgmi_bandwidth", "cpu_metrics_ver", + "cpu_metrics_table", "cpu_socket_energy", "cpu_ddr_bandwidth", + "cpu_temp", "cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption", + "cpu_dimm_thermal_sensor"] curr_platform_cpu_values = [args.cpu_power_metrics, args.cpu_prochot, args.cpu_freq_metrics, args.cpu_c0_res, args.cpu_lclk_dpm_level, args.cpu_pwr_svi_telemtry_rails, - args.cpu_io_bandwidth, args.cpu_xgmi_bandwidth, args.cpu_disable_apb, - args.set_cpu_pow_limit, args.set_cpu_xgmi_link_width, args.set_cpu_lclk_dpm_level, - args.set_soc_boost_limit, args.cpu_metrics_ver, args.cpu_metrics_table, - args.socket_energy, args.set_cpu_pwr_eff_mode, args.cpu_ddr_bandwidth, - args.cpu_temp, args.cpu_dimm_temp_range_rate, args.cpu_dimm_pow_conumption, - args.cpu_dimm_thermal_sensor, args.set_cpu_gmi3_link_width, args.set_cpu_pcie_lnk_rate, - args.set_cpu_df_pstate_range, args.cpu_enable_apb] + args.cpu_io_bandwidth, args.cpu_xgmi_bandwidth, args.cpu_metrics_ver, + args.cpu_metrics_table, args.cpu_socket_energy, args.cpu_ddr_bandwidth, + args.cpu_temp, args.cpu_dimm_temp_range_rate, args.cpu_dimm_pow_consumption, + args.cpu_dimm_thermal_sensor] - - # Handle No CPU passed + # Handle No CPU passed (fall back as this should be defined in metric()) if args.cpu == None: args.cpu = self.cpu_handles - if (not any(curr_platform_cpu_values)): + if not any(curr_platform_cpu_values): for arg in curr_platform_cpu_args: - if arg not in("cpu_lclk_dpm_level", "cpu_io_bandwidth", "cpu_xgmi_bandwidth", "cpu_disable_apb", - "set_cpu_pow_limit", "set_cpu_xgmi_link_width", "set_cpu_lclk_dpm_level", - "set_soc_boost_limit", "set_cpu_pwr_eff_mode", "cpu_dimm_temp_range_rate", - "cpu_dimm_temp_range_rate", "cpu_dimm_pow_conumption", "cpu_dimm_thermal_sensor", - "set_cpu_gmi3_link_width", "set_cpu_pcie_lnk_rate", "set_cpu_df_pstate_range", - "cpu_enable_apb"): + if arg not in("cpu_lclk_dpm_level", "cpu_io_bandwidth", "cpu_xgmi_bandwidth", + "cpu_dimm_temp_range_rate", "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"): setattr(args, arg, True) - if (len(self.cpu_handles)): - handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, - self.logger, - self.metric_cpu) - if handled_multiple_cpus: - return # This function is recursive - args.cpu = device_handle - # get cpu id for logging - cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) - logging.debug(f"Metric Arg information for CPU {cpu_id} on {self.helpers.os_info()}") + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, + self.logger, + self.metric_cpu) + if handled_multiple_cpus: + return # This function is recursive + args.cpu = device_handle + # get cpu id for logging + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + logging.debug(f"Metric Arg information for CPU {cpu_id} on {self.helpers.os_info()}") - static_dict = {} - if (args.cpu_power_metrics): - static_dict["power_metrics"] = {} - try: - soc_pow = amdsmi_interface.amdsmi_get_cpu_socket_power(args.cpu) - static_dict["power_metrics"]["socket power"] = soc_pow - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["power_metrics"]["socket power"] = "N/A" - logging.debug("Failed to get socket power for cpu %s | %s", cpu_id, e.get_error_info()) + static_dict = {} + if args.cpu_power_metrics: + static_dict["power_metrics"] = {} + try: + soc_pow = amdsmi_interface.amdsmi_get_cpu_socket_power(args.cpu) + static_dict["power_metrics"]["socket power"] = soc_pow + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket power"] = "N/A" + logging.debug("Failed to get socket power for cpu %s | %s", cpu_id, e.get_error_info()) - try: - soc_pow_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap(args.cpu) - static_dict["power_metrics"]["socket power limit"] = soc_pow_limit - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["power_metrics"]["socket power limit"] = "N/A" - logging.debug("Failed to get socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) + try: + soc_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap(args.cpu) + static_dict["power_metrics"]["socket power limit"] = soc_pwr_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket power limit"] = "N/A" + logging.debug("Failed to get socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) - try: - soc_max_pow_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap_max(args.cpu) - static_dict["power_metrics"]["socket max power limit"] = soc_max_pow_limit - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["power_metrics"]["socket max power limit"] = "N/A" - logging.debug("Failed to get max socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) + try: + soc_max_pwr_limit = amdsmi_interface.amdsmi_get_cpu_socket_power_cap_max(args.cpu) + static_dict["power_metrics"]["socket max power limit"] = soc_max_pwr_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["power_metrics"]["socket max power limit"] = "N/A" + logging.debug("Failed to get max socket power limit for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_prochot: + static_dict["prochot"] = {} + try: + proc_status = amdsmi_interface.amdsmi_get_cpu_prochot_status(args.cpu) + static_dict["prochot"]["prochot_status"] = proc_status + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["prochot"]["prochot_status"] = "N/A" + logging.debug("Failed to get prochot status for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_freq_metrics: + static_dict["freq_metrics"] = {} + try: + fclk_mclk = amdsmi_interface.amdsmi_get_cpu_fclk_mclk(args.cpu) + static_dict["freq_metrics"]["fclkmemclk"] = fclk_mclk + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["fclkmemclk"] = "N/A" + logging.debug("Failed to get current fclkmemclk freq for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.cpu_prochot): - static_dict["prochot"] = {} - try: - proc_status = amdsmi_interface.amdsmi_get_cpu_prochot_status(args.cpu) - static_dict["prochot"]["prochot_status"] = proc_status - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["prochot"]["prochot_status"] = "N/A" - logging.debug("Failed to get prochot status for cpu %s | %s", cpu_id, e.get_error_info()) + try: + cclk_freq = amdsmi_interface.amdsmi_get_cpu_cclk_limit(args.cpu) + static_dict["freq_metrics"]["cclkfreqlimit"] = cclk_freq + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["cclkfreqlimit"] = "N/A" + logging.debug("Failed to get current cclk freq for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.cpu_freq_metrics): - static_dict["freq_metrics"] = {} - try: - fclk_mclk = amdsmi_interface.amdsmi_get_cpu_fclk_mclk(args.cpu) - static_dict["freq_metrics"]["fclkmemclk"] = fclk_mclk - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["freq_metrics"]["fclkmemclk"] = "N/A" - logging.debug("Failed to get current fclkmemclk freq for cpu %s | %s", cpu_id, e.get_error_info()) + try: + soc_cur_freq_limit = amdsmi_interface.amdsmi_get_cpu_socket_current_active_freq_limit(args.cpu) + static_dict["freq_metrics"]["soc_current_active_freq_limit"] = soc_cur_freq_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["soc_current_active_freq_limit"] = "N/A" + logging.debug("Failed to get socket current freq limit for cpu %s | %s", cpu_id, e.get_error_info()) - try: - cclk_freq = amdsmi_interface.amdsmi_get_cpu_cclk_limit(args.cpu) - static_dict["freq_metrics"]["cclkfreqlimit"] = cclk_freq - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["freq_metrics"]["cclkfreqlimit"] = "N/A" - logging.debug("Failed to get current cclk freq for cpu %s | %s", cpu_id, e.get_error_info()) + try: + soc_freq_range = amdsmi_interface.amdsmi_get_cpu_socket_freq_range(args.cpu) + static_dict["freq_metrics"]["soc_freq_range"] = soc_freq_range + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["freq_metrics"]["soc_freq_range"] = "N/A" + logging.debug("Failed to get socket freq range for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_c0_res: + static_dict["c0_residency"] = {} + try: + residency = amdsmi_interface.amdsmi_get_cpu_socket_c0_residency(args.cpu) + static_dict["c0_residency"]["residency"] = residency + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["c0_residency"]["residency"] = "N/A" + logging.debug("Failed to get C0 residency for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_lclk_dpm_level: + static_dict["socket_dpm"] = {} + try: + dpm_val = amdsmi_interface.amdsmi_get_cpu_socket_lclk_dpm_level(args.cpu, + args.cpu_lclk_dpm_level[0][0]) + static_dict["socket_dpm"]["dpml_level_range"] = dpm_val + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["socket_dpm"]["dpml_level_range"] = "N/A" + logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_pwr_svi_telemtry_rails: + static_dict["svi_telemetry_all_rails"] = {} + try: + power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu) + static_dict["svi_telemetry_all_rails"]["power"] = power + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["c0_residency"]["residency"] = "N/A" + logging.debug("Failed to get svi telemetry all rails for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_io_bandwidth: + static_dict["io_bandwidth"] = {} + try: + bandwidth = amdsmi_interface.amdsmi_get_cpu_current_io_bandwidth(args.cpu, + int(args.cpu_io_bandwidth[0][0]), + args.cpu_io_bandwidth[0][1]) + static_dict["io_bandwidth"]["band_width"] = bandwidth + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["io_bandwidth"]["band_width"] = "N/A" + logging.debug("Failed to get io bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_xgmi_bandwidth: + static_dict["xgmi_bandwidth"] = {} + try: + bandwidth = amdsmi_interface.amdsmi_get_cpu_current_xgmi_bw(args.cpu, + int(args.cpu_xgmi_bandwidth[0][0]), + args.cpu_xgmi_bandwidth[0][1]) + static_dict["xgmi_bandwidth"]["band_width"] = bandwidth + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["xgmi_bandwidth"]["band_width"] = "N/A" + logging.debug("Failed to get xgmi bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_metrics_ver: + static_dict["metric_version"] = {} + try: + version = amdsmi_interface.amdsmi_get_hsmp_metrics_table_version(args.cpu) + static_dict["metric_version"]["version"] = version + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metric_version"]["version"] = "N/A" + logging.debug("Failed to get metrics table version for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_metrics_table: + static_dict["metrics_table"] = {} + try: + cpu_fam = amdsmi_interface.amdsmi_get_cpu_family() + static_dict["metrics_table"]["cpu_family"] = cpu_fam + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["cpu_family"] = "N/A" + logging.debug("Failed to get cpu family | %s", e.get_error_info()) + try: + cpu_mod = amdsmi_interface.amdsmi_get_cpu_model() + static_dict["metrics_table"]["cpu_model"] = cpu_mod + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["cpu_model"] = "N/A" + logging.debug("Failed to get cpu model | %s", e.get_error_info()) + try: + cpu_metrics_table = amdsmi_interface.amdsmi_get_hsmp_metrics_table(args.cpu) + static_dict["metrics_table"]["response"] = cpu_metrics_table + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["metrics_table"]["response"] = "N/A" + logging.debug("Failed to get metrics table for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_socket_energy: + static_dict["socket_energy"] = {} + try: + energy = amdsmi_interface.amdsmi_get_cpu_socket_energy(args.cpu) + static_dict["socket_energy"]["response"] = energy + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["socket_energy"]["response"] = "N/A" + logging.debug("Failed to get socket energy for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_ddr_bandwidth: + static_dict["ddr_bandwidth"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_ddr_bw(args.cpu) + static_dict["ddr_bandwidth"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["ddr_bandwidth"]["response"] = "N/A" + logging.debug("Failed to get ddr bandwdith for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_temp: + static_dict["cpu_temp"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_socket_temperature(args.cpu) + static_dict["cpu_temp"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["cpu_temp"]["response"] = "N/A" + logging.debug("Failed to get cpu temperature for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_temp_range_rate: + static_dict["dimm_temp_range_rate"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(args.cpu, args.cpu_dimm_temp_range_rate[0][0]) + static_dict["dimm_temp_range_rate"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_temp_range_rate"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_pow_consumption: + static_dict["dimm_pow_consumption"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_power_consumption(args.cpu, args.cpu_dimm_pow_consumption[0][0]) + static_dict["dimm_pow_consumption"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_pow_consumption"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) + if args.cpu_dimm_thermal_sensor: + static_dict["dimm_thermal_sensor"] = {} + try: + resp = amdsmi_interface.amdsmi_get_cpu_dimm_thermal_sensor(args.cpu, args.cpu_dimm_thermal_sensor[0][0]) + static_dict["dimm_thermal_sensor"]["response"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["dimm_thermal_sensor"]["response"] = "N/A" + logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) - try: - soc_cur_freq_limit = amdsmi_interface.amdsmi_get_cpu_socket_current_active_freq_limit(args.cpu) - static_dict["freq_metrics"]["soc_current_active_freq_limit"] = soc_cur_freq_limit - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["freq_metrics"]["soc_current_active_freq_limit"] = "N/A" - logging.debug("Failed to get socket current freq limit for cpu %s | %s", cpu_id, e.get_error_info()) - - try: - soc_freq_range = amdsmi_interface.amdsmi_get_cpu_socket_freq_range(args.cpu) - static_dict["freq_metrics"]["soc_freq_range"] = soc_freq_range - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["freq_metrics"]["soc_freq_range"] = "N/A" - logging.debug("Failed to get socket freq range for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_c0_res): - static_dict["c0_residency"] = {} - try: - residency = amdsmi_interface.amdsmi_get_cpu_socket_c0_residency(args.cpu) - static_dict["c0_residency"]["residency"] = residency - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["c0_residency"]["residency"] = "N/A" - logging.debug("Failed to get C0 residency for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_lclk_dpm_level): - static_dict["socket_dpm"] = {} - try: - dpm_val = amdsmi_interface.amdsmi_get_cpu_socket_lclk_dpm_level(args.cpu, - args.cpu_lclk_dpm_level[0][0]) - static_dict["socket_dpm"]["dpml_level_range"] = dpm_val - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["socket_dpm"]["dpml_level_range"] = "N/A" - logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_pwr_svi_telemtry_rails): - static_dict["svi_telemetry_all_rails"] = {} - try: - power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu) - static_dict["svi_telemetry_all_rails"]["power"] = power - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["c0_residency"]["residency"] = "N/A" - logging.debug("Failed to get svi telemetry all rails for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.cpu_io_bandwidth): - static_dict["io_bandwidth"] = {} - try: - bandwidth = amdsmi_interface.amdsmi_get_cpu_current_io_bandwidth(args.cpu, - int(args.cpu_io_bandwidth[0][0]), - args.cpu_io_bandwidth[0][1]) - static_dict["io_bandwidth"]["band_width"] = bandwidth - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["io_bandwidth"]["band_width"] = "N/A" - logging.debug("Failed to get io bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.cpu_xgmi_bandwidth): - static_dict["xgmi_bandwidth"] = {} - try: - bandwidth = amdsmi_interface.amdsmi_get_cpu_current_xgmi_bw(args.cpu, - int(args.cpu_xgmi_bandwidth[0][0]), - args.cpu_xgmi_bandwidth[0][1]) - static_dict["xgmi_bandwidth"]["band_width"] = bandwidth - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["xgmi_bandwidth"]["band_width"] = "N/A" - logging.debug("Failed to get xgmi bandwidth for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.cpu_enable_apb): - static_dict["apbenable"] = {} - try: - amdsmi_interface.amdsmi_cpu_apb_enable(args.cpu) - static_dict["apbenable"]["state"] = "Enabled DF - Pstate performance boost algorithm" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["apbenable"]["state"] = "N/A" - logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_disable_apb): - static_dict["apbdisable"] = {} - try: - amdsmi_interface.amdsmi_cpu_apb_disable(args.cpu, args.cpu_disable_apb[0][0]) - static_dict["apbdisable"]["state"] = "Disabled DF - Pstate performance boost algorithm" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["apbdisable"]["state"] = "N/A" - logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_pow_limit): - static_dict["set_pow_limit"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_socket_power_cap(args.cpu, args.set_cpu_pow_limit[0][0]) - static_dict["set_pow_limit"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_pow_limit"]["Response"] = "N/A" - logging.debug("Failed to set power limit for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_xgmi_link_width): - static_dict["set_xgmi_link_width"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_xgmi_width(args.cpu, args.set_cpu_xgmi_link_width[0][0], - args.set_cpu_xgmi_link_width[0][1]) - static_dict["set_xgmi_link_width"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_xgmi_link_width"]["Response"] = "N/A" - logging.debug("Failed to set xgmi link width for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_lclk_dpm_level): - static_dict["set_lclk_dpm_level"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_socket_lclk_dpm_level(args.cpu, args.set_cpu_lclk_dpm_level[0][0], - args.set_cpu_lclk_dpm_level[0][1], - args.set_cpu_lclk_dpm_level[0][2]) - static_dict["set_lclk_dpm_level"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_lclk_dpm_level"]["Response"] = "N/A" - logging.debug("Failed to set lclk dpm level for cpu %s | %s", cpu_id, e.get_error_info()) - if (args.set_soc_boost_limit): - static_dict["set_soc_boost_limit"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_socket_boostlimit(args.cpu, args.set_soc_boost_limit[0][0]) - static_dict["set_soc_boost_limit"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_soc_boost_limit"]["Response"] = "N/A" - logging.debug("Failed to set socket boost limit for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_metrics_ver): - static_dict["metric_version"] = {} - try: - version = amdsmi_interface.amdsmi_get_hsmp_metrics_table_version(args.cpu) - static_dict["metric_version"]["version"] = version - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["metric_version"]["version"] = "N/A" - logging.debug("Failed to get metrics table version for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_metrics_table): - static_dict["metrics_table"] = {} - try: - cpu_fam = amdsmi_interface.amdsmi_get_cpu_family() - static_dict["metrics_table"]["cpu_family"] = cpu_fam - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["metrics_table"]["cpu_family"] = "N/A" - logging.debug("Failed to get cpu family | %s", e.get_error_info()) - try: - cpu_mod = amdsmi_interface.amdsmi_get_cpu_model() - static_dict["metrics_table"]["cpu_model"] = cpu_mod - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["metrics_table"]["cpu_model"] = "N/A" - logging.debug("Failed to get cpu model | %s", e.get_error_info()) - try: - metrics_table = amdsmi_interface.amdsmi_get_hsmp_metrics_table(args.cpu) - static_dict["metrics_table"]["response"] = metrics_table - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["metrics_table"]["response"] = "N/A" - logging.debug("Failed to get metrics table for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.socket_energy): - static_dict["socket_energy"] = {} - try: - energy = amdsmi_interface.amdsmi_get_cpu_socket_energy(args.cpu) - static_dict["socket_energy"]["response"] = energy - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["socket_energy"]["response"] = "N/A" - logging.debug("Failed to get socket energy for cpu %s | %s", cpu_id, e.get_error_info()) - - if(args.set_cpu_pwr_eff_mode): - static_dict["set_pwr_eff_mode"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_pwr_efficiency_mode(args.cpu, args.set_cpu_pwr_eff_mode[0][0]) - static_dict["set_pwr_eff_mode"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_pwr_eff_mode"]["Response"] = "N/A" - logging.debug("Failed to set power efficiency mode for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_ddr_bandwidth): - static_dict["ddr_bandwidth"] = {} - try: - resp = amdsmi_interface.amdsmi_get_cpu_ddr_bw(args.cpu) - static_dict["ddr_bandwidth"]["response"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["ddr_bandwidth"]["response"] = "N/A" - logging.debug("Failed to get ddr bandwdith for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_temp): - static_dict["cpu_temp"] = {} - try: - resp = amdsmi_interface.amdsmi_get_cpu_socket_temperature(args.cpu) - static_dict["cpu_temp"]["response"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["cpu_temp"]["response"] = "N/A" - logging.debug("Failed to get cpu temperature for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_dimm_temp_range_rate): - static_dict["dimm_temp_range_rate"] = {} - try: - resp = amdsmi_interface.amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(args.cpu, args.cpu_dimm_temp_range_rate[0][0]) - static_dict["dimm_temp_range_rate"]["response"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["dimm_temp_range_rate"]["response"] = "N/A" - logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_dimm_pow_conumption): - static_dict["dimm_pow_conumption"] = {} - try: - resp = amdsmi_interface.amdsmi_get_cpu_dimm_power_consumption(args.cpu, args.cpu_dimm_pow_conumption[0][0]) - static_dict["dimm_pow_conumption"]["response"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["dimm_pow_conumption"]["response"] = "N/A" - logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.cpu_dimm_thermal_sensor): - static_dict["dimm_thermal_sensor"] = {} - try: - resp = amdsmi_interface.amdsmi_get_cpu_dimm_thermal_sensor(args.cpu, args.cpu_dimm_thermal_sensor[0][0]) - static_dict["dimm_thermal_sensor"]["response"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["dimm_thermal_sensor"]["response"] = "N/A" - logging.debug("Failed to get dimm temperature range and refresh rate for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_gmi3_link_width): - static_dict["set_gmi3_link_width"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_gmi3_link_width_range(args.cpu, args.set_cpu_gmi3_link_width[0][0], - args.set_cpu_gmi3_link_width[0][1]) - static_dict["set_gmi3_link_width"]["response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_gmi3_link_width"]["response"] = "N/A" - logging.debug("Failed to set gmi3 link width for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_pcie_lnk_rate): - static_dict["set_pcie_lnk_rate"] = {} - try: - resp = amdsmi_interface.amdsmi_set_cpu_pcie_link_rate(args.cpu, args.set_cpu_pcie_lnk_rate[0][0]) - static_dict["set_pcie_lnk_rate"]["prev_mode"] = resp - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_pcie_lnk_rate"]["prev_mode"] = "N/A" - logging.debug("Failed to set pcie link rate for cpu %s | %s", cpu_id, e.get_error_info()) - - if (args.set_cpu_df_pstate_range): - static_dict["set_df_pstate_range"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_df_pstate_range(args.cpu, args.set_cpu_df_pstate_range[0][0], - args.set_cpu_df_pstate_range[0][1]) - static_dict["set_df_pstate_range"]["response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_df_pstate_range"]["response"] = "N/A" - logging.debug("Failed to set df pstate range for cpu %s | %s", cpu_id, e.get_error_info()) - - multiple_devices_csv_override = False - self.logger.store_cpu_output(args.cpu, 'values', static_dict) - if multiple_devices: - self.logger.store_multiple_device_output() - return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + multiple_devices_csv_override = False + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) - def metric_core(self, args, multiple_devices=False, core=None, boost_limit=None, - curr_active_freq_core_limit=None, set_core_boost_limit=None, core_energy=None): + def metric_core(self, args, multiple_devices=False, core=None, core_boost_limit=None, + core_curr_active_freq_core_limit=None, core_energy=None): """Get Static information for target core Args: args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. - core (device_handle, optional): device_handle for target device. Defaults to None. - boost_limit (bool, optional): Value override for args.boostlimit. Defaults to None - curr_active_freq_core_limit (bool, optional): Value override for args.boostlimit. Defaults to None - set_core_boost_limit(list, optional): boost limit value.Value override for args.set_core_boost_limit. Defaults to None + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None + core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None core_energy (bool, optional): Value override for args.core_energy. Defaults to None Returns: None: Print output via AMDSMILogger to destination """ if core: args.core = core - if boost_limit: - args.core_boost_limit = boost_limit - if curr_active_freq_core_limit: - args.core_curr_active_freq_core_limit = curr_active_freq_core_limit - if set_core_boost_limit: - args.set_core_boost_limit = boost_limit + if core_boost_limit: + args.core_boost_limit = core_boost_limit + if core_curr_active_freq_core_limit: + args.core_curr_active_freq_core_limit = core_curr_active_freq_core_limit if core_energy: args.core_energy = core_energy #store core args that are applicable to the current platform - curr_platform_core_args = ["core_boost_limit", "core_curr_active_freq_core_limit", - "set_core_boost_limit","core_energy"] - curr_platform_core_values = [args.core_boost_limit, args.core_curr_active_freq_core_limit, - args.set_core_boost_limit, args.core_energy] + curr_platform_core_args = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] + curr_platform_core_values = [args.core_boost_limit, args.core_curr_active_freq_core_limit, args.core_energy] - # Handle No core passed + # Handle No cores passed if args.core == None: args.core = self.core_handles - if (not any(curr_platform_core_values)): + if not any(curr_platform_core_values): for arg in curr_platform_core_args: - if arg not in (["set_core_boost_limit"]): - setattr(args, arg, True) + setattr(args, arg, True) - if (len(self.core_handles)): - handled_multiple_cores, device_handle = self.helpers.handle_cores(args, - self.logger, - self.metric_core) - if handled_multiple_cores: - return # This function is recursive - args.core = device_handle - # get core id for logging - core_id = self.helpers.get_core_id_from_device_handle(args.core) - logging.debug(f"Static Arg information for Core {core_id} on {self.helpers.os_info()}") + handled_multiple_cores, device_handle = self.helpers.handle_cores(args, + self.logger, + self.metric_core) + if handled_multiple_cores: + return # This function is recursive + args.core = device_handle + # get core id for logging + core_id = self.helpers.get_core_id_from_device_handle(args.core) + logging.debug(f"Static Arg information for Core {core_id} on {self.helpers.os_info()}") - static_dict = {} - if (args.core_boost_limit): - static_dict["boost_limit"] ={} + static_dict = {} + if args.core_boost_limit: + static_dict["boost_limit"] ={} - try: - boost_limit = amdsmi_interface.amdsmi_get_cpu_core_boostlimit(args.core) - static_dict["boost_limit"]["value"] = boost_limit - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["boost_limit"]["value"] = "N/A" - logging.debug("Failed to get core boost limit for core %s | %s", core_id, e.get_error_info()) - if (args.core_curr_active_freq_core_limit): - static_dict["curr_active_freq_core_limit"] = {} + try: + core_boost_limit = amdsmi_interface.amdsmi_get_cpu_core_boostlimit(args.core) + static_dict["boost_limit"]["value"] = core_boost_limit + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["boost_limit"]["value"] = "N/A" + logging.debug("Failed to get core boost limit for core %s | %s", core_id, e.get_error_info()) + if args.core_curr_active_freq_core_limit: + static_dict["curr_active_freq_core_limit"] = {} - try: - freq = amdsmi_interface.amdsmi_get_cpu_core_current_freq_limit(args.core) - static_dict["curr_active_freq_core_limit"]["value"] = freq - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["curr_active_freq_core_limit"]["value"] = "N/A" - logging.debug("Failed to get current active frequency core for core %s | %s", core_id, e.get_error_info()) + try: + freq = amdsmi_interface.amdsmi_get_cpu_core_current_freq_limit(args.core) + static_dict["curr_active_freq_core_limit"]["value"] = freq + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["curr_active_freq_core_limit"]["value"] = "N/A" + logging.debug("Failed to get current active frequency core for core %s | %s", core_id, e.get_error_info()) + if args.core_energy: + static_dict["core_energy"] ={} + try: + energy = amdsmi_interface.amdsmi_get_cpu_core_energy(args.core) + static_dict["core_energy"]["value"] = energy + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["core_energy"]["value"] = "N/A" + logging.debug("Failed to get core energy for core %s | %s", core_id, e.get_error_info()) - if (args.set_core_boost_limit): - static_dict["set_core_boost_limit"] = {} - try: - amdsmi_interface.amdsmi_set_cpu_core_boostlimit(args.core, args.set_core_boost_limit[0][0]) - static_dict["set_core_boost_limit"]["Response"] = "Set Operation successful" - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["set_core_boost_limit"]["Response"] = "N/A" - logging.debug("Failed to set core boost limit for cpu %s | %s", core_id, e.get_error_info()) - - - if (args.core_energy): - static_dict["core_energy"] ={} - try: - energy = amdsmi_interface.amdsmi_get_cpu_core_energy(args.core) - static_dict["core_energy"]["value"] = energy - except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["core_energy"]["value"] = "N/A" - logging.debug("Failed to get core energy for core %s | %s", core_id, e.get_error_info()) - - - multiple_devices_csv_override = False - self.logger.store_core_output(args.core, 'values', static_dict) - if multiple_devices: - self.logger.store_multiple_device_output() - return # Skip printing when there are multiple devices - self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + multiple_devices_csv_override = False + self.logger.store_core_output(args.core, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, @@ -2200,22 +2058,21 @@ class AMDSMICommands(): clock=None, temperature=None, ecc=None, ecc_block=None, pcie=None, fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, - guard=None, guest_data=None, fb_usage=None, xgmi=None,cpu=None, - cpu_power_metrics=None, prochot=None, freq_metrics=None, c0_res=None, - lclk_dpm_level=None,pwr_svi_telemtry_rails=None, io_bandwidth=None, - xgmi_bandwidth=None, enable_apb=None, disable_apb=None,set_pow_limit=None, - set_xgmi_link_width=None, set_lclk_dpm_level=None, set_soc_boost_limit=None, - metrics_ver=None, metrics_table=None, socket_energy=None,set_pwr_eff_mode=None, - ddr_bandwidth=None, cpu_temp=None, dimm_temp_range_rate=None,dimm_pow_conumption=None, - dimm_thermal_sensor=None, set_gmi3_link_width=None, set_pcie_lnk_rate=None, - set_df_pstate_range=None, core=None, boost_limit=None, - curr_active_freq_core_limit=None, set_core_boost_limit=None, core_energy=None): + guard=None, guest_data=None, fb_usage=None, xgmi=None, + cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, + cpu_c0_res=None, cpu_lclk_dpm_level=None, cpu_pwr_svi_telemtry_rails=None, + cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, cpu_metrics_ver=None, + cpu_metrics_table=None, cpu_socket_energy=None, cpu_ddr_bandwidth=None, + cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None, + cpu_dimm_thermal_sensor=None, + core=None, core_boost_limit=None, core_curr_active_freq_core_limit=None, + core_energy=None): """Get Metric information for target gpu Args: args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. - watching_output (bool, optional): True if watch option has been set. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. usage (bool, optional): Value override for args.usage. Defaults to None. watch (Positive int, optional): Value override for args.watch. Defaults to None. @@ -2239,38 +2096,29 @@ class AMDSMICommands(): guest_data (bool, optional): Value override for args.guest_data. Defaults to None. fb_usage (bool, optional): Value override for args.fb_usage. Defaults to None. xgmi (bool, optional): Value override for args.xgmi. Defaults to None. - cpu (device_handle, optional): cpu index. Defaults to None - cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None - prochot (bool, optional): Value override for args.prochot. Defaults to None. - freq_metrics (bool, optional): Value override for args.freq_metrics. Defaults to None. - c0_res (bool, optional): Value override for args.c0_res. Defaults to None - lclk_dpm_level (list, optional): Value override for args.lclk_dpm_level. Defaults to None - pwr_svi_telemtry_rails (list, optional): value override for args.pwr_svi_telemtry_rails. Defaults to None - io_bandwidth (list, optional): value override for args.io_bandwidth. Defaults to None - xgmi_bandwidth (list, optional): value override for args.xgmi_bandwidth. Defaults to None - enable_apb (bool, optional): Value override for args.enable_apb. Defaults to None - disable_apb (bool, optional): Value override for args.disable_apb. Defaults to None - set_pow_limit (int, optional): Value override for args.cpu_set_pow_limit. Defaults to None - set_xgmi_link_width (list, optional): Value override for args.set_cpu_xgmi_link_width. Defaults to None - set_lclk_dpm_level (list, optional): Value override for args.set_cpu_lclk_dpm_level. Defaults to None - set_soc_boost_limit (list, optional): Value override for args.set_soc_boost_limit. Defaults to None - metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None - metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None - socket_energy (bool, optional): Value override for args.socket_energy. Defaults to None - set_pwr_eff_mode (list, optional): Value override for args.set_cpu_pwr_eff_mode. Defaults to None - ddr_bandwidth (bool, optional): Value override for args.ddr_bandwidth. Defaults to None - cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None - dimm_temp_range_rate (list, optional): Dimm address.Value override for args.cpu_dimm_temp_range_rate. Defaults to None - dimm_pow_conumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_conumption. Defaults to None - dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None - set_gmi3_link_width (list, optional): Min and Max link wdiths.Value override for args.set_cpu_gmi3_link_width. Defaults to None - set_pcie_lnk_rate (list, optional): Link rate.Value override for args.set_cpu_pcie_lnk_rate. Defaults to None - set_df_pstate_range (list, optional): Max and Min pstates.Value override for args.set_cpu_df_pstate_range. Defaults to None - core (int, optional): core index. Value override for args.core.Defaults to None - boost_limit (bool, optional): Value override for args.boostlimit. Defaults to None - curr_active_freq_core_limit (bool, optional): Value override for args.boostlimit. Defaults to None - set_core_boost_limit(list, optional): boost limit value.Value override for args.set_core_boost_limit. Defaults to None + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_power_metrics (bool, optional): Value override for args.cpu_power_metrics. Defaults to None + cpu_prochot (bool, optional): Value override for args.cpu_prochot. Defaults to None. + cpu_freq_metrics (bool, optional): Value override for args.cpu_freq_metrics. Defaults to None. + cpu_c0_res (bool, optional): Value override for args.cpu_c0_res. Defaults to None + cpu_lclk_dpm_level (list, optional): Value override for args.cpu_lclk_dpm_level. Defaults to None + cpu_pwr_svi_telemtry_rails (list, optional): value override for args.cpu_pwr_svi_telemtry_rails. Defaults to None + cpu_io_bandwidth (list, optional): value override for args.cpu_io_bandwidth. Defaults to None + cpu_xgmi_bandwidth (list, optional): value override for args.cpu_xgmi_bandwidth. Defaults to None + cpu_metrics_ver (bool, optional): Value override for args.cpu_metrics_ver. Defaults to None + cpu_metrics_table (bool, optional): Value override for args.cpu_metrics_table. Defaults to None + cpu_socket_energy (bool, optional): Value override for args.cpu_socket_energy. Defaults to None + cpu_ddr_bandwidth (bool, optional): Value override for args.cpu_ddr_bandwidth. Defaults to None + cpu_temp (bool, optional): Value override for args.cpu_temp. Defaults to None + cpu_dimm_temp_range_rate (list, optional): Dimm address. Value override for args.cpu_dimm_temp_range_rate. Defaults to None + cpu_dimm_pow_consumption (list, optional): Dimm address. Value override for args.cpu_dimm_pow_consumption. Defaults to None + cpu_dimm_thermal_sensor (list, optional): Dimm address. Value override for args.cpu_dimm_thermal_sensor. Defaults to None + + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (bool, optional): Value override for args.core_boost_limit. Defaults to None + core_curr_active_freq_core_limit (bool, optional): Value override for args.core_curr_active_freq_core_limit. Defaults to None + core_energy (bool, optional): Value override for args.core_energy. Defaults to None Raises: IndexError: Index error if gpu list is empty @@ -2278,119 +2126,123 @@ class AMDSMICommands(): Returns: None: Print output via AMDSMILogger to destination """ - gpus = args.gpu - cpus= args.cpu - cores = args.core + # TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments - # GPU Options check against each attribute - gpu_options = False - if hasattr(args, 'gpu'): - gpu_options |= bool(args.gpu) - if hasattr(args, 'usage'): - gpu_options |= bool(args.usage) - if hasattr(args, 'watch'): - gpu_options |= bool(args.watch) - if hasattr(args, 'watch_time'): - gpu_options |= bool(args.watch_time) - if hasattr(args, 'iterations'): - gpu_options |= bool(args.iterations) - if hasattr(args, 'power'): - gpu_options |= bool(args.power) - if hasattr(args, 'clock'): - gpu_options |= bool(args.clock) - if hasattr(args, 'temperature'): - gpu_options |= bool(args.temperature) - if hasattr(args, 'ecc'): - gpu_options |= bool(args.ecc) - if hasattr(args, 'ecc_block'): - gpu_options |= bool(args.ecc_block) - if hasattr(args, 'pcie'): - gpu_options |= bool(args.pcie) - if hasattr(args, 'fan'): - gpu_options |= bool(args.fan) - if hasattr(args, 'voltage_curve'): - gpu_options |= bool(args.voltage_curve) - if hasattr(args, 'overdrive'): - gpu_options |= bool(args.overdrive) - if hasattr(args, 'perf_level'): - gpu_options |= bool(args.perf_level) - if hasattr(args, 'xgmi_err'): - gpu_options |= bool(args.xgmi_err) - if hasattr(args, 'energy'): - gpu_options |= bool(args.energy) - if hasattr(args, 'mem_usage'): - gpu_options |= bool(args.mem_usage) - if hasattr(args, 'schedule'): - gpu_options |= bool(args.schedule) - if hasattr(args, 'guard'): - gpu_options |= bool(args.guard) - if hasattr(args, 'guest_data'): - gpu_options |= bool(args.guest_data) - if hasattr(args, 'fb_usage'): - gpu_options |= bool(args.fb_usage) - if hasattr(args, 'xgmi'): - gpu_options |= bool(args.xgmi) + # Mutually exculsive args + if gpu: + args.gpu = gpu + if cpu: + args.cpu = cpu + if core: + args.core = core - cpu_options = any([args.cpu, args.cpu_power_metrics, args.cpu_prochot, - args.cpu_freq_metrics, args.cpu_c0_res, args.cpu_lclk_dpm_level, - args.cpu_pwr_svi_telemtry_rails, args.cpu_io_bandwidth, args.cpu_xgmi_bandwidth, - args.cpu_enable_apb, args.cpu_disable_apb, args.set_cpu_pow_limit, - args.set_cpu_xgmi_link_width, args.set_cpu_lclk_dpm_level, - args.set_soc_boost_limit,args.cpu_metrics_ver, args.cpu_metrics_table, - args.socket_energy, args.set_cpu_pwr_eff_mode,args.cpu_ddr_bandwidth, - args.cpu_temp, args.cpu_dimm_temp_range_rate, args.cpu_dimm_pow_conumption, - args.cpu_dimm_thermal_sensor, args.set_cpu_gmi3_link_width, - args.set_cpu_pcie_lnk_rate, args.set_cpu_df_pstate_range]) + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", + "temperature", "ecc", "ecc_block", "pcie", "fan", "voltage_curve", + "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule", + "guard", "guest_data", "fb_usage", "xgmi"] + for attr in gpu_attributes: + if hasattr(args, attr): + gpu_args_enabled |= bool(getattr(args, attr)) - core_options = any([args.core_boost_limit, args.core_curr_active_freq_core_limit, - args.set_core_boost_limit, args.core_energy]) + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["cpu_power_metrics", "cpu_prochot", "cpu_freq_metrics", "cpu_c0_res", + "cpu_lclk_dpm_level", "cpu_pwr_svi_telemtry_rails", "cpu_io_bandwidth", + "cpu_xgmi_bandwidth", "cpu_metrics_ver", "cpu_metrics_table", + "cpu_socket_energy", "cpu_ddr_bandwidth", "cpu_temp", "cpu_dimm_temp_range_rate", + "cpu_dimm_pow_consumption", "cpu_dimm_thermal_sensor"] + for attr in cpu_attributes: + if hasattr(args, attr): + cpu_args_enabled |= bool(getattr(args, attr)) - if gpu_options and len(self.device_handles) == 0: - logging.error("No GPU devices present") - sys.exit(-1) + # Check if a Core argument has been set + core_args_enabled = False + core_attributes = ["core_boost_limit", "core_curr_active_freq_core_limit", "core_energy"] + for attr in core_attributes: + if hasattr(args, attr): + core_args_enabled |= bool(getattr(args, attr)) + logging.debug("gpu_args_enabled: %s, cpu_args_enabled: %s, core_args_enabled: %s", + gpu_args_enabled, cpu_args_enabled, core_args_enabled) + logging.debug("args.gpu: %s, args.cpu: %s, args.core: %s", args.gpu, args.cpu, args.core) - if ((len(self.device_handles) and ((((not gpus) and (not cpus) and (not cores)) or gpus) - and not cpu_options and not core_options))): - self.metric_gpu(args, multiple_devices, watching_output, gpu, - usage, watch, watch_time, iterations, power, - clock, temperature, ecc, ecc_block, pcie, - fan, voltage_curve, overdrive, perf_level, - xgmi_err, energy, mem_usage, schedule, - guard, guest_data, fb_usage, xgmi) + # Handle CPU and GPU driver intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None and args.core == None: + # If no args are set, print out all CPU, GPU, and Core metrics info + if not gpu_args_enabled and not cpu_args_enabled and not core_args_enabled: + args.cpu = self.cpu_handles + args.gpu = self.device_handles + args.core = self.core_handles + # Handle cases where the user has only specified an argument and no specific device + if args.gpu == None and gpu_args_enabled: + args.gpu = self.device_handles + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles + if args.core == None and core_args_enabled: + args.core = self.core_handles - if ((len(self.cpu_handles) and ((((not gpus) and (not cpus) and (not cores)) or cpus) - and not gpu_options and not core_options))): - self.logger.clear_multiple_devices_ouput() - self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, prochot, - freq_metrics, c0_res, lclk_dpm_level, pwr_svi_telemtry_rails, - io_bandwidth, xgmi_bandwidth, enable_apb, disable_apb, - set_pow_limit,set_xgmi_link_width, set_lclk_dpm_level, - set_soc_boost_limit, metrics_ver, metrics_table, socket_energy, - set_pwr_eff_mode,ddr_bandwidth, cpu_temp, dimm_temp_range_rate, - dimm_pow_conumption,dimm_thermal_sensor, set_gmi3_link_width, - set_pcie_lnk_rate, set_df_pstate_range) - - if (cpu_options and (len(self.cpu_handles) == 0)): - logging.error("No CPU devices present") - sys.exit(-1) - - if ((len(self.core_handles) and ((((not gpus) and (not cpus) and (not cores)) or cores) - and not gpu_options and not cpu_options))): + # Print out CPU first + if args.cpu: + self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot, + cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level, + cpu_pwr_svi_telemtry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth, + cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy, + cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate, + cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor) + if args.core: + self.logger.output = {} self.logger.clear_multiple_devices_ouput() - self.metric_core(args, multiple_devices, core, boost_limit, - curr_active_freq_core_limit, set_core_boost_limit, - core_energy) + self.metric_core(args, multiple_devices, core, core_boost_limit, + core_curr_active_freq_core_limit, core_energy) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.metric_gpu(args, multiple_devices, watching_output, gpu, + usage, watch, watch_time, iterations, power, + clock, temperature, ecc, ecc_block, pcie, + fan, voltage_curve, overdrive, perf_level, + xgmi_err, energy, mem_usage, schedule, + guard, guest_data, fb_usage, xgmi) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None and args.core == None: + # If no args are set, print out all CPU and Core metrics info + if not cpu_args_enabled and not core_args_enabled: + args.cpu = self.cpu_handles + args.core = self.core_handles - if (core_options and (len(self.cpu_handles) == 0)): - logging.error("No Core devices present") - sys.exit(-1) + if args.cpu == None and cpu_args_enabled: + args.cpu = self.cpu_handles + if args.core == None and core_args_enabled: + args.core = self.core_handles + + if args.cpu: + self.metric_cpu(args, multiple_devices, cpu, cpu_power_metrics, cpu_prochot, + cpu_freq_metrics, cpu_c0_res, cpu_lclk_dpm_level, + cpu_pwr_svi_telemtry_rails, cpu_io_bandwidth, cpu_xgmi_bandwidth, + cpu_metrics_ver, cpu_metrics_table, cpu_socket_energy, + cpu_ddr_bandwidth, cpu_temp, cpu_dimm_temp_range_rate, + cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.metric_core(args, multiple_devices, core, core_boost_limit, + core_curr_active_freq_core_limit, core_energy) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + args.gpu = self.device_handles + + self.logger.clear_multiple_devices_ouput() + self.metric_gpu(args, multiple_devices, watching_output, gpu, + usage, watch, watch_time, iterations, power, + clock, temperature, ecc, ecc_block, pcie, + fan, voltage_curve, overdrive, perf_level, + xgmi_err, energy, mem_usage, schedule) - if (len(self.cpu_handles) == 0 and len(self.device_handles) == 0 and - len(self.core_handles) == 0): - logging.error("No CPU and GPU devices present") - sys.exit(-1) def process(self, args, multiple_devices=False, watching_output=False, gpu=None, general=None, engine=None, pid=None, name=None, @@ -2400,7 +2252,7 @@ class AMDSMICommands(): Args: args (Namespace): Namespace containing the parsed CLI args multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. - watching_output (bool, optional): True if watch option has been set. Defaults to False. + watching_output (bool, optional): True if watch argument has been set. Defaults to False. gpu (device_handle, optional): device_handle for target device. Defaults to None. general (bool, optional): Value override for args.general. Defaults to None. engine (bool, optional): Value override for args.engine. Defaults to None. @@ -2862,7 +2714,246 @@ class AMDSMICommands(): self.logger.print_output(multiple_device_enabled=True) - def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, + def set_core(self, args, multiple_devices=False, core=None, core_boost_limit=None): + """Issue set commands to target core(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + core (device_handle, optional): device_handle for target device. Defaults to None. + core_boost_limit (list, optional): Value override for args.core_boost_limit. Defaults to None. Defaults to None. + + Raises: + ValueError: Value error if no core value is provided + IndexError: Index error if core list is empty + + Return: + Nothing + """ + if core: + args.core = core + if core_boost_limit: + args.core_boost_limit = core_boost_limit + + if args.core == None: + raise ValueError('No Core provided, specific Core targets(S) are needed') + + # Handle multiple cores + handled_multiple_cores, device_handle = self.helpers.handle_cores(args, self.logger, self.set_core) + if handled_multiple_cores: + return # This function is recursive + + # Error if no subcommand args are passed + if not any([args.core_boost_limit]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + args.core = device_handle + # build core string for errors + try: + core_id = self.helpers.get_core_id_from_device_handle(args.core) + except IndexError: + core_id = f'ID Unavailable for {args.core}' + + static_dict = {} + if args.core_boost_limit: + static_dict["set_core_boost_limit"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_core_boostlimit(args.core, args.core_boost_limit[0][0]) + static_dict["set_core_boost_limit"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_core_boost_limit"]["Response"] = f"Error occured for Core {core_id} - {e.get_error_info()}" + logging.debug("Failed to set core boost limit for cpu %s | %s", core_id, e.get_error_info()) + + multiple_devices_csv_override = False + self.logger.store_core_output(args.core, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def set_cpu(self, args, multiple_devices=False, cpu=None, cpu_pwr_limit=None, + cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, cpu_pwr_eff_mode=None, + cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, cpu_df_pstate_range=None, + cpu_enable_apb=None, cpu_disable_apb=None, soc_boost_limit=None): + """Issue set commands to target cpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None. + cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None. + cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None. + cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None. + cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None. + cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None. + cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None. + cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None. + cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None. + soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None. + + Raises: + ValueError: Value error if no cpu value is provided + IndexError: Index error if cpu list is empty + + Return: + Nothing + """ + if cpu: + args.cpu = cpu + if cpu_pwr_limit: + args.cpu_pwr_limit = cpu_pwr_limit + if cpu_xgmi_link_width: + args.cpu_xgmi_link_width = cpu_xgmi_link_width + if cpu_lclk_dpm_level: + args.cpu_lclk_dpm_level = cpu_lclk_dpm_level + if cpu_pwr_eff_mode: + args.cpu_pwr_eff_mode = cpu_pwr_eff_mode + if cpu_gmi3_link_width: + args.cpu_gmi3_link_width = cpu_gmi3_link_width + if cpu_pcie_link_rate: + args.cpu_pcie_link_rate = cpu_pcie_link_rate + if cpu_df_pstate_range: + args.cpu_df_pstate_range = cpu_df_pstate_range + if cpu_enable_apb: + args.cpu_enable_apb = cpu_enable_apb + if cpu_disable_apb: + args.cpu_disable_apb = cpu_disable_apb + if soc_boost_limit: + args.soc_boost_limit = soc_boost_limit + + if args.cpu == None: + raise ValueError('No CPU provided, specific CPU targets(S) are needed') + + #Handle multiple CPU's + handled_multiple_cpus, device_handle = self.helpers.handle_cpus(args, self.logger, self.set_cpu) + if handled_multiple_cpus: + return # This function is recursive + + args.cpu = device_handle + #Error if no subcommand args are passed + if not any([args.cpu_pwr_limit, args.cpu_xgmi_link_width, args.cpu_lclk_dpm_level, + args.cpu_pwr_eff_mode, args.cpu_gmi3_link_width, args.cpu_pcie_link_rate, + args.cpu_df_pstate_range, args.cpu_enable_apb, args.cpu_disable_apb, + args.soc_boost_limit]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + + # Build CPU string for errors + try: + cpu_id = self.helpers.get_cpu_id_from_device_handle(args.cpu) + except IndexError: + cpu_id = f'ID Unavailable for {args.cpu}' + + static_dict = {} + + if args.cpu_pwr_limit: + static_dict["set_pwr_limit"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_socket_power_cap(args.cpu, args.cpu_pwr_limit[0][0]) + static_dict["set_pwr_limit"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pwr_limit"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set power limit for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_xgmi_link_width: + static_dict["set_xgmi_link_width"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_xgmi_width(args.cpu, args.cpu_xgmi_link_width[0][0], + args.cpu_xgmi_link_width[0][1]) + static_dict["set_xgmi_link_width"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_xgmi_link_width"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set xgmi link width for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_lclk_dpm_level: + static_dict["set_lclk_dpm_level"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_socket_lclk_dpm_level(args.cpu, args.cpu_lclk_dpm_level[0][0], + args.cpu_lclk_dpm_level[0][1], + args.cpu_lclk_dpm_level[0][2]) + static_dict["set_lclk_dpm_level"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_lclk_dpm_level"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set lclk dpm level for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_pwr_eff_mode: + static_dict["set_pwr_eff_mode"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_pwr_efficiency_mode(args.cpu, args.cpu_pwr_eff_mode[0][0]) + static_dict["set_pwr_eff_mode"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pwr_eff_mode"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set power efficiency mode for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_gmi3_link_width: + static_dict["set_gmi3_link_width"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_gmi3_link_width_range(args.cpu, args.cpu_gmi3_link_width[0][0], + args.cpu_gmi3_link_width[0][1]) + static_dict["set_gmi3_link_width"]["response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_gmi3_link_width"]["response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set gmi3 link width for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_pcie_link_rate: + static_dict["set_pcie_link_rate"] = {} + try: + resp = amdsmi_interface.amdsmi_set_cpu_pcie_link_rate(args.cpu, args.cpu_pcie_link_rate[0][0]) + static_dict["set_pcie_link_rate"]["prev_mode"] = resp + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_pcie_link_rate"]["prev_mode"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set pcie link rate for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_df_pstate_range: + static_dict["set_df_pstate_range"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_df_pstate_range(args.cpu, args.cpu_df_pstate_range[0][0], + args.cpu_df_pstate_range[0][1]) + static_dict["set_df_pstate_range"]["response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["set_df_pstate_range"]["response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set df pstate range for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_enable_apb: + static_dict["apbenable"] = {} + try: + amdsmi_interface.amdsmi_cpu_apb_enable(args.cpu) + static_dict["apbenable"]["state"] = "Enabled DF - Pstate performance boost algorithm" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["apbenable"]["state"] = "N/A" + logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.cpu_disable_apb: + static_dict["apbdisable"] = {} + try: + amdsmi_interface.amdsmi_cpu_apb_disable(args.cpu, args.cpu_disable_apb[0][0]) + static_dict["apbdisable"]["state"] = "Disabled DF - Pstate performance boost algorithm" + except amdsmi_exception.AmdSmiLibraryException as e: + static_dict["apbdisable"]["state"] = "N/A" + logging.debug("Failed to enable APB for cpu %s | %s", cpu_id, e.get_error_info()) + + if args.soc_boost_limit: + static_dict["set_soc_boost_limit"] = {} + try: + amdsmi_interface.amdsmi_set_cpu_socket_boostlimit(args.cpu, args.soc_boost_limit[0][0]) + static_dict["set_soc_boost_limit"]["Response"] = "Set Operation successful" + except amdsmi_exception.AmdSmiLibraryException as e: + #static_dict["set_soc_boost_limit"]["Response"] = "N/A" + static_dict["set_soc_boost_limit"]["Response"] = f"Error occured for CPU {cpu_id} - {e.get_error_info()}" + logging.debug("Failed to set socket boost limit for cpu %s | %s", cpu_id, e.get_error_info()) + + multiple_devices_csv_override = False + self.logger.store_cpu_output(args.cpu, 'values', static_dict) + if multiple_devices: + self.logger.store_multiple_device_output() + return # Skip printing when there are multiple devices + self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) + + + def set_gpu(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, profile=None, perf_determinism=None, compute_partition=None, memory_partition=None, power_cap=None): """Issue reset commands to target gpu(s) @@ -2909,7 +3000,7 @@ class AMDSMICommands(): raise ValueError('No GPU provided, specific GPU target(s) are needed') # Handle multiple GPUs - handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_value) + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.set_gpu) if handled_multiple_gpus: return # This function is recursive @@ -3019,6 +3110,132 @@ class AMDSMICommands(): self.logger.print_output() + def set_value(self, args, multiple_devices=False, gpu=None, fan=None, perf_level=None, + profile=None, perf_determinism=None, compute_partition=None, + memory_partition=None, power_cap=None, + cpu=None, cpu_pwr_limit=None, cpu_xgmi_link_width=None, cpu_lclk_dpm_level=None, + cpu_pwr_eff_mode=None, cpu_gmi3_link_width=None, cpu_pcie_link_rate=None, + cpu_df_pstate_range=None, cpu_enable_apb=None, cpu_disable_apb=None, + soc_boost_limit=None, core=None, core_boost_limit=None): + """Issue reset commands to target gpu(s) + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. Defaults to False. + gpu (device_handle, optional): device_handle for target device. Defaults to None. + fan (int, optional): Value override for args.fan. Defaults to None. + perf_level (amdsmi_interface.AmdSmiDevPerfLevel, optional): Value override for args.perf_level. Defaults to None. + profile (bool, optional): Value override for args.profile. Defaults to None. + perf_determinism (int, optional): Value override for args.perf_determinism. Defaults to None. + compute_partition (amdsmi_interface.AmdSmiComputePartitionType, optional): Value override for args.compute_partition. Defaults to None. + memory_partition (amdsmi_interface.AmdSmiMemoryPartitionType, optional): Value override for args.memory_partition. Defaults to None. + power_cap (int, optional): Value override for args.power_cap. Defaults to None. + + cpu (cpu_handle, optional): device_handle for target device. Defaults to None. + cpu_pwr_limit (int, optional): Value override for args.cpu_pwr_limit. Defaults to None. + cpu_xgmi_link_width (List[int], optional): Value override for args.cpu_xgmi_link_width. Defaults to None. + cpu_lclk_dpm_level (List[int], optional): Value override for args.cpu_lclk_dpm_level. Defaults to None. + cpu_pwr_eff_mode (int, optional): Value override for args.cpu_pwr_eff_mode. Defaults to None. + cpu_gmi3_link_width (List[int], optional): Value override for args.cpu_gmi3_link_width. Defaults to None. + cpu_pcie_link_rate (int, optional): Value override for args.cpu_pcie_link_rate. Defaults to None. + cpu_df_pstate_range (List[int], optional): Value override for args.cpu_df_pstate_range. Defaults to None. + cpu_enable_apb (bool, optional): Value override for args.cpu_enable_apb. Defaults to None. + cpu_disable_apb (int, optional): Value override for args.cpu_disable_apb. Defaults to None. + soc_boost_limit (int, optional): Value override for args.soc_boost_limit. Defaults to None. + + core (device_handle, optional): device_handle for target core. Defaults to None. + core_boost_limit (int, optional): Value override for args.core_boost_limit. Defaults to None + + Raises: + ValueError: Value error if no gpu value is provided + IndexError: Index error if gpu list is empty + + Return: + Nothing + """ + # Mutually exculsive args + if gpu: + args.gpu = gpu + if cpu: + args.cpu = cpu + if core: + args.core = core + + # Check if a GPU argument has been set + gpu_args_enabled = False + gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition", + "memory_partition", "power_cap"] + for attr in gpu_attributes: + if hasattr(args, attr): + gpu_args_enabled |= bool(getattr(args, attr)) + + # Check if a CPU argument has been set + cpu_args_enabled = False + cpu_attributes = ["cpu_pwr_limit", "cpu_xgmi_link_width", "cpu_lclk_dpm_level", "cpu_pwr_eff_mode", + "cpu_gmi3_link_width", "cpu_pcie_link_rate", "cpu_df_pstate_range", + "cpu_enable_apb", "cpu_disable_apb", "soc_boost_limit"] + for attr in cpu_attributes: + if hasattr(args, attr): + cpu_args_enabled |= bool(getattr(args, attr)) + + # Check if a Core argument has been set + core_args_enabled = False + core_attributes = ["core_boost_limit"] + for attr in core_attributes: + if hasattr(args, attr): + core_args_enabled |= bool(getattr(args, attr)) + + # Only allow one device's arguments to be set at a time + if gpu_args_enabled == cpu_args_enabled == core_args_enabled == False: + raise ValueError('No GPU, CPU, or CORE arguments provided, specific target(s) are needed') + elif gpu_args_enabled == cpu_args_enabled == core_args_enabled == True: + raise ValueError('Cannot set GPU, CPU, and CORE arguments at the same time') + elif not (gpu_args_enabled ^ cpu_args_enabled ^ core_args_enabled): + raise ValueError('Cannot set GPU, CPU, or CORE arguments at the same time') + + # Handle CPU and GPU intialization cases + if self.helpers.is_amd_hsmp_initialized() and self.helpers.is_amdgpu_initialized(): + # Print out all CPU and all GPU static info only if no device was specified. + # If a GPU or CPU argument is provided only print out the specified device. + if args.cpu == None and args.gpu == None and args.core == None: + raise ValueError('No GPU, CPU, or CORE provided, specific target(s) are needed') + + if args.cpu: + self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit, + cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode, + cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range, + cpu_enable_apb, cpu_disable_apb, soc_boost_limit) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.set_core(args, multiple_devices, core, core_boost_limit) + if args.gpu: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.set_gpu(args, multiple_devices, gpu, fan, perf_level, + profile, perf_determinism, compute_partition, + memory_partition, power_cap) + elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized + if args.cpu == None and args.core == None: + raise ValueError('No CPU or CORE provided, specific target(s) are needed') + if args.cpu: + self.set_cpu(args, multiple_devices, cpu, cpu_pwr_limit, + cpu_xgmi_link_width, cpu_lclk_dpm_level, cpu_pwr_eff_mode, + cpu_gmi3_link_width, cpu_pcie_link_rate, cpu_df_pstate_range, + cpu_enable_apb, cpu_disable_apb, soc_boost_limit) + if args.core: + self.logger.output = {} + self.logger.clear_multiple_devices_ouput() + self.set_core(args, multiple_devices, core, core_boost_limit) + elif self.helpers.is_amdgpu_initialized(): # Only GPU is initialized + if args.gpu == None: + raise ValueError('No GPU provided, specific GPU target(s) are needed') + self.logger.clear_multiple_devices_ouput() + self.set_gpu(args, multiple_devices, gpu, fan, perf_level, + profile, perf_determinism, compute_partition, + memory_partition, power_cap) + + def reset(self, args, multiple_devices=False, gpu=None, gpureset=None, clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None, compute_partition=None, memory_partition=None, power_cap=None): diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 23ec7ce58d..a685a7e8dd 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -116,6 +116,18 @@ class AMDSMIHelpers(): return self._is_windows + def get_amdsmi_init_flag(self): + return AMDSMI_INIT_FLAG + + + def is_amdgpu_initialized(self): + return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_GPUS + + + def is_amd_hsmp_initialized(self): + return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_CPUS + + def get_cpu_choices(self): """Return dictionary of possible CPU choices and string of the output: Dictionary will be in format: cpus[ID]: Device Handle) @@ -136,11 +148,11 @@ class AMDSMIHelpers(): except amdsmi_interface.AmdSmiLibraryException as e: if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): - logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)') + logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)') else: raise e if len(cpu_handles) == 0: - logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)') + logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)') else: # Handle spacing for the gpu_choices_str max_padding = int(math.log10(len(cpu_handles))) + 1 @@ -181,11 +193,11 @@ class AMDSMIHelpers(): except amdsmi_interface.AmdSmiLibraryException as e: if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): - logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)') + logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)') else: raise e if len(core_handles) == 0: - logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)') + logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)') else: # Handle spacing for the gpu_choices_str max_padding = int(math.log10(len(core_handles))) + 1 @@ -463,6 +475,7 @@ class AMDSMIHelpers(): else: return False, args.cpu + def handle_cores(self, args, logger, subcommand): """This function will run execute the subcommands based on the number of cores passed in via args. @@ -567,6 +580,7 @@ class AMDSMIHelpers(): amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle, "Unable to find cpu ID from device_handle") + def get_core_id_from_device_handle(self, input_device_handle): """Get the core index from the device_handle. amdsmi_interface.amdsmi_get_cpusocket_handles() returns the list of device_handles in order of cpu_index diff --git a/amdsmi_cli/amdsmi_init.py b/amdsmi_cli/amdsmi_init.py index dc742d091a..e5b7484422 100644 --- a/amdsmi_cli/amdsmi_init.py +++ b/amdsmi_cli/amdsmi_init.py @@ -42,6 +42,7 @@ sys.tracebacklimit = -1 # Disable traceback when raising errors # On initial import set initialized variable AMDSMI_INITIALIZED = False +AMDSMI_INIT_FLAG = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS AMD_VENDOR_ID = 4098 def check_amdgpu_driver(): @@ -53,8 +54,8 @@ def check_amdgpu_driver(): return False -def check_amdhsmp_driver(): - """ Returns true if amd hsmp is found in the list of initialized modules """ +def check_amd_hsmp_driver(): + """ Returns true if amd_hsmp is found in the list of initialized modules """ amd_cpu_status_file = Path("/sys/module/amd_hsmp/initstate") if amd_cpu_status_file.exists(): if amd_cpu_status_file.read_text(encoding="ascii").strip() == "live": @@ -62,32 +63,36 @@ def check_amdhsmp_driver(): return False -def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS): +def init_amdsmi(): """ Initializes AMDSMI - Raises: - err: AmdSmiLibraryException if not successful - """ - gpu_flag = False; - cpu_flag = False; + Checks for the presence of the amdgpu and amd_hsmp drivers and initializes the + AMD SMI library based on the live drivers found. - # Check if both the amdgpu and amdhsmp driver is up and handle error gracefully - if check_amdgpu_driver() and check_amdhsmp_driver(): - # init AMD APUS + Return: + init_flag: the flag used to initialize the AMD SMI library without error + + Raises: + err: AmdSmiLibraryException if not successful in initializing any drivers + """ + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS + if check_amdgpu_driver() and check_amd_hsmp_driver(): + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS + logging.debug("Both amdgpu and amd_hsmp driver's initstate is live") try: - amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS) + amdsmi_interface.amdsmi_init(init_flag) except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): - logging.error("Drivers not loaded (amdgpu and hsmp drivers not found in modules)") + logging.error("Drivers not loaded (amdgpu and amd_hsmp drivers not found in modules)") sys.exit(-1) else: raise e - # # Check if amdgpu driver is up & Handle error gracefully elif check_amdgpu_driver(): - # Only init AMD GPUs for now, waiting for future support for AMD CPUs + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS + logging.debug("amdgpu driver initstate is live") try: - amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS) + amdsmi_interface.amdsmi_init(init_flag) except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): @@ -95,25 +100,24 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS): sys.exit(-1) else: raise e - logging.debug("AMDSMI initialized successfully, but initstate was not live") - - elif check_amdhsmp_driver(): - # Only init AMD CPUs + logging.debug("amdgpu driver initialized successfully, but amd_hsmp initstate was not live") + elif check_amd_hsmp_driver(): + init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS + logging.debug("amd_hsmp driver initstate is live") try: - amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS) - cpu_flag = True + amdsmi_interface.amdsmi_init(init_flag) except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e: if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT, amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED): - logging.error("Driver not loaded (hsmp not found in modules)") + logging.error("Driver not loaded (amd_hsmp not found in modules)") sys.exit(-1) else: raise e - else: - pass + logging.debug("amd_hsmp driver initialized successfully, but amdgpu initstate was not live") - logging.debug("AMDSMI initialized successfully") + logging.debug(f"AMDSMI initialized with atleast one driver successfully | init flag: {init_flag}") + return init_flag def shut_down_amdsmi(): """Shutdown AMDSMI instance @@ -134,7 +138,7 @@ def signal_handler(sig, frame): if not AMDSMI_INITIALIZED: - init_amdsmi() + AMDSMI_INIT_FLAG = init_amdsmi() AMDSMI_INITIALIZED = True signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index 0b65cc250d..4bbd71724f 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -25,8 +25,8 @@ import json import re import time from typing import Dict -import yaml from enum import Enum +import yaml from amdsmi_helpers import AMDSMIHelpers import amdsmi_cli_exceptions @@ -255,6 +255,7 @@ class AMDSMILogger(): core_id = self.helpers.get_core_id_from_device_handle(device_handle) self._store_core_output_amdsmi(core_id=core_id, argument=argument, data=data) + def _store_core_output_amdsmi(self, core_id, argument, data): if argument == 'timestamp': # Make sure timestamp is the first element in the output self.output['timestamp'] = int(time.time()) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index d29e691ad5..2db2d7661f 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -72,9 +72,23 @@ class AMDSMIParser(argparse.ArgumentParser): # Helper variables self.helpers = AMDSMIHelpers() - self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices() - self.cpu_choices, self.cpu_choices_str = self.helpers.get_cpu_choices() - self.core_choices, self.core_choices_str = self.helpers.get_core_choices() + + # Get choices based on driver initialized + if self.helpers.is_amdgpu_initialized(): + self.gpu_choices, self.gpu_choices_str = self.helpers.get_gpu_choices() + else: + self.gpu_choices = {} + self.gpu_choices_str = "" + + if self.helpers.is_amd_hsmp_initialized(): + self.cpu_choices, self.cpu_choices_str = self.helpers.get_cpu_choices() + self.core_choices, self.core_choices_str = self.helpers.get_core_choices() + else: + self.cpu_choices = {} + self.cpu_choices_str = "" + self.core_choices = {} + self.core_choices_str = "" + self.vf_choices = ['3', '2', '1'] version_string = f"Version: {__version__}" @@ -324,16 +338,28 @@ class AMDSMIParser(argparse.ArgumentParser): def _validate_cpu_core(self, value): - if (int(value) < 0): - outputformat = self.helpers.get_output_format() - raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(value, outputformat) + if isinstance(value, str): + if value.lower() == "all": + return value + if value.isdigit(): + if int(value) < 0: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(value, outputformat) + else: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(value, outputformat) + + if isinstance(value, int): + if int(value) < 0: + outputformat = self.helpers.get_output_format() + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(value, outputformat) return value def _validate_positive(self, value): i_value = int(value) - if (i_value < 0): + if i_value < 0: outputformat = self.helpers.get_output_format() raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(i_value, outputformat) @@ -351,510 +377,24 @@ class AMDSMIParser(argparse.ArgumentParser): # Mutually Exclusive Args within the subparser device_args = subcommand_parser.add_mutually_exclusive_group(required=required) - device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices), - nargs='+', help=gpu_help) - device_args.add_argument('-U', '--cpu', type=self._validate_cpu_core, - action=self._cpu_select(self.cpu_choices), - nargs='+', help=cpu_help) - device_args.add_argument('-O', '--core', type=self._validate_cpu_core, - action=self._core_select(self.core_choices), - nargs='+', help=core_help) + + if self.helpers.is_amdgpu_initialized(): + device_args.add_argument('-g', '--gpu', action=self._gpu_select(self.gpu_choices), + nargs='+', help=gpu_help) + + if self.helpers.is_amd_hsmp_initialized(): + device_args.add_argument('-U', '--cpu', type=self._validate_cpu_core, + action=self._cpu_select(self.cpu_choices), + nargs='+', help=cpu_help) + device_args.add_argument('-O', '--core', type=self._validate_cpu_core, + action=self._core_select(self.core_choices), + nargs='+', help=core_help) if self.helpers.is_hypervisor(): device_args.add_argument('-v', '--vf', action='store', nargs='+', help=vf_help, choices=self.vf_choices) - def _add_version_parser(self, subparsers, func): - # Subparser help text - version_help = "Display version information" - - # Create version subparser - version_parser = subparsers.add_parser('version', help=version_help, description=None) - version_parser._optionals.title = None - version_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - version_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(version_parser) - - - def _add_list_parser(self, subparsers, func): - # Subparser help text - list_help = "List GPU information" - list_subcommand_help = "Lists all the devices on the system and the links between devices.\ - \nLists all the sockets and for each socket, GPUs and/or CPUs associated to\ - \nthat socket alongside some basic information for each device.\ - \nIn virtualization environments, it can also list VFs associated to each\ - \nGPU with some basic information for each VF." - - # Create list subparser - list_parser = subparsers.add_parser('list', help=list_help, description=list_subcommand_help) - list_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - list_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(list_parser) - self._add_device_arguments(list_parser, required=False) - - - def _add_static_parser(self, subparsers, func): - # Subparser help text - static_help = "Gets static information about the specified GPU" - static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\ - \nIf no static argument is provided, all static information will be displayed." - static_optionals_title = "Static Arguments" - - # Optional arguments help text - asic_help = "All asic information" - bus_help = "All bus information" - vbios_help = "All video bios information (if available)" - limit_help = "All limit metric values (i.e. power and thermal limits)" - driver_help = "Displays driver version" - vram_help = "All vram information" - cache_help = "All cache information" - board_help = "All board information" - - # Options arguments help text for Hypervisors and Baremetal - ras_help = "Displays RAS features information" - numa_help = "All numa node information" # Linux Baremetal only - partition_help = "Partition information" - - # Options arguments help text for Hypervisors - dfc_help = "All DFC FW table information" - fb_help = "Displays Frame Buffer information" - num_vf_help = "Displays number of supported and enabled VFs" - - # Options arguments help text for cpu - smu_help = "All SMU FW information" - interface_help = "Displays hsmp interface version" - - # Create static subparser - static_parser = subparsers.add_parser('static', help=static_help, description=static_subcommand_help) - static_parser._optionals.title = static_optionals_title - static_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - static_parser.set_defaults(func=func) - cpu_group = static_parser.add_argument_group("CPU Option") - - # Add Universal Arguments - self._add_command_modifiers(static_parser) - self._add_device_arguments(static_parser, required=False) - - # Optional Args - static_parser.add_argument('-a', '--asic', action='store_true', required=False, help=asic_help) - static_parser.add_argument('-b', '--bus', action='store_true', required=False, help=bus_help) - static_parser.add_argument('-V', '--vbios', action='store_true', required=False, help=vbios_help) - static_parser.add_argument('-d', '--driver', action='store_true', required=False, help=driver_help) - static_parser.add_argument('-v', '--vram', action='store_true', required=False, help=vram_help) - static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) - static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) - cpu_group.add_argument('-s', '--smu', action='store_true', required=False, help=smu_help) - cpu_group.add_argument('-i', '--interface_ver', action='store_true', required=False, help=interface_help) - # Options to display on Hypervisors and Baremetal - if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): - static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) - static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) - static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) - - if self.helpers.is_linux() and not self.helpers.is_virtual_os(): - static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) - - # Options to only display on a Hypervisor - if self.helpers.is_hypervisor(): - static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help) - static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) - static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) - - - def _add_firmware_parser(self, subparsers, func): - # Subparser help text - firmware_help = "Gets firmware information about the specified GPU" - firmware_subcommand_help = "If no GPU is specified, return firmware information for all GPUs on the system." - firmware_optionals_title = "Firmware Arguments" - - # Optional arguments help text - fw_list_help = "All FW list information" - err_records_help = "All error records information" - - # Create firmware subparser - firmware_parser = subparsers.add_parser('firmware', help=firmware_help, description=firmware_subcommand_help, aliases=['ucode']) - firmware_parser._optionals.title = firmware_optionals_title - firmware_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - firmware_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(firmware_parser) - self._add_device_arguments(firmware_parser, required=False) - - # Optional Args - firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True) - - # Options to only display on a Hypervisor - if self.helpers.is_hypervisor(): - firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) - - - def _add_bad_pages_parser(self, subparsers, func): - if not (self.helpers.is_baremetal() and self.helpers.is_linux()): - # The bad_pages subcommand is only applicable to Linux Baremetal systems - return - - # Subparser help text - bad_pages_help = "Gets bad page information about the specified GPU" - bad_pages_subcommand_help = "If no GPU is specified, return bad page information for all GPUs on the system." - bad_pages_optionals_title = "Bad Pages Arguments" - - # Optional arguments help text - pending_help = "Displays all pending retired pages" - retired_help = "Displays retired pages" - un_res_help = "Displays unreservable pages" - - # Create bad_pages subparser - bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help) - bad_pages_parser._optionals.title = bad_pages_optionals_title - bad_pages_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - bad_pages_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(bad_pages_parser) - self._add_device_arguments(bad_pages_parser, required=False) - - # Optional Args - bad_pages_parser.add_argument('-p', '--pending', action='store_true', required=False, help=pending_help) - bad_pages_parser.add_argument('-r', '--retired', action='store_true', required=False, help=retired_help) - bad_pages_parser.add_argument('-u', '--un-res', action='store_true', required=False, help=un_res_help) - - - def _add_metric_parser(self, subparsers, func): - # Subparser help text - metric_help = "Gets metric/performance information about the specified GPU" - metric_subcommand_help = "If no GPU is specified, returns metric information for all GPUs on the system.\ - \nIf no metric argument is provided all metric information will be displayed." - metric_optionals_title = "Metric arguments" - - # Optional arguments help text - usage_help = "Displays engine usage information" - - # Help text for Arguments only Available on Linux Virtual OS and Baremetal platforms - mem_usage_help = "Memory usage per block" - - # Help text for Arguments only on Hypervisor and Baremetal platforms - power_help = "Current power usage" - clock_help = "Average, max, and current clock frequencies" - temperature_help = "Current temperatures" - ecc_help = "Total number of ECC errors" - ecc_block_help = "Number of ECC errors per block" - pcie_help = "Current PCIe speed, width, and replay count" - - # Help text for Arguments only on Linux Baremetal platforms - fan_help = "Current fan speed" - vc_help = "Display voltage curve" - overdrive_help = "Current GPU clock overdrive level" - perf_level_help = "Current DPM performance level" - xgmi_err_help = "XGMI error information since last read" - energy_help = "Amount of energy consumed" - - # Help text for Arguments only on Hypervisors - schedule_help = "All scheduling information" - guard_help = "All guard information" - guest_data_help = "All guest data information" - fb_usage_help = "Displays total and used Frame Buffer usage information" - xgmi_help = "Table of current XGMI metrics information" - - # Help text for cpu options - cpu_power_metrics_help = "Cpu power metrics" - cpu_proc_help = "Displays prochot status" - cpu_freq_help = "Displays currentFclkMemclk frequencies and cclk frequency limit" - cpu_c0_res_help = "Displays C0 residency" - cpu_lclk_dpm_help = "Displays lclk dpm level range. Requires socket ID and nbio id as inputs" - cpu_pwr_svi_telemtry_rails_help = "Displays svi based telemetry for all rails" - cpu_io_bandwidth_help = "Displays current IO bandwidth for the selected CPU.\ - \n input parameters are bandwidth type(1) and link ID encodings\ - \n i.e. P2, P3, G0 - G7" - cpu_xgmi_bandwidth_help = "Displays current XGMI bandwidth for the selected CPU\ - \n input parameters are bandwidth type(1,2,4) and link ID encodings\ - \n i.e. P2, P3, G0 - G7" - cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" - cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm." - "Input parameter is DFPstate (0 -3 )" - set_cpu_pow_limit_help = "Set power limit for the given socket. Input parameter is \ -power limit value." - set_cpu_xgmi_link_width_help = "Set max and Min linkwidth. Input parameters are \ -min and max link width values" - set_cpu_lclk_dpm_level_help = "Sets the max and min dpm level on a given NBIO. Inpur parameters are \ -die_index, min dpm, max dpm." - core_boost_limit_help = "Get booslimit for the selected cores" - core_curr_active_freq_core_limit_help = "Get Current CCLK limit set per Core" - set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is \ -socket limit value" - set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is \ -core limit value" - cpu_metrics_ver_help = "Displays metrics table version" - cpu_metrics_table_help = "Displays metric table" - core_energy_help = "Displays core energy for the selected core" - socket_energy_help = "Displays socket energy for the selected socket" - set_cpu_pwr_eff_mode_help = "Sets the power efficency mode policy. Input parameter is mode." - cpu_ddr_bandwidth_help = "Displays per socket max ddr bw, current utilized bw and current utilized ddr bw in percentage" - cpu_temp_help = "Displays cpu socket temperature" - cpu_dimm_temp_range_rate_help = "Displays dimm temperature range and refresh rate" - cpu_dimm_pow_conumption_help = "Displays dimm power consumption" - cpu_dimm_thermal_sensor_help = "Displays dimm thermal sensor" - set_cpu_gmi3_link_width_help = "Sets max and min gmi3 link width range" - set_cpu_pcie_lnk_rate_help = "Sets pcie link rate" - set_cpu_df_pstate_range_help = "Sets max and min df-pstates" - - # Create metric subparser - metric_parser = subparsers.add_parser('metric', help=metric_help, description=metric_subcommand_help) - metric_parser._optionals.title = metric_optionals_title - metric_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - metric_parser.set_defaults(func=func) - cpu_group = metric_parser.add_argument_group("CPU Option") - set_group = metric_parser.add_argument_group("Set Options") - - # Add Universal Arguments - self._add_command_modifiers(metric_parser) - self._add_device_arguments(metric_parser, required=False) - - # Add Watch args - self._add_watch_arguments(metric_parser) - - # Optional Args for Linux Virtual OS and Baremetal systems - if not self.helpers.is_hypervisor() and not self.helpers.is_windows(): - metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help) - - # Optional Args for Hypervisors and Baremetal systems - if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): - metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) - metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) - metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) - metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) - metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) - metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) - - # Optional Args for Linux Baremetal Systems - if self.helpers.is_baremetal() and self.helpers.is_linux(): - metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help) - metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) - metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) - metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) - metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) - metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) - metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) - - # Options to only display to Hypervisors - if self.helpers.is_hypervisor(): - metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) - metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help) - metric_parser.add_argument('-u', '--guest-data', action='store_true', required=False, help=guest_data_help) - metric_parser.add_argument('-f', '--fb_usage', action='store_true', required=False, help=fb_usage_help) - metric_parser.add_argument('-m', '--xgmi', action='store_true', required=False, help=xgmi_help) - - cpu_group.add_argument('--cpu_power_metrics', action='store_true', required=False, help=cpu_power_metrics_help) - cpu_group.add_argument('--cpu_prochot', action='store_true', required=False, help=cpu_proc_help) - cpu_group.add_argument('--cpu_freq_metrics', action='store_true', required=False, help=cpu_freq_help) - cpu_group.add_argument('--cpu_c0_res', action='store_true', required=False, help=cpu_c0_res_help) - cpu_group.add_argument('--cpu_lclk_dpm_level', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("NBIOID"), help=cpu_lclk_dpm_help) - cpu_group.add_argument('--cpu_pwr_svi_telemtry_rails', action='store_true', required=False, - help=cpu_pwr_svi_telemtry_rails_help) - cpu_group.add_argument('--cpu_io_bandwidth', action='append', required=False, nargs=2, - metavar=("IO_BW","LINKID_NAME"), help=cpu_io_bandwidth_help) - cpu_group.add_argument('--cpu_xgmi_bandwidth', action='append', required=False, nargs=2, - metavar=("XGMI_BW","LINKID_NAME"), help=cpu_xgmi_bandwidth_help) - cpu_group.add_argument('--cpu_enable_apb', action='store_true', required=False, help=cpu_enable_apb_help) - cpu_group.add_argument('--cpu_disable_apb', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("DF_PSTATE"), help=cpu_disable_apb_help) - set_group.add_argument('--set_cpu_pow_limit', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("POW_LIMIT"),help=set_cpu_pow_limit_help) - set_group.add_argument('--set_cpu_xgmi_link_width', action='append', required=False, type=self._validate_positive, - nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) - set_group.add_argument('--set_cpu_lclk_dpm_level', action='append', required=False, type=self._validate_positive, - nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"),help=set_cpu_lclk_dpm_level_help) - cpu_group.add_argument('--core_boost_limit', action='store_true', required=False, help=core_boost_limit_help) - cpu_group.add_argument('--core_curr_active_freq_core_limit', action='store_true', required=False, - help=core_curr_active_freq_core_limit_help) - set_group.add_argument('--set_soc_boost_limit', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) - set_group.add_argument('--set_core_boost_limit', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) - cpu_group.add_argument('--cpu_metrics_ver', action='store_true', required=False, help=cpu_metrics_ver_help) - cpu_group.add_argument('--cpu_metrics_table', action='store_true', required=False, help=cpu_metrics_table_help) - cpu_group.add_argument('--core_energy', action='store_true', required=False, help=core_energy_help) - cpu_group.add_argument('--socket_energy', action='store_true', required=False, help=socket_energy_help) - set_group.add_argument('--set_cpu_pwr_eff_mode', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) - cpu_group.add_argument('--cpu_ddr_bandwidth', action='store_true', required=False, help=cpu_ddr_bandwidth_help) - cpu_group.add_argument('--cpu_temp', action='store_true', required=False, help=cpu_temp_help) - cpu_group.add_argument('--cpu_dimm_temp_range_rate', action='append', required=False, type=lambda x: int(x, 0), - nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_temp_range_rate_help) - cpu_group.add_argument('--cpu_dimm_pow_conumption', action='append', required=False, type=lambda x: int(x, 0), - nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_pow_conumption_help) - cpu_group.add_argument('--cpu_dimm_thermal_sensor', action='append', required=False, type=lambda x: int(x, 0), - nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_thermal_sensor_help) - set_group.add_argument('--set_cpu_gmi3_link_width', action='append', required=False, type=self._validate_positive, - nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) - set_group.add_argument('--set_cpu_pcie_lnk_rate', action='append', required=False, type=self._validate_positive, - nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_lnk_rate_help) - set_group.add_argument('--set_cpu_df_pstate_range', action='append', required=False, type=self._validate_positive, - nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) - - def _add_process_parser(self, subparsers, func): - if self.helpers.is_hypervisor(): - # Don't add this subparser on Hypervisors - # This subparser is only available to Guest and Baremetal systems - return - - # Subparser help text - process_help = "Lists general process information running on the specified GPU" - process_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ - \nIf no process argument is provided all process information will be displayed." - process_optionals_title = "Process arguments" - - # Optional Arguments help text - general_help = "pid, process name, memory usage" - engine_help = "All engine usages" - pid_help = "Gets all process information about the specified process based on Process ID" - name_help = "Gets all process information about the specified process based on Process Name.\ - \nIf multiple processes have the same name information is returned for all of them." - - - # Create process subparser - process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) - process_parser._optionals.title = process_optionals_title - process_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - process_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(process_parser) - self._add_device_arguments(process_parser, required=False) - - # Add Watch args - self._add_watch_arguments(process_parser) - - # Optional Args - process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) - process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) - process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help) - process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) - - - def _add_profile_parser(self, subparsers, func): - if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): - # This subparser only applies to Hypervisors - return - - # Subparser help text - profile_help = "Displays information about all profiles and current profile" - profile_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system." - profile_optionals_title = "Profile Arguments" - - # Create profile subparser - profile_parser = subparsers.add_parser('profile', help=profile_help, description=profile_subcommand_help) - profile_parser._optionals.title = profile_optionals_title - profile_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - profile_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(profile_parser) - self._add_device_arguments(profile_parser, required=False) - - - def _add_event_parser(self, subparsers, func): - # Subparser help text - event_help = "Displays event information for the given GPU" - event_subcommand_help = "If no GPU is specified, returns event information for all GPUs on the system." - event_optionals_title = "Event Arguments" - - # Create event subparser - event_parser = subparsers.add_parser('event', help=event_help, description=event_subcommand_help) - event_parser._optionals.title = event_optionals_title - event_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - event_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(event_parser) - self._add_device_arguments(event_parser, required=False) - - - def _add_topology_parser(self, subparsers, func): - if not(self.helpers.is_baremetal() and self.helpers.is_linux()): - # This subparser is only applicable to Baremetal Linux - return - - # Subparser help text - topology_help = "Displays topology information of the devices" - topology_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ - \nIf no topology argument is provided all topology information will be displayed." - topology_optionals_title = "Topology arguments" - - # Help text for Arguments only on Guest and BM platforms - access_help = "Displays link accessibility between GPUs" - weight_help = "Displays relative weight between GPUs" - hops_help = "Displays the number of hops between GPUs" - link_type_help = "Displays the link type between GPUs" - numa_bw_help = "Display max and min bandwidth between nodes" - - # Create topology subparser - topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) - topology_parser._optionals.title = topology_optionals_title - topology_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - topology_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(topology_parser) - self._add_device_arguments(topology_parser, required=False) - - # Optional Args - topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help) - topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help) - topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) - topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) - topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) - - - def _add_set_value_parser(self, subparsers, func): - if not(self.helpers.is_baremetal() and self.helpers.is_linux()): - # This subparser is only applicable to Baremetal Linux - return - - # Subparser help text - set_value_help = "Set options for devices" - set_value_subcommand_help = "A GPU must be specified to set a configuration.\ - \nA set argument must be provided; Multiple set arguments are accepted" - set_value_optionals_title = "Set Arguments" - - # Help text for Arguments only on BM platforms - set_fan_help = "Set GPU fan speed (0-255 or 0-100%%)" - set_perf_level_help = "Set performance level" - set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes" - set_perf_det_help = "Set GPU clock frequency limit and performance level to determinism to get minimal performance variation" - compute_partition_choices_str = ", ".join(self.helpers.get_compute_partition_types()) - memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) - set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" - set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" - set_power_cap_help = "Set power capacity limit" - - # Create set_value subparser - set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) - set_value_parser._optionals.title = set_value_optionals_title - set_value_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) - set_value_parser.set_defaults(func=func) - - # Add Universal Arguments - self._add_command_modifiers(set_value_parser) - # Device args are required as safeguard from the user applying the operation to all gpus unintentionally - self._add_device_arguments(set_value_parser, required=True) - - # Optional Args - set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') - set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') - set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') - set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._not_negative_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') - set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') - set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') - set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') - - def _validate_set_clock(self, validate_clock_type=True): """ Validate Clock input""" amdsmi_helpers = self.helpers @@ -948,11 +488,546 @@ core limit value" return _ValidateOverdrivePercent + def _add_version_parser(self, subparsers, func): + # Subparser help text + version_help = "Display version information" + + # Create version subparser + version_parser = subparsers.add_parser('version', help=version_help, description=None) + version_parser._optionals.title = None + version_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + version_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(version_parser) + + + def _add_list_parser(self, subparsers, func): + if not self.helpers.is_amdgpu_initialized(): + # The list subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + list_help = "List GPU information" + list_subcommand_help = "Lists all the devices on the system and the links between devices.\ + \nLists all the sockets and for each socket, GPUs and/or CPUs associated to\ + \nthat socket alongside some basic information for each device.\ + \nIn virtualization environments, it can also list VFs associated to each\ + \nGPU with some basic information for each VF." + + # Create list subparser + list_parser = subparsers.add_parser('list', help=list_help, description=list_subcommand_help) + list_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + list_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(list_parser) + self._add_device_arguments(list_parser, required=False) + + + def _add_static_parser(self, subparsers, func): + # Subparser help text + static_help = "Gets static information about the specified GPU" + static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\ + \nIf no static argument is provided, all static information will be displayed." + static_optionals_title = "Static Arguments" + + # Optional arguments help text + asic_help = "All asic information" + bus_help = "All bus information" + vbios_help = "All video bios information (if available)" + limit_help = "All limit metric values (i.e. power and thermal limits)" + driver_help = "Displays driver version" + vram_help = "All vram information" + cache_help = "All cache information" + board_help = "All board information" + + # Options arguments help text for Hypervisors and Baremetal + ras_help = "Displays RAS features information" + numa_help = "All numa node information" # Linux Baremetal only + partition_help = "Partition information" + + # Options arguments help text for Hypervisors + dfc_help = "All DFC FW table information" + fb_help = "Displays Frame Buffer information" + num_vf_help = "Displays number of supported and enabled VFs" + + # Options arguments help text for CPUs + smu_help = "All SMU FW information" + interface_help = "Displays hsmp interface version" + + # Create static subparser + static_parser = subparsers.add_parser('static', help=static_help, description=static_subcommand_help) + static_parser._optionals.title = static_optionals_title + static_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + static_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_device_arguments(static_parser, required=False) + + # Handle GPU Options + if self.helpers.is_amdgpu_initialized(): + static_parser.add_argument('-a', '--asic', action='store_true', required=False, help=asic_help) + static_parser.add_argument('-b', '--bus', action='store_true', required=False, help=bus_help) + static_parser.add_argument('-V', '--vbios', action='store_true', required=False, help=vbios_help) + static_parser.add_argument('-d', '--driver', action='store_true', required=False, help=driver_help) + static_parser.add_argument('-v', '--vram', action='store_true', required=False, help=vram_help) + static_parser.add_argument('-c', '--cache', action='store_true', required=False, help=cache_help) + static_parser.add_argument('-B', '--board', action='store_true', required=False, help=board_help) + + # Options to display on Hypervisors and Baremetal + if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): + static_parser.add_argument('-r', '--ras', action='store_true', required=False, help=ras_help) + static_parser.add_argument('-p', '--partition', action='store_true', required=False, help=partition_help) + static_parser.add_argument('-l', '--limit', action='store_true', required=False, help=limit_help) + + if self.helpers.is_linux() and not self.helpers.is_virtual_os(): + static_parser.add_argument('-u', '--numa', action='store_true', required=False, help=numa_help) + + # Options to only display on a Hypervisor TODO: Add hypervisor driver check + if self.helpers.is_hypervisor(): + static_parser.add_argument('-d', '--dfc-ucode', action='store_true', required=False, help=dfc_help) + static_parser.add_argument('-f', '--fb-info', action='store_true', required=False, help=fb_help) + static_parser.add_argument('-n', '--num-vf', action='store_true', required=False, help=num_vf_help) + + # Handle CPU Options + if self.helpers.is_amd_hsmp_initialized(): + cpu_group = static_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('-s', '--smu', action='store_true', required=False, help=smu_help) + cpu_group.add_argument('-i', '--interface-ver', action='store_true', required=False, help=interface_help) + + # Add command modifiers to the bottom + self._add_command_modifiers(static_parser) + + + def _add_firmware_parser(self, subparsers, func): + if not self.helpers.is_amdgpu_initialized(): + # The firmware subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + firmware_help = "Gets firmware information about the specified GPU" + firmware_subcommand_help = "If no GPU is specified, return firmware information for all GPUs on the system." + firmware_optionals_title = "Firmware Arguments" + + # Optional arguments help text + fw_list_help = "All FW list information" + err_records_help = "All error records information" + + # Create firmware subparser + firmware_parser = subparsers.add_parser('firmware', help=firmware_help, description=firmware_subcommand_help, aliases=['ucode']) + firmware_parser._optionals.title = firmware_optionals_title + firmware_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + firmware_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(firmware_parser) + self._add_device_arguments(firmware_parser, required=False) + + # Optional Args + firmware_parser.add_argument('-f', '--ucode-list', '--fw-list', dest='fw_list', action='store_true', required=False, help=fw_list_help, default=True) + + # Options to only display on a Hypervisor + if self.helpers.is_hypervisor(): + firmware_parser.add_argument('-e', '--error-records', action='store_true', required=False, help=err_records_help) + + + def _add_bad_pages_parser(self, subparsers, func): + if not (self.helpers.is_baremetal() and self.helpers.is_linux()): + # The bad_pages subcommand is only applicable to Linux Baremetal systems + return + + if not self.helpers.is_amdgpu_initialized(): + # The bad_pages subcommand is only applicable to systems with amdgpu initialized + return + + + # Subparser help text + bad_pages_help = "Gets bad page information about the specified GPU" + bad_pages_subcommand_help = "If no GPU is specified, return bad page information for all GPUs on the system." + bad_pages_optionals_title = "Bad Pages Arguments" + + # Optional arguments help text + pending_help = "Displays all pending retired pages" + retired_help = "Displays retired pages" + un_res_help = "Displays unreservable pages" + + # Create bad_pages subparser + bad_pages_parser = subparsers.add_parser('bad-pages', help=bad_pages_help, description=bad_pages_subcommand_help) + bad_pages_parser._optionals.title = bad_pages_optionals_title + bad_pages_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + bad_pages_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(bad_pages_parser) + self._add_device_arguments(bad_pages_parser, required=False) + + # Optional Args + bad_pages_parser.add_argument('-p', '--pending', action='store_true', required=False, help=pending_help) + bad_pages_parser.add_argument('-r', '--retired', action='store_true', required=False, help=retired_help) + bad_pages_parser.add_argument('-u', '--un-res', action='store_true', required=False, help=un_res_help) + + + def _add_metric_parser(self, subparsers, func): + # Subparser help text + metric_help = "Gets metric/performance information about the specified GPU" + metric_subcommand_help = "If no GPU is specified, returns metric information for all GPUs on the system.\ + \nIf no metric argument is provided all metric information will be displayed." + metric_optionals_title = "Metric arguments" + + # Optional arguments help text + usage_help = "Displays engine usage information" + + # Help text for Arguments only Available on Linux Virtual OS and Baremetal platforms + mem_usage_help = "Memory usage per block" + + # Help text for Arguments only on Hypervisor and Baremetal platforms + power_help = "Current power usage" + clock_help = "Average, max, and current clock frequencies" + temperature_help = "Current temperatures" + ecc_help = "Total number of ECC errors" + ecc_block_help = "Number of ECC errors per block" + pcie_help = "Current PCIe speed, width, and replay count" + + # Help text for Arguments only on Linux Baremetal platforms + fan_help = "Current fan speed" + vc_help = "Display voltage curve" + overdrive_help = "Current GPU clock overdrive level" + perf_level_help = "Current DPM performance level" + xgmi_err_help = "XGMI error information since last read" + energy_help = "Amount of energy consumed" + + # Help text for Arguments only on Hypervisors + schedule_help = "All scheduling information" + guard_help = "All guard information" + guest_data_help = "All guest data information" + fb_usage_help = "Displays total and used Frame Buffer usage information" + xgmi_help = "Table of current XGMI metrics information" + + # Help text for cpu options + cpu_power_metrics_help = "CPU power metrics" + cpu_proc_help = "Displays prochot status" + cpu_freq_help = "Displays currentFclkMemclk frequencies and cclk frequency limit" + cpu_c0_res_help = "Displays C0 residency" + cpu_lclk_dpm_help = "Displays lclk dpm level range. Requires socket ID and NBOID as inputs" + cpu_pwr_svi_telemtry_rails_help = "Displays svi based telemetry for all rails" + cpu_io_bandwidth_help = "Displays current IO bandwidth for the selected CPU.\ + \n input parameters are bandwidth type(1) and link ID encodings\ + \n i.e. P2, P3, G0 - G7" + cpu_xgmi_bandwidth_help = "Displays current XGMI bandwidth for the selected CPU\ + \n input parameters are bandwidth type(1,2,4) and link ID encodings\ + \n i.e. P2, P3, G0 - G7" + cpu_metrics_ver_help = "Displays metrics table version" + cpu_metrics_table_help = "Displays metric table" + cpu_socket_energy_help = "Displays socket energy for the selected CPU socket" + cpu_ddr_bandwidth_help = "Displays per socket max ddr bw, current utilized bw,\ + \n and current utilized ddr bw in percentage" + cpu_temp_help = "Displays cpu socket temperature" + cpu_dimm_temp_range_rate_help = "Displays dimm temperature range and refresh rate" + cpu_dimm_pow_consumption_help = "Displays dimm power consumption" + cpu_dimm_thermal_sensor_help = "Displays dimm thermal sensor" + + # Help text for core options + core_energy_help = "Displays core energy for the selected core" + core_boost_limit_help = "Get boost limit for the selected cores" + core_curr_active_freq_core_limit_help = "Get Current CCLK limit set per Core" + + # Create metric subparser + metric_parser = subparsers.add_parser('metric', help=metric_help, description=metric_subcommand_help) + metric_parser._optionals.title = metric_optionals_title + metric_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + metric_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_device_arguments(metric_parser, required=False) + + # Add Watch args + self._add_watch_arguments(metric_parser) + + # Optional Args for Linux Virtual OS and Baremetal systems + if not self.helpers.is_hypervisor() and not self.helpers.is_windows(): + metric_parser.add_argument('-m', '--mem-usage', action='store_true', required=False, help=mem_usage_help) + + if self.helpers.is_amdgpu_initialized(): + # Optional Args for Hypervisors and Baremetal systems + if self.helpers.is_hypervisor() or self.helpers.is_baremetal() or self.helpers.is_linux(): + metric_parser.add_argument('-u', '--usage', action='store_true', required=False, help=usage_help) + metric_parser.add_argument('-p', '--power', action='store_true', required=False, help=power_help) + metric_parser.add_argument('-c', '--clock', action='store_true', required=False, help=clock_help) + metric_parser.add_argument('-t', '--temperature', action='store_true', required=False, help=temperature_help) + metric_parser.add_argument('-e', '--ecc', action='store_true', required=False, help=ecc_help) + metric_parser.add_argument('-P', '--pcie', action='store_true', required=False, help=pcie_help) + + # Optional Args for Linux Baremetal Systems + if self.helpers.is_baremetal() and self.helpers.is_linux(): + metric_parser.add_argument('-k', '--ecc-block', action='store_true', required=False, help=ecc_block_help) + metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) + metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) + metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) + metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) + metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) + metric_parser.add_argument('-E', '--energy', action='store_true', required=False, help=energy_help) + + # Options to only display to Hypervisors + if self.helpers.is_hypervisor(): + metric_parser.add_argument('-s', '--schedule', action='store_true', required=False, help=schedule_help) + metric_parser.add_argument('-G', '--guard', action='store_true', required=False, help=guard_help) + metric_parser.add_argument('-u', '--guest-data', action='store_true', required=False, help=guest_data_help) + metric_parser.add_argument('-f', '--fb-usage', action='store_true', required=False, help=fb_usage_help) + metric_parser.add_argument('-m', '--xgmi', action='store_true', required=False, help=xgmi_help) + + if self.helpers.is_amd_hsmp_initialized(): + # Optional Args for CPUs + cpu_group = metric_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('--cpu-power-metrics', action='store_true', required=False, help=cpu_power_metrics_help) + cpu_group.add_argument('--cpu-prochot', action='store_true', required=False, help=cpu_proc_help) + cpu_group.add_argument('--cpu-freq-metrics', action='store_true', required=False, help=cpu_freq_help) + cpu_group.add_argument('--cpu-c0-res', action='store_true', required=False, help=cpu_c0_res_help) + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._validate_positive, + nargs=1, metavar=("NBIOID"), help=cpu_lclk_dpm_help) + cpu_group.add_argument('--cpu-pwr-svi-telemtry-rails', action='store_true', required=False, + help=cpu_pwr_svi_telemtry_rails_help) + cpu_group.add_argument('--cpu-io-bandwidth', action='append', required=False, nargs=2, + metavar=("IO_BW", "LINKID_NAME"), help=cpu_io_bandwidth_help) + cpu_group.add_argument('--cpu-xgmi-bandwidth', action='append', required=False, nargs=2, + metavar=("XGMI_BW", "LINKID_NAME"), help=cpu_xgmi_bandwidth_help) + cpu_group.add_argument('--cpu-metrics-ver', action='store_true', required=False, help=cpu_metrics_ver_help) + cpu_group.add_argument('--cpu-metrics-table', action='store_true', required=False, help=cpu_metrics_table_help) + cpu_group.add_argument('--cpu-socket-energy', action='store_true', required=False, help=cpu_socket_energy_help) + cpu_group.add_argument('--cpu-ddr-bandwidth', action='store_true', required=False, help=cpu_ddr_bandwidth_help) + cpu_group.add_argument('--cpu-temp', action='store_true', required=False, help=cpu_temp_help) + cpu_group.add_argument('--cpu-dimm-temp-range-rate', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_temp_range_rate_help) + cpu_group.add_argument('--cpu-dimm-pow-consumption', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_pow_consumption_help) + cpu_group.add_argument('--cpu-dimm-thermal-sensor', action='append', required=False, type=lambda x: int(x, 0), + nargs=1, metavar=("DIMM_ADDR"), help=cpu_dimm_thermal_sensor_help) + + # Optional Args for CPU cores + core_group = metric_parser.add_argument_group("CPU Core Arguments") + core_group.add_argument('--core-boost-limit', action='store_true', required=False, help=core_boost_limit_help) + core_group.add_argument('--core-curr-active-freq-core-limit', action='store_true', required=False, + help=core_curr_active_freq_core_limit_help) + core_group.add_argument('--core-energy', action='store_true', required=False, help=core_energy_help) + + # Add command modifiers to the bottom + self._add_command_modifiers(metric_parser) + + + def _add_process_parser(self, subparsers, func): + if self.helpers.is_hypervisor(): + # Don't add this subparser on Hypervisors + # This subparser is only available to Guest and Baremetal systems + return + + if not self.helpers.is_amdgpu_initialized(): + # The process subcommand is currently only applicable to systems with amdgpu initialized + return + + # Subparser help text + process_help = "Lists general process information running on the specified GPU" + process_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ + \nIf no process argument is provided all process information will be displayed." + process_optionals_title = "Process arguments" + + # Optional Arguments help text + general_help = "pid, process name, memory usage" + engine_help = "All engine usages" + pid_help = "Gets all process information about the specified process based on Process ID" + name_help = "Gets all process information about the specified process based on Process Name.\ + \nIf multiple processes have the same name information is returned for all of them." + + + # Create process subparser + process_parser = subparsers.add_parser('process', help=process_help, description=process_subcommand_help) + process_parser._optionals.title = process_optionals_title + process_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + process_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(process_parser) + self._add_device_arguments(process_parser, required=False) + + # Add Watch args + self._add_watch_arguments(process_parser) + + # Optional Args + process_parser.add_argument('-G', '--general', action='store_true', required=False, help=general_help) + process_parser.add_argument('-e', '--engine', action='store_true', required=False, help=engine_help) + process_parser.add_argument('-p', '--pid', action='store', type=self._not_negative_int, required=False, help=pid_help) + process_parser.add_argument('-n', '--name', action='store', required=False, help=name_help) + + + def _add_profile_parser(self, subparsers, func): + if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): + # This subparser only applies to Hypervisors + return + + # Subparser help text + profile_help = "Displays information about all profiles and current profile" + profile_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system." + profile_optionals_title = "Profile Arguments" + + # Create profile subparser + profile_parser = subparsers.add_parser('profile', help=profile_help, description=profile_subcommand_help) + profile_parser._optionals.title = profile_optionals_title + profile_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + profile_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(profile_parser) + self._add_device_arguments(profile_parser, required=False) + + + def _add_event_parser(self, subparsers, func): + if not self.helpers.is_amdgpu_initialized(): + # The event subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + event_help = "Displays event information for the given GPU" + event_subcommand_help = "If no GPU is specified, returns event information for all GPUs on the system." + event_optionals_title = "Event Arguments" + + # Create event subparser + event_parser = subparsers.add_parser('event', help=event_help, description=event_subcommand_help) + event_parser._optionals.title = event_optionals_title + event_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + event_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(event_parser) + self._add_device_arguments(event_parser, required=False) + + + def _add_topology_parser(self, subparsers, func): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux + return + + if not self.helpers.is_amdgpu_initialized(): + # The topology subcommand is only applicable to systems with amdgpu initialized + return + + # Subparser help text + topology_help = "Displays topology information of the devices" + topology_subcommand_help = "If no GPU is specified, returns information for all GPUs on the system.\ + \nIf no topology argument is provided all topology information will be displayed." + topology_optionals_title = "Topology arguments" + + # Help text for Arguments only on Guest and BM platforms + access_help = "Displays link accessibility between GPUs" + weight_help = "Displays relative weight between GPUs" + hops_help = "Displays the number of hops between GPUs" + link_type_help = "Displays the link type between GPUs" + numa_bw_help = "Display max and min bandwidth between nodes" + + # Create topology subparser + topology_parser = subparsers.add_parser('topology', help=topology_help, description=topology_subcommand_help) + topology_parser._optionals.title = topology_optionals_title + topology_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + topology_parser.set_defaults(func=func) + + # Add Universal Arguments + self._add_command_modifiers(topology_parser) + self._add_device_arguments(topology_parser, required=False) + + # Optional Args + topology_parser.add_argument('-a', '--access', action='store_true', required=False, help=access_help) + topology_parser.add_argument('-w', '--weight', action='store_true', required=False, help=weight_help) + topology_parser.add_argument('-o', '--hops', action='store_true', required=False, help=hops_help) + topology_parser.add_argument('-t', '--link-type', action='store_true', required=False, help=link_type_help) + topology_parser.add_argument('-b', '--numa-bw', action='store_true', required=False, help=numa_bw_help) + + + def _add_set_value_parser(self, subparsers, func): + if not(self.helpers.is_baremetal() and self.helpers.is_linux()): + # This subparser is only applicable to Baremetal Linux + return + + # Subparser help text + set_value_help = "Set options for devices" + set_value_subcommand_help = "A GPU must be specified to set a configuration.\ + \nA set argument must be provided; Multiple set arguments are accepted" + set_value_optionals_title = "Set Arguments" + + # Help text for Arguments only on BM platforms + set_fan_help = "Set GPU fan speed (0-255 or 0-100%%)" + set_perf_level_help = "Set performance level" + set_profile_help = "Set power profile level (#) or a quoted string of custom profile attributes" + set_perf_det_help = "Set GPU clock frequency limit and performance level to determinism to get minimal performance variation" + compute_partition_choices_str = ", ".join(self.helpers.get_compute_partition_types()) + memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) + set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" + set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" + set_power_cap_help = "Set power capacity limit" + + # Help text for CPU set options + set_cpu_pwr_limit_help = "Set power limit for the given socket. Input parameter is power limit value." + set_cpu_xgmi_link_width_help = "Set max and Min linkwidth. Input parameters are min and max link width values" + set_cpu_lclk_dpm_level_help = "Sets the max and min dpm level on a given NBIO.\ + \n Input parameters are die_index, min dpm, max dpm." + set_cpu_pwr_eff_mode_help = "Sets the power efficency mode policy. Input parameter is mode." + set_cpu_gmi3_link_width_help = "Sets max and min gmi3 link width range" + set_cpu_pcie_link_rate_help = "Sets pcie link rate" + set_cpu_df_pstate_range_help = "Sets max and min df-pstates" + set_cpu_enable_apb_help = "Enables the DF p-state performance boost algorithm" + set_cpu_disable_apb_help = "Disables the DF p-state performance boost algorithm. Input parameter is DFPstate (0-3)" + set_soc_boost_limit_help = "Sets the boost limit for the given socket. Input parameter is socket BOOST_LIMIT value" + + # Help text for CPU Core set options + set_core_boost_limit_help = "Sets the boost limit for the given core. Input parameter is core BOOST_LIMIT value" + + # Create set_value subparser + set_value_parser = subparsers.add_parser('set', help=set_value_help, description=set_value_subcommand_help) + set_value_parser._optionals.title = set_value_optionals_title + set_value_parser.formatter_class=lambda prog: AMDSMISubparserHelpFormatter(prog) + set_value_parser.set_defaults(func=func) + + # Device args are required as safeguard from the user applying the operation to all gpus unintentionally + self._add_device_arguments(set_value_parser, required=True) + + if self.helpers.is_amdgpu_initialized(): + # Optional GPU Args + set_value_parser.add_argument('-f', '--fan', action=self._validate_fan_speed(), required=False, help=set_fan_help, metavar='%') + set_value_parser.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') + set_value_parser.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') + set_value_parser.add_argument('-d', '--perf-determinism', action='store', type=self._not_negative_int, required=False, help=set_perf_det_help, metavar='SCLKMAX') + set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') + set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') + set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') + + if self.helpers.is_amd_hsmp_initialized(): + # Optional CPU Args + cpu_group = set_value_parser.add_argument_group("CPU Arguments") + cpu_group.add_argument('--cpu-pwr-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("PWR_LIMIT"), help=set_cpu_pwr_limit_help) + cpu_group.add_argument('--cpu-xgmi-link-width', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MIN_WIDTH", "MAX_WIDTH"), help=set_cpu_xgmi_link_width_help) + cpu_group.add_argument('--cpu-lclk-dpm-level', action='append', required=False, type=self._validate_positive, nargs=3, metavar=("NBIOID", "MIN_DPM", "MAX_DPM"), help=set_cpu_lclk_dpm_level_help) + cpu_group.add_argument('--cpu-pwr-eff-mode', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("MODE"), help=set_cpu_pwr_eff_mode_help) + cpu_group.add_argument('--cpu-gmi3-link-width', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MIN_LW", "MAX_LW"), help=set_cpu_gmi3_link_width_help) + cpu_group.add_argument('--cpu-pcie-link-rate', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("LINK_RATE"), help=set_cpu_pcie_link_rate_help) + cpu_group.add_argument('--cpu-df-pstate-range', action='append', required=False, type=self._validate_positive, nargs=2, metavar=("MAX_PSTATE", "MIN_PSTATE"), help=set_cpu_df_pstate_range_help) + cpu_group.add_argument('--cpu-enable-apb', action='store_true', required=False, help=set_cpu_enable_apb_help) + cpu_group.add_argument('--cpu-disable-apb', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("DF_PSTATE"), help=set_cpu_disable_apb_help) + cpu_group.add_argument('--soc-boost-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("BOOST_LIMIT"), help=set_soc_boost_limit_help) + + # Optional CPU Core Args + core_group = set_value_parser.add_argument_group("CPU Core Arguments") + core_group.add_argument('--core-boost-limit', action='append', required=False, type=self._validate_positive, nargs=1, metavar=("BOOST_LIMIT"), help=set_core_boost_limit_help) + + # Add command modifiers to the bottom + self._add_command_modifiers(set_value_parser) + + def _add_reset_parser(self, subparsers, func): if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return + if not self.helpers.is_amdgpu_initialized(): + # The reset subcommand is only applicable to systems with amdgpu initialized + return + # Subparser help text reset_help = "Reset options for devices" reset_subcommand_help = "A GPU must be specified to reset a configuration.\ @@ -998,6 +1073,10 @@ core limit value" # This subparser is only applicable to Linux return + if not self.helpers.is_amdgpu_initialized(): + # The monitor subcommand is only applicable to systems with amdgpu initialized + return + # Subparser help text monitor_help = "Monitor metrics for target devices" monitor_subcommand_help = "Monitor a target device for the specified arguments.\ diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 0233c3cf24..1b599d5632 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -67,12 +67,12 @@ extern "C" { * Initialization flags may be OR'd together and passed to ::amdsmi_init(). */ typedef enum { - AMDSMI_INIT_ALL_PROCESSORS = 0x0, // Default option + AMDSMI_INIT_ALL_PROCESSORS = 0xFFFFFFFF, //!< Initialize all processors AMDSMI_INIT_AMD_CPUS = (1 << 0), AMDSMI_INIT_AMD_GPUS = (1 << 1), AMDSMI_INIT_NON_AMD_CPUS = (1 << 2), AMDSMI_INIT_NON_AMD_GPUS = (1 << 3), - AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS) + AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS) // Default option } amdsmi_init_flags_t; /* Maximum size definitions AMDSMI */ diff --git a/py-interface/README.md b/py-interface/README.md index b4c79a351d..a1747f31a6 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -73,7 +73,7 @@ except AmdSmiException as e: ### amdsmi_init -Description: Initialize amdsmi lib and connect to driver +Description: Dynamically initialize amdsmi with amd_hsmp and amdgpu drivers Input parameters: `None` @@ -87,7 +87,12 @@ Example: ```python try: - amdsmi_init() + init_flag = amdsmi_init() + # Print out integer bitmask of initialized drivers + # 1 is for amd_hsmp + # 2 is for amdgpu + # 3 is for amd_hsmp and amdgpu + print(init_flag) # continue with amdsmi except AmdSmiException as e: print("Init failed") diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 8b49fd9b2c..7656b9fa8b 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -196,14 +196,14 @@ _libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB') # values for enumeration 'amdsmi_init_flags_t' amdsmi_init_flags_t__enumvalues = { - 0: 'AMDSMI_INIT_ALL_PROCESSORS', + 4294967295: 'AMDSMI_INIT_ALL_PROCESSORS', 1: 'AMDSMI_INIT_AMD_CPUS', 2: 'AMDSMI_INIT_AMD_GPUS', 4: 'AMDSMI_INIT_NON_AMD_CPUS', 8: 'AMDSMI_INIT_NON_AMD_GPUS', 3: 'AMDSMI_INIT_AMD_APUS', } -AMDSMI_INIT_ALL_PROCESSORS = 0 +AMDSMI_INIT_ALL_PROCESSORS = 4294967295 AMDSMI_INIT_AMD_CPUS = 1 AMDSMI_INIT_AMD_GPUS = 2 AMDSMI_INIT_NON_AMD_CPUS = 4 diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index e46eb71784..7b0da49bf5 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -261,7 +261,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() { processors_.clear(); sockets_.clear(); esmi_exit(); - init_flag_ = AMDSMI_INIT_ALL_PROCESSORS; + init_flag_ &= ~AMDSMI_INIT_AMD_CPUS; } #endif if (init_flag_ & AMDSMI_INIT_AMD_GPUS) { @@ -270,7 +270,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() { } processors_.clear(); sockets_.clear(); - init_flag_ = AMDSMI_INIT_ALL_PROCESSORS; + init_flag_ &= ~AMDSMI_INIT_AMD_GPUS; rsmi_status_t ret = rsmi_shut_down(); if (ret != RSMI_STATUS_SUCCESS) { return amd::smi::rsmi_to_amdsmi_status(ret);