diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 2eb9fcd45a..02bcc45f35 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -3903,19 +3903,24 @@ class AMDSMICommands(): args.gpu = device_handle # Error if no subcommand args are passed - if not any([args.fan is not None, - args.perf_level, - args.profile, - args.compute_partition, - args.memory_partition, - args.perf_determinism is not None, - args.power_cap is not None, - args.soc_pstate is not None, - args.xgmi_plpd is not None, - args.process_isolation is not None, - args.clk_limit is not None]): - command = " ".join(sys.argv[1:]) - raise AmdSmiRequiredCommandException(command, self.logger.format) + if self.helpers.is_baremetal(): + if not any([args.fan is not None, + args.perf_level, + args.profile, + args.compute_partition, + args.memory_partition, + args.perf_determinism is not None, + args.power_cap is not None, + args.soc_pstate is not None, + args.xgmi_plpd is not None, + args.clk_limit is not None, + args.process_isolation is not None]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + else: + if not any([args.process_isolation is not None]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) # Build GPU string for errors try: @@ -3929,100 +3934,113 @@ class AMDSMICommands(): gpu_string = f"GPU ID: {gpu_id} BDF:{gpu_bdf}" # Handle args - if isinstance(args.fan, int): - try: - amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e - - self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}") - if args.perf_level: - perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level] - try: - amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set performance level {args.perf_level} on {gpu_string}") from e - - self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}") - if args.profile: - self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented") - if isinstance(args.perf_determinism, int): - try: - amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perf_determinism) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perf_determinism} on {gpu_string}") from e - - self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}") - if args.compute_partition: - compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] - try: - amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}") - if args.memory_partition: - memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] - try: - amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") - if isinstance(args.power_cap, int): - try: - power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) - logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - min_power_cap = power_cap_info["min_power_cap"] - min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - max_power_cap = power_cap_info["max_power_cap"] - max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - current_power_cap = power_cap_info["power_cap"] - current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(f"Unable to get power cap info from {gpu_string}") from e - - if args.power_cap == current_power_cap: - self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}") - elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: + if self.helpers.is_baremetal(): + if isinstance(args.fan, int): try: - new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) + amdsmi_interface.amdsmi_set_gpu_fan_speed(args.gpu, 0, args.fan) except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}") - else: - # setting power cap to 0 will return the current power cap so the technical minimum value is 1 - if min_power_cap == 0: - min_power_cap = 1 - self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}") - if isinstance(args.soc_pstate, int): - try: - amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set dpm soc pstate policy to {args.soc_pstate} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'socpstate', f"Successfully soc pstate dpm policy to id {args.soc_pstate}") - if isinstance(args.xgmi_plpd, int): - try: - amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.xgmi_plpd}") + raise ValueError(f"Unable to set fan speed {args.fan} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'fan', f"Successfully set fan speed {args.fan}") + if args.perf_level: + perf_level = amdsmi_interface.AmdSmiDevPerfLevel[args.perf_level] + try: + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, perf_level) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set performance level {args.perf_level} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'perflevel', f"Successfully set performance level {args.perf_level}") + if args.profile: + self.logger.store_output(args.gpu, 'profile', "Not Yet Implemented") + if isinstance(args.perf_determinism, int): + try: + amdsmi_interface.amdsmi_set_gpu_perf_determinism_mode(args.gpu, args.perf_determinism) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set performance determinism and clock frequency to {args.perf_determinism} on {gpu_string}") from e + + self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}") + if args.compute_partition: + compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] + try: + amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}") + if args.memory_partition: + memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] + try: + amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + if isinstance(args.power_cap, int): + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + min_power_cap = power_cap_info["min_power_cap"] + min_power_cap = self.helpers.convert_SI_unit(min_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + max_power_cap = power_cap_info["max_power_cap"] + max_power_cap = self.helpers.convert_SI_unit(max_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap = power_cap_info["power_cap"] + current_power_cap = self.helpers.convert_SI_unit(current_power_cap, AMDSMIHelpers.SI_Unit.MICRO) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get power cap info from {gpu_string}") from e + + if args.power_cap == current_power_cap: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {args.power_cap}") + elif args.power_cap >= min_power_cap and args.power_cap <= max_power_cap: + try: + new_power_cap = self.helpers.convert_SI_unit(args.power_cap, AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, new_power_cap) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set power cap to {args.power_cap} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {args.power_cap}") + else: + # setting power cap to 0 will return the current power cap so the technical minimum value is 1 + if min_power_cap == 0: + min_power_cap = 1 + self.logger.store_output(args.gpu, 'powercap', f"Power cap must be between {min_power_cap} and {max_power_cap}") + if isinstance(args.soc_pstate, int): + try: + amdsmi_interface.amdsmi_set_soc_pstate(args.gpu, args.soc_pstate) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set dpm soc pstate policy to {args.soc_pstate} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'socpstate', f"Successfully soc pstate dpm policy to id {args.soc_pstate}") + if isinstance(args.xgmi_plpd, int): + try: + amdsmi_interface.amdsmi_set_xgmi_plpd(args.gpu, args.xgmi_plpd) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set XGMI policy to {args.xgmi_plpd} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'xgmiplpd', f"Successfully set per-link power down policy to id {args.xgmi_plpd}") + if isinstance(args.clk_limit, tuple): + try: + clk_type = args.clk_limit.clk_type + lim_type = args.clk_limit.lim_type + val = args.clk_limit.val + amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}") + if isinstance(args.process_isolation, int): status_string = "Enabled" if args.process_isolation else "Disabled" result = f"Requested process isolation to {status_string}" # This should not print out @@ -4039,17 +4057,6 @@ class AMDSMICommands(): raise ValueError(f"Unable to set process isolation to {status_string} on {gpu_string}") from e self.logger.store_output(args.gpu, 'process_isolation', result) - if isinstance(args.clk_limit, tuple): - try: - clk_type = args.clk_limit.clk_type - lim_type = args.clk_limit.lim_type - val = args.clk_limit.val - amdsmi_interface.amdsmi_set_gpu_clk_limit(args.gpu, clk_type, lim_type, val) - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'clk_limit', f"Successfully changed {args.clk_limit.lim_type} of {args.clk_limit.clk_type} to {args.clk_limit.val}") if multiple_devices: self.logger.store_multiple_device_output() @@ -4262,159 +4269,163 @@ class AMDSMICommands(): gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) # Error if no subcommand args are passed - if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \ - args.perf_determinism, args.compute_partition, args.memory_partition, \ - args.power_cap, args.clean_local_data]): - command = " ".join(sys.argv[1:]) - raise AmdSmiRequiredCommandException(command, self.logger.format) + if self.helpers.is_baremetal(): + if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \ + args.perf_determinism, args.compute_partition, args.memory_partition, \ + args.power_cap, args.clean_local_data]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) + else: + if not any([args.clean_local_data]): + command = " ".join(sys.argv[1:]) + raise AmdSmiRequiredCommandException(command, self.logger.format) - if args.gpureset: - if self.helpers.is_amd_device(args.gpu): + if self.helpers.is_baremetal(): + if args.gpureset: + if self.helpers.is_amd_device(args.gpu): + try: + amdsmi_interface.amdsmi_reset_gpu(args.gpu) + result = 'Successfully reset GPU' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "Failed to reset GPU" + else: + result = 'Unable to reset non-amd GPU' + self.logger.store_output(args.gpu, 'gpu_reset', result) + if args.clocks: + reset_clocks_results = {'overdrive': '', + 'clocks': '', + 'performance': ''} try: - amdsmi_interface.amdsmi_reset_gpu(args.gpu) - result = 'Successfully reset GPU' + amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0) + reset_clocks_results['overdrive'] = 'Overdrive set to 0' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - result = "Failed to reset GPU" - else: - result = 'Unable to reset non-amd GPU' + reset_clocks_results['overdrive'] = "N/A" + logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'gpu_reset', result) - if args.clocks: - reset_clocks_results = {'overdrive': '', - 'clocks': '', - 'performance': ''} - try: - amdsmi_interface.amdsmi_set_gpu_overdrive_level(args.gpu, 0) - reset_clocks_results['overdrive'] = 'Overdrive set to 0' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - reset_clocks_results['overdrive'] = "N/A" - logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) - - try: - level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO - amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) - reset_clocks_results['clocks'] = 'Successfully reset clocks' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - reset_clocks_results['clocks'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) - - try: - level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO - amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) - reset_clocks_results['performance'] = 'Performance level reset to auto' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - reset_clocks_results['performance'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) - if args.fans: - try: - amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0) - result = 'Successfully reset fan speed to driver control' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - result = "N/A" - logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.store_output(args.gpu, 'reset_fans', result) - if args.profile: - reset_profile_results = {'power_profile' : '', - 'performance_level': ''} - try: - power_profile_mask = amdsmi_interface.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT - amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask) - reset_profile_results['power_profile'] = 'Successfully reset Power Profile' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - reset_profile_results['power_profile'] = "N/A" - logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) - - try: - level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO - amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) - reset_profile_results['performance_level'] = 'Successfully reset Performance Level' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - reset_profile_results['performance_level'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) - if args.xgmierr: - try: - amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu) - result = 'Successfully reset XGMI Error count' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - result = "N/A" - logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'reset_xgmi_err', result) - if args.perf_determinism: - try: - level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO - amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) - result = 'Successfully disabled performance determinism' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - result = "N/A" - logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'reset_perf_determinism', result) - if args.compute_partition: - try: - amdsmi_interface.amdsmi_reset_gpu_compute_partition(args.gpu) - result = 'Successfully reset compute partition' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - result = "N/A" - logging.debug("Failed to reset compute partition on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'reset_compute_partition', result) - if args.memory_partition: - try: - amdsmi_interface.amdsmi_reset_gpu_memory_partition(args.gpu) - result = 'Successfully reset memory partition' - except amdsmi_exception.AmdSmiLibraryException as e: - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - result = "N/A" - logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.store_output(args.gpu, 'reset_memory_partition', result) - if args.power_cap: - try: - power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) - logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") - default_power_cap_in_w = power_cap_info["default_power_cap"] - default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) - current_power_cap_in_w = power_cap_info["power_cap"] - current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) - except amdsmi_exception.AmdSmiLibraryException as e: - raise ValueError(f"Unable to get power cap info from {gpu_id}") from e - - if current_power_cap_in_w == default_power_cap_in_w: - self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap_in_w}") - else: try: - default_power_cap_in_uw = self.helpers.convert_SI_unit(default_power_cap_in_w, - AMDSMIHelpers.SI_Unit.BASE, - AMDSMIHelpers.SI_Unit.MICRO) - amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap_in_uw) + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + reset_clocks_results['clocks'] = 'Successfully reset clocks' except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e - self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}") + reset_clocks_results['clocks'] = "N/A" + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + + try: + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + reset_clocks_results['performance'] = 'Performance level reset to auto' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_clocks_results['performance'] = "N/A" + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) + if args.fans: + try: + amdsmi_interface.amdsmi_reset_gpu_fan(args.gpu, 0) + result = 'Successfully reset fan speed to driver control' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_fans', result) + if args.profile: + reset_profile_results = {'power_profile' : '', + 'performance_level': ''} + try: + power_profile_mask = amdsmi_interface.AmdSmiPowerProfilePresetMasks.BOOTUP_DEFAULT + amdsmi_interface.amdsmi_set_gpu_power_profile(args.gpu, 0, power_profile_mask) + reset_profile_results['power_profile'] = 'Successfully reset Power Profile' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_profile_results['power_profile'] = "N/A" + logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) + + try: + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + reset_profile_results['performance_level'] = 'Successfully reset Performance Level' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + reset_profile_results['performance_level'] = "N/A" + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) + if args.xgmierr: + try: + amdsmi_interface.amdsmi_reset_gpu_xgmi_error(args.gpu) + result = 'Successfully reset XGMI Error count' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_xgmi_err', result) + if args.perf_determinism: + try: + level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO + amdsmi_interface.amdsmi_set_gpu_perf_level(args.gpu, level_auto) + result = 'Successfully disabled performance determinism' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_perf_determinism', result) + if args.compute_partition: + try: + amdsmi_interface.amdsmi_reset_gpu_compute_partition(args.gpu) + result = 'Successfully reset compute partition' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset compute partition on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_compute_partition', result) + if args.memory_partition: + try: + amdsmi_interface.amdsmi_reset_gpu_memory_partition(args.gpu) + result = 'Successfully reset memory partition' + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + result = "N/A" + logging.debug("Failed to reset memory partition on gpu %s | %s", gpu_id, e.get_error_info()) + self.logger.store_output(args.gpu, 'reset_memory_partition', result) + if args.power_cap: + try: + power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) + logging.debug(f"Power cap info for gpu {gpu_id} | {power_cap_info}") + default_power_cap_in_w = power_cap_info["default_power_cap"] + default_power_cap_in_w = self.helpers.convert_SI_unit(default_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) + current_power_cap_in_w = power_cap_info["power_cap"] + current_power_cap_in_w = self.helpers.convert_SI_unit(current_power_cap_in_w, AMDSMIHelpers.SI_Unit.MICRO) + except amdsmi_exception.AmdSmiLibraryException as e: + raise ValueError(f"Unable to get power cap info from {gpu_id}") from e + + if current_power_cap_in_w == default_power_cap_in_w: + self.logger.store_output(args.gpu, 'powercap', f"Power cap is already set to {default_power_cap_in_w}") + else: + try: + default_power_cap_in_uw = self.helpers.convert_SI_unit(default_power_cap_in_w, + AMDSMIHelpers.SI_Unit.BASE, + AMDSMIHelpers.SI_Unit.MICRO) + amdsmi_interface.amdsmi_set_power_cap(args.gpu, 0, default_power_cap_in_uw) + except amdsmi_exception.AmdSmiLibraryException as e: + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + raise ValueError(f"Unable to reset power cap to {default_power_cap_in_w} on GPU {gpu_id}") from e + self.logger.store_output(args.gpu, 'powercap', f"Successfully set power cap to {default_power_cap_in_w}") + if args.clean_local_data: try: amdsmi_interface.amdsmi_clean_gpu_local_data(args.gpu) diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 9965c486b5..3f4059a86f 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -1077,8 +1077,8 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_parser.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') set_value_parser.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_parser.add_argument('-o', '--power-cap', action='store', type=self._positive_int, required=False, help=set_power_cap_help, metavar='WATTS') - set_value_parser.add_argument('-p', '--soc-pstate', action='store', required=False, type=self._not_negative_int, help=set_soc_pstate_help, metavar='POLICY_ID') - set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') + set_value_parser.add_argument('-p', '--soc-pstate', action='store', required=False, type=self._not_negative_int, help=set_soc_pstate_help, metavar='POLICY_ID') + set_value_parser.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=self._not_negative_int, help=set_xgmi_plpd_help, metavar='POLICY_ID') set_value_parser.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE')) set_value_parser.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=self._not_negative_int, required=False, help=set_process_isolation_help, metavar='STATUS')