Added set --pcie command and added more pcie info to static --bus output (#481)

* Added amd-smi set --pcie command
* Removed current pcie level due to it not being static
* Added pcie information to static --bus

---------

Signed-off-by: Pham, Gabriel <Gabriel.Pham@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>

[ROCm/amdsmi commit: 9e3537d778]
This commit is contained in:
Pham, Gabriel
2025-10-28 14:55:55 -05:00
committed by GitHub
parent 354886f4ff
commit 87b2fd73b8
3 changed files with 54 additions and 9 deletions
+23 -4
View File
@@ -8,7 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Added
- **Added the following API's to amdsmi_interface.py**.
- **Added the following C API's to amdsmi_interface.py**.
- amdsmi_get_cpu_handle()
- amdsmi_get_esmi_err_msg()
- amdsmi_get_gpu_event_notification()
@@ -24,6 +24,25 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- The entry `policies` is added to the end of the dictionary to match API definition.
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
- **Added pcie levels to `amd-smi static --bus` command**.
- The static --bus option has been updated to include the range of pcie levels that one may set a device to.
- Levels are a 2-tuple composed of the PCIE speed and bandwidth.
```console
$ amd-smi static --bus
GPU: 0
BUS:
BDF: 0000:43:00.0
MAX_PCIE_WIDTH: 16
MAX_PCIE_SPEED: 16 GT/s
PCIE_LEVELS:
0: (2.5 GT/s, 1)
1: (5.0 GT/s, 4)
2: (16.0 GT/s, 16)
PCIE_INTERFACE_VERSION: Gen 4
SLOT_TYPE: CEM
```
- **Added evicted_time metric for kfd processes**.
- Time that queues are evicted on a GPU in milliseconds
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`
@@ -1219,9 +1238,9 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to
GPU: 0
CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6
GPU: 1
CLK_LEVEL: level(s) 5, 6 is/are greater than performance levels supported for device
```
GPU: 1
CLK_LEVEL: clock level(s) 5, 6 is/are greater than sclk frequency levels supported for device GPU ID: 1 BDF:0000:46:00.0
```
- **Added new command `amd-smi static -C/--clock`**.
- This new command displays the clock frequency performance levels for the selected GPUs and clocks.
+28 -3
View File
@@ -428,6 +428,7 @@ class AMDSMICommands():
args.partition = partition
if clock:
args.clock = clock
# args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list
if args.clock == []:
args.clock = True
@@ -534,6 +535,7 @@ class AMDSMICommands():
'bdf': "N/A",
'max_pcie_width': "N/A",
'max_pcie_speed': "N/A",
'pcie_levels': "N/A",
'pcie_interface_version': "N/A",
'slot_type': "N/A"
}
@@ -572,6 +574,21 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info())
try:
pcie_info = amdsmi_interface.amdsmi_get_gpu_pci_bandwidth(args.gpu)
num_supported = pcie_info['transfer_rate']['num_supported']
if num_supported != 0:
bus_info['pcie_levels'] = {}
for level in range(0, num_supported):
speed = str(self.helpers.convert_SI_unit(float(pcie_info['transfer_rate']['frequency'][level]), AMDSMIHelpers.SI_Unit.NANO)) + " GT/s"
width = pcie_info['lanes'][level]
level_values = (speed, width)
bus_info['pcie_levels'].update({level: level_values})
else:
bus_info['pcie_levels'] = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get pci bandwidth info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['bus'] = bus_info
if args.vbios:
try:
@@ -1018,7 +1035,6 @@ class AMDSMICommands():
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['cache_info'] = cache_info_list
# default to printing all clocks, if in current_platform_args; otherwise print specific clocks
if 'clock' in current_platform_args and (args.clock == True or isinstance(args.clock, list)):
original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices
@@ -1056,8 +1072,15 @@ class AMDSMICommands():
try:
frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion)
# some clocks may have a sysfs file but no frequencies for whatever reason.
if len(frequencies['frequency']) == 0:
freq_dict = "N/A"
continue
freq_dict = {}
freq_dict.update({'current level':frequencies['current']})
current_level = frequencies['current']
freq_dict.update({'current_level':current_level})
current_frequency = str(self.helpers.convert_SI_unit(frequencies['frequency'][current_level], AMDSMIHelpers.SI_Unit.MICRO)) + "MHz"
freq_dict.update({'current_frequency':current_frequency})
freq_dict.update({'frequency_levels':{}})
if frequencies["num_supported"] != 0:
for level in range(len(frequencies['frequency'])):
@@ -1070,6 +1093,7 @@ class AMDSMICommands():
freq_dict = "N/A"
except amdsmi_exception.AmdSmiLibraryException as e:
freq_dict = "N/A"
logging.debug("Failed to get clock info for gpu %s | %s", gpu_id, e.get_error_info())
clk_dict[clk] = freq_dict
static_dict['clock'] = clk_dict
@@ -4563,6 +4587,7 @@ class AMDSMICommands():
args.power_cap is not None,
args.soc_pstate is not None,
args.xgmi_plpd is not None,
args.pcie is not None,
args.clk_level is not None,
args.clk_limit is not None,
args.process_isolation is not None]):
@@ -5087,7 +5112,7 @@ class AMDSMICommands():
gpu_args_enabled = False
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
"memory_partition", "power_cap", "soc_pstate", "xgmi_plpd",
"process_isolation", "clk_limit", "clk_level"]
"process_isolation", "clk_limit", "clk_level", "pcie"]
for attr in gpu_attributes:
if hasattr(args, attr):
if getattr(args, attr) is not None:
+3 -2
View File
@@ -1233,7 +1233,7 @@ class AMDSMIParser(argparse.ArgumentParser):
set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}"
xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies())
set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}"
set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels."
set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels."
power_cap_min, power_cap_max = self.helpers.get_power_caps()
if power_cap_max != "N/A":
power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
@@ -1284,7 +1284,8 @@ class AMDSMIParser(argparse.ArgumentParser):
if self.helpers.is_baremetal():
set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID')
set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'FREQ_LEVELS'))
set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'PERF_LEVELS'))
set_value_exclusive_group.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE'))
set_value_exclusive_group.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--process-isolation'), required=False, help=set_process_isolation_help, metavar='STATUS')