Added set --pcie command and added more pcie info to static --bus output (#481)
* Added amd-smi set --pcie command
* Removed current pcie level due to it not being static
* Added pcie information to static --bus
---------
Signed-off-by: Pham, Gabriel <Gabriel.Pham@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
[ROCm/amdsmi commit: 9e3537d778]
This commit is contained in:
@@ -8,7 +8,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Added
|
||||
|
||||
- **Added the following API's to amdsmi_interface.py**.
|
||||
- **Added the following C API's to amdsmi_interface.py**.
|
||||
- amdsmi_get_cpu_handle()
|
||||
- amdsmi_get_esmi_err_msg()
|
||||
- amdsmi_get_gpu_event_notification()
|
||||
@@ -24,6 +24,25 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- The entry `policies` is added to the end of the dictionary to match API definition.
|
||||
- The entry `plpds` is marked for deprecation as it has the same information as `policies`.
|
||||
|
||||
- **Added pcie levels to `amd-smi static --bus` command**.
|
||||
- The static --bus option has been updated to include the range of pcie levels that one may set a device to.
|
||||
- Levels are a 2-tuple composed of the PCIE speed and bandwidth.
|
||||
|
||||
```console
|
||||
$ amd-smi static --bus
|
||||
GPU: 0
|
||||
BUS:
|
||||
BDF: 0000:43:00.0
|
||||
MAX_PCIE_WIDTH: 16
|
||||
MAX_PCIE_SPEED: 16 GT/s
|
||||
PCIE_LEVELS:
|
||||
0: (2.5 GT/s, 1)
|
||||
1: (5.0 GT/s, 4)
|
||||
2: (16.0 GT/s, 16)
|
||||
PCIE_INTERFACE_VERSION: Gen 4
|
||||
SLOT_TYPE: CEM
|
||||
```
|
||||
|
||||
- **Added evicted_time metric for kfd processes**.
|
||||
- Time that queues are evicted on a GPU in milliseconds
|
||||
- Added to CLI in `amd-smi monitor -q` and `amd-smi process`
|
||||
@@ -1219,9 +1238,9 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to
|
||||
GPU: 0
|
||||
CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6
|
||||
|
||||
GPU: 1
|
||||
CLK_LEVEL: level(s) 5, 6 is/are greater than performance levels supported for device
|
||||
```
|
||||
GPU: 1
|
||||
CLK_LEVEL: clock level(s) 5, 6 is/are greater than sclk frequency levels supported for device GPU ID: 1 BDF:0000:46:00.0
|
||||
```
|
||||
|
||||
- **Added new command `amd-smi static -C/--clock`**.
|
||||
- This new command displays the clock frequency performance levels for the selected GPUs and clocks.
|
||||
|
||||
@@ -428,6 +428,7 @@ class AMDSMICommands():
|
||||
args.partition = partition
|
||||
if clock:
|
||||
args.clock = clock
|
||||
|
||||
# args.clock defaults to False so if it was overwritten to empty list, that indicates that it was given as an arguments but with an empty list
|
||||
if args.clock == []:
|
||||
args.clock = True
|
||||
@@ -534,6 +535,7 @@ class AMDSMICommands():
|
||||
'bdf': "N/A",
|
||||
'max_pcie_width': "N/A",
|
||||
'max_pcie_speed': "N/A",
|
||||
'pcie_levels': "N/A",
|
||||
'pcie_interface_version': "N/A",
|
||||
'slot_type': "N/A"
|
||||
}
|
||||
@@ -572,6 +574,21 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
pcie_info = amdsmi_interface.amdsmi_get_gpu_pci_bandwidth(args.gpu)
|
||||
num_supported = pcie_info['transfer_rate']['num_supported']
|
||||
if num_supported != 0:
|
||||
bus_info['pcie_levels'] = {}
|
||||
for level in range(0, num_supported):
|
||||
speed = str(self.helpers.convert_SI_unit(float(pcie_info['transfer_rate']['frequency'][level]), AMDSMIHelpers.SI_Unit.NANO)) + " GT/s"
|
||||
width = pcie_info['lanes'][level]
|
||||
level_values = (speed, width)
|
||||
bus_info['pcie_levels'].update({level: level_values})
|
||||
else:
|
||||
bus_info['pcie_levels'] = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("Failed to get pci bandwidth info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['bus'] = bus_info
|
||||
if args.vbios:
|
||||
try:
|
||||
@@ -1018,7 +1035,6 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get cache info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['cache_info'] = cache_info_list
|
||||
|
||||
# default to printing all clocks, if in current_platform_args; otherwise print specific clocks
|
||||
if 'clock' in current_platform_args and (args.clock == True or isinstance(args.clock, list)):
|
||||
original_clock_args = args.clock #save original args.clock value, so we can reset for multiple devices
|
||||
@@ -1056,8 +1072,15 @@ class AMDSMICommands():
|
||||
|
||||
try:
|
||||
frequencies = amdsmi_interface.amdsmi_get_clk_freq(args.gpu, clk_type_conversion)
|
||||
# some clocks may have a sysfs file but no frequencies for whatever reason.
|
||||
if len(frequencies['frequency']) == 0:
|
||||
freq_dict = "N/A"
|
||||
continue
|
||||
freq_dict = {}
|
||||
freq_dict.update({'current level':frequencies['current']})
|
||||
current_level = frequencies['current']
|
||||
freq_dict.update({'current_level':current_level})
|
||||
current_frequency = str(self.helpers.convert_SI_unit(frequencies['frequency'][current_level], AMDSMIHelpers.SI_Unit.MICRO)) + "MHz"
|
||||
freq_dict.update({'current_frequency':current_frequency})
|
||||
freq_dict.update({'frequency_levels':{}})
|
||||
if frequencies["num_supported"] != 0:
|
||||
for level in range(len(frequencies['frequency'])):
|
||||
@@ -1070,6 +1093,7 @@ class AMDSMICommands():
|
||||
freq_dict = "N/A"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
freq_dict = "N/A"
|
||||
logging.debug("Failed to get clock info for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
clk_dict[clk] = freq_dict
|
||||
|
||||
static_dict['clock'] = clk_dict
|
||||
@@ -4563,6 +4587,7 @@ class AMDSMICommands():
|
||||
args.power_cap is not None,
|
||||
args.soc_pstate is not None,
|
||||
args.xgmi_plpd is not None,
|
||||
args.pcie is not None,
|
||||
args.clk_level is not None,
|
||||
args.clk_limit is not None,
|
||||
args.process_isolation is not None]):
|
||||
@@ -5087,7 +5112,7 @@ class AMDSMICommands():
|
||||
gpu_args_enabled = False
|
||||
gpu_attributes = ["fan", "perf_level", "profile", "perf_determinism", "compute_partition",
|
||||
"memory_partition", "power_cap", "soc_pstate", "xgmi_plpd",
|
||||
"process_isolation", "clk_limit", "clk_level"]
|
||||
"process_isolation", "clk_limit", "clk_level", "pcie"]
|
||||
for attr in gpu_attributes:
|
||||
if hasattr(args, attr):
|
||||
if getattr(args, attr) is not None:
|
||||
|
||||
@@ -1233,7 +1233,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
set_soc_pstate_help = f"Set the GPU soc pstate policy using policy id, an integer. Valid id's include:\n\t{soc_pstate_help_info}"
|
||||
xgmi_plpd_help_info = ", ".join(self.helpers.get_xgmi_plpd_policies())
|
||||
set_xgmi_plpd_help = f"Set the GPU XGMI per-link power down policy using policy id, an integer. Valid id's include:\n\t{xgmi_plpd_help_info}"
|
||||
set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels."
|
||||
set_clock_freq_help = "Set one or more sclk (aka gfxclk), mclk, fclk, pcie, or socclk frequency levels.\n\tUse `amd-smi static --clock` to find acceptable levels.\n\tUse `amd-smi static --bus` to find acceptable pcie levels."
|
||||
power_cap_min, power_cap_max = self.helpers.get_power_caps()
|
||||
if power_cap_max != "N/A":
|
||||
power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO)
|
||||
@@ -1284,7 +1284,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
if self.helpers.is_baremetal():
|
||||
set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID')
|
||||
set_value_exclusive_group.add_argument('-x', '--xgmi-plpd', action='store', required=False, type=lambda value: self._not_negative_int(value, '--xgmi-plpd'), help=set_xgmi_plpd_help, metavar='POLICY_ID')
|
||||
set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'FREQ_LEVELS'))
|
||||
set_value_exclusive_group.add_argument('-c', '--clk-level', action=self._level_select(), nargs='+', required=False, help=set_clock_freq_help, metavar=('CLK_TYPE', 'PERF_LEVELS'))
|
||||
|
||||
set_value_exclusive_group.add_argument('-L', '--clk-limit', action=self._limit_select(), nargs=3, required=False, help=set_clk_limit_help, metavar=('CLK_TYPE', 'LIM_TYPE', 'VALUE'))
|
||||
set_value_exclusive_group.add_argument('-R', '--process-isolation', action='store', choices=[0,1], type=lambda value: self._not_negative_int(value, '--process-isolation'), required=False, help=set_process_isolation_help, metavar='STATUS')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user