Refactor ESMI Initialization and Argument Parsing

Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Iefab3a8110e0d3c525ee0cef1bdef9101550e9de
This commit is contained in:
Maisam Arif
2024-02-21 03:48:09 -06:00
committed by Maisam Arif
parent 180f893791
commit f58613561c
10 changed files with 1615 additions and 1295 deletions
+8 -8
View File
@@ -183,7 +183,7 @@ Static Arguments:
-l, --limit All limit metric values (i.e. power and thermal limits)
-u, --numa All numa node information
CPU Option<s>:
CPU Arguments:
-s, --smu All SMU FW information
-i, --interface_ver Displays hsmp interface version
@@ -276,7 +276,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[--cpu_lclk_dpm_level NBIOID] [--cpu_pwr_svi_telemtry_rails]
[--cpu_io_bandwidth IO_BW LINKID_NAME]
[--cpu_xgmi_bandwidth XGMI_BW LINKID_NAME] [--cpu_enable_apb]
[--cpu_disable_apb DF_PSTATE] [--set_cpu_pow_limit POW_LIMIT]
[--cpu_disable_apb DF_PSTATE] [--set_cpu_pwr_limit PWR_LIMIT]
[--set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH]
[--set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM]
[--core_boost_limit] [--core_curr_active_freq_core_limit]
@@ -285,10 +285,10 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
[--cpu_metrics_table] [--core_energy] [--socket_energy]
[--set_cpu_pwr_eff_mode MODE] [--cpu_ddr_bandwidth] [--cpu_temp]
[--cpu_dimm_temp_range_rate DIMM_ADDR]
[--cpu_dimm_pow_conumption DIMM_ADDR]
[--cpu_dimm_pow_consumption DIMM_ADDR]
[--cpu_dimm_thermal_sensor DIMM_ADDR]
[--set_cpu_gmi3_link_width MIN_LW MAX_LW]
[--set_cpu_pcie_lnk_rate LINK_RATE]
[--set_cpu_pcie_link_rate LINK_RATE]
[--set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE]
If no GPU is specified, returns metric information for all GPUs on the system.
@@ -329,7 +329,7 @@ Metric arguments:
-x, --xgmi-err XGMI error information since last read
-E, --energy Amount of energy consumed
CPU Option<s>:
CPU Arguments:
--cpu_power_metrics Cpu power metrics
--cpu_prochot Displays prochot status
--cpu_freq_metrics Displays currentFclkMemclk frequencies and cclk frequency limit
@@ -353,18 +353,18 @@ CPU Option<s>:
--cpu_ddr_bandwidth Displays per socket max ddr bw, current utilized bw and current utilized ddr bw in percentage
--cpu_temp Displays cpu socket temperature
--cpu_dimm_temp_range_rate DIMM_ADDR Displays dimm temperature range and refresh rate
--cpu_dimm_pow_conumption DIMM_ADDR Displays dimm power consumption
--cpu_dimm_pow_consumption DIMM_ADDR Displays dimm power consumption
--cpu_dimm_thermal_sensor DIMM_ADDR Displays dimm thermal sensor
Set Options<s>:
--set_cpu_pow_limit POW_LIMIT Set power limit for the given socket. Input parameter is power limit value.
--set_cpu_pwr_limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value.
--set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values
--set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. Inpur parameters are die_index, min dpm, max dpm.
--set_soc_boost_limit BOOST_LIMIT Sets the boost limit for the given socket. Input parameter is socket limit value
--set_core_boost_limit BOOST_LIMIT Sets the boost limit for the given core. Input parameter is core limit value
--set_cpu_pwr_eff_mode MODE Sets the power efficency mode policy. Input parameter is mode.
--set_cpu_gmi3_link_width MIN_LW MAX_LW Sets max and min gmi3 link width range
--set_cpu_pcie_lnk_rate LINK_RATE Sets pcie link rate
--set_cpu_pcie_link_rate LINK_RATE Sets pcie link rate
--set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE Sets max and min df-pstates
Command Modifiers:
File diff suppressed because it is too large Load Diff
+18 -4
View File
@@ -116,6 +116,18 @@ class AMDSMIHelpers():
return self._is_windows
def get_amdsmi_init_flag(self):
return AMDSMI_INIT_FLAG
def is_amdgpu_initialized(self):
return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_GPUS
def is_amd_hsmp_initialized(self):
return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_CPUS
def get_cpu_choices(self):
"""Return dictionary of possible CPU choices and string of the output:
Dictionary will be in format: cpus[ID]: Device Handle)
@@ -136,11 +148,11 @@ class AMDSMIHelpers():
except amdsmi_interface.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)')
logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)')
else:
raise e
if len(cpu_handles) == 0:
logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)')
logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)')
else:
# Handle spacing for the gpu_choices_str
max_padding = int(math.log10(len(cpu_handles))) + 1
@@ -181,11 +193,11 @@ class AMDSMIHelpers():
except amdsmi_interface.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)')
logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)')
else:
raise e
if len(core_handles) == 0:
logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)')
logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)')
else:
# Handle spacing for the gpu_choices_str
max_padding = int(math.log10(len(core_handles))) + 1
@@ -463,6 +475,7 @@ class AMDSMIHelpers():
else:
return False, args.cpu
def handle_cores(self, args, logger, subcommand):
"""This function will run execute the subcommands based on the number
of cores passed in via args.
@@ -567,6 +580,7 @@ class AMDSMIHelpers():
amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle,
"Unable to find cpu ID from device_handle")
def get_core_id_from_device_handle(self, input_device_handle):
"""Get the core index from the device_handle.
amdsmi_interface.amdsmi_get_cpusocket_handles() returns the list of device_handles in order of cpu_index
+31 -27
View File
@@ -42,6 +42,7 @@ sys.tracebacklimit = -1 # Disable traceback when raising errors
# On initial import set initialized variable
AMDSMI_INITIALIZED = False
AMDSMI_INIT_FLAG = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS
AMD_VENDOR_ID = 4098
def check_amdgpu_driver():
@@ -53,8 +54,8 @@ def check_amdgpu_driver():
return False
def check_amdhsmp_driver():
""" Returns true if amd hsmp is found in the list of initialized modules """
def check_amd_hsmp_driver():
""" Returns true if amd_hsmp is found in the list of initialized modules """
amd_cpu_status_file = Path("/sys/module/amd_hsmp/initstate")
if amd_cpu_status_file.exists():
if amd_cpu_status_file.read_text(encoding="ascii").strip() == "live":
@@ -62,32 +63,36 @@ def check_amdhsmp_driver():
return False
def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
def init_amdsmi():
""" Initializes AMDSMI
Raises:
err: AmdSmiLibraryException if not successful
"""
gpu_flag = False;
cpu_flag = False;
Checks for the presence of the amdgpu and amd_hsmp drivers and initializes the
AMD SMI library based on the live drivers found.
# Check if both the amdgpu and amdhsmp driver is up and handle error gracefully
if check_amdgpu_driver() and check_amdhsmp_driver():
# init AMD APUS
Return:
init_flag: the flag used to initialize the AMD SMI library without error
Raises:
err: AmdSmiLibraryException if not successful in initializing any drivers
"""
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS
if check_amdgpu_driver() and check_amd_hsmp_driver():
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS
logging.debug("Both amdgpu and amd_hsmp driver's initstate is live")
try:
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS)
amdsmi_interface.amdsmi_init(init_flag)
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error("Drivers not loaded (amdgpu and hsmp drivers not found in modules)")
logging.error("Drivers not loaded (amdgpu and amd_hsmp drivers not found in modules)")
sys.exit(-1)
else:
raise e
# # Check if amdgpu driver is up & Handle error gracefully
elif check_amdgpu_driver():
# Only init AMD GPUs for now, waiting for future support for AMD CPUs
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS
logging.debug("amdgpu driver initstate is live")
try:
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS)
amdsmi_interface.amdsmi_init(init_flag)
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
@@ -95,25 +100,24 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
sys.exit(-1)
else:
raise e
logging.debug("AMDSMI initialized successfully, but initstate was not live")
elif check_amdhsmp_driver():
# Only init AMD CPUs
logging.debug("amdgpu driver initialized successfully, but amd_hsmp initstate was not live")
elif check_amd_hsmp_driver():
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS
logging.debug("amd_hsmp driver initstate is live")
try:
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS)
cpu_flag = True
amdsmi_interface.amdsmi_init(init_flag)
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
logging.error("Driver not loaded (hsmp not found in modules)")
logging.error("Driver not loaded (amd_hsmp not found in modules)")
sys.exit(-1)
else:
raise e
else:
pass
logging.debug("amd_hsmp driver initialized successfully, but amdgpu initstate was not live")
logging.debug("AMDSMI initialized successfully")
logging.debug(f"AMDSMI initialized with atleast one driver successfully | init flag: {init_flag}")
return init_flag
def shut_down_amdsmi():
"""Shutdown AMDSMI instance
@@ -134,7 +138,7 @@ def signal_handler(sig, frame):
if not AMDSMI_INITIALIZED:
init_amdsmi()
AMDSMI_INIT_FLAG = init_amdsmi()
AMDSMI_INITIALIZED = True
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
+2 -1
View File
@@ -25,8 +25,8 @@ import json
import re
import time
from typing import Dict
import yaml
from enum import Enum
import yaml
from amdsmi_helpers import AMDSMIHelpers
import amdsmi_cli_exceptions
@@ -255,6 +255,7 @@ class AMDSMILogger():
core_id = self.helpers.get_core_id_from_device_handle(device_handle)
self._store_core_output_amdsmi(core_id=core_id, argument=argument, data=data)
def _store_core_output_amdsmi(self, core_id, argument, data):
if argument == 'timestamp': # Make sure timestamp is the first element in the output
self.output['timestamp'] = int(time.time())
File diff suppressed because it is too large Load Diff
+2 -2
View File
@@ -67,12 +67,12 @@ extern "C" {
* Initialization flags may be OR'd together and passed to ::amdsmi_init().
*/
typedef enum {
AMDSMI_INIT_ALL_PROCESSORS = 0x0, // Default option
AMDSMI_INIT_ALL_PROCESSORS = 0xFFFFFFFF, //!< Initialize all processors
AMDSMI_INIT_AMD_CPUS = (1 << 0),
AMDSMI_INIT_AMD_GPUS = (1 << 1),
AMDSMI_INIT_NON_AMD_CPUS = (1 << 2),
AMDSMI_INIT_NON_AMD_GPUS = (1 << 3),
AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS)
AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS) // Default option
} amdsmi_init_flags_t;
/* Maximum size definitions AMDSMI */
+7 -2
View File
@@ -73,7 +73,7 @@ except AmdSmiException as e:
### amdsmi_init
Description: Initialize amdsmi lib and connect to driver
Description: Dynamically initialize amdsmi with amd_hsmp and amdgpu drivers
Input parameters: `None`
@@ -87,7 +87,12 @@ Example:
```python
try:
amdsmi_init()
init_flag = amdsmi_init()
# Print out integer bitmask of initialized drivers
# 1 is for amd_hsmp
# 2 is for amdgpu
# 3 is for amd_hsmp and amdgpu
print(init_flag)
# continue with amdsmi
except AmdSmiException as e:
print("Init failed")
+2 -2
View File
@@ -196,14 +196,14 @@ _libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB')
# values for enumeration 'amdsmi_init_flags_t'
amdsmi_init_flags_t__enumvalues = {
0: 'AMDSMI_INIT_ALL_PROCESSORS',
4294967295: 'AMDSMI_INIT_ALL_PROCESSORS',
1: 'AMDSMI_INIT_AMD_CPUS',
2: 'AMDSMI_INIT_AMD_GPUS',
4: 'AMDSMI_INIT_NON_AMD_CPUS',
8: 'AMDSMI_INIT_NON_AMD_GPUS',
3: 'AMDSMI_INIT_AMD_APUS',
}
AMDSMI_INIT_ALL_PROCESSORS = 0
AMDSMI_INIT_ALL_PROCESSORS = 4294967295
AMDSMI_INIT_AMD_CPUS = 1
AMDSMI_INIT_AMD_GPUS = 2
AMDSMI_INIT_NON_AMD_CPUS = 4
+2 -2
View File
@@ -261,7 +261,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() {
processors_.clear();
sockets_.clear();
esmi_exit();
init_flag_ = AMDSMI_INIT_ALL_PROCESSORS;
init_flag_ &= ~AMDSMI_INIT_AMD_CPUS;
}
#endif
if (init_flag_ & AMDSMI_INIT_AMD_GPUS) {
@@ -270,7 +270,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() {
}
processors_.clear();
sockets_.clear();
init_flag_ = AMDSMI_INIT_ALL_PROCESSORS;
init_flag_ &= ~AMDSMI_INIT_AMD_GPUS;
rsmi_status_t ret = rsmi_shut_down();
if (ret != RSMI_STATUS_SUCCESS) {
return amd::smi::rsmi_to_amdsmi_status(ret);