Refactor ESMI Initialization and Argument Parsing
Signed-off-by: Maisam Arif <maisarif@amd.com> Change-Id: Iefab3a8110e0d3c525ee0cef1bdef9101550e9de
This commit is contained in:
@@ -183,7 +183,7 @@ Static Arguments:
|
||||
-l, --limit All limit metric values (i.e. power and thermal limits)
|
||||
-u, --numa All numa node information
|
||||
|
||||
CPU Option<s>:
|
||||
CPU Arguments:
|
||||
-s, --smu All SMU FW information
|
||||
-i, --interface_ver Displays hsmp interface version
|
||||
|
||||
@@ -276,7 +276,7 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[--cpu_lclk_dpm_level NBIOID] [--cpu_pwr_svi_telemtry_rails]
|
||||
[--cpu_io_bandwidth IO_BW LINKID_NAME]
|
||||
[--cpu_xgmi_bandwidth XGMI_BW LINKID_NAME] [--cpu_enable_apb]
|
||||
[--cpu_disable_apb DF_PSTATE] [--set_cpu_pow_limit POW_LIMIT]
|
||||
[--cpu_disable_apb DF_PSTATE] [--set_cpu_pwr_limit PWR_LIMIT]
|
||||
[--set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH]
|
||||
[--set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM]
|
||||
[--core_boost_limit] [--core_curr_active_freq_core_limit]
|
||||
@@ -285,10 +285,10 @@ usage: amd-smi metric [-h] [--json | --csv] [--file FILE] [--loglevel LEVEL]
|
||||
[--cpu_metrics_table] [--core_energy] [--socket_energy]
|
||||
[--set_cpu_pwr_eff_mode MODE] [--cpu_ddr_bandwidth] [--cpu_temp]
|
||||
[--cpu_dimm_temp_range_rate DIMM_ADDR]
|
||||
[--cpu_dimm_pow_conumption DIMM_ADDR]
|
||||
[--cpu_dimm_pow_consumption DIMM_ADDR]
|
||||
[--cpu_dimm_thermal_sensor DIMM_ADDR]
|
||||
[--set_cpu_gmi3_link_width MIN_LW MAX_LW]
|
||||
[--set_cpu_pcie_lnk_rate LINK_RATE]
|
||||
[--set_cpu_pcie_link_rate LINK_RATE]
|
||||
[--set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE]
|
||||
|
||||
If no GPU is specified, returns metric information for all GPUs on the system.
|
||||
@@ -329,7 +329,7 @@ Metric arguments:
|
||||
-x, --xgmi-err XGMI error information since last read
|
||||
-E, --energy Amount of energy consumed
|
||||
|
||||
CPU Option<s>:
|
||||
CPU Arguments:
|
||||
--cpu_power_metrics Cpu power metrics
|
||||
--cpu_prochot Displays prochot status
|
||||
--cpu_freq_metrics Displays currentFclkMemclk frequencies and cclk frequency limit
|
||||
@@ -353,18 +353,18 @@ CPU Option<s>:
|
||||
--cpu_ddr_bandwidth Displays per socket max ddr bw, current utilized bw and current utilized ddr bw in percentage
|
||||
--cpu_temp Displays cpu socket temperature
|
||||
--cpu_dimm_temp_range_rate DIMM_ADDR Displays dimm temperature range and refresh rate
|
||||
--cpu_dimm_pow_conumption DIMM_ADDR Displays dimm power consumption
|
||||
--cpu_dimm_pow_consumption DIMM_ADDR Displays dimm power consumption
|
||||
--cpu_dimm_thermal_sensor DIMM_ADDR Displays dimm thermal sensor
|
||||
|
||||
Set Options<s>:
|
||||
--set_cpu_pow_limit POW_LIMIT Set power limit for the given socket. Input parameter is power limit value.
|
||||
--set_cpu_pwr_limit PWR_LIMIT Set power limit for the given socket. Input parameter is power limit value.
|
||||
--set_cpu_xgmi_link_width MIN_WIDTH MAX_WIDTH Set max and Min linkwidth. Input parameters are min and max link width values
|
||||
--set_cpu_lclk_dpm_level NBIOID MIN_DPM MAX_DPM Sets the max and min dpm level on a given NBIO. Inpur parameters are die_index, min dpm, max dpm.
|
||||
--set_soc_boost_limit BOOST_LIMIT Sets the boost limit for the given socket. Input parameter is socket limit value
|
||||
--set_core_boost_limit BOOST_LIMIT Sets the boost limit for the given core. Input parameter is core limit value
|
||||
--set_cpu_pwr_eff_mode MODE Sets the power efficency mode policy. Input parameter is mode.
|
||||
--set_cpu_gmi3_link_width MIN_LW MAX_LW Sets max and min gmi3 link width range
|
||||
--set_cpu_pcie_lnk_rate LINK_RATE Sets pcie link rate
|
||||
--set_cpu_pcie_link_rate LINK_RATE Sets pcie link rate
|
||||
--set_cpu_df_pstate_range MAX_PSTATE MIN_PSTATE Sets max and min df-pstates
|
||||
|
||||
Command Modifiers:
|
||||
|
||||
+959
-742
File diff suppressed because it is too large
Load Diff
@@ -116,6 +116,18 @@ class AMDSMIHelpers():
|
||||
return self._is_windows
|
||||
|
||||
|
||||
def get_amdsmi_init_flag(self):
|
||||
return AMDSMI_INIT_FLAG
|
||||
|
||||
|
||||
def is_amdgpu_initialized(self):
|
||||
return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_GPUS
|
||||
|
||||
|
||||
def is_amd_hsmp_initialized(self):
|
||||
return AMDSMI_INIT_FLAG & amdsmi_interface.amdsmi_wrapper.AMDSMI_INIT_AMD_CPUS
|
||||
|
||||
|
||||
def get_cpu_choices(self):
|
||||
"""Return dictionary of possible CPU choices and string of the output:
|
||||
Dictionary will be in format: cpus[ID]: Device Handle)
|
||||
@@ -136,11 +148,11 @@ class AMDSMIHelpers():
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)')
|
||||
logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)')
|
||||
else:
|
||||
raise e
|
||||
if len(cpu_handles) == 0:
|
||||
logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)')
|
||||
logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)')
|
||||
else:
|
||||
# Handle spacing for the gpu_choices_str
|
||||
max_padding = int(math.log10(len(cpu_handles))) + 1
|
||||
@@ -181,11 +193,11 @@ class AMDSMIHelpers():
|
||||
except amdsmi_interface.AmdSmiLibraryException as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.info('Unable to get device choices, driver not initialized (amdhsmp not found in modules)')
|
||||
logging.info('Unable to get device choices, driver not initialized (amd_hsmp not found in modules)')
|
||||
else:
|
||||
raise e
|
||||
if len(core_handles) == 0:
|
||||
logging.info('Unable to find any devices, check if driver is initialized (amdhsmp not found in modules)')
|
||||
logging.info('Unable to find any devices, check if driver is initialized (amd_hsmp not found in modules)')
|
||||
else:
|
||||
# Handle spacing for the gpu_choices_str
|
||||
max_padding = int(math.log10(len(core_handles))) + 1
|
||||
@@ -463,6 +475,7 @@ class AMDSMIHelpers():
|
||||
else:
|
||||
return False, args.cpu
|
||||
|
||||
|
||||
def handle_cores(self, args, logger, subcommand):
|
||||
"""This function will run execute the subcommands based on the number
|
||||
of cores passed in via args.
|
||||
@@ -567,6 +580,7 @@ class AMDSMIHelpers():
|
||||
amdsmi_interface.amdsmi_wrapper.amdsmi_processor_handle,
|
||||
"Unable to find cpu ID from device_handle")
|
||||
|
||||
|
||||
def get_core_id_from_device_handle(self, input_device_handle):
|
||||
"""Get the core index from the device_handle.
|
||||
amdsmi_interface.amdsmi_get_cpusocket_handles() returns the list of device_handles in order of cpu_index
|
||||
|
||||
+31
-27
@@ -42,6 +42,7 @@ sys.tracebacklimit = -1 # Disable traceback when raising errors
|
||||
|
||||
# On initial import set initialized variable
|
||||
AMDSMI_INITIALIZED = False
|
||||
AMDSMI_INIT_FLAG = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS
|
||||
AMD_VENDOR_ID = 4098
|
||||
|
||||
def check_amdgpu_driver():
|
||||
@@ -53,8 +54,8 @@ def check_amdgpu_driver():
|
||||
return False
|
||||
|
||||
|
||||
def check_amdhsmp_driver():
|
||||
""" Returns true if amd hsmp is found in the list of initialized modules """
|
||||
def check_amd_hsmp_driver():
|
||||
""" Returns true if amd_hsmp is found in the list of initialized modules """
|
||||
amd_cpu_status_file = Path("/sys/module/amd_hsmp/initstate")
|
||||
if amd_cpu_status_file.exists():
|
||||
if amd_cpu_status_file.read_text(encoding="ascii").strip() == "live":
|
||||
@@ -62,32 +63,36 @@ def check_amdhsmp_driver():
|
||||
return False
|
||||
|
||||
|
||||
def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
|
||||
def init_amdsmi():
|
||||
""" Initializes AMDSMI
|
||||
|
||||
Raises:
|
||||
err: AmdSmiLibraryException if not successful
|
||||
"""
|
||||
gpu_flag = False;
|
||||
cpu_flag = False;
|
||||
Checks for the presence of the amdgpu and amd_hsmp drivers and initializes the
|
||||
AMD SMI library based on the live drivers found.
|
||||
|
||||
# Check if both the amdgpu and amdhsmp driver is up and handle error gracefully
|
||||
if check_amdgpu_driver() and check_amdhsmp_driver():
|
||||
# init AMD APUS
|
||||
Return:
|
||||
init_flag: the flag used to initialize the AMD SMI library without error
|
||||
|
||||
Raises:
|
||||
err: AmdSmiLibraryException if not successful in initializing any drivers
|
||||
"""
|
||||
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_ALL_PROCESSORS
|
||||
if check_amdgpu_driver() and check_amd_hsmp_driver():
|
||||
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS
|
||||
logging.debug("Both amdgpu and amd_hsmp driver's initstate is live")
|
||||
try:
|
||||
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_APUS)
|
||||
amdsmi_interface.amdsmi_init(init_flag)
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.error("Drivers not loaded (amdgpu and hsmp drivers not found in modules)")
|
||||
logging.error("Drivers not loaded (amdgpu and amd_hsmp drivers not found in modules)")
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
# # Check if amdgpu driver is up & Handle error gracefully
|
||||
elif check_amdgpu_driver():
|
||||
# Only init AMD GPUs for now, waiting for future support for AMD CPUs
|
||||
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS
|
||||
logging.debug("amdgpu driver initstate is live")
|
||||
try:
|
||||
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS)
|
||||
amdsmi_interface.amdsmi_init(init_flag)
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
@@ -95,25 +100,24 @@ def init_amdsmi(flag=amdsmi_interface.AmdSmiInitFlags.INIT_AMD_GPUS):
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
logging.debug("AMDSMI initialized successfully, but initstate was not live")
|
||||
|
||||
elif check_amdhsmp_driver():
|
||||
# Only init AMD CPUs
|
||||
logging.debug("amdgpu driver initialized successfully, but amd_hsmp initstate was not live")
|
||||
elif check_amd_hsmp_driver():
|
||||
init_flag = amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS
|
||||
logging.debug("amd_hsmp driver initstate is live")
|
||||
try:
|
||||
amdsmi_interface.amdsmi_init(amdsmi_interface.AmdSmiInitFlags.INIT_AMD_CPUS)
|
||||
cpu_flag = True
|
||||
amdsmi_interface.amdsmi_init(init_flag)
|
||||
except (amdsmi_interface.AmdSmiLibraryException, amdsmi_interface.AmdSmiParameterException) as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED):
|
||||
logging.error("Driver not loaded (hsmp not found in modules)")
|
||||
logging.error("Driver not loaded (amd_hsmp not found in modules)")
|
||||
sys.exit(-1)
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
pass
|
||||
logging.debug("amd_hsmp driver initialized successfully, but amdgpu initstate was not live")
|
||||
|
||||
logging.debug("AMDSMI initialized successfully")
|
||||
logging.debug(f"AMDSMI initialized with atleast one driver successfully | init flag: {init_flag}")
|
||||
|
||||
return init_flag
|
||||
|
||||
def shut_down_amdsmi():
|
||||
"""Shutdown AMDSMI instance
|
||||
@@ -134,7 +138,7 @@ def signal_handler(sig, frame):
|
||||
|
||||
|
||||
if not AMDSMI_INITIALIZED:
|
||||
init_amdsmi()
|
||||
AMDSMI_INIT_FLAG = init_amdsmi()
|
||||
AMDSMI_INITIALIZED = True
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
@@ -25,8 +25,8 @@ import json
|
||||
import re
|
||||
import time
|
||||
from typing import Dict
|
||||
import yaml
|
||||
from enum import Enum
|
||||
import yaml
|
||||
|
||||
from amdsmi_helpers import AMDSMIHelpers
|
||||
import amdsmi_cli_exceptions
|
||||
@@ -255,6 +255,7 @@ class AMDSMILogger():
|
||||
core_id = self.helpers.get_core_id_from_device_handle(device_handle)
|
||||
self._store_core_output_amdsmi(core_id=core_id, argument=argument, data=data)
|
||||
|
||||
|
||||
def _store_core_output_amdsmi(self, core_id, argument, data):
|
||||
if argument == 'timestamp': # Make sure timestamp is the first element in the output
|
||||
self.output['timestamp'] = int(time.time())
|
||||
|
||||
+584
-505
File diff suppressed because it is too large
Load Diff
@@ -67,12 +67,12 @@ extern "C" {
|
||||
* Initialization flags may be OR'd together and passed to ::amdsmi_init().
|
||||
*/
|
||||
typedef enum {
|
||||
AMDSMI_INIT_ALL_PROCESSORS = 0x0, // Default option
|
||||
AMDSMI_INIT_ALL_PROCESSORS = 0xFFFFFFFF, //!< Initialize all processors
|
||||
AMDSMI_INIT_AMD_CPUS = (1 << 0),
|
||||
AMDSMI_INIT_AMD_GPUS = (1 << 1),
|
||||
AMDSMI_INIT_NON_AMD_CPUS = (1 << 2),
|
||||
AMDSMI_INIT_NON_AMD_GPUS = (1 << 3),
|
||||
AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS)
|
||||
AMDSMI_INIT_AMD_APUS = (AMDSMI_INIT_AMD_CPUS | AMDSMI_INIT_AMD_GPUS) // Default option
|
||||
} amdsmi_init_flags_t;
|
||||
|
||||
/* Maximum size definitions AMDSMI */
|
||||
|
||||
@@ -73,7 +73,7 @@ except AmdSmiException as e:
|
||||
|
||||
### amdsmi_init
|
||||
|
||||
Description: Initialize amdsmi lib and connect to driver
|
||||
Description: Dynamically initialize amdsmi with amd_hsmp and amdgpu drivers
|
||||
|
||||
Input parameters: `None`
|
||||
|
||||
@@ -87,7 +87,12 @@ Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
amdsmi_init()
|
||||
init_flag = amdsmi_init()
|
||||
# Print out integer bitmask of initialized drivers
|
||||
# 1 is for amd_hsmp
|
||||
# 2 is for amdgpu
|
||||
# 3 is for amd_hsmp and amdgpu
|
||||
print(init_flag)
|
||||
# continue with amdsmi
|
||||
except AmdSmiException as e:
|
||||
print("Init failed")
|
||||
|
||||
@@ -196,14 +196,14 @@ _libraries['FIXME_STUB'] = FunctionFactoryStub() # ctypes.CDLL('FIXME_STUB')
|
||||
|
||||
# values for enumeration 'amdsmi_init_flags_t'
|
||||
amdsmi_init_flags_t__enumvalues = {
|
||||
0: 'AMDSMI_INIT_ALL_PROCESSORS',
|
||||
4294967295: 'AMDSMI_INIT_ALL_PROCESSORS',
|
||||
1: 'AMDSMI_INIT_AMD_CPUS',
|
||||
2: 'AMDSMI_INIT_AMD_GPUS',
|
||||
4: 'AMDSMI_INIT_NON_AMD_CPUS',
|
||||
8: 'AMDSMI_INIT_NON_AMD_GPUS',
|
||||
3: 'AMDSMI_INIT_AMD_APUS',
|
||||
}
|
||||
AMDSMI_INIT_ALL_PROCESSORS = 0
|
||||
AMDSMI_INIT_ALL_PROCESSORS = 4294967295
|
||||
AMDSMI_INIT_AMD_CPUS = 1
|
||||
AMDSMI_INIT_AMD_GPUS = 2
|
||||
AMDSMI_INIT_NON_AMD_CPUS = 4
|
||||
|
||||
@@ -261,7 +261,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() {
|
||||
processors_.clear();
|
||||
sockets_.clear();
|
||||
esmi_exit();
|
||||
init_flag_ = AMDSMI_INIT_ALL_PROCESSORS;
|
||||
init_flag_ &= ~AMDSMI_INIT_AMD_CPUS;
|
||||
}
|
||||
#endif
|
||||
if (init_flag_ & AMDSMI_INIT_AMD_GPUS) {
|
||||
@@ -270,7 +270,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() {
|
||||
}
|
||||
processors_.clear();
|
||||
sockets_.clear();
|
||||
init_flag_ = AMDSMI_INIT_ALL_PROCESSORS;
|
||||
init_flag_ &= ~AMDSMI_INIT_AMD_GPUS;
|
||||
rsmi_status_t ret = rsmi_shut_down();
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return amd::smi::rsmi_to_amdsmi_status(ret);
|
||||
|
||||
Reference in New Issue
Block a user