[SWDEV-538483] Add NPM API's and CLI (#817)
* Added Python & C API's for new node devices. Currently these are functional for node 0 only. - amdsmi_get_node_handle - amdsmi_get_npm_info * Added `amd-smi node` CLI for Node Power Management --------- Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com> Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
This commit is contained in:
committad av
GitHub
förälder
00a893d299
incheckning
f8e4771363
+4
-2
@@ -251,7 +251,8 @@ set(CMN_SRC_LIST
|
||||
"${ROCM_SRC_DIR}/rocm_smi_logger.cc"
|
||||
"${SHR_MUTEX_DIR}/shared_mutex.cc"
|
||||
"${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc"
|
||||
"${ROCM_SRC_DIR}/rocm_smi_board_temp.cc")
|
||||
"${ROCM_SRC_DIR}/rocm_smi_board_temp.cc"
|
||||
"${ROCM_SRC_DIR}/rocm_smi_npm.cc")
|
||||
|
||||
if(ENABLE_ESMI_LIB)
|
||||
list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c)
|
||||
@@ -277,7 +278,8 @@ set(CMN_INC_LIST
|
||||
"${ROCM_INC_DIR}/rocm_smi_logger.h"
|
||||
"${SHR_MUTEX_DIR}/shared_mutex.h"
|
||||
"${ROCM_INC_DIR}/rocm_smi_binary_parser.h"
|
||||
"${ROCM_INC_DIR}/rocm_smi_board_temp.h")
|
||||
"${ROCM_INC_DIR}/rocm_smi_board_temp.h"
|
||||
"${ROCM_INC_DIR}/rocm_smi_npm.h")
|
||||
|
||||
add_subdirectory("rocm_smi")
|
||||
add_subdirectory("src")
|
||||
|
||||
@@ -159,6 +159,7 @@ if __name__ == "__main__":
|
||||
amd_smi_commands.xgmi,
|
||||
amd_smi_commands.partition,
|
||||
amd_smi_commands.ras,
|
||||
amd_smi_commands.node,
|
||||
amd_smi_commands.default,
|
||||
sys_argv=sys.argv,
|
||||
helpers=amd_smi_helpers)
|
||||
|
||||
@@ -53,6 +53,7 @@ class AMDSMICommands():
|
||||
self.device_handles = []
|
||||
self.cpu_handles = []
|
||||
self.core_handles = []
|
||||
self.node_handle = None
|
||||
self.stop = ''
|
||||
self.group_check_printed = False
|
||||
|
||||
@@ -75,6 +76,20 @@ class AMDSMICommands():
|
||||
logging.error('Unable to detect any GPU devices, check amdgpu version and module status (sudo modprobe amdgpu)')
|
||||
exit_flag = True
|
||||
|
||||
# Resolve the node handle.
|
||||
for dev in self.device_handles:
|
||||
try:
|
||||
nh = amdsmi_interface.amdsmi_get_node_handle(dev)
|
||||
if nh is not None:
|
||||
self.node_handle = nh
|
||||
continue
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED,
|
||||
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL):
|
||||
logging.debug("Unable to get node handle: %s", e.get_error_info())
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.helpers.is_amd_hsmp_initialized():
|
||||
try:
|
||||
self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles()
|
||||
@@ -7231,6 +7246,71 @@ class AMDSMICommands():
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def node(self, args, multiple_devices=False, nodes=None, power_management=None):
|
||||
"""List node informations
|
||||
|
||||
Args:
|
||||
args (Namespace): Namespace containing the parsed CLI args
|
||||
multiple_devices (bool, optional): True if checking for multiple devices.
|
||||
Defaults to False.
|
||||
|
||||
Returns:
|
||||
None: Print output via AMDSMILogger to destination
|
||||
"""
|
||||
# Set args.* to passed in arguments
|
||||
if nodes:
|
||||
args.nodes = nodes
|
||||
if power_management:
|
||||
args.power_management = power_management
|
||||
if getattr(args, 'nodes', None) is None:
|
||||
args.nodes = self.node_handle
|
||||
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups(check_render=True, check_video=False)
|
||||
self.group_check_printed = True
|
||||
|
||||
# Get NPM info
|
||||
if args.nodes is not None:
|
||||
try:
|
||||
npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info())
|
||||
npm_info = "N/A"
|
||||
else:
|
||||
logging.debug('No node handle available to query NPM info')
|
||||
npm_info = "N/A"
|
||||
|
||||
# Log outputs
|
||||
npm_dict = {"limit": "N/A", "status": "N/A"}
|
||||
power_unit ="W"
|
||||
|
||||
limit = "N/A"
|
||||
if isinstance(npm_info, dict):
|
||||
limit = npm_info.get('limit', "N/A")
|
||||
status = npm_info.get('status', npm_info.get('current', "N/A"))
|
||||
|
||||
if limit !="N/A":
|
||||
npm_dict['limit'] = limit
|
||||
status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
|
||||
npm_dict.update({"status": status})
|
||||
if self.logger.is_human_readable_format() and self.logger.destination == 'stdout':
|
||||
print(f"NODE:\n POWER_MANAGEMENT:\n LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}\n STATUS: {npm_dict.get('status', 'N/A')}")
|
||||
else:
|
||||
if self.logger.is_csv_format():
|
||||
csv_dict = {}
|
||||
csv_dict['limit'] = npm_dict.get('limit', "N/A")
|
||||
csv_dict['status'] = npm_dict.get('status', "N/A")
|
||||
self.logger.output = csv_dict
|
||||
else:
|
||||
# For JSON and human readable format with file output
|
||||
npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit)
|
||||
self.logger.output = {'node': {'power_management': npm_dict}}
|
||||
if multiple_devices:
|
||||
self.logger.store_multiple_device_output()
|
||||
return
|
||||
self.logger.print_output()
|
||||
|
||||
|
||||
def default(self, args):
|
||||
"""Display the default amdsmi view when no args are given."""
|
||||
|
||||
|
||||
@@ -624,6 +624,41 @@ class AMDSMIHelpers():
|
||||
return False, args.core
|
||||
|
||||
|
||||
# The below handle_nodes function is currently unused as only node 0 is supported.
|
||||
# Marked as a private function until it is needed in the future.
|
||||
def _handle_nodes(self, args, logger, subcommand):
|
||||
"""This function will run execute the subcommands based on the number
|
||||
of nodes passed in via args.
|
||||
params:
|
||||
args - argparser args to pass to subcommand
|
||||
current_platform_args (list) - GPU supported platform arguments
|
||||
current_platform_values (list) - GPU supported values for the arguments
|
||||
logger (AMDSMILogger) - Logger to print out output
|
||||
subcommand (AMDSMICommands) - Function that can handle multiple gpus
|
||||
|
||||
return:
|
||||
tuple(bool, device_handle) :
|
||||
bool - True if executed subcommand for multiple devices
|
||||
device_handle - Return the device_handle if the list of devices is a length of 1
|
||||
(handled_multiple_nodes, device_handle)
|
||||
|
||||
"""
|
||||
if isinstance(args.node, list):
|
||||
if len(args.node) > 1:
|
||||
for node_handle in args.node:
|
||||
# Handle multiple_devices to print all output at once
|
||||
subcommand(args, multiple_devices=True, node=node_handle)
|
||||
logger.print_output(multiple_device_enabled=True)
|
||||
return True, args.node
|
||||
elif len(args.node) == 1:
|
||||
args.node = args.node[0]
|
||||
return False, args.node
|
||||
else:
|
||||
logging.debug("args.node has an empty list")
|
||||
else:
|
||||
return False, args.node
|
||||
|
||||
|
||||
def handle_watch(self, args, subcommand, logger):
|
||||
"""This function will run the subcommand multiple times based
|
||||
on the passed watch, watch_time, and iterations passed in.
|
||||
|
||||
@@ -70,7 +70,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
"""
|
||||
def __init__(self, version, list, static, firmware, bad_pages, metric,
|
||||
process, profile, event, topology, set_value, reset, monitor,
|
||||
xgmi, partition, ras, default, sys_argv=None, helpers=None):
|
||||
xgmi, partition, ras, node, default, sys_argv=None,
|
||||
helpers=None):
|
||||
|
||||
# Helper variables
|
||||
if helpers is None:
|
||||
@@ -122,7 +123,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
# Store possible subcommands & aliases for later errors
|
||||
self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages',
|
||||
'metric', 'process', 'profile', 'event', 'topology', 'set',
|
||||
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
|
||||
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras',
|
||||
'node', 'default']
|
||||
|
||||
# Add all subparsers
|
||||
if sys_argv is not None:
|
||||
@@ -143,6 +145,7 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_xgmi_parser(self.subparsers, xgmi)
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
self._add_node_parser(self.subparsers, node)
|
||||
elif any(arg in sys_argv for arg in ['version']):
|
||||
self._add_version_parser(self.subparsers, version)
|
||||
elif any(arg in sys_argv for arg in ['list']):
|
||||
@@ -175,6 +178,8 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_partition_parser(self.subparsers, partition)
|
||||
elif any(arg in sys_argv for arg in ['ras']):
|
||||
self._add_ras_parser(self.subparsers, ras)
|
||||
elif any(arg in sys_argv for arg in ['node']):
|
||||
self._add_node_parser(self.subparsers, node)
|
||||
else:
|
||||
# If no subcommand is given, add the default parser
|
||||
self._add_default_parser(self.subparsers, default)
|
||||
@@ -1564,6 +1569,32 @@ class AMDSMIParser(argparse.ArgumentParser):
|
||||
self._add_command_modifiers(ras_parser)
|
||||
|
||||
|
||||
def _add_node_parser(self, subparsers: argparse._SubParsersAction, func):
|
||||
if self.helpers.is_virtual_os():
|
||||
# This subparser is only available to Guest and Hypervisor systems
|
||||
return
|
||||
|
||||
# Subparser help text
|
||||
node_help = "Gets power information for the node"
|
||||
node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\
|
||||
\nIf no node argument is provided, all node information will be displayed."
|
||||
node_optionals_title = "Node arguments"
|
||||
|
||||
# Help text for Node arguments
|
||||
power_management_help = "Displays power management information"
|
||||
|
||||
node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help)
|
||||
node_parser._optionals.title = node_optionals_title
|
||||
node_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog)
|
||||
node_parser.set_defaults(func=func)
|
||||
|
||||
# Optional Args
|
||||
node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help)
|
||||
|
||||
# Add Universal Arguments
|
||||
self._add_command_modifiers(node_parser)
|
||||
|
||||
|
||||
def error(self, message):
|
||||
outputformat = self.helpers.get_output_format()
|
||||
|
||||
|
||||
@@ -245,6 +245,13 @@ typedef enum {
|
||||
typedef void *amdsmi_processor_handle;
|
||||
typedef void *amdsmi_socket_handle;
|
||||
|
||||
/**
|
||||
* @brief opaque handler point to underlying implementation
|
||||
*
|
||||
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
|
||||
*/
|
||||
typedef void *amdsmi_node_handle;
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
|
||||
/**
|
||||
@@ -2141,6 +2148,27 @@ typedef enum {
|
||||
AMDSMI_AFFINITY_SCOPE_SOCKET //!< socket affinity
|
||||
} amdsmi_affinity_scope_t;
|
||||
|
||||
/**
|
||||
* @brief NPM status
|
||||
*
|
||||
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
|
||||
*/
|
||||
typedef enum {
|
||||
AMDSMI_NPM_STATUS_DISABLED,
|
||||
AMDSMI_NPM_STATUS_ENABLED
|
||||
} amdsmi_npm_status_t;
|
||||
|
||||
/**
|
||||
* @brief NPM info
|
||||
*
|
||||
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
|
||||
*/
|
||||
typedef struct {
|
||||
amdsmi_npm_status_t status; //!< NPM status (enabled/disabled).
|
||||
uint64_t limit; //!< Node-level power limit in Watts.
|
||||
uint64_t reserved[6];
|
||||
} amdsmi_npm_info_t;
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
|
||||
/**
|
||||
@@ -2625,6 +2653,28 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
|
||||
uint32_t *processor_count,
|
||||
amdsmi_processor_handle* processor_handles);
|
||||
|
||||
/**
|
||||
* @brief Get the node handle associated with processor handle.
|
||||
*
|
||||
* @ingroup tagProcDiscovery
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details This function retrieves the node handle of a processor handler. The
|
||||
* @p processor_handle must be provided for the processor.
|
||||
* Currently, only AMD GPUs are supported.
|
||||
*
|
||||
* @param[in] processor_handle A pointer to a ::amdsmi_processor_handle, this
|
||||
* is required to be OAM ID 0 otherwise the API will fail. OAM ID is sourced
|
||||
* from amdsmi_get_gpu_asic_info API.
|
||||
*
|
||||
* @param[out] amdsmi_node_handle* A pointer to a block of memory where amdsmi_node_handle
|
||||
* will be written.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle, amdsmi_node_handle *node_handle);
|
||||
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
/**
|
||||
@@ -6220,6 +6270,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
|
||||
amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle,
|
||||
uint16_t *xcd_count);
|
||||
|
||||
/**
|
||||
* @brief Retrieves node power management (NPM) status and power limit for the specified node.
|
||||
*
|
||||
* @ingroup tagNodeInfo
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details This function queries the NPM controller for the given node and returns whether NPM is enabled,
|
||||
* along with the current node-level power limit in Watts. The NPM status and limit are set out-of-band
|
||||
* and reported via this API.
|
||||
*
|
||||
* @param[in] node_handle Handle to the Node to query.
|
||||
* @param[out] info Pointer to amdsmi_npm_info_t structure to receive NPM status and limit.
|
||||
* Must be allocated by the user.
|
||||
*
|
||||
* @return ::AMDSMI_STATUS_SUCCESS on success, non-zero on failure.
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle, amdsmi_npm_info_t *info);
|
||||
|
||||
/** @} End tagAsicBoardInfo */
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -6482,6 +6551,7 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *
|
||||
|
||||
/** @} End tagProcessInfo */
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup tagDriverControl Driver control mechanisms
|
||||
* These functions provide control over the driver. Users should use with
|
||||
|
||||
@@ -4487,6 +4487,37 @@ def amdsmi_get_gpu_fan_speed_max(
|
||||
return fan_speed.value
|
||||
|
||||
|
||||
def amdsmi_get_node_handle(processor_handle):
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(processor_handle,
|
||||
amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
node_handle = amdsmi_wrapper.amdsmi_node_handle()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_node_handle(processor_handle, ctypes.byref(node_handle))
|
||||
)
|
||||
|
||||
return node_handle
|
||||
|
||||
|
||||
def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]:
|
||||
if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle):
|
||||
raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle)
|
||||
|
||||
npm_info = amdsmi_wrapper.amdsmi_npm_info_t()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_npm_info(
|
||||
node_handle, ctypes.byref(npm_info)
|
||||
)
|
||||
)
|
||||
|
||||
dict_ret = {
|
||||
"limit": npm_info.limit,
|
||||
"status": npm_info.status,
|
||||
}
|
||||
return dict_ret
|
||||
|
||||
|
||||
def amdsmi_get_temp_metric(
|
||||
processor_handle: processor_handle_t,
|
||||
sensor_type: AmdSmiTemperatureType,
|
||||
|
||||
@@ -261,6 +261,7 @@ AMDSMI_CONTAINER_DOCKER = 1
|
||||
amdsmi_container_types_t = ctypes.c_uint32 # enum
|
||||
amdsmi_processor_handle = ctypes.POINTER(None)
|
||||
amdsmi_socket_handle = ctypes.POINTER(None)
|
||||
amdsmi_node_handle = ctypes.POINTER(None)
|
||||
amdsmi_cpusocket_handle = ctypes.POINTER(None)
|
||||
class struct_amdsmi_hsmp_driver_version_t(Structure):
|
||||
pass
|
||||
@@ -2259,6 +2260,27 @@ amdsmi_affinity_scope_t__enumvalues = {
|
||||
AMDSMI_AFFINITY_SCOPE_NODE = 0
|
||||
AMDSMI_AFFINITY_SCOPE_SOCKET = 1
|
||||
amdsmi_affinity_scope_t = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'amdsmi_npm_status_t'
|
||||
amdsmi_npm_status_t__enumvalues = {
|
||||
0: 'AMDSMI_NPM_STATUS_DISABLED',
|
||||
1: 'AMDSMI_NPM_STATUS_ENABLED',
|
||||
}
|
||||
AMDSMI_NPM_STATUS_DISABLED = 0
|
||||
AMDSMI_NPM_STATUS_ENABLED = 1
|
||||
amdsmi_npm_status_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_npm_info_t(Structure):
|
||||
pass
|
||||
|
||||
struct_amdsmi_npm_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_npm_info_t._fields_ = [
|
||||
('status', amdsmi_npm_status_t),
|
||||
('PADDING_0', ctypes.c_ubyte * 4),
|
||||
('limit', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint64 * 6),
|
||||
]
|
||||
|
||||
amdsmi_npm_info_t = struct_amdsmi_npm_info_t
|
||||
class struct_amdsmi_smu_fw_version_t(Structure):
|
||||
pass
|
||||
|
||||
@@ -2489,6 +2511,9 @@ amdsmi_get_processor_handles_by_type.argtypes = [amdsmi_socket_handle, processor
|
||||
amdsmi_get_processor_handles = _libraries['libamd_smi.so'].amdsmi_get_processor_handles
|
||||
amdsmi_get_processor_handles.restype = amdsmi_status_t
|
||||
amdsmi_get_processor_handles.argtypes = [amdsmi_socket_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))]
|
||||
amdsmi_get_node_handle = _libraries['libamd_smi.so'].amdsmi_get_node_handle
|
||||
amdsmi_get_node_handle.restype = amdsmi_status_t
|
||||
amdsmi_get_node_handle.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(None))]
|
||||
amdsmi_get_cpucore_handles = _libraries['libamd_smi.so'].amdsmi_get_cpucore_handles
|
||||
amdsmi_get_cpucore_handles.restype = amdsmi_status_t
|
||||
amdsmi_get_cpucore_handles.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))]
|
||||
@@ -2966,6 +2991,9 @@ amdsmi_get_violation_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(
|
||||
amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list
|
||||
amdsmi_get_gpu_process_list.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)]
|
||||
amdsmi_get_npm_info = _libraries['libamd_smi.so'].amdsmi_get_npm_info
|
||||
amdsmi_get_npm_info.restype = amdsmi_status_t
|
||||
amdsmi_get_npm_info.argtypes = [amdsmi_node_handle, ctypes.POINTER(struct_amdsmi_npm_info_t)]
|
||||
amdsmi_gpu_driver_reload = _libraries['libamd_smi.so'].amdsmi_gpu_driver_reload
|
||||
amdsmi_gpu_driver_reload.restype = amdsmi_status_t
|
||||
amdsmi_gpu_driver_reload.argtypes = []
|
||||
@@ -3231,7 +3259,8 @@ __all__ = \
|
||||
'AMDSMI_MEM_TYPE_GTT', 'AMDSMI_MEM_TYPE_LAST',
|
||||
'AMDSMI_MEM_TYPE_VIS_VRAM', 'AMDSMI_MEM_TYPE_VRAM',
|
||||
'AMDSMI_MM_UVD', 'AMDSMI_MM_VCE', 'AMDSMI_MM_VCN',
|
||||
'AMDSMI_MM__MAX', 'AMDSMI_POWER_CAP_TYPE_PPT0',
|
||||
'AMDSMI_MM__MAX', 'AMDSMI_NPM_STATUS_DISABLED',
|
||||
'AMDSMI_NPM_STATUS_ENABLED', 'AMDSMI_POWER_CAP_TYPE_PPT0',
|
||||
'AMDSMI_POWER_CAP_TYPE_PPT1', 'AMDSMI_PROCESSOR_TYPE_AMD_APU',
|
||||
'AMDSMI_PROCESSOR_TYPE_AMD_CPU',
|
||||
'AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE',
|
||||
@@ -3471,6 +3500,7 @@ __all__ = \
|
||||
'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version',
|
||||
'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest',
|
||||
'amdsmi_get_minmax_bandwidth_between_processors',
|
||||
'amdsmi_get_node_handle', 'amdsmi_get_npm_info',
|
||||
'amdsmi_get_pcie_info', 'amdsmi_get_power_cap_info',
|
||||
'amdsmi_get_power_info',
|
||||
'amdsmi_get_processor_count_from_handles',
|
||||
@@ -3499,7 +3529,8 @@ __all__ = \
|
||||
'amdsmi_link_type_t', 'amdsmi_memory_page_status_t',
|
||||
'amdsmi_memory_partition_config_t',
|
||||
'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t',
|
||||
'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_nps_caps_t',
|
||||
'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_node_handle',
|
||||
'amdsmi_npm_info_t', 'amdsmi_npm_status_t', 'amdsmi_nps_caps_t',
|
||||
'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t',
|
||||
'amdsmi_od_volt_freq_data_t', 'amdsmi_p2p_capability_t',
|
||||
'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t',
|
||||
@@ -3570,8 +3601,8 @@ __all__ = \
|
||||
'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t',
|
||||
'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t',
|
||||
'struct_amdsmi_memory_partition_config_t',
|
||||
'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t',
|
||||
'struct_amdsmi_od_volt_curve_t',
|
||||
'struct_amdsmi_name_value_t', 'struct_amdsmi_npm_info_t',
|
||||
'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t',
|
||||
'struct_amdsmi_od_volt_freq_data_t',
|
||||
'struct_amdsmi_p2p_capability_t',
|
||||
'struct_amdsmi_pcie_bandwidth_t', 'struct_amdsmi_pcie_info_t',
|
||||
|
||||
@@ -596,6 +596,26 @@ typedef enum {
|
||||
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
|
||||
} rsmi_temperature_type_t;
|
||||
|
||||
/**
|
||||
* @brief NPM status
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_NPM_STATUS_DISABLED,
|
||||
RSMI_NPM_STATUS_ENABLED
|
||||
} rsmi_npm_status_t;
|
||||
|
||||
/**
|
||||
* @brief NPM info including status, limit.
|
||||
*
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
rsmi_npm_status_t status; //!< NPM status (enabled/disabled).
|
||||
uint64_t limit; //!< Node-level power limit in Watts.
|
||||
uint64_t reserved[6];
|
||||
} rsmi_npm_info_t;
|
||||
|
||||
/**
|
||||
* @brief Activity (Utilization) Metrics. This enum is used to identify
|
||||
* various activity metrics.
|
||||
@@ -2892,6 +2912,9 @@ rsmi_status_t rsmi_dev_fan_speed_get(uint32_t dv_ind,
|
||||
rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind,
|
||||
uint32_t sensor_ind, uint64_t *max_speed);
|
||||
|
||||
rsmi_status_t rsmi_dev_npm_info_get(uint32_t dv_ind,
|
||||
uintptr_t node_handle, rsmi_npm_info_t *npm_info);
|
||||
|
||||
/**
|
||||
* @brief Get the temperature metric value for the specified metric, from the
|
||||
* specified temperature sensor on the specified device.
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
|
||||
#define ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include <string>
|
||||
|
||||
|
||||
namespace amd::smi {
|
||||
|
||||
rsmi_status_t get_npm_board_status(const std::string &board_path,
|
||||
bool *enabled);
|
||||
|
||||
rsmi_status_t get_npm_board_limit(const std::string &board_path,
|
||||
uint64_t *limit);
|
||||
|
||||
}
|
||||
#endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
|
||||
@@ -55,6 +55,7 @@
|
||||
#include "rocm_smi/rocm_smi64Config.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#include "rocm_smi/rocm_smi_board_temp.h"
|
||||
#include "rocm_smi/rocm_smi_npm.h"
|
||||
|
||||
using amd::smi::monitorTypesToString;
|
||||
using amd::smi::getRSMIStatusString;
|
||||
@@ -3258,6 +3259,67 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_npm_info_get(uint32_t dv_ind, uintptr_t node_handle,
|
||||
rsmi_npm_info_t *npm_info) {
|
||||
TRY
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind=" << dv_ind;
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (npm_info == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(npm_info)
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
if (node_handle == 0) {
|
||||
ss << __PRETTY_FUNCTION__ << " | node_handle == 0 -> returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
std::string *board_path_str = reinterpret_cast<std::string*>(node_handle);
|
||||
if (board_path_str == nullptr || board_path_str->empty()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | invalid/empty board path in node_handle";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
bool npm_status = false;
|
||||
uint64_t npm_limit = UINT64_MAX;
|
||||
|
||||
rsmi_status_t ret = amd::smi::get_npm_board_status(*board_path_str, &npm_status);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | get_npm_board_status failed: "
|
||||
<< getRSMIStatusString(ret);
|
||||
LOG_INFO(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = amd::smi::get_npm_board_limit(*board_path_str, &npm_limit);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | get_npm_board_limit returned "
|
||||
<< getRSMIStatusString(ret) << " ; using sentinel limit";
|
||||
LOG_DEBUG(ss);
|
||||
npm_limit = UINT64_MAX;
|
||||
}
|
||||
|
||||
// fill output
|
||||
std::memset(npm_info, 0, sizeof(*npm_info));
|
||||
npm_info->status = npm_status ? RSMI_NPM_STATUS_ENABLED : RSMI_NPM_STATUS_DISABLED;
|
||||
npm_info->limit = npm_limit;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_SUCCESS);
|
||||
LOG_TRACE(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
rsmi_temperature_metric_t metric, int64_t *temperature) {
|
||||
@@ -7899,3 +7961,4 @@ rsmi_test_refcount(uint64_t refcnt_type) {
|
||||
return static_cast<int32_t>(smi.ref_count());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rocm_smi/rocm_smi_npm.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#include <fstream>
|
||||
#include <cstring>
|
||||
#include <cerrno>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
|
||||
using amd::smi::getRSMIStatusString;
|
||||
|
||||
namespace amd::smi {
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
rsmi_status_t read_npm_file(const fs::path &path, std::string &out) {
|
||||
std::ifstream ifs(path);
|
||||
if (!ifs.is_open()) {
|
||||
return RSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
std::string line;
|
||||
if (!std::getline(ifs, line)) {
|
||||
return RSMI_STATUS_NO_DATA;
|
||||
}
|
||||
out = line;
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
rsmi_status_t get_npm_board_status(const std::string &board_path, bool *enabled) {
|
||||
if (enabled == nullptr) return RSMI_STATUS_INVALID_ARGS;
|
||||
if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS;
|
||||
|
||||
fs::path bd(board_path);
|
||||
if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
std::string s;
|
||||
rsmi_status_t r = read_npm_file(bd / "npm_status", s);
|
||||
if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
if (s == "enabled") {
|
||||
*enabled = true;
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
if (s == "disabled") {
|
||||
*enabled = false;
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
rsmi_status_t get_npm_board_limit(const std::string &board_path, uint64_t *limit) {
|
||||
if (limit == nullptr) return RSMI_STATUS_INVALID_ARGS;
|
||||
if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS;
|
||||
|
||||
fs::path bd(board_path);
|
||||
if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
fs::path p = bd / "cur_node_power_limit";
|
||||
if (!fs::exists(p) || !fs::is_regular_file(p)) return RSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
std::string s;
|
||||
rsmi_status_t r = read_npm_file(p, s);
|
||||
if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
try {
|
||||
size_t idx = 0;
|
||||
unsigned long long v = std::stoull(s, &idx, 10);
|
||||
if (idx != s.size()) return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
*limit = static_cast<uint64_t>(v);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
} catch (...) {
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace
|
||||
@@ -477,6 +477,78 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_node_handle *node_handle) {
|
||||
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (node_handle == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Check if OAM ID is 0
|
||||
amdsmi_asic_info_t asic_info;
|
||||
amdsmi_status_t r = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
if (r != AMDSMI_STATUS_SUCCESS) {
|
||||
return r;
|
||||
}
|
||||
|
||||
if (asic_info.oam_id != 0) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Get renderPath
|
||||
amdsmi_enumeration_info_t enumeration_info;
|
||||
r = amdsmi_get_gpu_enumeration_info(processor_handle, &enumeration_info);
|
||||
if (r != AMDSMI_STATUS_SUCCESS) {
|
||||
return r;
|
||||
}
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
// Construct the path from /sys/class/drm/renderD* device
|
||||
fs::path drm_device_path = fs::path("/sys/class/drm") / ("renderD" + std::to_string(enumeration_info.drm_render)) / "device";
|
||||
fs::path found_board;
|
||||
|
||||
try {
|
||||
// Navigate to the board directory from the DRM device path
|
||||
fs::path board_dir = drm_device_path / "board";
|
||||
fs::path npm_status = board_dir / "npm_status";
|
||||
|
||||
// Check if board directory and npm_status exist
|
||||
if (fs::exists(board_dir) && fs::is_directory(board_dir) && fs::exists(npm_status)) {
|
||||
found_board = board_dir;
|
||||
}
|
||||
} catch (...) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
if (found_board.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Store board path so node handle remains valid for library lifetime.
|
||||
static std::mutex g_node_mu;
|
||||
static std::map<std::string, std::unique_ptr<std::string>> g_node_registry;
|
||||
|
||||
std::string board_path = found_board.string();
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(g_node_mu);
|
||||
auto it = g_node_registry.find(board_path);
|
||||
if (it == g_node_registry.end()) {
|
||||
auto ptr = std::make_unique<std::string>(board_path);
|
||||
amdsmi_node_handle h = reinterpret_cast<amdsmi_node_handle>(ptr.get());
|
||||
g_node_registry.emplace(board_path, std::move(ptr));
|
||||
*node_handle = h;
|
||||
} else {
|
||||
*node_handle = reinterpret_cast<amdsmi_node_handle>(it->second.get());
|
||||
}
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
amdsmi_status_t amdsmi_get_processor_count_from_handles(amdsmi_processor_handle* processor_handles,
|
||||
uint32_t* processor_count, uint32_t* nr_cpusockets,
|
||||
@@ -879,6 +951,36 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle
|
||||
return amdsmi_status;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle,
|
||||
amdsmi_npm_info_t *npm_info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (node_handle == nullptr || npm_info == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Verify board path from node_handle
|
||||
auto board_path_str = reinterpret_cast<std::string*>(node_handle);
|
||||
if (board_path_str == nullptr || board_path_str->empty()) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
rsmi_npm_info_t rsmi_npm_info;
|
||||
rsmi_status_t rstatus = rsmi_dev_npm_info_get(0, reinterpret_cast<uintptr_t>(node_handle), &rsmi_npm_info);
|
||||
amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(rstatus);
|
||||
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
|
||||
return amdsmi_status;
|
||||
}
|
||||
|
||||
if (sizeof(amdsmi_npm_info_t) != sizeof(rsmi_npm_info_t)) {
|
||||
return AMDSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
std::memcpy(npm_info, &rsmi_npm_info, sizeof(amdsmi_npm_info_t));
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_vram_usage_t *vram_info) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
Referens i nytt ärende
Block a user