[SWDEV-538483] Add NPM API's and CLI (#817)

* Added Python & C API's for new node devices. Currently these are functional for node 0 only.
 - amdsmi_get_node_handle
 - amdsmi_get_npm_info
* Added `amd-smi node` CLI for Node Power Management

---------

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
This commit is contained in:
Kanangot Balakrishnan, Bindhiya
2025-11-13 21:51:31 -06:00
committad av GitHub
förälder 00a893d299
incheckning f8e4771363
13 ändrade filer med 616 tillägg och 8 borttagningar
+4 -2
Visa fil
@@ -251,7 +251,8 @@ set(CMN_SRC_LIST
"${ROCM_SRC_DIR}/rocm_smi_logger.cc"
"${SHR_MUTEX_DIR}/shared_mutex.cc"
"${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc"
"${ROCM_SRC_DIR}/rocm_smi_board_temp.cc")
"${ROCM_SRC_DIR}/rocm_smi_board_temp.cc"
"${ROCM_SRC_DIR}/rocm_smi_npm.cc")
if(ENABLE_ESMI_LIB)
list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c)
@@ -277,7 +278,8 @@ set(CMN_INC_LIST
"${ROCM_INC_DIR}/rocm_smi_logger.h"
"${SHR_MUTEX_DIR}/shared_mutex.h"
"${ROCM_INC_DIR}/rocm_smi_binary_parser.h"
"${ROCM_INC_DIR}/rocm_smi_board_temp.h")
"${ROCM_INC_DIR}/rocm_smi_board_temp.h"
"${ROCM_INC_DIR}/rocm_smi_npm.h")
add_subdirectory("rocm_smi")
add_subdirectory("src")
+1
Visa fil
@@ -159,6 +159,7 @@ if __name__ == "__main__":
amd_smi_commands.xgmi,
amd_smi_commands.partition,
amd_smi_commands.ras,
amd_smi_commands.node,
amd_smi_commands.default,
sys_argv=sys.argv,
helpers=amd_smi_helpers)
+80
Visa fil
@@ -53,6 +53,7 @@ class AMDSMICommands():
self.device_handles = []
self.cpu_handles = []
self.core_handles = []
self.node_handle = None
self.stop = ''
self.group_check_printed = False
@@ -75,6 +76,20 @@ class AMDSMICommands():
logging.error('Unable to detect any GPU devices, check amdgpu version and module status (sudo modprobe amdgpu)')
exit_flag = True
# Resolve the node handle.
for dev in self.device_handles:
try:
nh = amdsmi_interface.amdsmi_get_node_handle(dev)
if nh is not None:
self.node_handle = nh
continue
except amdsmi_exception.AmdSmiLibraryException as e:
if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED,
amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL):
logging.debug("Unable to get node handle: %s", e.get_error_info())
else:
raise e
if self.helpers.is_amd_hsmp_initialized():
try:
self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles()
@@ -7231,6 +7246,71 @@ class AMDSMICommands():
time.sleep(1)
def node(self, args, multiple_devices=False, nodes=None, power_management=None):
"""List node informations
Args:
args (Namespace): Namespace containing the parsed CLI args
multiple_devices (bool, optional): True if checking for multiple devices.
Defaults to False.
Returns:
None: Print output via AMDSMILogger to destination
"""
# Set args.* to passed in arguments
if nodes:
args.nodes = nodes
if power_management:
args.power_management = power_management
if getattr(args, 'nodes', None) is None:
args.nodes = self.node_handle
if not self.group_check_printed:
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Get NPM info
if args.nodes is not None:
try:
npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes)
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info())
npm_info = "N/A"
else:
logging.debug('No node handle available to query NPM info')
npm_info = "N/A"
# Log outputs
npm_dict = {"limit": "N/A", "status": "N/A"}
power_unit ="W"
limit = "N/A"
if isinstance(npm_info, dict):
limit = npm_info.get('limit', "N/A")
status = npm_info.get('status', npm_info.get('current', "N/A"))
if limit !="N/A":
npm_dict['limit'] = limit
status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
npm_dict.update({"status": status})
if self.logger.is_human_readable_format() and self.logger.destination == 'stdout':
print(f"NODE:\n POWER_MANAGEMENT:\n LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}\n STATUS: {npm_dict.get('status', 'N/A')}")
else:
if self.logger.is_csv_format():
csv_dict = {}
csv_dict['limit'] = npm_dict.get('limit', "N/A")
csv_dict['status'] = npm_dict.get('status', "N/A")
self.logger.output = csv_dict
else:
# For JSON and human readable format with file output
npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit)
self.logger.output = {'node': {'power_management': npm_dict}}
if multiple_devices:
self.logger.store_multiple_device_output()
return
self.logger.print_output()
def default(self, args):
"""Display the default amdsmi view when no args are given."""
+35
Visa fil
@@ -624,6 +624,41 @@ class AMDSMIHelpers():
return False, args.core
# The below handle_nodes function is currently unused as only node 0 is supported.
# Marked as a private function until it is needed in the future.
def _handle_nodes(self, args, logger, subcommand):
"""This function will run execute the subcommands based on the number
of nodes passed in via args.
params:
args - argparser args to pass to subcommand
current_platform_args (list) - GPU supported platform arguments
current_platform_values (list) - GPU supported values for the arguments
logger (AMDSMILogger) - Logger to print out output
subcommand (AMDSMICommands) - Function that can handle multiple gpus
return:
tuple(bool, device_handle) :
bool - True if executed subcommand for multiple devices
device_handle - Return the device_handle if the list of devices is a length of 1
(handled_multiple_nodes, device_handle)
"""
if isinstance(args.node, list):
if len(args.node) > 1:
for node_handle in args.node:
# Handle multiple_devices to print all output at once
subcommand(args, multiple_devices=True, node=node_handle)
logger.print_output(multiple_device_enabled=True)
return True, args.node
elif len(args.node) == 1:
args.node = args.node[0]
return False, args.node
else:
logging.debug("args.node has an empty list")
else:
return False, args.node
def handle_watch(self, args, subcommand, logger):
"""This function will run the subcommand multiple times based
on the passed watch, watch_time, and iterations passed in.
+33 -2
Visa fil
@@ -70,7 +70,8 @@ class AMDSMIParser(argparse.ArgumentParser):
"""
def __init__(self, version, list, static, firmware, bad_pages, metric,
process, profile, event, topology, set_value, reset, monitor,
xgmi, partition, ras, default, sys_argv=None, helpers=None):
xgmi, partition, ras, node, default, sys_argv=None,
helpers=None):
# Helper variables
if helpers is None:
@@ -122,7 +123,8 @@ class AMDSMIParser(argparse.ArgumentParser):
# Store possible subcommands & aliases for later errors
self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages',
'metric', 'process', 'profile', 'event', 'topology', 'set',
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default']
'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras',
'node', 'default']
# Add all subparsers
if sys_argv is not None:
@@ -143,6 +145,7 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_xgmi_parser(self.subparsers, xgmi)
self._add_partition_parser(self.subparsers, partition)
self._add_ras_parser(self.subparsers, ras)
self._add_node_parser(self.subparsers, node)
elif any(arg in sys_argv for arg in ['version']):
self._add_version_parser(self.subparsers, version)
elif any(arg in sys_argv for arg in ['list']):
@@ -175,6 +178,8 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_partition_parser(self.subparsers, partition)
elif any(arg in sys_argv for arg in ['ras']):
self._add_ras_parser(self.subparsers, ras)
elif any(arg in sys_argv for arg in ['node']):
self._add_node_parser(self.subparsers, node)
else:
# If no subcommand is given, add the default parser
self._add_default_parser(self.subparsers, default)
@@ -1564,6 +1569,32 @@ class AMDSMIParser(argparse.ArgumentParser):
self._add_command_modifiers(ras_parser)
def _add_node_parser(self, subparsers: argparse._SubParsersAction, func):
if self.helpers.is_virtual_os():
# This subparser is only available to Guest and Hypervisor systems
return
# Subparser help text
node_help = "Gets power information for the node"
node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\
\nIf no node argument is provided, all node information will be displayed."
node_optionals_title = "Node arguments"
# Help text for Node arguments
power_management_help = "Displays power management information"
node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help)
node_parser._optionals.title = node_optionals_title
node_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog)
node_parser.set_defaults(func=func)
# Optional Args
node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help)
# Add Universal Arguments
self._add_command_modifiers(node_parser)
def error(self, message):
outputformat = self.helpers.get_output_format()
+70
Visa fil
@@ -245,6 +245,13 @@ typedef enum {
typedef void *amdsmi_processor_handle;
typedef void *amdsmi_socket_handle;
/**
* @brief opaque handler point to underlying implementation
*
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
*/
typedef void *amdsmi_node_handle;
#ifdef ENABLE_ESMI_LIB
/**
@@ -2141,6 +2148,27 @@ typedef enum {
AMDSMI_AFFINITY_SCOPE_SOCKET //!< socket affinity
} amdsmi_affinity_scope_t;
/**
* @brief NPM status
*
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
*/
typedef enum {
AMDSMI_NPM_STATUS_DISABLED,
AMDSMI_NPM_STATUS_ENABLED
} amdsmi_npm_status_t;
/**
* @brief NPM info
*
* @cond @tag{gpu_bm_linux} @tag{host} @endcond
*/
typedef struct {
amdsmi_npm_status_t status; //!< NPM status (enabled/disabled).
uint64_t limit; //!< Node-level power limit in Watts.
uint64_t reserved[6];
} amdsmi_npm_info_t;
#ifdef ENABLE_ESMI_LIB
/**
@@ -2625,6 +2653,28 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
uint32_t *processor_count,
amdsmi_processor_handle* processor_handles);
/**
* @brief Get the node handle associated with processor handle.
*
* @ingroup tagProcDiscovery
*
* @platform{gpu_bm_linux} @platform{host}
*
* @details This function retrieves the node handle of a processor handler. The
* @p processor_handle must be provided for the processor.
* Currently, only AMD GPUs are supported.
*
* @param[in] processor_handle A pointer to a ::amdsmi_processor_handle, this
* is required to be OAM ID 0 otherwise the API will fail. OAM ID is sourced
* from amdsmi_get_gpu_asic_info API.
*
* @param[out] amdsmi_node_handle* A pointer to a block of memory where amdsmi_node_handle
* will be written.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle, amdsmi_node_handle *node_handle);
#ifdef ENABLE_ESMI_LIB
/**
@@ -6220,6 +6270,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle,
uint16_t *xcd_count);
/**
* @brief Retrieves node power management (NPM) status and power limit for the specified node.
*
* @ingroup tagNodeInfo
*
* @platform{gpu_bm_linux} @platform{host}
*
* @details This function queries the NPM controller for the given node and returns whether NPM is enabled,
* along with the current node-level power limit in Watts. The NPM status and limit are set out-of-band
* and reported via this API.
*
* @param[in] node_handle Handle to the Node to query.
* @param[out] info Pointer to amdsmi_npm_info_t structure to receive NPM status and limit.
* Must be allocated by the user.
*
* @return ::AMDSMI_STATUS_SUCCESS on success, non-zero on failure.
*/
amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle, amdsmi_npm_info_t *info);
/** @} End tagAsicBoardInfo */
/*****************************************************************************/
@@ -6482,6 +6551,7 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *
/** @} End tagProcessInfo */
/*****************************************************************************/
/** @defgroup tagDriverControl Driver control mechanisms
* These functions provide control over the driver. Users should use with
+31
Visa fil
@@ -4487,6 +4487,37 @@ def amdsmi_get_gpu_fan_speed_max(
return fan_speed.value
def amdsmi_get_node_handle(processor_handle):
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(processor_handle,
amdsmi_wrapper.amdsmi_processor_handle
)
node_handle = amdsmi_wrapper.amdsmi_node_handle()
_check_res(
amdsmi_wrapper.amdsmi_get_node_handle(processor_handle, ctypes.byref(node_handle))
)
return node_handle
def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]:
if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle):
raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle)
npm_info = amdsmi_wrapper.amdsmi_npm_info_t()
_check_res(
amdsmi_wrapper.amdsmi_get_npm_info(
node_handle, ctypes.byref(npm_info)
)
)
dict_ret = {
"limit": npm_info.limit,
"status": npm_info.status,
}
return dict_ret
def amdsmi_get_temp_metric(
processor_handle: processor_handle_t,
sensor_type: AmdSmiTemperatureType,
+35 -4
Visa fil
@@ -261,6 +261,7 @@ AMDSMI_CONTAINER_DOCKER = 1
amdsmi_container_types_t = ctypes.c_uint32 # enum
amdsmi_processor_handle = ctypes.POINTER(None)
amdsmi_socket_handle = ctypes.POINTER(None)
amdsmi_node_handle = ctypes.POINTER(None)
amdsmi_cpusocket_handle = ctypes.POINTER(None)
class struct_amdsmi_hsmp_driver_version_t(Structure):
pass
@@ -2259,6 +2260,27 @@ amdsmi_affinity_scope_t__enumvalues = {
AMDSMI_AFFINITY_SCOPE_NODE = 0
AMDSMI_AFFINITY_SCOPE_SOCKET = 1
amdsmi_affinity_scope_t = ctypes.c_uint32 # enum
# values for enumeration 'amdsmi_npm_status_t'
amdsmi_npm_status_t__enumvalues = {
0: 'AMDSMI_NPM_STATUS_DISABLED',
1: 'AMDSMI_NPM_STATUS_ENABLED',
}
AMDSMI_NPM_STATUS_DISABLED = 0
AMDSMI_NPM_STATUS_ENABLED = 1
amdsmi_npm_status_t = ctypes.c_uint32 # enum
class struct_amdsmi_npm_info_t(Structure):
pass
struct_amdsmi_npm_info_t._pack_ = 1 # source:False
struct_amdsmi_npm_info_t._fields_ = [
('status', amdsmi_npm_status_t),
('PADDING_0', ctypes.c_ubyte * 4),
('limit', ctypes.c_uint64),
('reserved', ctypes.c_uint64 * 6),
]
amdsmi_npm_info_t = struct_amdsmi_npm_info_t
class struct_amdsmi_smu_fw_version_t(Structure):
pass
@@ -2489,6 +2511,9 @@ amdsmi_get_processor_handles_by_type.argtypes = [amdsmi_socket_handle, processor
amdsmi_get_processor_handles = _libraries['libamd_smi.so'].amdsmi_get_processor_handles
amdsmi_get_processor_handles.restype = amdsmi_status_t
amdsmi_get_processor_handles.argtypes = [amdsmi_socket_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))]
amdsmi_get_node_handle = _libraries['libamd_smi.so'].amdsmi_get_node_handle
amdsmi_get_node_handle.restype = amdsmi_status_t
amdsmi_get_node_handle.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(None))]
amdsmi_get_cpucore_handles = _libraries['libamd_smi.so'].amdsmi_get_cpucore_handles
amdsmi_get_cpucore_handles.restype = amdsmi_status_t
amdsmi_get_cpucore_handles.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))]
@@ -2966,6 +2991,9 @@ amdsmi_get_violation_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(
amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list
amdsmi_get_gpu_process_list.restype = amdsmi_status_t
amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)]
amdsmi_get_npm_info = _libraries['libamd_smi.so'].amdsmi_get_npm_info
amdsmi_get_npm_info.restype = amdsmi_status_t
amdsmi_get_npm_info.argtypes = [amdsmi_node_handle, ctypes.POINTER(struct_amdsmi_npm_info_t)]
amdsmi_gpu_driver_reload = _libraries['libamd_smi.so'].amdsmi_gpu_driver_reload
amdsmi_gpu_driver_reload.restype = amdsmi_status_t
amdsmi_gpu_driver_reload.argtypes = []
@@ -3231,7 +3259,8 @@ __all__ = \
'AMDSMI_MEM_TYPE_GTT', 'AMDSMI_MEM_TYPE_LAST',
'AMDSMI_MEM_TYPE_VIS_VRAM', 'AMDSMI_MEM_TYPE_VRAM',
'AMDSMI_MM_UVD', 'AMDSMI_MM_VCE', 'AMDSMI_MM_VCN',
'AMDSMI_MM__MAX', 'AMDSMI_POWER_CAP_TYPE_PPT0',
'AMDSMI_MM__MAX', 'AMDSMI_NPM_STATUS_DISABLED',
'AMDSMI_NPM_STATUS_ENABLED', 'AMDSMI_POWER_CAP_TYPE_PPT0',
'AMDSMI_POWER_CAP_TYPE_PPT1', 'AMDSMI_PROCESSOR_TYPE_AMD_APU',
'AMDSMI_PROCESSOR_TYPE_AMD_CPU',
'AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE',
@@ -3471,6 +3500,7 @@ __all__ = \
'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version',
'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest',
'amdsmi_get_minmax_bandwidth_between_processors',
'amdsmi_get_node_handle', 'amdsmi_get_npm_info',
'amdsmi_get_pcie_info', 'amdsmi_get_power_cap_info',
'amdsmi_get_power_info',
'amdsmi_get_processor_count_from_handles',
@@ -3499,7 +3529,8 @@ __all__ = \
'amdsmi_link_type_t', 'amdsmi_memory_page_status_t',
'amdsmi_memory_partition_config_t',
'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t',
'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_nps_caps_t',
'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_node_handle',
'amdsmi_npm_info_t', 'amdsmi_npm_status_t', 'amdsmi_nps_caps_t',
'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t',
'amdsmi_od_volt_freq_data_t', 'amdsmi_p2p_capability_t',
'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t',
@@ -3570,8 +3601,8 @@ __all__ = \
'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t',
'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t',
'struct_amdsmi_memory_partition_config_t',
'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t',
'struct_amdsmi_od_volt_curve_t',
'struct_amdsmi_name_value_t', 'struct_amdsmi_npm_info_t',
'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t',
'struct_amdsmi_od_volt_freq_data_t',
'struct_amdsmi_p2p_capability_t',
'struct_amdsmi_pcie_bandwidth_t', 'struct_amdsmi_pcie_info_t',
+23
Visa fil
@@ -596,6 +596,26 @@ typedef enum {
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_temperature_type_t;
/**
* @brief NPM status
*
*/
typedef enum {
RSMI_NPM_STATUS_DISABLED,
RSMI_NPM_STATUS_ENABLED
} rsmi_npm_status_t;
/**
* @brief NPM info including status, limit.
*
*/
typedef struct
{
rsmi_npm_status_t status; //!< NPM status (enabled/disabled).
uint64_t limit; //!< Node-level power limit in Watts.
uint64_t reserved[6];
} rsmi_npm_info_t;
/**
* @brief Activity (Utilization) Metrics. This enum is used to identify
* various activity metrics.
@@ -2892,6 +2912,9 @@ rsmi_status_t rsmi_dev_fan_speed_get(uint32_t dv_ind,
rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind,
uint32_t sensor_ind, uint64_t *max_speed);
rsmi_status_t rsmi_dev_npm_info_get(uint32_t dv_ind,
uintptr_t node_handle, rsmi_npm_info_t *npm_info);
/**
* @brief Get the temperature metric value for the specified metric, from the
* specified temperature sensor on the specified device.
+39
Visa fil
@@ -0,0 +1,39 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
#define ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
#include "rocm_smi/rocm_smi.h"
#include <string>
namespace amd::smi {
rsmi_status_t get_npm_board_status(const std::string &board_path,
bool *enabled);
rsmi_status_t get_npm_board_limit(const std::string &board_path,
uint64_t *limit);
}
#endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_
+63
Visa fil
@@ -55,6 +55,7 @@
#include "rocm_smi/rocm_smi64Config.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_board_temp.h"
#include "rocm_smi/rocm_smi_npm.h"
using amd::smi::monitorTypesToString;
using amd::smi::getRSMIStatusString;
@@ -3258,6 +3259,67 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
CATCH
}
rsmi_status_t
rsmi_dev_npm_info_get(uint32_t dv_ind, uintptr_t node_handle,
rsmi_npm_info_t *npm_info) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind=" << dv_ind;
LOG_TRACE(ss);
if (npm_info == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
CHK_SUPPORT_NAME_ONLY(npm_info)
DEVICE_MUTEX
if (node_handle == 0) {
ss << __PRETTY_FUNCTION__ << " | node_handle == 0 -> returning "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS);
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
std::string *board_path_str = reinterpret_cast<std::string*>(node_handle);
if (board_path_str == nullptr || board_path_str->empty()) {
ss << __PRETTY_FUNCTION__ << " | invalid/empty board path in node_handle";
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
bool npm_status = false;
uint64_t npm_limit = UINT64_MAX;
rsmi_status_t ret = amd::smi::get_npm_board_status(*board_path_str, &npm_status);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | get_npm_board_status failed: "
<< getRSMIStatusString(ret);
LOG_INFO(ss);
return ret;
}
ret = amd::smi::get_npm_board_limit(*board_path_str, &npm_limit);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | get_npm_board_limit returned "
<< getRSMIStatusString(ret) << " ; using sentinel limit";
LOG_DEBUG(ss);
npm_limit = UINT64_MAX;
}
// fill output
std::memset(npm_info, 0, sizeof(*npm_info));
npm_info->status = npm_status ? RSMI_NPM_STATUS_ENABLED : RSMI_NPM_STATUS_DISABLED;
npm_info->limit = npm_limit;
ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS);
LOG_TRACE(ss);
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t *temperature) {
@@ -7899,3 +7961,4 @@ rsmi_test_refcount(uint64_t refcnt_type) {
return static_cast<int32_t>(smi.ref_count());
}
+100
Visa fil
@@ -0,0 +1,100 @@
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "rocm_smi/rocm_smi_npm.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_logger.h"
#include <fstream>
#include <cstring>
#include <cerrno>
#include <iomanip>
#include <sstream>
#include <map>
using amd::smi::getRSMIStatusString;
namespace amd::smi {
namespace fs = std::filesystem;
rsmi_status_t read_npm_file(const fs::path &path, std::string &out) {
std::ifstream ifs(path);
if (!ifs.is_open()) {
return RSMI_STATUS_FILE_ERROR;
}
std::string line;
if (!std::getline(ifs, line)) {
return RSMI_STATUS_NO_DATA;
}
out = line;
return RSMI_STATUS_SUCCESS;
}
rsmi_status_t get_npm_board_status(const std::string &board_path, bool *enabled) {
if (enabled == nullptr) return RSMI_STATUS_INVALID_ARGS;
if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS;
fs::path bd(board_path);
if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED;
std::string s;
rsmi_status_t r = read_npm_file(bd / "npm_status", s);
if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED;
if (s == "enabled") {
*enabled = true;
return RSMI_STATUS_SUCCESS;
}
if (s == "disabled") {
*enabled = false;
return RSMI_STATUS_SUCCESS;
}
return RSMI_STATUS_UNEXPECTED_DATA;
}
rsmi_status_t get_npm_board_limit(const std::string &board_path, uint64_t *limit) {
if (limit == nullptr) return RSMI_STATUS_INVALID_ARGS;
if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS;
fs::path bd(board_path);
if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED;
fs::path p = bd / "cur_node_power_limit";
if (!fs::exists(p) || !fs::is_regular_file(p)) return RSMI_STATUS_NOT_SUPPORTED;
std::string s;
rsmi_status_t r = read_npm_file(p, s);
if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED;
try {
size_t idx = 0;
unsigned long long v = std::stoull(s, &idx, 10);
if (idx != s.size()) return RSMI_STATUS_UNEXPECTED_DATA;
*limit = static_cast<uint64_t>(v);
return RSMI_STATUS_SUCCESS;
} catch (...) {
return RSMI_STATUS_UNEXPECTED_DATA;
}
}
} // end namespace
+102
Visa fil
@@ -477,6 +477,78 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle,
amdsmi_node_handle *node_handle) {
AMDSMI_CHECK_INIT();
if (node_handle == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Check if OAM ID is 0
amdsmi_asic_info_t asic_info;
amdsmi_status_t r = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
if (asic_info.oam_id != 0) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Get renderPath
amdsmi_enumeration_info_t enumeration_info;
r = amdsmi_get_gpu_enumeration_info(processor_handle, &enumeration_info);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
namespace fs = std::filesystem;
// Construct the path from /sys/class/drm/renderD* device
fs::path drm_device_path = fs::path("/sys/class/drm") / ("renderD" + std::to_string(enumeration_info.drm_render)) / "device";
fs::path found_board;
try {
// Navigate to the board directory from the DRM device path
fs::path board_dir = drm_device_path / "board";
fs::path npm_status = board_dir / "npm_status";
// Check if board directory and npm_status exist
if (fs::exists(board_dir) && fs::is_directory(board_dir) && fs::exists(npm_status)) {
found_board = board_dir;
}
} catch (...) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
if (found_board.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Store board path so node handle remains valid for library lifetime.
static std::mutex g_node_mu;
static std::map<std::string, std::unique_ptr<std::string>> g_node_registry;
std::string board_path = found_board.string();
{
std::lock_guard<std::mutex> lk(g_node_mu);
auto it = g_node_registry.find(board_path);
if (it == g_node_registry.end()) {
auto ptr = std::make_unique<std::string>(board_path);
amdsmi_node_handle h = reinterpret_cast<amdsmi_node_handle>(ptr.get());
g_node_registry.emplace(board_path, std::move(ptr));
*node_handle = h;
} else {
*node_handle = reinterpret_cast<amdsmi_node_handle>(it->second.get());
}
}
return AMDSMI_STATUS_SUCCESS;
}
#ifdef ENABLE_ESMI_LIB
amdsmi_status_t amdsmi_get_processor_count_from_handles(amdsmi_processor_handle* processor_handles,
uint32_t* processor_count, uint32_t* nr_cpusockets,
@@ -879,6 +951,36 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle
return amdsmi_status;
}
amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle,
amdsmi_npm_info_t *npm_info) {
AMDSMI_CHECK_INIT();
if (node_handle == nullptr || npm_info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Verify board path from node_handle
auto board_path_str = reinterpret_cast<std::string*>(node_handle);
if (board_path_str == nullptr || board_path_str->empty()) {
return AMDSMI_STATUS_INVAL;
}
rsmi_npm_info_t rsmi_npm_info;
rsmi_status_t rstatus = rsmi_dev_npm_info_get(0, reinterpret_cast<uintptr_t>(node_handle), &rsmi_npm_info);
amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(rstatus);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
return amdsmi_status;
}
if (sizeof(amdsmi_npm_info_t) != sizeof(rsmi_npm_info_t)) {
return AMDSMI_STATUS_UNEXPECTED_SIZE;
}
std::memcpy(npm_info, &rsmi_npm_info, sizeof(amdsmi_npm_info_t));
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle,
amdsmi_vram_usage_t *vram_info) {
AMDSMI_CHECK_INIT();