diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b7375ff0d..dd6374b6c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,7 +251,8 @@ set(CMN_SRC_LIST "${ROCM_SRC_DIR}/rocm_smi_logger.cc" "${SHR_MUTEX_DIR}/shared_mutex.cc" "${ROCM_SRC_DIR}/rocm_smi_binary_parser.cc" - "${ROCM_SRC_DIR}/rocm_smi_board_temp.cc") + "${ROCM_SRC_DIR}/rocm_smi_board_temp.cc" + "${ROCM_SRC_DIR}/rocm_smi_npm.cc") if(ENABLE_ESMI_LIB) list(APPEND CMN_SRC_LIST ${ESMI_SRC_DIR}/e_smi.c) @@ -277,7 +278,8 @@ set(CMN_INC_LIST "${ROCM_INC_DIR}/rocm_smi_logger.h" "${SHR_MUTEX_DIR}/shared_mutex.h" "${ROCM_INC_DIR}/rocm_smi_binary_parser.h" - "${ROCM_INC_DIR}/rocm_smi_board_temp.h") + "${ROCM_INC_DIR}/rocm_smi_board_temp.h" + "${ROCM_INC_DIR}/rocm_smi_npm.h") add_subdirectory("rocm_smi") add_subdirectory("src") diff --git a/amdsmi_cli/amdsmi_cli.py b/amdsmi_cli/amdsmi_cli.py index daaeb6b644..1c43d48c8c 100755 --- a/amdsmi_cli/amdsmi_cli.py +++ b/amdsmi_cli/amdsmi_cli.py @@ -159,6 +159,7 @@ if __name__ == "__main__": amd_smi_commands.xgmi, amd_smi_commands.partition, amd_smi_commands.ras, + amd_smi_commands.node, amd_smi_commands.default, sys_argv=sys.argv, helpers=amd_smi_helpers) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index fd7b38b8cf..ad35028681 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -53,6 +53,7 @@ class AMDSMICommands(): self.device_handles = [] self.cpu_handles = [] self.core_handles = [] + self.node_handle = None self.stop = '' self.group_check_printed = False @@ -75,6 +76,20 @@ class AMDSMICommands(): logging.error('Unable to detect any GPU devices, check amdgpu version and module status (sudo modprobe amdgpu)') exit_flag = True + # Resolve the node handle. + for dev in self.device_handles: + try: + nh = amdsmi_interface.amdsmi_get_node_handle(dev) + if nh is not None: + self.node_handle = nh + continue + except amdsmi_exception.AmdSmiLibraryException as e: + if e.err_code in (amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED, + amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL): + logging.debug("Unable to get node handle: %s", e.get_error_info()) + else: + raise e + if self.helpers.is_amd_hsmp_initialized(): try: self.cpu_handles = amdsmi_interface.amdsmi_get_cpusocket_handles() @@ -7231,6 +7246,71 @@ class AMDSMICommands(): time.sleep(1) + def node(self, args, multiple_devices=False, nodes=None, power_management=None): + """List node informations + + Args: + args (Namespace): Namespace containing the parsed CLI args + multiple_devices (bool, optional): True if checking for multiple devices. + Defaults to False. + + Returns: + None: Print output via AMDSMILogger to destination + """ + # Set args.* to passed in arguments + if nodes: + args.nodes = nodes + if power_management: + args.power_management = power_management + if getattr(args, 'nodes', None) is None: + args.nodes = self.node_handle + + if not self.group_check_printed: + self.helpers.check_required_groups(check_render=True, check_video=False) + self.group_check_printed = True + + # Get NPM info + if args.nodes is not None: + try: + npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes) + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info()) + npm_info = "N/A" + else: + logging.debug('No node handle available to query NPM info') + npm_info = "N/A" + + # Log outputs + npm_dict = {"limit": "N/A", "status": "N/A"} + power_unit ="W" + + limit = "N/A" + if isinstance(npm_info, dict): + limit = npm_info.get('limit', "N/A") + status = npm_info.get('status', npm_info.get('current', "N/A")) + + if limit !="N/A": + npm_dict['limit'] = limit + status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED" + npm_dict.update({"status": status}) + if self.logger.is_human_readable_format() and self.logger.destination == 'stdout': + print(f"NODE:\n POWER_MANAGEMENT:\n LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}\n STATUS: {npm_dict.get('status', 'N/A')}") + else: + if self.logger.is_csv_format(): + csv_dict = {} + csv_dict['limit'] = npm_dict.get('limit', "N/A") + csv_dict['status'] = npm_dict.get('status', "N/A") + self.logger.output = csv_dict + else: + # For JSON and human readable format with file output + npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit) + self.logger.output = {'node': {'power_management': npm_dict}} + if multiple_devices: + self.logger.store_multiple_device_output() + return + self.logger.print_output() + + def default(self, args): """Display the default amdsmi view when no args are given.""" diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 9f10d5b9df..b166a75f6c 100755 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -624,6 +624,41 @@ class AMDSMIHelpers(): return False, args.core + # The below handle_nodes function is currently unused as only node 0 is supported. + # Marked as a private function until it is needed in the future. + def _handle_nodes(self, args, logger, subcommand): + """This function will run execute the subcommands based on the number + of nodes passed in via args. + params: + args - argparser args to pass to subcommand + current_platform_args (list) - GPU supported platform arguments + current_platform_values (list) - GPU supported values for the arguments + logger (AMDSMILogger) - Logger to print out output + subcommand (AMDSMICommands) - Function that can handle multiple gpus + + return: + tuple(bool, device_handle) : + bool - True if executed subcommand for multiple devices + device_handle - Return the device_handle if the list of devices is a length of 1 + (handled_multiple_nodes, device_handle) + + """ + if isinstance(args.node, list): + if len(args.node) > 1: + for node_handle in args.node: + # Handle multiple_devices to print all output at once + subcommand(args, multiple_devices=True, node=node_handle) + logger.print_output(multiple_device_enabled=True) + return True, args.node + elif len(args.node) == 1: + args.node = args.node[0] + return False, args.node + else: + logging.debug("args.node has an empty list") + else: + return False, args.node + + def handle_watch(self, args, subcommand, logger): """This function will run the subcommand multiple times based on the passed watch, watch_time, and iterations passed in. diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index 558205df45..ffe8236935 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -70,7 +70,8 @@ class AMDSMIParser(argparse.ArgumentParser): """ def __init__(self, version, list, static, firmware, bad_pages, metric, process, profile, event, topology, set_value, reset, monitor, - xgmi, partition, ras, default, sys_argv=None, helpers=None): + xgmi, partition, ras, node, default, sys_argv=None, + helpers=None): # Helper variables if helpers is None: @@ -122,7 +123,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Store possible subcommands & aliases for later errors self.possible_commands = ['version', 'list', 'static', 'firmware', 'ucode', 'bad-pages', 'metric', 'process', 'profile', 'event', 'topology', 'set', - 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', 'default'] + 'reset', 'monitor', 'dmon', 'xgmi', 'partition', 'ras', + 'node', 'default'] # Add all subparsers if sys_argv is not None: @@ -143,6 +145,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_xgmi_parser(self.subparsers, xgmi) self._add_partition_parser(self.subparsers, partition) self._add_ras_parser(self.subparsers, ras) + self._add_node_parser(self.subparsers, node) elif any(arg in sys_argv for arg in ['version']): self._add_version_parser(self.subparsers, version) elif any(arg in sys_argv for arg in ['list']): @@ -175,6 +178,8 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_partition_parser(self.subparsers, partition) elif any(arg in sys_argv for arg in ['ras']): self._add_ras_parser(self.subparsers, ras) + elif any(arg in sys_argv for arg in ['node']): + self._add_node_parser(self.subparsers, node) else: # If no subcommand is given, add the default parser self._add_default_parser(self.subparsers, default) @@ -1564,6 +1569,32 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_command_modifiers(ras_parser) + def _add_node_parser(self, subparsers: argparse._SubParsersAction, func): + if self.helpers.is_virtual_os(): + # This subparser is only available to Guest and Hypervisor systems + return + + # Subparser help text + node_help = "Gets power information for the node" + node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\ + \nIf no node argument is provided, all node information will be displayed." + node_optionals_title = "Node arguments" + + # Help text for Node arguments + power_management_help = "Displays power management information" + + node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help) + node_parser._optionals.title = node_optionals_title + node_parser.formatter_class = lambda prog: AMDSMISubparserHelpFormatter(prog) + node_parser.set_defaults(func=func) + + # Optional Args + node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help) + + # Add Universal Arguments + self._add_command_modifiers(node_parser) + + def error(self, message): outputformat = self.helpers.get_output_format() diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index d3b2bcdff2..da11c1ad9e 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -245,6 +245,13 @@ typedef enum { typedef void *amdsmi_processor_handle; typedef void *amdsmi_socket_handle; +/** + * @brief opaque handler point to underlying implementation + * + * @cond @tag{gpu_bm_linux} @tag{host} @endcond + */ +typedef void *amdsmi_node_handle; + #ifdef ENABLE_ESMI_LIB /** @@ -2141,6 +2148,27 @@ typedef enum { AMDSMI_AFFINITY_SCOPE_SOCKET //!< socket affinity } amdsmi_affinity_scope_t; +/** + * @brief NPM status + * + * @cond @tag{gpu_bm_linux} @tag{host} @endcond + */ +typedef enum { + AMDSMI_NPM_STATUS_DISABLED, + AMDSMI_NPM_STATUS_ENABLED +} amdsmi_npm_status_t; + +/** + * @brief NPM info + * + * @cond @tag{gpu_bm_linux} @tag{host} @endcond + */ +typedef struct { + amdsmi_npm_status_t status; //!< NPM status (enabled/disabled). + uint64_t limit; //!< Node-level power limit in Watts. + uint64_t reserved[6]; +} amdsmi_npm_info_t; + #ifdef ENABLE_ESMI_LIB /** @@ -2625,6 +2653,28 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle, uint32_t *processor_count, amdsmi_processor_handle* processor_handles); +/** + * @brief Get the node handle associated with processor handle. + * + * @ingroup tagProcDiscovery + * + * @platform{gpu_bm_linux} @platform{host} + * + * @details This function retrieves the node handle of a processor handler. The + * @p processor_handle must be provided for the processor. + * Currently, only AMD GPUs are supported. + * + * @param[in] processor_handle A pointer to a ::amdsmi_processor_handle, this + * is required to be OAM ID 0 otherwise the API will fail. OAM ID is sourced + * from amdsmi_get_gpu_asic_info API. + * + * @param[out] amdsmi_node_handle* A pointer to a block of memory where amdsmi_node_handle + * will be written. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle, amdsmi_node_handle *node_handle); + #ifdef ENABLE_ESMI_LIB /** @@ -6220,6 +6270,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle, uint16_t *xcd_count); +/** + * @brief Retrieves node power management (NPM) status and power limit for the specified node. + * + * @ingroup tagNodeInfo + * + * @platform{gpu_bm_linux} @platform{host} + * + * @details This function queries the NPM controller for the given node and returns whether NPM is enabled, + * along with the current node-level power limit in Watts. The NPM status and limit are set out-of-band + * and reported via this API. + * + * @param[in] node_handle Handle to the Node to query. + * @param[out] info Pointer to amdsmi_npm_info_t structure to receive NPM status and limit. + * Must be allocated by the user. + * + * @return ::AMDSMI_STATUS_SUCCESS on success, non-zero on failure. + */ +amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle, amdsmi_npm_info_t *info); + /** @} End tagAsicBoardInfo */ /*****************************************************************************/ @@ -6482,6 +6551,7 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t * /** @} End tagProcessInfo */ + /*****************************************************************************/ /** @defgroup tagDriverControl Driver control mechanisms * These functions provide control over the driver. Users should use with diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 82ad0ad3b3..6146262cac 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -4487,6 +4487,37 @@ def amdsmi_get_gpu_fan_speed_max( return fan_speed.value +def amdsmi_get_node_handle(processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException(processor_handle, + amdsmi_wrapper.amdsmi_processor_handle + ) + node_handle = amdsmi_wrapper.amdsmi_node_handle() + _check_res( + amdsmi_wrapper.amdsmi_get_node_handle(processor_handle, ctypes.byref(node_handle)) + ) + + return node_handle + + +def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]: + if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle): + raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle) + + npm_info = amdsmi_wrapper.amdsmi_npm_info_t() + _check_res( + amdsmi_wrapper.amdsmi_get_npm_info( + node_handle, ctypes.byref(npm_info) + ) + ) + + dict_ret = { + "limit": npm_info.limit, + "status": npm_info.status, + } + return dict_ret + + def amdsmi_get_temp_metric( processor_handle: processor_handle_t, sensor_type: AmdSmiTemperatureType, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 49082865d0..e703c145e6 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -261,6 +261,7 @@ AMDSMI_CONTAINER_DOCKER = 1 amdsmi_container_types_t = ctypes.c_uint32 # enum amdsmi_processor_handle = ctypes.POINTER(None) amdsmi_socket_handle = ctypes.POINTER(None) +amdsmi_node_handle = ctypes.POINTER(None) amdsmi_cpusocket_handle = ctypes.POINTER(None) class struct_amdsmi_hsmp_driver_version_t(Structure): pass @@ -2259,6 +2260,27 @@ amdsmi_affinity_scope_t__enumvalues = { AMDSMI_AFFINITY_SCOPE_NODE = 0 AMDSMI_AFFINITY_SCOPE_SOCKET = 1 amdsmi_affinity_scope_t = ctypes.c_uint32 # enum + +# values for enumeration 'amdsmi_npm_status_t' +amdsmi_npm_status_t__enumvalues = { + 0: 'AMDSMI_NPM_STATUS_DISABLED', + 1: 'AMDSMI_NPM_STATUS_ENABLED', +} +AMDSMI_NPM_STATUS_DISABLED = 0 +AMDSMI_NPM_STATUS_ENABLED = 1 +amdsmi_npm_status_t = ctypes.c_uint32 # enum +class struct_amdsmi_npm_info_t(Structure): + pass + +struct_amdsmi_npm_info_t._pack_ = 1 # source:False +struct_amdsmi_npm_info_t._fields_ = [ + ('status', amdsmi_npm_status_t), + ('PADDING_0', ctypes.c_ubyte * 4), + ('limit', ctypes.c_uint64), + ('reserved', ctypes.c_uint64 * 6), +] + +amdsmi_npm_info_t = struct_amdsmi_npm_info_t class struct_amdsmi_smu_fw_version_t(Structure): pass @@ -2489,6 +2511,9 @@ amdsmi_get_processor_handles_by_type.argtypes = [amdsmi_socket_handle, processor amdsmi_get_processor_handles = _libraries['libamd_smi.so'].amdsmi_get_processor_handles amdsmi_get_processor_handles.restype = amdsmi_status_t amdsmi_get_processor_handles.argtypes = [amdsmi_socket_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))] +amdsmi_get_node_handle = _libraries['libamd_smi.so'].amdsmi_get_node_handle +amdsmi_get_node_handle.restype = amdsmi_status_t +amdsmi_get_node_handle.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(None))] amdsmi_get_cpucore_handles = _libraries['libamd_smi.so'].amdsmi_get_cpucore_handles amdsmi_get_cpucore_handles.restype = amdsmi_status_t amdsmi_get_cpucore_handles.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))] @@ -2966,6 +2991,9 @@ amdsmi_get_violation_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER( amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list amdsmi_get_gpu_process_list.restype = amdsmi_status_t amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)] +amdsmi_get_npm_info = _libraries['libamd_smi.so'].amdsmi_get_npm_info +amdsmi_get_npm_info.restype = amdsmi_status_t +amdsmi_get_npm_info.argtypes = [amdsmi_node_handle, ctypes.POINTER(struct_amdsmi_npm_info_t)] amdsmi_gpu_driver_reload = _libraries['libamd_smi.so'].amdsmi_gpu_driver_reload amdsmi_gpu_driver_reload.restype = amdsmi_status_t amdsmi_gpu_driver_reload.argtypes = [] @@ -3231,7 +3259,8 @@ __all__ = \ 'AMDSMI_MEM_TYPE_GTT', 'AMDSMI_MEM_TYPE_LAST', 'AMDSMI_MEM_TYPE_VIS_VRAM', 'AMDSMI_MEM_TYPE_VRAM', 'AMDSMI_MM_UVD', 'AMDSMI_MM_VCE', 'AMDSMI_MM_VCN', - 'AMDSMI_MM__MAX', 'AMDSMI_POWER_CAP_TYPE_PPT0', + 'AMDSMI_MM__MAX', 'AMDSMI_NPM_STATUS_DISABLED', + 'AMDSMI_NPM_STATUS_ENABLED', 'AMDSMI_POWER_CAP_TYPE_PPT0', 'AMDSMI_POWER_CAP_TYPE_PPT1', 'AMDSMI_PROCESSOR_TYPE_AMD_APU', 'AMDSMI_PROCESSOR_TYPE_AMD_CPU', 'AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE', @@ -3471,6 +3500,7 @@ __all__ = \ 'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version', 'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest', 'amdsmi_get_minmax_bandwidth_between_processors', + 'amdsmi_get_node_handle', 'amdsmi_get_npm_info', 'amdsmi_get_pcie_info', 'amdsmi_get_power_cap_info', 'amdsmi_get_power_info', 'amdsmi_get_processor_count_from_handles', @@ -3499,7 +3529,8 @@ __all__ = \ 'amdsmi_link_type_t', 'amdsmi_memory_page_status_t', 'amdsmi_memory_partition_config_t', 'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t', - 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_nps_caps_t', + 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_node_handle', + 'amdsmi_npm_info_t', 'amdsmi_npm_status_t', 'amdsmi_nps_caps_t', 'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t', 'amdsmi_od_volt_freq_data_t', 'amdsmi_p2p_capability_t', 'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t', @@ -3570,8 +3601,8 @@ __all__ = \ 'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t', 'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t', 'struct_amdsmi_memory_partition_config_t', - 'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t', - 'struct_amdsmi_od_volt_curve_t', + 'struct_amdsmi_name_value_t', 'struct_amdsmi_npm_info_t', + 'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t', 'struct_amdsmi_od_volt_freq_data_t', 'struct_amdsmi_p2p_capability_t', 'struct_amdsmi_pcie_bandwidth_t', 'struct_amdsmi_pcie_info_t', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 9474233b4f..09c91200f9 100644 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -596,6 +596,26 @@ typedef enum { RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; +/** + * @brief NPM status + * + */ +typedef enum { + RSMI_NPM_STATUS_DISABLED, + RSMI_NPM_STATUS_ENABLED +} rsmi_npm_status_t; + +/** + * @brief NPM info including status, limit. + * + */ +typedef struct +{ + rsmi_npm_status_t status; //!< NPM status (enabled/disabled). + uint64_t limit; //!< Node-level power limit in Watts. + uint64_t reserved[6]; +} rsmi_npm_info_t; + /** * @brief Activity (Utilization) Metrics. This enum is used to identify * various activity metrics. @@ -2892,6 +2912,9 @@ rsmi_status_t rsmi_dev_fan_speed_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *max_speed); +rsmi_status_t rsmi_dev_npm_info_get(uint32_t dv_ind, + uintptr_t node_handle, rsmi_npm_info_t *npm_info); + /** * @brief Get the temperature metric value for the specified metric, from the * specified temperature sensor on the specified device. diff --git a/rocm_smi/include/rocm_smi/rocm_smi_npm.h b/rocm_smi/include/rocm_smi/rocm_smi_npm.h new file mode 100644 index 0000000000..f728fceacc --- /dev/null +++ b/rocm_smi/include/rocm_smi/rocm_smi_npm.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_ +#define ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_ + +#include "rocm_smi/rocm_smi.h" +#include + + +namespace amd::smi { + +rsmi_status_t get_npm_board_status(const std::string &board_path, + bool *enabled); + +rsmi_status_t get_npm_board_limit(const std::string &board_path, + uint64_t *limit); + +} +#endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_ diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 70eaab6253..8b6506c045 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -55,6 +55,7 @@ #include "rocm_smi/rocm_smi64Config.h" #include "rocm_smi/rocm_smi_logger.h" #include "rocm_smi/rocm_smi_board_temp.h" +#include "rocm_smi/rocm_smi_npm.h" using amd::smi::monitorTypesToString; using amd::smi::getRSMIStatusString; @@ -3258,6 +3259,67 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, CATCH } +rsmi_status_t +rsmi_dev_npm_info_get(uint32_t dv_ind, uintptr_t node_handle, + rsmi_npm_info_t *npm_info) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind=" << dv_ind; + LOG_TRACE(ss); + + if (npm_info == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + CHK_SUPPORT_NAME_ONLY(npm_info) + + DEVICE_MUTEX + + if (node_handle == 0) { + ss << __PRETTY_FUNCTION__ << " | node_handle == 0 -> returning " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS); + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + + std::string *board_path_str = reinterpret_cast(node_handle); + if (board_path_str == nullptr || board_path_str->empty()) { + ss << __PRETTY_FUNCTION__ << " | invalid/empty board path in node_handle"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + + bool npm_status = false; + uint64_t npm_limit = UINT64_MAX; + + rsmi_status_t ret = amd::smi::get_npm_board_status(*board_path_str, &npm_status); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | get_npm_board_status failed: " + << getRSMIStatusString(ret); + LOG_INFO(ss); + return ret; + } + + ret = amd::smi::get_npm_board_limit(*board_path_str, &npm_limit); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ << " | get_npm_board_limit returned " + << getRSMIStatusString(ret) << " ; using sentinel limit"; + LOG_DEBUG(ss); + npm_limit = UINT64_MAX; + } + + // fill output + std::memset(npm_info, 0, sizeof(*npm_info)); + npm_info->status = npm_status ? RSMI_NPM_STATUS_ENABLED : RSMI_NPM_STATUS_DISABLED; + npm_info->limit = npm_limit; + + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " + << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_TRACE(ss); + return RSMI_STATUS_SUCCESS; + CATCH +} + rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t *temperature) { @@ -7899,3 +7961,4 @@ rsmi_test_refcount(uint64_t refcnt_type) { return static_cast(smi.ref_count()); } + diff --git a/rocm_smi/src/rocm_smi_npm.cc b/rocm_smi/src/rocm_smi_npm.cc new file mode 100644 index 0000000000..561271b583 --- /dev/null +++ b/rocm_smi/src/rocm_smi_npm.cc @@ -0,0 +1,100 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "rocm_smi/rocm_smi_npm.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi_logger.h" +#include +#include +#include +#include +#include +#include + +using amd::smi::getRSMIStatusString; + +namespace amd::smi { + +namespace fs = std::filesystem; + +rsmi_status_t read_npm_file(const fs::path &path, std::string &out) { + std::ifstream ifs(path); + if (!ifs.is_open()) { + return RSMI_STATUS_FILE_ERROR; + } + std::string line; + if (!std::getline(ifs, line)) { + return RSMI_STATUS_NO_DATA; + } + out = line; + return RSMI_STATUS_SUCCESS; +} + +rsmi_status_t get_npm_board_status(const std::string &board_path, bool *enabled) { + if (enabled == nullptr) return RSMI_STATUS_INVALID_ARGS; + if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS; + + fs::path bd(board_path); + if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED; + + std::string s; + rsmi_status_t r = read_npm_file(bd / "npm_status", s); + if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED; + + if (s == "enabled") { + *enabled = true; + return RSMI_STATUS_SUCCESS; + } + if (s == "disabled") { + *enabled = false; + return RSMI_STATUS_SUCCESS; + } + return RSMI_STATUS_UNEXPECTED_DATA; +} + +rsmi_status_t get_npm_board_limit(const std::string &board_path, uint64_t *limit) { + if (limit == nullptr) return RSMI_STATUS_INVALID_ARGS; + if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS; + + fs::path bd(board_path); + if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED; + + fs::path p = bd / "cur_node_power_limit"; + if (!fs::exists(p) || !fs::is_regular_file(p)) return RSMI_STATUS_NOT_SUPPORTED; + + std::string s; + rsmi_status_t r = read_npm_file(p, s); + if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED; + + try { + size_t idx = 0; + unsigned long long v = std::stoull(s, &idx, 10); + if (idx != s.size()) return RSMI_STATUS_UNEXPECTED_DATA; + *limit = static_cast(v); + return RSMI_STATUS_SUCCESS; + } catch (...) { + return RSMI_STATUS_UNEXPECTED_DATA; + } +} + +} // end namespace diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 4de244f22c..3e7be704b9 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -477,6 +477,78 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle, return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle, + amdsmi_node_handle *node_handle) { + + AMDSMI_CHECK_INIT(); + + if (node_handle == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + // Check if OAM ID is 0 + amdsmi_asic_info_t asic_info; + amdsmi_status_t r = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); + if (r != AMDSMI_STATUS_SUCCESS) { + return r; + } + + if (asic_info.oam_id != 0) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + // Get renderPath + amdsmi_enumeration_info_t enumeration_info; + r = amdsmi_get_gpu_enumeration_info(processor_handle, &enumeration_info); + if (r != AMDSMI_STATUS_SUCCESS) { + return r; + } + + namespace fs = std::filesystem; + + // Construct the path from /sys/class/drm/renderD* device + fs::path drm_device_path = fs::path("/sys/class/drm") / ("renderD" + std::to_string(enumeration_info.drm_render)) / "device"; + fs::path found_board; + + try { + // Navigate to the board directory from the DRM device path + fs::path board_dir = drm_device_path / "board"; + fs::path npm_status = board_dir / "npm_status"; + + // Check if board directory and npm_status exist + if (fs::exists(board_dir) && fs::is_directory(board_dir) && fs::exists(npm_status)) { + found_board = board_dir; + } + } catch (...) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + if (found_board.empty()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + // Store board path so node handle remains valid for library lifetime. + static std::mutex g_node_mu; + static std::map> g_node_registry; + + std::string board_path = found_board.string(); + { + std::lock_guard lk(g_node_mu); + auto it = g_node_registry.find(board_path); + if (it == g_node_registry.end()) { + auto ptr = std::make_unique(board_path); + amdsmi_node_handle h = reinterpret_cast(ptr.get()); + g_node_registry.emplace(board_path, std::move(ptr)); + *node_handle = h; + } else { + *node_handle = reinterpret_cast(it->second.get()); + } + } + + return AMDSMI_STATUS_SUCCESS; + +} + #ifdef ENABLE_ESMI_LIB amdsmi_status_t amdsmi_get_processor_count_from_handles(amdsmi_processor_handle* processor_handles, uint32_t* processor_count, uint32_t* nr_cpusockets, @@ -879,6 +951,36 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle return amdsmi_status; } +amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle, + amdsmi_npm_info_t *npm_info) { + AMDSMI_CHECK_INIT(); + + if (node_handle == nullptr || npm_info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + // Verify board path from node_handle + auto board_path_str = reinterpret_cast(node_handle); + if (board_path_str == nullptr || board_path_str->empty()) { + return AMDSMI_STATUS_INVAL; + } + + rsmi_npm_info_t rsmi_npm_info; + rsmi_status_t rstatus = rsmi_dev_npm_info_get(0, reinterpret_cast(node_handle), &rsmi_npm_info); + amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(rstatus); + if (amdsmi_status != AMDSMI_STATUS_SUCCESS) { + return amdsmi_status; + } + + if (sizeof(amdsmi_npm_info_t) != sizeof(rsmi_npm_info_t)) { + return AMDSMI_STATUS_UNEXPECTED_SIZE; + } + std::memcpy(npm_info, &rsmi_npm_info, sizeof(amdsmi_npm_info_t)); + + return AMDSMI_STATUS_SUCCESS; + +} + amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_usage_t *vram_info) { AMDSMI_CHECK_INIT();