From b999f866110b63d10e293d855de650b86bdb0fb8 Mon Sep 17 00:00:00 2001 From: "Mewar, Deepak" Date: Tue, 20 May 2025 11:36:09 +0530 Subject: [PATCH] [SWDEV-512393] Added amdsmi_get_cpu_affinity_with_scope (#198) Signed-off-by: Maisam Arif Signed-off-by: Deepak Mewar --- CHANGELOG.md | 4 + amdsmi_cli/amdsmi_commands.py | 18 +++- docs/reference/amdsmi-py-api.md | 37 +++++++ example/CMakeLists.txt | 6 +- include/amd_smi/amdsmi.h | 91 ++++++++++++++++-- include/amd_smi/impl/amd_smi_gpu_device.h | 1 + include/amd_smi/impl/amd_smi_system.h | 6 ++ include/amd_smi/impl/amd_smi_utils.h | 16 ++++ py-interface/__init__.py | 6 ++ py-interface/amdsmi_interface.py | 49 ++++++++++ py-interface/amdsmi_wrapper.py | 68 +++++++++---- src/amd_smi/amd_smi.cc | 112 +++++++++++++++++++++- src/amd_smi/amd_smi_gpu_device.cc | 34 ++++++- src/amd_smi/amd_smi_system.cc | 81 +++++++++++++++- src/amd_smi/amd_smi_utils.cc | 34 +++++-- 15 files changed, 524 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 853791a3a7..f08e1e90a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - Added new C and Python API `amdsmi_get_cpu_model_name` - Not sourced from esmi library. +- **Added `amdsmi_get_cpu_affinity_with_scope()`**. + +### Added + - **Added support for GPU metrics 1.8**. - Added new fields for `amdsmi_gpu_xcp_metrics_t` including: - Adding the following metrics to allow new calculations for violation status: diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 0a483b42f0..8787a9e75f 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -832,8 +832,24 @@ class AMDSMICommands(): numa_affinity = "N/A" logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info()) + try: + cpu_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.NUMA_SCOPE) + except amdsmi_exception.AmdSmiLibraryException as e: + cpu_set = [] + cpu_set.append(-1) + logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info()) + + try: + cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE) + except amdsmi_exception.AmdSmiLibraryException as e: + cpusockets = [] + cpusockets.append(-1) + logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['numa'] = {'node' : numa_node_number, - 'affinity' : numa_affinity} + 'affinity' : numa_affinity, + 'CPU affinity' : [hex(cpus) for cpus in cpu_set], + 'Socket affinity' : [socket for socket in set(cpusockets)]} if args.vram: vram_info_dict = {"type" : "N/A", "vendor" : "N/A", diff --git a/docs/reference/amdsmi-py-api.md b/docs/reference/amdsmi-py-api.md index 1ce318e3ac..6cf8f186d0 100644 --- a/docs/reference/amdsmi-py-api.md +++ b/docs/reference/amdsmi-py-api.md @@ -4180,6 +4180,42 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_cpu_affinity_with_scope + +Description: Returns list of bitmask information for the given GPU. + +Input parameters: + +* `processor_handle` device which to query + +Output: List with fields + +Field | Description +---|--- +`array_size` | array size = (num of sockets * num of cores)/ size of 64-bit +`scope` | enum value for numa or socket affinity + +Exceptions that can be thrown by `amdsmi_get_gpu_vram_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + bitmask = amdsmi_get_cpu_affinity_with_scope(device) + print(bitmask['size']) +except AmdSmiException as e: + print(e) +``` + ## CPU APIs ### amdsmi_get_processor_info @@ -5274,6 +5310,7 @@ try: except AmdSmiException as e: print(e) ``` + ### amdsmi_get_afids_from_cper Description: Get the AFIDs from CPER buffer diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 00f3ea918a..124745b2a3 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -33,8 +33,10 @@ endif() # add package search paths set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../) -set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib ${ROCM_DIR}/lib64) - +set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib) +if(EXISTS ${ROCM_DIR}/lib64) + set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64) +endif() find_package(amd_smi CONFIG REQUIRED) message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 1a54954c1b..cce1db89df 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -2085,6 +2085,18 @@ typedef enum { AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH } amdsmi_virtualization_mode_t; + +/** + * @brief Scope for Numa affinity or Socket affinity + * + * @cond @tag{gpu_bm_linux} @endcond + */ +typedef enum { + AMDSMI_AFFINITY_SCOPE_NODE = 0, // Memory affinity as numa node + AMDSMI_AFFINITY_SCOPE_SOCKET = 1 // socket affinity +} amdsmi_affinity_scope_t; + + #define AMDSMI_DEFAULT_VARIANT 0xFFFFFFFFFFFFFFFF #ifdef ENABLE_ESMI_LIB @@ -2309,6 +2321,16 @@ typedef struct { #endif +/** + * @brief cpu socket info data + * + * @cond @tag{cpu_bm} @endcond + */ +typedef struct { + uint32_t socket_id; + uint32_t cores_per_socket; +} amdsmi_sock_info_t; + /*****************************************************************************/ /** @defgroup tagInitShutdown Initialization and Shutdown * @{ @@ -2698,6 +2720,34 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle, unsigned in amdsmi_status_t amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle, amdsmi_enumeration_info_t *info); +/** + * @brief Retrieves an array of uint64_t (sized to cpu_set_size) of bitmasks with the + * affinity within numa node or socket for the device. + * + * @ingroup tagProcDiscovery + * + * @platform{gpu_bm_linux} + * + * @details Given a processor handle @p processor_handle, the size of the cpu_set array @p cpu_set_size, + * and a pointer to an array of int64_t @p cpu_set, and @p scope, this function will write the CPU affinity bitmask + * to the array pointed to by @p cpu_set. + * + * User must allocate the enough memory for the cpu_set array. The size of the array is determined by the + * number of CPU cores in the system. As an example, if there are 2 CPUs and each has 112 cores, the size + * should be ceiling(2*112/64) = 4, where 64 is the bits of uint64_t. The function will write the CPU affinity bitmask + * to the array. For example, to describe the CPU cores 0-55,112-167, it will set the 0-55 and 112-167 bits + * to 1 and the reset of bits to 0 in the cpu_set array. + * + * @param[in] processor_handle a processor handle + * @param[in] cpu_set_size The size of the cpu_set array that is safe to access + * @param[in,out] cpu_set Array reference in which to return a bitmask of CPU cores that this processor affinities with. + * @param[in] scope Scope for socket or numa affinity. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle, + uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope); + /** @} End tagProcDiscovery */ /*****************************************************************************/ @@ -6987,14 +7037,14 @@ amdsmi_status_t amdsmi_get_hsmp_metrics_table(amdsmi_processor_handle processor_ /** @} tagHSMPMetricsTable */ /*****************************************************************************/ -/** @defgroup tagAuxillary Auxillary functions +/** @defgroup cpuAuxillary Auxillary functions * @{ */ /** * @brief Get first online core on socket. * - * @ingroup tagAuxillary + * @ingroup cpuAuxillary * * @platform{cpu_bm} * @@ -7010,7 +7060,7 @@ amdsmi_status_t amdsmi_first_online_core_on_cpu_socket(amdsmi_processor_handle p /** * @brief Get CPU family. * - * @ingroup tagAuxillary + * @ingroup cpuAuxillary * * @platform{cpu_bm} * @@ -7023,7 +7073,7 @@ amdsmi_status_t amdsmi_get_cpu_family(uint32_t *cpu_family); /** * @brief Get CPU model. * - * @ingroup tagAuxillary + * @ingroup cpuAuxillary * * @platform{cpu_bm} * @@ -7036,7 +7086,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model); /** * @brief Retrieve the CPU processor model name based on the processor index. * - * @ingroup tagAuxillary + * @ingroup cpuAuxillary * * @platform{cpu_bm} * @@ -7062,7 +7112,7 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand /** * @brief Get a description of provided AMDSMI error status for esmi errors. * - * @ingroup tagAuxillary + * @ingroup cpuAuxillary * * @platform{cpu_bm} * @@ -7078,7 +7128,33 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand */ amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string); -/** @} tagAuxillary */ +/** + * @brief Get cpu cores per socket from sys filesystem. + * + * @ingroup cpuAuxillary + * + * @platform{cpu_bm} + * + * @param[in] sock_count - cpu socket count + * @param[in,out] soc_info - Input buffer to return the cpu cores per socket + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *soc_info); + +/** + * @brief Get CPU socket count from sys filesystem. + * + * @ingroup cpuAuxillary + * + * @platform{cpu_bm} + * + * @param[in,out] sock_count - Input buffer to return the cpu socket count + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count); +/** @} cpuAuxillary */ #endif @@ -7087,4 +7163,3 @@ amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **sta #endif // __cplusplus #endif // __AMDSMI_H__ - diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index 6d69e250db..1bef7182b1 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -73,6 +73,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor { // New methods for -e feature std::string bdf_to_string() const; // -e feature + std::vector get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const; private: uint32_t gpu_id_; diff --git a/include/amd_smi/impl/amd_smi_system.h b/include/amd_smi/impl/amd_smi_system.h index 95d4db34f0..bdd90a7ce4 100644 --- a/include/amd_smi/impl/amd_smi_system.h +++ b/include/amd_smi/impl/amd_smi_system.h @@ -25,6 +25,7 @@ #include #include +#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_socket.h" #include "amd_smi/impl/amd_smi_processor.h" @@ -60,6 +61,11 @@ class AMDSmiSystem { amdsmi_status_t get_cpu_model_name(uint32_t socket_id, std::string *model_name); + std::map get_sys_cpu_cores_per_socket() ; + + amdsmi_status_t get_sys_num_of_cpu_sockets(uint32_t *sock_num); + + std::vector get_cpu_sockets_from_numa_node(int32_t numa_node); private: AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {} diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index 1db60cbe7d..907597c611 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -23,10 +23,12 @@ #ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ #define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ +#include #include #include #include #include +#include #include "amd_smi/amdsmi.h" #include "amd_smi/impl/amd_smi_gpu_device.h" @@ -176,4 +178,18 @@ constexpr T translate_umax_or_assign_value(U source_value, V target_value) return result; } +/** + * @brief Iterates all entires in a directory . + * + * @details Given a directory in const std::string & base_path, and a callback function + * entry_callback, this function will open the directory and iterate through all entires + * in that directory. For each entry it will call the entry_callback function with the + * path of that entry + * + * @param[in] base_path the path of the directory to iterate in + * + * @retval ::true if the iteration was successful + * ::false if the iteration failed + */ +bool iterate_directory(const std::string &base_path, std::function entry_callback); #endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ diff --git a/py-interface/__init__.py b/py-interface/__init__.py index a1318455a4..8fa08a62c1 100644 --- a/py-interface/__init__.py +++ b/py-interface/__init__.py @@ -86,6 +86,11 @@ from .amdsmi_interface import amdsmi_get_gpu_device_bdf from .amdsmi_interface import amdsmi_get_gpu_device_uuid from .amdsmi_interface import amdsmi_get_gpu_enumeration_info +# # Functions not dependent on ESMI library +from .amdsmi_interface import amdsmi_get_cpu_socket_count +from .amdsmi_interface import amdsmi_get_cpu_cores_per_socket +from .amdsmi_interface import amdsmi_get_cpu_affinity_with_scope + # # SW Version Information from .amdsmi_interface import amdsmi_get_gpu_driver_info @@ -278,6 +283,7 @@ from .amdsmi_interface import AmdSmiProcessorType from .amdsmi_interface import AmdSmiVirtualizationMode from .amdsmi_interface import AmdSmiVramType from .amdsmi_interface import AmdSmiVramVendor +from .amdsmi_interface import AmdSmiAffinityScope # Exceptions from .amdsmi_exception import AmdSmiLibraryException diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 147c5371f0..74221bd83b 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -520,6 +520,9 @@ class AmdSmiVramVendor(IntEnum): MICRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MICRON UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_UNKNOWN +class AmdSmiAffinityScope(IntEnum): + NUMA_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_NODE + SOCKET_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_SOCKET class AmdSmiEventReader: def __init__( @@ -1738,6 +1741,23 @@ def amdsmi_get_cpu_model_name( ) return f"{cpu_info.model_name}" +def amdsmi_get_cpu_cores_per_socket(sock_count: ctypes.c_uint32()): + cps = amdsmi_wrapper.amdsmi_sock_info_t() + + _check_res( + amdsmi_wrapper.amdsmi_get_cpu_cores_per_socket(sock_count, cps) + ) + return {"socket_id": cps.socket_id, + "cores_per_socket": cps.cores_per_socket + } + +def amdsmi_get_cpu_socket_count(): + sock_count = ctypes.c_uint32() + _check_res( + amdsmi_wrapper.amdsmi_get_cpu_socket_count(ctypes.byref(sock_count)) + ) + return sock_count.value + def amdsmi_init(flag=AmdSmiInitFlags.INIT_AMD_GPUS): if not isinstance(flag, AmdSmiInitFlags): raise AmdSmiParameterException(flag, AmdSmiInitFlags) @@ -1841,6 +1861,35 @@ def amdsmi_get_gpu_enumeration_info(processor_handle: amdsmi_wrapper.amdsmi_proc return enumeration_info +def amdsmi_get_cpu_affinity_with_scope( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + scope: AmdSmiAffinityScope +) -> List[int]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(scope, AmdSmiAffinityScope): + raise AmdSmiParameterException(scope, AmdSmiAffinityScope) + + socket_count = amdsmi_get_cpu_socket_count() + sock_info = amdsmi_get_cpu_cores_per_socket(socket_count) + core_count = sock_info['cores_per_socket'] + + size = ctypes.c_uint32(0) + size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8) + size = int(math.ceil(size)) + size = ctypes.c_uint32(size) + cpu_set = (ctypes.c_uint64 * size.value)() + + _check_res( + amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope( + processor_handle, size, cpu_set, scope) + ) + + return cpu_set + def amdsmi_get_gpu_asic_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 1051191c18..dad184d56b 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -2131,6 +2131,15 @@ AMDSMI_VIRTUALIZATION_MODE_HOST = 2 AMDSMI_VIRTUALIZATION_MODE_GUEST = 3 AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH = 4 amdsmi_virtualization_mode_t = ctypes.c_uint32 # enum + +# values for enumeration 'amdsmi_affinity_scope_t' +amdsmi_affinity_scope_t__enumvalues = { + 0: 'AMDSMI_AFFINITY_SCOPE_NODE', + 1: 'AMDSMI_AFFINITY_SCOPE_SOCKET', +} +AMDSMI_AFFINITY_SCOPE_NODE = 0 +AMDSMI_AFFINITY_SCOPE_SOCKET = 1 +amdsmi_affinity_scope_t = ctypes.c_uint32 # enum class struct_amdsmi_smu_fw_version_t(Structure): pass @@ -2322,6 +2331,16 @@ struct_amdsmi_cpu_info_t._fields_ = [ ] amdsmi_cpu_info_t = struct_amdsmi_cpu_info_t +class struct_amdsmi_sock_info_t(Structure): + pass + +struct_amdsmi_sock_info_t._pack_ = 1 # source:False +struct_amdsmi_sock_info_t._fields_ = [ + ('socket_id', ctypes.c_uint32), + ('cores_per_socket', ctypes.c_uint32), +] + +amdsmi_sock_info_t = struct_amdsmi_sock_info_t uint64_t = ctypes.c_uint64 amdsmi_init = _libraries['libamd_smi.so'].amdsmi_init amdsmi_init.restype = amdsmi_status_t @@ -2369,6 +2388,10 @@ amdsmi_get_gpu_device_uuid.argtypes = [amdsmi_processor_handle, ctypes.POINTER(c amdsmi_get_gpu_enumeration_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_enumeration_info amdsmi_get_gpu_enumeration_info.restype = amdsmi_status_t amdsmi_get_gpu_enumeration_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_enumeration_info_t)] +uint32_t = ctypes.c_uint32 +amdsmi_get_cpu_affinity_with_scope = _libraries['libamd_smi.so'].amdsmi_get_cpu_affinity_with_scope +amdsmi_get_cpu_affinity_with_scope.restype = amdsmi_status_t +amdsmi_get_cpu_affinity_with_scope.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_uint64), amdsmi_affinity_scope_t] amdsmi_get_gpu_id = _libraries['libamd_smi.so'].amdsmi_get_gpu_id amdsmi_get_gpu_id.restype = amdsmi_status_t amdsmi_get_gpu_id.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint16)] @@ -2378,7 +2401,6 @@ amdsmi_get_gpu_revision.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctyp amdsmi_get_gpu_vendor_name = _libraries['libamd_smi.so'].amdsmi_get_gpu_vendor_name amdsmi_get_gpu_vendor_name.restype = amdsmi_status_t amdsmi_get_gpu_vendor_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), size_t] -uint32_t = ctypes.c_uint32 amdsmi_get_gpu_vram_vendor = _libraries['libamd_smi.so'].amdsmi_get_gpu_vram_vendor amdsmi_get_gpu_vram_vendor.restype = amdsmi_status_t amdsmi_get_gpu_vram_vendor.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t] @@ -2939,6 +2961,12 @@ amdsmi_get_cpu_model_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(st amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg amdsmi_get_esmi_err_msg.restype = amdsmi_status_t amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))] +amdsmi_get_cpu_cores_per_socket = _libraries['libamd_smi.so'].amdsmi_get_cpu_cores_per_socket +amdsmi_get_cpu_cores_per_socket.restype = amdsmi_status_t +amdsmi_get_cpu_cores_per_socket.argtypes = [uint32_t, ctypes.POINTER(struct_amdsmi_sock_info_t)] +amdsmi_get_cpu_socket_count = _libraries['libamd_smi.so'].amdsmi_get_cpu_socket_count +amdsmi_get_cpu_socket_count.restype = amdsmi_status_t +amdsmi_get_cpu_socket_count.argtypes = [ctypes.POINTER(ctypes.c_uint32)] __all__ = \ ['AGG_BW0', 'AMDSMI_ACCELERATOR_DECODER', 'AMDSMI_ACCELERATOR_DMA', 'AMDSMI_ACCELERATOR_ENCODER', @@ -2950,6 +2978,7 @@ __all__ = \ 'AMDSMI_ACCELERATOR_PARTITION_QPX', 'AMDSMI_ACCELERATOR_PARTITION_SPX', 'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_ACCELERATOR_XCC', + 'AMDSMI_AFFINITY_SCOPE_NODE', 'AMDSMI_AFFINITY_SCOPE_SOCKET', 'AMDSMI_CACHE_PROPERTY_CPU_CACHE', 'AMDSMI_CACHE_PROPERTY_DATA_CACHE', 'AMDSMI_CACHE_PROPERTY_ENABLED', @@ -3173,18 +3202,18 @@ __all__ = \ 'amdsmi_accelerator_partition_profile_t', 'amdsmi_accelerator_partition_resource_profile_t', 'amdsmi_accelerator_partition_resource_type_t', - 'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t', - 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', - 'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t', - 'amdsmi_clean_gpu_local_data', 'amdsmi_clk_info_t', - 'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t', - 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', - 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', - 'amdsmi_cper_guid_t', 'amdsmi_cper_hdr_t', - 'amdsmi_cper_notify_type_t', 'amdsmi_cper_sev_t', - 'amdsmi_cper_timestamp_t', 'amdsmi_cper_valid_bits_t', - 'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable', - 'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t', + 'amdsmi_accelerator_partition_type_t', 'amdsmi_affinity_scope_t', + 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', + 'amdsmi_board_info_t', 'amdsmi_cache_property_type_t', + 'amdsmi_card_form_factor_t', 'amdsmi_clean_gpu_local_data', + 'amdsmi_clk_info_t', 'amdsmi_clk_limit_type_t', + 'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t', + 'amdsmi_container_types_t', 'amdsmi_counter_command_t', + 'amdsmi_counter_value_t', 'amdsmi_cper_guid_t', + 'amdsmi_cper_hdr_t', 'amdsmi_cper_notify_type_t', + 'amdsmi_cper_sev_t', 'amdsmi_cper_timestamp_t', + 'amdsmi_cper_valid_bits_t', 'amdsmi_cpu_apb_disable', + 'amdsmi_cpu_apb_enable', 'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t', 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t', @@ -3198,11 +3227,12 @@ __all__ = \ 'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t', 'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t', 'amdsmi_frequency_range_t', 'amdsmi_fw_block_t', - 'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper', + 'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper', + 'amdsmi_get_cpu_affinity_with_scope', 'amdsmi_get_clk_freq', 'amdsmi_get_clock_info', 'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit', 'amdsmi_get_cpu_core_current_freq_limit', - 'amdsmi_get_cpu_core_energy', + 'amdsmi_get_cpu_core_energy', 'amdsmi_get_cpu_cores_per_socket', 'amdsmi_get_cpu_current_io_bandwidth', 'amdsmi_get_cpu_current_xgmi_bw', 'amdsmi_get_cpu_ddr_bw', 'amdsmi_get_cpu_dimm_power_consumption', @@ -3215,6 +3245,7 @@ __all__ = \ 'amdsmi_get_cpu_pwr_svi_telemetry_all_rails', 'amdsmi_get_cpu_smu_fw_version', 'amdsmi_get_cpu_socket_c0_residency', + 'amdsmi_get_cpu_socket_count', 'amdsmi_get_cpu_socket_current_active_freq_limit', 'amdsmi_get_cpu_socket_energy', 'amdsmi_get_cpu_socket_freq_range', @@ -3332,8 +3363,9 @@ __all__ = \ 'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap', 'amdsmi_set_soc_pstate', 'amdsmi_set_xgmi_plpd', 'amdsmi_shut_down', 'amdsmi_smu_fw_version_t', - 'amdsmi_socket_handle', 'amdsmi_status_code_to_string', - 'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification', + 'amdsmi_sock_info_t', 'amdsmi_socket_handle', + 'amdsmi_status_code_to_string', 'amdsmi_status_t', + 'amdsmi_stop_gpu_event_notification', 'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t', 'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type', 'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number', @@ -3380,7 +3412,7 @@ __all__ = \ 'struct_amdsmi_proc_info_t', 'struct_amdsmi_process_info_t', 'struct_amdsmi_range_t', 'struct_amdsmi_ras_feature_t', 'struct_amdsmi_retired_page_record_t', - 'struct_amdsmi_smu_fw_version_t', + 'struct_amdsmi_smu_fw_version_t', 'struct_amdsmi_sock_info_t', 'struct_amdsmi_temp_range_refresh_rate_t', 'struct_amdsmi_topology_nearest_t', 'struct_amdsmi_utilization_counter_t', diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 63cf19b5fe..f034829512 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -3941,8 +3941,8 @@ amdsmi_get_gpu_cper_entries( std::string path = std::string("/sys/kernel/debug/dri/") + std::to_string(gpu_device->get_card_id()) + "/amdgpu_ring_cper"; - - + + return amdsmi_get_gpu_cper_entries_by_path( path.c_str(), severity_mask, @@ -4795,6 +4795,85 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle, return status; } +amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle, + uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope) +{ + AMDSMI_CHECK_INIT(); + + if (processor_handle == nullptr || cpu_set == nullptr || cpu_set_size == 0) { + return AMDSMI_STATUS_INVAL; + } + + // Retrieve GPU device from the processor handle + amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; + amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + uint32_t numa_node; + status = amdsmi_topo_get_numa_node_number(processor_handle, &numa_node); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + int32_t node_id = static_cast(numa_node); + + status = amdsmi_get_gpu_topo_numa_affinity(processor_handle, &node_id); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + if(node_id < 0) { + return AMDSMI_STATUS_NOT_FOUND; + } + + std::memset(cpu_set, 0, cpu_set_size * sizeof(uint64_t)); + switch(scope) { + case AMDSMI_AFFINITY_SCOPE_NODE: + { + std::vector bitmask = gpu_device->get_bitmask_from_numa_node(node_id, cpu_set_size); + if(bitmask[0] == std::numeric_limits::max()){ + return AMDSMI_STATUS_REFCOUNT_OVERFLOW; + } else { + std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t)); + } + break; + } + + case AMDSMI_AFFINITY_SCOPE_SOCKET: + { + std::vector sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id); + + if(sockets[0] == std::numeric_limits::max()){ + return AMDSMI_STATUS_REFCOUNT_OVERFLOW; + } else { + for (uint32_t idx : sockets) { + cpu_set[idx] = idx; + } + + std::sort(cpu_set, cpu_set + cpu_set_size); + + // Discard duplicates + uint32_t temp_size = 0; + for (uint32_t i = 0; i < cpu_set_size; ++i) { + if (i == 0 || cpu_set[i] != cpu_set[i - 1]) { + cpu_set[temp_size++] = cpu_set[i]; + } + } + + // Update the size to the temp size after discarding duplicates + cpu_set_size = temp_size; + } + break; + } + + default: + return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS; + } + + return AMDSMI_STATUS_SUCCESS; +} #ifdef ENABLE_ESMI_LIB static amdsmi_status_t amdsmi_errno_to_esmi_status(amdsmi_status_t status) @@ -5905,6 +5984,35 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *sock_info) +{ + std::map socket_core_count = amd::smi::AMDSmiSystem::getInstance().get_sys_cpu_cores_per_socket(); + + for (uint32_t i = 0; i < sock_count; ++i) { + auto it = socket_core_count.find(sock_info[i].socket_id); + if (it != socket_core_count.end()) { + sock_info[i].cores_per_socket = it->second; + } else { + sock_info[i].cores_per_socket = 0; + } + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count) +{ + amdsmi_status_t status; + uint32_t sock_num; + status = amd::smi::AMDSmiSystem::getInstance().get_sys_num_of_cpu_sockets(&sock_num); + if (status != AMDSMI_STATUS_SUCCESS) + return status; + + *sock_count = sock_num; + + return AMDSMI_STATUS_SUCCESS; +} + amdsmi_status_t amdsmi_get_cpu_handles(uint32_t *cpu_count, amdsmi_processor_handle *processor_handles) { diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index 2879548ce7..acaa94e627 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -323,7 +323,39 @@ std::string AMDSmiGPUDevice::bdf_to_string() const { return oss.str(); } +std::vector AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const { + std::vector bitmask(size, 0); + + if (node_id < 0) { + bitmask[0] = std::numeric_limits::max(); + return bitmask; + } + + std::string path = "/sys/devices/system/node/node" + std::to_string(node_id) + "/cpulist"; + std::ifstream file(path); + + if (file.is_open()) { + std::string info; + while (std::getline(file, info)) { + std::istringstream sstr(info); + std::string node_cpus; + while (std::getline(sstr, node_cpus, ',')) { + size_t hyphen = node_cpus.find('-'); + if (hyphen != std::string::npos) { + int start = std::stoi(node_cpus.substr(0, hyphen)); + int end = std::stoi(node_cpus.substr(hyphen + 1)); + for (int i = start; i <= end; ++i) { + bitmask[i / 64] |= (1ULL << (i % 64)); + } + } else { + int core = std::stoi(node_cpus); + bitmask[core / 64] |= (1ULL << (core % 64)); + } + } + } + } + return bitmask; +} } // namespace smi } // namespace amd - diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index 98490076bd..d990925536 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -31,6 +31,7 @@ #include "amd_smi/impl/amd_smi_utils.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" +#include namespace amd { namespace smi { @@ -103,7 +104,7 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string if (!cpu_info.is_open()) { std::cerr << "Failed to open /proc/cpuinfo:" << strerror(errno) << std::endl; return AMDSMI_STATUS_FILE_ERROR; - } else { + } else { uint32_t current_socket_id = -1; while (std::getline(cpu_info, info)) { if (info.find("processor") != std::string::npos) { @@ -126,7 +127,85 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string } return AMDSMI_STATUS_SUCCESS; } + #endif +std::map AMDSmiSystem::get_sys_cpu_cores_per_socket() { + std::map socket_core_count; + std::string base_path = "/sys/devices/system/cpu/"; + + iterate_directory(base_path, [&socket_core_count](const std::string &path) { + std::string filename(basename(path.c_str())); + if (filename.find("cpu") != std::string::npos) { + std::string cpuPath = path; + std::ifstream package_id_file(cpuPath + "/topology/physical_package_id"); + std::ifstream core_id_file(cpuPath + "/topology/core_id"); + + if (package_id_file.is_open() && core_id_file.is_open()) { + uint32_t physical_id, core_id; + package_id_file >> physical_id; + core_id_file >> core_id; + + socket_core_count[physical_id]++; + } + } + }); + + return socket_core_count; +} + +amdsmi_status_t AMDSmiSystem::get_sys_num_of_cpu_sockets(uint32_t *sock_num) { + std::map socket_count_map; + std::string base_path = "/sys/devices/system/cpu/"; + + iterate_directory(base_path, [&socket_count_map](std::string path) { + std::string filename(basename(path.c_str())); + if (filename.find("cpu") != std::string::npos) { + std::string cpu_path = path; + std::ifstream package_id_file(cpu_path + "/topology/physical_package_id"); + + if (package_id_file.is_open()) { + uint32_t physical_id; + package_id_file >> physical_id; + + socket_count_map[physical_id]++; + } + } + }); + + *sock_num = static_cast(socket_count_map.size()); + + return AMDSMI_STATUS_SUCCESS; +} + +std::vector AMDSmiSystem::get_cpu_sockets_from_numa_node(int32_t numa_node) { + std::vector sockets; + if (numa_node < 0) { + sockets[0] = std::numeric_limits::max(); + return sockets; + } + std::ifstream node_info("/sys/devices/system/node/node" + std::to_string(numa_node) + "/cpulist"); + std::string info; + + if (node_info.is_open()) { + std::getline(node_info, info); + std::istringstream iss(info); + uint32_t index; + while (iss >> index) { + std::ifstream cpu_info("/sys/devices/system/cpu/cpu" + std::to_string(index) + "/topology/physical_package_id"); + if (cpu_info.is_open()) { + uint32_t socket; + cpu_info >> socket; + sockets.push_back(socket); + } + } + } + + // Discarding duplicate socket entries + std::sort(sockets.begin(), sockets.end()); + sockets.erase(std::unique(sockets.begin(), sockets.end()), sockets.end()); + + return sockets; +} amdsmi_status_t AMDSmiSystem::init(uint64_t flags) { init_flag_ = flags; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index ff0de75866..d3b484d6e5 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -36,15 +36,15 @@ #include #include -#include -#include +#include +#include #include #include -#include -#include -#include #include -#include +#include +#include +#include +#include #include "amd_smi/impl/amd_smi_utils.h" #include "amd_smi/impl/amd_smi_system.h" @@ -1046,3 +1046,25 @@ void amdsmi_wait_for_user_input(void) { } } } + +bool iterate_directory(const std::string &base_path, + std::function entry_callback) { + + DIR *dir = opendir(base_path.c_str()); + if (!dir) { + return false; + } + + struct dirent *entry = nullptr; + while ((entry = readdir(dir)) != NULL) { + entry_callback(entry->d_name); + } + + if (errno != 0) { + closedir(dir); + return false; + } + + closedir(dir); + return true; +}