[SWDEV-512393] Added amdsmi_get_cpu_affinity_with_scope (#198)

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Signed-off-by: Deepak Mewar <deepak.mewar@amd.com>
Cette révision appartient à :
Mewar, Deepak
2025-05-20 11:36:09 +05:30
révisé par GitHub
Parent 51e99965b3
révision b999f86611
15 fichiers modifiés avec 524 ajouts et 39 suppressions
+4
Voir le fichier
@@ -12,6 +12,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
- Added new C and Python API `amdsmi_get_cpu_model_name`
- Not sourced from esmi library.
- **Added `amdsmi_get_cpu_affinity_with_scope()`**.
### Added
- **Added support for GPU metrics 1.8**.
- Added new fields for `amdsmi_gpu_xcp_metrics_t` including:
- Adding the following metrics to allow new calculations for violation status:
+17 -1
Voir le fichier
@@ -832,8 +832,24 @@ class AMDSMICommands():
numa_affinity = "N/A"
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
try:
cpu_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.NUMA_SCOPE)
except amdsmi_exception.AmdSmiLibraryException as e:
cpu_set = []
cpu_set.append(-1)
logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info())
try:
cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
except amdsmi_exception.AmdSmiLibraryException as e:
cpusockets = []
cpusockets.append(-1)
logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['numa'] = {'node' : numa_node_number,
'affinity' : numa_affinity}
'affinity' : numa_affinity,
'CPU affinity' : [hex(cpus) for cpus in cpu_set],
'Socket affinity' : [socket for socket in set(cpusockets)]}
if args.vram:
vram_info_dict = {"type" : "N/A",
"vendor" : "N/A",
+37
Voir le fichier
@@ -4180,6 +4180,42 @@ except AmdSmiException as e:
print(e)
```
### amdsmi_get_cpu_affinity_with_scope
Description: Returns list of bitmask information for the given GPU.
Input parameters:
* `processor_handle` device which to query
Output: List with fields
Field | Description
---|---
`array_size` | array size = (num of sockets * num of cores)/ size of 64-bit
`scope` | enum value for numa or socket affinity
Exceptions that can be thrown by `amdsmi_get_gpu_vram_info` function:
* `AmdSmiLibraryException`
* `AmdSmiRetryException`
* `AmdSmiParameterException`
Example:
```python
try:
devices = amdsmi_get_processor_handles()
if len(devices) == 0:
print("No GPUs on machine")
else:
for device in devices:
bitmask = amdsmi_get_cpu_affinity_with_scope(device)
print(bitmask['size'])
except AmdSmiException as e:
print(e)
```
## CPU APIs
### amdsmi_get_processor_info
@@ -5274,6 +5310,7 @@ try:
except AmdSmiException as e:
print(e)
```
### amdsmi_get_afids_from_cper
Description: Get the AFIDs from CPER buffer
+4 -2
Voir le fichier
@@ -33,8 +33,10 @@ endif()
# add package search paths
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib ${ROCM_DIR}/lib64)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib)
if(EXISTS ${ROCM_DIR}/lib64)
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
endif()
find_package(amd_smi CONFIG REQUIRED)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
+83 -8
Voir le fichier
@@ -2085,6 +2085,18 @@ typedef enum {
AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH
} amdsmi_virtualization_mode_t;
/**
* @brief Scope for Numa affinity or Socket affinity
*
* @cond @tag{gpu_bm_linux} @endcond
*/
typedef enum {
AMDSMI_AFFINITY_SCOPE_NODE = 0, // Memory affinity as numa node
AMDSMI_AFFINITY_SCOPE_SOCKET = 1 // socket affinity
} amdsmi_affinity_scope_t;
#define AMDSMI_DEFAULT_VARIANT 0xFFFFFFFFFFFFFFFF
#ifdef ENABLE_ESMI_LIB
@@ -2309,6 +2321,16 @@ typedef struct {
#endif
/**
* @brief cpu socket info data
*
* @cond @tag{cpu_bm} @endcond
*/
typedef struct {
uint32_t socket_id;
uint32_t cores_per_socket;
} amdsmi_sock_info_t;
/*****************************************************************************/
/** @defgroup tagInitShutdown Initialization and Shutdown
* @{
@@ -2698,6 +2720,34 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle, unsigned in
amdsmi_status_t
amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle, amdsmi_enumeration_info_t *info);
/**
* @brief Retrieves an array of uint64_t (sized to cpu_set_size) of bitmasks with the
* affinity within numa node or socket for the device.
*
* @ingroup tagProcDiscovery
*
* @platform{gpu_bm_linux}
*
* @details Given a processor handle @p processor_handle, the size of the cpu_set array @p cpu_set_size,
* and a pointer to an array of int64_t @p cpu_set, and @p scope, this function will write the CPU affinity bitmask
* to the array pointed to by @p cpu_set.
*
* User must allocate the enough memory for the cpu_set array. The size of the array is determined by the
* number of CPU cores in the system. As an example, if there are 2 CPUs and each has 112 cores, the size
* should be ceiling(2*112/64) = 4, where 64 is the bits of uint64_t. The function will write the CPU affinity bitmask
* to the array. For example, to describe the CPU cores 0-55,112-167, it will set the 0-55 and 112-167 bits
* to 1 and the reset of bits to 0 in the cpu_set array.
*
* @param[in] processor_handle a processor handle
* @param[in] cpu_set_size The size of the cpu_set array that is safe to access
* @param[in,out] cpu_set Array reference in which to return a bitmask of CPU cores that this processor affinities with.
* @param[in] scope Scope for socket or numa affinity.
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle,
uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope);
/** @} End tagProcDiscovery */
/*****************************************************************************/
@@ -6987,14 +7037,14 @@ amdsmi_status_t amdsmi_get_hsmp_metrics_table(amdsmi_processor_handle processor_
/** @} tagHSMPMetricsTable */
/*****************************************************************************/
/** @defgroup tagAuxillary Auxillary functions
/** @defgroup cpuAuxillary Auxillary functions
* @{
*/
/**
* @brief Get first online core on socket.
*
* @ingroup tagAuxillary
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
@@ -7010,7 +7060,7 @@ amdsmi_status_t amdsmi_first_online_core_on_cpu_socket(amdsmi_processor_handle p
/**
* @brief Get CPU family.
*
* @ingroup tagAuxillary
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
@@ -7023,7 +7073,7 @@ amdsmi_status_t amdsmi_get_cpu_family(uint32_t *cpu_family);
/**
* @brief Get CPU model.
*
* @ingroup tagAuxillary
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
@@ -7036,7 +7086,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model);
/**
* @brief Retrieve the CPU processor model name based on the processor index.
*
* @ingroup tagAuxillary
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
@@ -7062,7 +7112,7 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
/**
* @brief Get a description of provided AMDSMI error status for esmi errors.
*
* @ingroup tagAuxillary
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
@@ -7078,7 +7128,33 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
*/
amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string);
/** @} tagAuxillary */
/**
* @brief Get cpu cores per socket from sys filesystem.
*
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
* @param[in] sock_count - cpu socket count
* @param[in,out] soc_info - Input buffer to return the cpu cores per socket
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *soc_info);
/**
* @brief Get CPU socket count from sys filesystem.
*
* @ingroup cpuAuxillary
*
* @platform{cpu_bm}
*
* @param[in,out] sock_count - Input buffer to return the cpu socket count
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count);
/** @} cpuAuxillary */
#endif
@@ -7087,4 +7163,3 @@ amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **sta
#endif // __cplusplus
#endif // __AMDSMI_H__
+1
Voir le fichier
@@ -73,6 +73,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
// New methods for -e feature
std::string bdf_to_string() const; // -e feature
std::vector<uint64_t> get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const;
private:
uint32_t gpu_id_;
+6
Voir le fichier
@@ -25,6 +25,7 @@
#include <vector>
#include <set>
#include <map>
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/amd_smi_socket.h"
#include "amd_smi/impl/amd_smi_processor.h"
@@ -60,6 +61,11 @@ class AMDSmiSystem {
amdsmi_status_t get_cpu_model_name(uint32_t socket_id, std::string *model_name);
std::map<uint32_t, uint32_t> get_sys_cpu_cores_per_socket() ;
amdsmi_status_t get_sys_num_of_cpu_sockets(uint32_t *sock_num);
std::vector<uint32_t> get_cpu_sockets_from_numa_node(int32_t numa_node);
private:
AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {}
+16
Voir le fichier
@@ -23,10 +23,12 @@
#ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
#define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
#include <dirent.h>
#include <limits>
#include <type_traits>
#include <string>
#include <utility>
#include <functional>
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/amd_smi_gpu_device.h"
@@ -176,4 +178,18 @@ constexpr T translate_umax_or_assign_value(U source_value, V target_value)
return result;
}
/**
* @brief Iterates all entires in a directory .
*
* @details Given a directory in const std::string & base_path, and a callback function
* entry_callback, this function will open the directory and iterate through all entires
* in that directory. For each entry it will call the entry_callback function with the
* path of that entry
*
* @param[in] base_path the path of the directory to iterate in
*
* @retval ::true if the iteration was successful
* ::false if the iteration failed
*/
bool iterate_directory(const std::string &base_path, std::function<void(const std::string &)> entry_callback);
#endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
+6
Voir le fichier
@@ -86,6 +86,11 @@ from .amdsmi_interface import amdsmi_get_gpu_device_bdf
from .amdsmi_interface import amdsmi_get_gpu_device_uuid
from .amdsmi_interface import amdsmi_get_gpu_enumeration_info
# # Functions not dependent on ESMI library
from .amdsmi_interface import amdsmi_get_cpu_socket_count
from .amdsmi_interface import amdsmi_get_cpu_cores_per_socket
from .amdsmi_interface import amdsmi_get_cpu_affinity_with_scope
# # SW Version Information
from .amdsmi_interface import amdsmi_get_gpu_driver_info
@@ -278,6 +283,7 @@ from .amdsmi_interface import AmdSmiProcessorType
from .amdsmi_interface import AmdSmiVirtualizationMode
from .amdsmi_interface import AmdSmiVramType
from .amdsmi_interface import AmdSmiVramVendor
from .amdsmi_interface import AmdSmiAffinityScope
# Exceptions
from .amdsmi_exception import AmdSmiLibraryException
+49
Voir le fichier
@@ -520,6 +520,9 @@ class AmdSmiVramVendor(IntEnum):
MICRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MICRON
UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_UNKNOWN
class AmdSmiAffinityScope(IntEnum):
NUMA_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_NODE
SOCKET_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_SOCKET
class AmdSmiEventReader:
def __init__(
@@ -1738,6 +1741,23 @@ def amdsmi_get_cpu_model_name(
)
return f"{cpu_info.model_name}"
def amdsmi_get_cpu_cores_per_socket(sock_count: ctypes.c_uint32()):
cps = amdsmi_wrapper.amdsmi_sock_info_t()
_check_res(
amdsmi_wrapper.amdsmi_get_cpu_cores_per_socket(sock_count, cps)
)
return {"socket_id": cps.socket_id,
"cores_per_socket": cps.cores_per_socket
}
def amdsmi_get_cpu_socket_count():
sock_count = ctypes.c_uint32()
_check_res(
amdsmi_wrapper.amdsmi_get_cpu_socket_count(ctypes.byref(sock_count))
)
return sock_count.value
def amdsmi_init(flag=AmdSmiInitFlags.INIT_AMD_GPUS):
if not isinstance(flag, AmdSmiInitFlags):
raise AmdSmiParameterException(flag, AmdSmiInitFlags)
@@ -1841,6 +1861,35 @@ def amdsmi_get_gpu_enumeration_info(processor_handle: amdsmi_wrapper.amdsmi_proc
return enumeration_info
def amdsmi_get_cpu_affinity_with_scope(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
scope: AmdSmiAffinityScope
) -> List[int]:
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
raise AmdSmiParameterException(
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
)
if not isinstance(scope, AmdSmiAffinityScope):
raise AmdSmiParameterException(scope, AmdSmiAffinityScope)
socket_count = amdsmi_get_cpu_socket_count()
sock_info = amdsmi_get_cpu_cores_per_socket(socket_count)
core_count = sock_info['cores_per_socket']
size = ctypes.c_uint32(0)
size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8)
size = int(math.ceil(size))
size = ctypes.c_uint32(size)
cpu_set = (ctypes.c_uint64 * size.value)()
_check_res(
amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope(
processor_handle, size, cpu_set, scope)
)
return cpu_set
def amdsmi_get_gpu_asic_info(
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
) -> Dict[str, Any]:
+50 -18
Voir le fichier
@@ -2131,6 +2131,15 @@ AMDSMI_VIRTUALIZATION_MODE_HOST = 2
AMDSMI_VIRTUALIZATION_MODE_GUEST = 3
AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH = 4
amdsmi_virtualization_mode_t = ctypes.c_uint32 # enum
# values for enumeration 'amdsmi_affinity_scope_t'
amdsmi_affinity_scope_t__enumvalues = {
0: 'AMDSMI_AFFINITY_SCOPE_NODE',
1: 'AMDSMI_AFFINITY_SCOPE_SOCKET',
}
AMDSMI_AFFINITY_SCOPE_NODE = 0
AMDSMI_AFFINITY_SCOPE_SOCKET = 1
amdsmi_affinity_scope_t = ctypes.c_uint32 # enum
class struct_amdsmi_smu_fw_version_t(Structure):
pass
@@ -2322,6 +2331,16 @@ struct_amdsmi_cpu_info_t._fields_ = [
]
amdsmi_cpu_info_t = struct_amdsmi_cpu_info_t
class struct_amdsmi_sock_info_t(Structure):
pass
struct_amdsmi_sock_info_t._pack_ = 1 # source:False
struct_amdsmi_sock_info_t._fields_ = [
('socket_id', ctypes.c_uint32),
('cores_per_socket', ctypes.c_uint32),
]
amdsmi_sock_info_t = struct_amdsmi_sock_info_t
uint64_t = ctypes.c_uint64
amdsmi_init = _libraries['libamd_smi.so'].amdsmi_init
amdsmi_init.restype = amdsmi_status_t
@@ -2369,6 +2388,10 @@ amdsmi_get_gpu_device_uuid.argtypes = [amdsmi_processor_handle, ctypes.POINTER(c
amdsmi_get_gpu_enumeration_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_enumeration_info
amdsmi_get_gpu_enumeration_info.restype = amdsmi_status_t
amdsmi_get_gpu_enumeration_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_enumeration_info_t)]
uint32_t = ctypes.c_uint32
amdsmi_get_cpu_affinity_with_scope = _libraries['libamd_smi.so'].amdsmi_get_cpu_affinity_with_scope
amdsmi_get_cpu_affinity_with_scope.restype = amdsmi_status_t
amdsmi_get_cpu_affinity_with_scope.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_uint64), amdsmi_affinity_scope_t]
amdsmi_get_gpu_id = _libraries['libamd_smi.so'].amdsmi_get_gpu_id
amdsmi_get_gpu_id.restype = amdsmi_status_t
amdsmi_get_gpu_id.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint16)]
@@ -2378,7 +2401,6 @@ amdsmi_get_gpu_revision.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctyp
amdsmi_get_gpu_vendor_name = _libraries['libamd_smi.so'].amdsmi_get_gpu_vendor_name
amdsmi_get_gpu_vendor_name.restype = amdsmi_status_t
amdsmi_get_gpu_vendor_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), size_t]
uint32_t = ctypes.c_uint32
amdsmi_get_gpu_vram_vendor = _libraries['libamd_smi.so'].amdsmi_get_gpu_vram_vendor
amdsmi_get_gpu_vram_vendor.restype = amdsmi_status_t
amdsmi_get_gpu_vram_vendor.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t]
@@ -2939,6 +2961,12 @@ amdsmi_get_cpu_model_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(st
amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg
amdsmi_get_esmi_err_msg.restype = amdsmi_status_t
amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
amdsmi_get_cpu_cores_per_socket = _libraries['libamd_smi.so'].amdsmi_get_cpu_cores_per_socket
amdsmi_get_cpu_cores_per_socket.restype = amdsmi_status_t
amdsmi_get_cpu_cores_per_socket.argtypes = [uint32_t, ctypes.POINTER(struct_amdsmi_sock_info_t)]
amdsmi_get_cpu_socket_count = _libraries['libamd_smi.so'].amdsmi_get_cpu_socket_count
amdsmi_get_cpu_socket_count.restype = amdsmi_status_t
amdsmi_get_cpu_socket_count.argtypes = [ctypes.POINTER(ctypes.c_uint32)]
__all__ = \
['AGG_BW0', 'AMDSMI_ACCELERATOR_DECODER',
'AMDSMI_ACCELERATOR_DMA', 'AMDSMI_ACCELERATOR_ENCODER',
@@ -2950,6 +2978,7 @@ __all__ = \
'AMDSMI_ACCELERATOR_PARTITION_QPX',
'AMDSMI_ACCELERATOR_PARTITION_SPX',
'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_ACCELERATOR_XCC',
'AMDSMI_AFFINITY_SCOPE_NODE', 'AMDSMI_AFFINITY_SCOPE_SOCKET',
'AMDSMI_CACHE_PROPERTY_CPU_CACHE',
'AMDSMI_CACHE_PROPERTY_DATA_CACHE',
'AMDSMI_CACHE_PROPERTY_ENABLED',
@@ -3173,18 +3202,18 @@ __all__ = \
'amdsmi_accelerator_partition_profile_t',
'amdsmi_accelerator_partition_resource_profile_t',
'amdsmi_accelerator_partition_resource_type_t',
'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t',
'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t',
'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t',
'amdsmi_clean_gpu_local_data', 'amdsmi_clk_info_t',
'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t',
'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t',
'amdsmi_counter_command_t', 'amdsmi_counter_value_t',
'amdsmi_cper_guid_t', 'amdsmi_cper_hdr_t',
'amdsmi_cper_notify_type_t', 'amdsmi_cper_sev_t',
'amdsmi_cper_timestamp_t', 'amdsmi_cper_valid_bits_t',
'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable',
'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t',
'amdsmi_accelerator_partition_type_t', 'amdsmi_affinity_scope_t',
'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t',
'amdsmi_board_info_t', 'amdsmi_cache_property_type_t',
'amdsmi_card_form_factor_t', 'amdsmi_clean_gpu_local_data',
'amdsmi_clk_info_t', 'amdsmi_clk_limit_type_t',
'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t',
'amdsmi_container_types_t', 'amdsmi_counter_command_t',
'amdsmi_counter_value_t', 'amdsmi_cper_guid_t',
'amdsmi_cper_hdr_t', 'amdsmi_cper_notify_type_t',
'amdsmi_cper_sev_t', 'amdsmi_cper_timestamp_t',
'amdsmi_cper_valid_bits_t', 'amdsmi_cpu_apb_disable',
'amdsmi_cpu_apb_enable', 'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t',
'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t',
'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t',
'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t',
@@ -3198,11 +3227,12 @@ __all__ = \
'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t',
'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t',
'amdsmi_frequency_range_t', 'amdsmi_fw_block_t',
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
'amdsmi_get_cpu_affinity_with_scope',
'amdsmi_get_clk_freq', 'amdsmi_get_clock_info',
'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit',
'amdsmi_get_cpu_core_current_freq_limit',
'amdsmi_get_cpu_core_energy',
'amdsmi_get_cpu_core_energy', 'amdsmi_get_cpu_cores_per_socket',
'amdsmi_get_cpu_current_io_bandwidth',
'amdsmi_get_cpu_current_xgmi_bw', 'amdsmi_get_cpu_ddr_bw',
'amdsmi_get_cpu_dimm_power_consumption',
@@ -3215,6 +3245,7 @@ __all__ = \
'amdsmi_get_cpu_pwr_svi_telemetry_all_rails',
'amdsmi_get_cpu_smu_fw_version',
'amdsmi_get_cpu_socket_c0_residency',
'amdsmi_get_cpu_socket_count',
'amdsmi_get_cpu_socket_current_active_freq_limit',
'amdsmi_get_cpu_socket_energy',
'amdsmi_get_cpu_socket_freq_range',
@@ -3332,8 +3363,9 @@ __all__ = \
'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap',
'amdsmi_set_soc_pstate', 'amdsmi_set_xgmi_plpd',
'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
'amdsmi_sock_info_t', 'amdsmi_socket_handle',
'amdsmi_status_code_to_string', 'amdsmi_status_t',
'amdsmi_stop_gpu_event_notification',
'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
@@ -3380,7 +3412,7 @@ __all__ = \
'struct_amdsmi_proc_info_t', 'struct_amdsmi_process_info_t',
'struct_amdsmi_range_t', 'struct_amdsmi_ras_feature_t',
'struct_amdsmi_retired_page_record_t',
'struct_amdsmi_smu_fw_version_t',
'struct_amdsmi_smu_fw_version_t', 'struct_amdsmi_sock_info_t',
'struct_amdsmi_temp_range_refresh_rate_t',
'struct_amdsmi_topology_nearest_t',
'struct_amdsmi_utilization_counter_t',
+110 -2
Voir le fichier
@@ -3941,8 +3941,8 @@ amdsmi_get_gpu_cper_entries(
std::string path = std::string("/sys/kernel/debug/dri/") +
std::to_string(gpu_device->get_card_id()) +
"/amdgpu_ring_cper";
return amdsmi_get_gpu_cper_entries_by_path(
path.c_str(),
severity_mask,
@@ -4795,6 +4795,85 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
return status;
}
amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle,
uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope)
{
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr || cpu_set == nullptr || cpu_set_size == 0) {
return AMDSMI_STATUS_INVAL;
}
// Retrieve GPU device from the processor handle
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
uint32_t numa_node;
status = amdsmi_topo_get_numa_node_number(processor_handle, &numa_node);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
int32_t node_id = static_cast<int32_t>(numa_node);
status = amdsmi_get_gpu_topo_numa_affinity(processor_handle, &node_id);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
if(node_id < 0) {
return AMDSMI_STATUS_NOT_FOUND;
}
std::memset(cpu_set, 0, cpu_set_size * sizeof(uint64_t));
switch(scope) {
case AMDSMI_AFFINITY_SCOPE_NODE:
{
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_numa_node(node_id, cpu_set_size);
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
} else {
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
}
break;
}
case AMDSMI_AFFINITY_SCOPE_SOCKET:
{
std::vector<uint32_t> sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id);
if(sockets[0] == std::numeric_limits<int32_t>::max()){
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
} else {
for (uint32_t idx : sockets) {
cpu_set[idx] = idx;
}
std::sort(cpu_set, cpu_set + cpu_set_size);
// Discard duplicates
uint32_t temp_size = 0;
for (uint32_t i = 0; i < cpu_set_size; ++i) {
if (i == 0 || cpu_set[i] != cpu_set[i - 1]) {
cpu_set[temp_size++] = cpu_set[i];
}
}
// Update the size to the temp size after discarding duplicates
cpu_set_size = temp_size;
}
break;
}
default:
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
return AMDSMI_STATUS_SUCCESS;
}
#ifdef ENABLE_ESMI_LIB
static amdsmi_status_t amdsmi_errno_to_esmi_status(amdsmi_status_t status)
@@ -5905,6 +5984,35 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *sock_info)
{
std::map<uint32_t, uint32_t> socket_core_count = amd::smi::AMDSmiSystem::getInstance().get_sys_cpu_cores_per_socket();
for (uint32_t i = 0; i < sock_count; ++i) {
auto it = socket_core_count.find(sock_info[i].socket_id);
if (it != socket_core_count.end()) {
sock_info[i].cores_per_socket = it->second;
} else {
sock_info[i].cores_per_socket = 0;
}
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count)
{
amdsmi_status_t status;
uint32_t sock_num;
status = amd::smi::AMDSmiSystem::getInstance().get_sys_num_of_cpu_sockets(&sock_num);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
*sock_count = sock_num;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_handles(uint32_t *cpu_count,
amdsmi_processor_handle *processor_handles)
{
+33 -1
Voir le fichier
@@ -323,7 +323,39 @@ std::string AMDSmiGPUDevice::bdf_to_string() const {
return oss.str();
}
std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const {
std::vector<uint64_t> bitmask(size, 0);
if (node_id < 0) {
bitmask[0] = std::numeric_limits<int32_t>::max();
return bitmask;
}
std::string path = "/sys/devices/system/node/node" + std::to_string(node_id) + "/cpulist";
std::ifstream file(path);
if (file.is_open()) {
std::string info;
while (std::getline(file, info)) {
std::istringstream sstr(info);
std::string node_cpus;
while (std::getline(sstr, node_cpus, ',')) {
size_t hyphen = node_cpus.find('-');
if (hyphen != std::string::npos) {
int start = std::stoi(node_cpus.substr(0, hyphen));
int end = std::stoi(node_cpus.substr(hyphen + 1));
for (int i = start; i <= end; ++i) {
bitmask[i / 64] |= (1ULL << (i % 64));
}
} else {
int core = std::stoi(node_cpus);
bitmask[core / 64] |= (1ULL << (core % 64));
}
}
}
}
return bitmask;
}
} // namespace smi
} // namespace amd
+80 -1
Voir le fichier
@@ -31,6 +31,7 @@
#include "amd_smi/impl/amd_smi_utils.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
#include <map>
namespace amd {
namespace smi {
@@ -103,7 +104,7 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string
if (!cpu_info.is_open()) {
std::cerr << "Failed to open /proc/cpuinfo:" << strerror(errno) << std::endl;
return AMDSMI_STATUS_FILE_ERROR;
} else {
} else {
uint32_t current_socket_id = -1;
while (std::getline(cpu_info, info)) {
if (info.find("processor") != std::string::npos) {
@@ -126,7 +127,85 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string
}
return AMDSMI_STATUS_SUCCESS;
}
#endif
std::map<uint32_t, uint32_t> AMDSmiSystem::get_sys_cpu_cores_per_socket() {
std::map<uint32_t, uint32_t> socket_core_count;
std::string base_path = "/sys/devices/system/cpu/";
iterate_directory(base_path, [&socket_core_count](const std::string &path) {
std::string filename(basename(path.c_str()));
if (filename.find("cpu") != std::string::npos) {
std::string cpuPath = path;
std::ifstream package_id_file(cpuPath + "/topology/physical_package_id");
std::ifstream core_id_file(cpuPath + "/topology/core_id");
if (package_id_file.is_open() && core_id_file.is_open()) {
uint32_t physical_id, core_id;
package_id_file >> physical_id;
core_id_file >> core_id;
socket_core_count[physical_id]++;
}
}
});
return socket_core_count;
}
amdsmi_status_t AMDSmiSystem::get_sys_num_of_cpu_sockets(uint32_t *sock_num) {
std::map<uint32_t, uint32_t> socket_count_map;
std::string base_path = "/sys/devices/system/cpu/";
iterate_directory(base_path, [&socket_count_map](std::string path) {
std::string filename(basename(path.c_str()));
if (filename.find("cpu") != std::string::npos) {
std::string cpu_path = path;
std::ifstream package_id_file(cpu_path + "/topology/physical_package_id");
if (package_id_file.is_open()) {
uint32_t physical_id;
package_id_file >> physical_id;
socket_count_map[physical_id]++;
}
}
});
*sock_num = static_cast<uint32_t>(socket_count_map.size());
return AMDSMI_STATUS_SUCCESS;
}
std::vector<uint32_t> AMDSmiSystem::get_cpu_sockets_from_numa_node(int32_t numa_node) {
std::vector<uint32_t> sockets;
if (numa_node < 0) {
sockets[0] = std::numeric_limits<int32_t>::max();
return sockets;
}
std::ifstream node_info("/sys/devices/system/node/node" + std::to_string(numa_node) + "/cpulist");
std::string info;
if (node_info.is_open()) {
std::getline(node_info, info);
std::istringstream iss(info);
uint32_t index;
while (iss >> index) {
std::ifstream cpu_info("/sys/devices/system/cpu/cpu" + std::to_string(index) + "/topology/physical_package_id");
if (cpu_info.is_open()) {
uint32_t socket;
cpu_info >> socket;
sockets.push_back(socket);
}
}
}
// Discarding duplicate socket entries
std::sort(sockets.begin(), sockets.end());
sockets.erase(std::unique(sockets.begin(), sockets.end()), sockets.end());
return sockets;
}
amdsmi_status_t AMDSmiSystem::init(uint64_t flags) {
init_flag_ = flags;
+28 -6
Voir le fichier
@@ -36,15 +36,15 @@
#include <dirent.h>
#include <sys/types.h>
#include <memory>
#include <random>
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <iostream>
#include <regex>
#include <cstdio>
#include <sstream>
#include <iterator>
#include <algorithm>
#include <memory>
#include <random>
#include <regex>
#include <sstream>
#include "amd_smi/impl/amd_smi_utils.h"
#include "amd_smi/impl/amd_smi_system.h"
@@ -1046,3 +1046,25 @@ void amdsmi_wait_for_user_input(void) {
}
}
}
bool iterate_directory(const std::string &base_path,
std::function<void(const std::string &)> entry_callback) {
DIR *dir = opendir(base_path.c_str());
if (!dir) {
return false;
}
struct dirent *entry = nullptr;
while ((entry = readdir(dir)) != NULL) {
entry_callback(entry->d_name);
}
if (errno != 0) {
closedir(dir);
return false;
}
closedir(dir);
return true;
}