[SWDEV-512393] Added amdsmi_get_cpu_affinity_with_scope (#198)
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> Signed-off-by: Deepak Mewar <deepak.mewar@amd.com>
Cette révision appartient à :
@@ -12,6 +12,10 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
- Added new C and Python API `amdsmi_get_cpu_model_name`
|
||||
- Not sourced from esmi library.
|
||||
|
||||
- **Added `amdsmi_get_cpu_affinity_with_scope()`**.
|
||||
|
||||
### Added
|
||||
|
||||
- **Added support for GPU metrics 1.8**.
|
||||
- Added new fields for `amdsmi_gpu_xcp_metrics_t` including:
|
||||
- Adding the following metrics to allow new calculations for violation status:
|
||||
|
||||
@@ -832,8 +832,24 @@ class AMDSMICommands():
|
||||
numa_affinity = "N/A"
|
||||
logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
cpu_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.NUMA_SCOPE)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
cpu_set = []
|
||||
cpu_set.append(-1)
|
||||
logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
cpusockets = []
|
||||
cpusockets.append(-1)
|
||||
logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['numa'] = {'node' : numa_node_number,
|
||||
'affinity' : numa_affinity}
|
||||
'affinity' : numa_affinity,
|
||||
'CPU affinity' : [hex(cpus) for cpus in cpu_set],
|
||||
'Socket affinity' : [socket for socket in set(cpusockets)]}
|
||||
if args.vram:
|
||||
vram_info_dict = {"type" : "N/A",
|
||||
"vendor" : "N/A",
|
||||
|
||||
@@ -4180,6 +4180,42 @@ except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_cpu_affinity_with_scope
|
||||
|
||||
Description: Returns list of bitmask information for the given GPU.
|
||||
|
||||
Input parameters:
|
||||
|
||||
* `processor_handle` device which to query
|
||||
|
||||
Output: List with fields
|
||||
|
||||
Field | Description
|
||||
---|---
|
||||
`array_size` | array size = (num of sockets * num of cores)/ size of 64-bit
|
||||
`scope` | enum value for numa or socket affinity
|
||||
|
||||
Exceptions that can be thrown by `amdsmi_get_gpu_vram_info` function:
|
||||
|
||||
* `AmdSmiLibraryException`
|
||||
* `AmdSmiRetryException`
|
||||
* `AmdSmiParameterException`
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
try:
|
||||
devices = amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
print("No GPUs on machine")
|
||||
else:
|
||||
for device in devices:
|
||||
bitmask = amdsmi_get_cpu_affinity_with_scope(device)
|
||||
print(bitmask['size'])
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
## CPU APIs
|
||||
|
||||
### amdsmi_get_processor_info
|
||||
@@ -5274,6 +5310,7 @@ try:
|
||||
except AmdSmiException as e:
|
||||
print(e)
|
||||
```
|
||||
|
||||
### amdsmi_get_afids_from_cper
|
||||
|
||||
Description: Get the AFIDs from CPER buffer
|
||||
|
||||
@@ -33,8 +33,10 @@ endif()
|
||||
|
||||
# add package search paths
|
||||
set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${ROCM_DIR} ../../../)
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib ${ROCM_DIR}/lib64)
|
||||
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib)
|
||||
if(EXISTS ${ROCM_DIR}/lib64)
|
||||
set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} ${ROCM_DIR}/lib64)
|
||||
endif()
|
||||
find_package(amd_smi CONFIG REQUIRED)
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
@@ -2085,6 +2085,18 @@ typedef enum {
|
||||
AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH
|
||||
} amdsmi_virtualization_mode_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Scope for Numa affinity or Socket affinity
|
||||
*
|
||||
* @cond @tag{gpu_bm_linux} @endcond
|
||||
*/
|
||||
typedef enum {
|
||||
AMDSMI_AFFINITY_SCOPE_NODE = 0, // Memory affinity as numa node
|
||||
AMDSMI_AFFINITY_SCOPE_SOCKET = 1 // socket affinity
|
||||
} amdsmi_affinity_scope_t;
|
||||
|
||||
|
||||
#define AMDSMI_DEFAULT_VARIANT 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
@@ -2309,6 +2321,16 @@ typedef struct {
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief cpu socket info data
|
||||
*
|
||||
* @cond @tag{cpu_bm} @endcond
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t socket_id;
|
||||
uint32_t cores_per_socket;
|
||||
} amdsmi_sock_info_t;
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup tagInitShutdown Initialization and Shutdown
|
||||
* @{
|
||||
@@ -2698,6 +2720,34 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle, unsigned in
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle, amdsmi_enumeration_info_t *info);
|
||||
|
||||
/**
|
||||
* @brief Retrieves an array of uint64_t (sized to cpu_set_size) of bitmasks with the
|
||||
* affinity within numa node or socket for the device.
|
||||
*
|
||||
* @ingroup tagProcDiscovery
|
||||
*
|
||||
* @platform{gpu_bm_linux}
|
||||
*
|
||||
* @details Given a processor handle @p processor_handle, the size of the cpu_set array @p cpu_set_size,
|
||||
* and a pointer to an array of int64_t @p cpu_set, and @p scope, this function will write the CPU affinity bitmask
|
||||
* to the array pointed to by @p cpu_set.
|
||||
*
|
||||
* User must allocate the enough memory for the cpu_set array. The size of the array is determined by the
|
||||
* number of CPU cores in the system. As an example, if there are 2 CPUs and each has 112 cores, the size
|
||||
* should be ceiling(2*112/64) = 4, where 64 is the bits of uint64_t. The function will write the CPU affinity bitmask
|
||||
* to the array. For example, to describe the CPU cores 0-55,112-167, it will set the 0-55 and 112-167 bits
|
||||
* to 1 and the reset of bits to 0 in the cpu_set array.
|
||||
*
|
||||
* @param[in] processor_handle a processor handle
|
||||
* @param[in] cpu_set_size The size of the cpu_set array that is safe to access
|
||||
* @param[in,out] cpu_set Array reference in which to return a bitmask of CPU cores that this processor affinities with.
|
||||
* @param[in] scope Scope for socket or numa affinity.
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle,
|
||||
uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope);
|
||||
|
||||
/** @} End tagProcDiscovery */
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -6987,14 +7037,14 @@ amdsmi_status_t amdsmi_get_hsmp_metrics_table(amdsmi_processor_handle processor_
|
||||
/** @} tagHSMPMetricsTable */
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup tagAuxillary Auxillary functions
|
||||
/** @defgroup cpuAuxillary Auxillary functions
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Get first online core on socket.
|
||||
*
|
||||
* @ingroup tagAuxillary
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
@@ -7010,7 +7060,7 @@ amdsmi_status_t amdsmi_first_online_core_on_cpu_socket(amdsmi_processor_handle p
|
||||
/**
|
||||
* @brief Get CPU family.
|
||||
*
|
||||
* @ingroup tagAuxillary
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
@@ -7023,7 +7073,7 @@ amdsmi_status_t amdsmi_get_cpu_family(uint32_t *cpu_family);
|
||||
/**
|
||||
* @brief Get CPU model.
|
||||
*
|
||||
* @ingroup tagAuxillary
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
@@ -7036,7 +7086,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model);
|
||||
/**
|
||||
* @brief Retrieve the CPU processor model name based on the processor index.
|
||||
*
|
||||
* @ingroup tagAuxillary
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
@@ -7062,7 +7112,7 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
|
||||
/**
|
||||
* @brief Get a description of provided AMDSMI error status for esmi errors.
|
||||
*
|
||||
* @ingroup tagAuxillary
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
@@ -7078,7 +7128,33 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string);
|
||||
|
||||
/** @} tagAuxillary */
|
||||
/**
|
||||
* @brief Get cpu cores per socket from sys filesystem.
|
||||
*
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
* @param[in] sock_count - cpu socket count
|
||||
* @param[in,out] soc_info - Input buffer to return the cpu cores per socket
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *soc_info);
|
||||
|
||||
/**
|
||||
* @brief Get CPU socket count from sys filesystem.
|
||||
*
|
||||
* @ingroup cpuAuxillary
|
||||
*
|
||||
* @platform{cpu_bm}
|
||||
*
|
||||
* @param[in,out] sock_count - Input buffer to return the cpu socket count
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count);
|
||||
/** @} cpuAuxillary */
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7087,4 +7163,3 @@ amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **sta
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // __AMDSMI_H__
|
||||
|
||||
|
||||
@@ -73,6 +73,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
|
||||
// New methods for -e feature
|
||||
std::string bdf_to_string() const; // -e feature
|
||||
std::vector<uint64_t> get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const;
|
||||
|
||||
private:
|
||||
uint32_t gpu_id_;
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_socket.h"
|
||||
#include "amd_smi/impl/amd_smi_processor.h"
|
||||
@@ -60,6 +61,11 @@ class AMDSmiSystem {
|
||||
|
||||
amdsmi_status_t get_cpu_model_name(uint32_t socket_id, std::string *model_name);
|
||||
|
||||
std::map<uint32_t, uint32_t> get_sys_cpu_cores_per_socket() ;
|
||||
|
||||
amdsmi_status_t get_sys_num_of_cpu_sockets(uint32_t *sock_num);
|
||||
|
||||
std::vector<uint32_t> get_cpu_sockets_from_numa_node(int32_t numa_node);
|
||||
private:
|
||||
AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {}
|
||||
|
||||
|
||||
@@ -23,10 +23,12 @@
|
||||
#ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
#define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
|
||||
#include <dirent.h>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <functional>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_gpu_device.h"
|
||||
@@ -176,4 +178,18 @@ constexpr T translate_umax_or_assign_value(U source_value, V target_value)
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Iterates all entires in a directory .
|
||||
*
|
||||
* @details Given a directory in const std::string & base_path, and a callback function
|
||||
* entry_callback, this function will open the directory and iterate through all entires
|
||||
* in that directory. For each entry it will call the entry_callback function with the
|
||||
* path of that entry
|
||||
*
|
||||
* @param[in] base_path the path of the directory to iterate in
|
||||
*
|
||||
* @retval ::true if the iteration was successful
|
||||
* ::false if the iteration failed
|
||||
*/
|
||||
bool iterate_directory(const std::string &base_path, std::function<void(const std::string &)> entry_callback);
|
||||
#endif // AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_
|
||||
|
||||
@@ -86,6 +86,11 @@ from .amdsmi_interface import amdsmi_get_gpu_device_bdf
|
||||
from .amdsmi_interface import amdsmi_get_gpu_device_uuid
|
||||
from .amdsmi_interface import amdsmi_get_gpu_enumeration_info
|
||||
|
||||
# # Functions not dependent on ESMI library
|
||||
from .amdsmi_interface import amdsmi_get_cpu_socket_count
|
||||
from .amdsmi_interface import amdsmi_get_cpu_cores_per_socket
|
||||
from .amdsmi_interface import amdsmi_get_cpu_affinity_with_scope
|
||||
|
||||
# # SW Version Information
|
||||
from .amdsmi_interface import amdsmi_get_gpu_driver_info
|
||||
|
||||
@@ -278,6 +283,7 @@ from .amdsmi_interface import AmdSmiProcessorType
|
||||
from .amdsmi_interface import AmdSmiVirtualizationMode
|
||||
from .amdsmi_interface import AmdSmiVramType
|
||||
from .amdsmi_interface import AmdSmiVramVendor
|
||||
from .amdsmi_interface import AmdSmiAffinityScope
|
||||
|
||||
# Exceptions
|
||||
from .amdsmi_exception import AmdSmiLibraryException
|
||||
|
||||
@@ -520,6 +520,9 @@ class AmdSmiVramVendor(IntEnum):
|
||||
MICRON = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_MICRON
|
||||
UNKNOWN = amdsmi_wrapper.AMDSMI_VRAM_VENDOR_UNKNOWN
|
||||
|
||||
class AmdSmiAffinityScope(IntEnum):
|
||||
NUMA_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_NODE
|
||||
SOCKET_SCOPE = amdsmi_wrapper.AMDSMI_AFFINITY_SCOPE_SOCKET
|
||||
|
||||
class AmdSmiEventReader:
|
||||
def __init__(
|
||||
@@ -1738,6 +1741,23 @@ def amdsmi_get_cpu_model_name(
|
||||
)
|
||||
return f"{cpu_info.model_name}"
|
||||
|
||||
def amdsmi_get_cpu_cores_per_socket(sock_count: ctypes.c_uint32()):
|
||||
cps = amdsmi_wrapper.amdsmi_sock_info_t()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_cpu_cores_per_socket(sock_count, cps)
|
||||
)
|
||||
return {"socket_id": cps.socket_id,
|
||||
"cores_per_socket": cps.cores_per_socket
|
||||
}
|
||||
|
||||
def amdsmi_get_cpu_socket_count():
|
||||
sock_count = ctypes.c_uint32()
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_cpu_socket_count(ctypes.byref(sock_count))
|
||||
)
|
||||
return sock_count.value
|
||||
|
||||
def amdsmi_init(flag=AmdSmiInitFlags.INIT_AMD_GPUS):
|
||||
if not isinstance(flag, AmdSmiInitFlags):
|
||||
raise AmdSmiParameterException(flag, AmdSmiInitFlags)
|
||||
@@ -1841,6 +1861,35 @@ def amdsmi_get_gpu_enumeration_info(processor_handle: amdsmi_wrapper.amdsmi_proc
|
||||
|
||||
return enumeration_info
|
||||
|
||||
def amdsmi_get_cpu_affinity_with_scope(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
scope: AmdSmiAffinityScope
|
||||
) -> List[int]:
|
||||
if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle):
|
||||
raise AmdSmiParameterException(
|
||||
processor_handle, amdsmi_wrapper.amdsmi_processor_handle
|
||||
)
|
||||
|
||||
if not isinstance(scope, AmdSmiAffinityScope):
|
||||
raise AmdSmiParameterException(scope, AmdSmiAffinityScope)
|
||||
|
||||
socket_count = amdsmi_get_cpu_socket_count()
|
||||
sock_info = amdsmi_get_cpu_cores_per_socket(socket_count)
|
||||
core_count = sock_info['cores_per_socket']
|
||||
|
||||
size = ctypes.c_uint32(0)
|
||||
size = (socket_count * core_count)/ (ctypes.sizeof(ctypes.c_uint64) * 8)
|
||||
size = int(math.ceil(size))
|
||||
size = ctypes.c_uint32(size)
|
||||
cpu_set = (ctypes.c_uint64 * size.value)()
|
||||
|
||||
_check_res(
|
||||
amdsmi_wrapper.amdsmi_get_cpu_affinity_with_scope(
|
||||
processor_handle, size, cpu_set, scope)
|
||||
)
|
||||
|
||||
return cpu_set
|
||||
|
||||
def amdsmi_get_gpu_asic_info(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
) -> Dict[str, Any]:
|
||||
|
||||
@@ -2131,6 +2131,15 @@ AMDSMI_VIRTUALIZATION_MODE_HOST = 2
|
||||
AMDSMI_VIRTUALIZATION_MODE_GUEST = 3
|
||||
AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH = 4
|
||||
amdsmi_virtualization_mode_t = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'amdsmi_affinity_scope_t'
|
||||
amdsmi_affinity_scope_t__enumvalues = {
|
||||
0: 'AMDSMI_AFFINITY_SCOPE_NODE',
|
||||
1: 'AMDSMI_AFFINITY_SCOPE_SOCKET',
|
||||
}
|
||||
AMDSMI_AFFINITY_SCOPE_NODE = 0
|
||||
AMDSMI_AFFINITY_SCOPE_SOCKET = 1
|
||||
amdsmi_affinity_scope_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_smu_fw_version_t(Structure):
|
||||
pass
|
||||
|
||||
@@ -2322,6 +2331,16 @@ struct_amdsmi_cpu_info_t._fields_ = [
|
||||
]
|
||||
|
||||
amdsmi_cpu_info_t = struct_amdsmi_cpu_info_t
|
||||
class struct_amdsmi_sock_info_t(Structure):
|
||||
pass
|
||||
|
||||
struct_amdsmi_sock_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_sock_info_t._fields_ = [
|
||||
('socket_id', ctypes.c_uint32),
|
||||
('cores_per_socket', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
amdsmi_sock_info_t = struct_amdsmi_sock_info_t
|
||||
uint64_t = ctypes.c_uint64
|
||||
amdsmi_init = _libraries['libamd_smi.so'].amdsmi_init
|
||||
amdsmi_init.restype = amdsmi_status_t
|
||||
@@ -2369,6 +2388,10 @@ amdsmi_get_gpu_device_uuid.argtypes = [amdsmi_processor_handle, ctypes.POINTER(c
|
||||
amdsmi_get_gpu_enumeration_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_enumeration_info
|
||||
amdsmi_get_gpu_enumeration_info.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_enumeration_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_enumeration_info_t)]
|
||||
uint32_t = ctypes.c_uint32
|
||||
amdsmi_get_cpu_affinity_with_scope = _libraries['libamd_smi.so'].amdsmi_get_cpu_affinity_with_scope
|
||||
amdsmi_get_cpu_affinity_with_scope.restype = amdsmi_status_t
|
||||
amdsmi_get_cpu_affinity_with_scope.argtypes = [amdsmi_processor_handle, uint32_t, ctypes.POINTER(ctypes.c_uint64), amdsmi_affinity_scope_t]
|
||||
amdsmi_get_gpu_id = _libraries['libamd_smi.so'].amdsmi_get_gpu_id
|
||||
amdsmi_get_gpu_id.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_id.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint16)]
|
||||
@@ -2378,7 +2401,6 @@ amdsmi_get_gpu_revision.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctyp
|
||||
amdsmi_get_gpu_vendor_name = _libraries['libamd_smi.so'].amdsmi_get_gpu_vendor_name
|
||||
amdsmi_get_gpu_vendor_name.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_vendor_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), size_t]
|
||||
uint32_t = ctypes.c_uint32
|
||||
amdsmi_get_gpu_vram_vendor = _libraries['libamd_smi.so'].amdsmi_get_gpu_vram_vendor
|
||||
amdsmi_get_gpu_vram_vendor.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_vram_vendor.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t]
|
||||
@@ -2939,6 +2961,12 @@ amdsmi_get_cpu_model_name.argtypes = [amdsmi_processor_handle, ctypes.POINTER(st
|
||||
amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg
|
||||
amdsmi_get_esmi_err_msg.restype = amdsmi_status_t
|
||||
amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
|
||||
amdsmi_get_cpu_cores_per_socket = _libraries['libamd_smi.so'].amdsmi_get_cpu_cores_per_socket
|
||||
amdsmi_get_cpu_cores_per_socket.restype = amdsmi_status_t
|
||||
amdsmi_get_cpu_cores_per_socket.argtypes = [uint32_t, ctypes.POINTER(struct_amdsmi_sock_info_t)]
|
||||
amdsmi_get_cpu_socket_count = _libraries['libamd_smi.so'].amdsmi_get_cpu_socket_count
|
||||
amdsmi_get_cpu_socket_count.restype = amdsmi_status_t
|
||||
amdsmi_get_cpu_socket_count.argtypes = [ctypes.POINTER(ctypes.c_uint32)]
|
||||
__all__ = \
|
||||
['AGG_BW0', 'AMDSMI_ACCELERATOR_DECODER',
|
||||
'AMDSMI_ACCELERATOR_DMA', 'AMDSMI_ACCELERATOR_ENCODER',
|
||||
@@ -2950,6 +2978,7 @@ __all__ = \
|
||||
'AMDSMI_ACCELERATOR_PARTITION_QPX',
|
||||
'AMDSMI_ACCELERATOR_PARTITION_SPX',
|
||||
'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_ACCELERATOR_XCC',
|
||||
'AMDSMI_AFFINITY_SCOPE_NODE', 'AMDSMI_AFFINITY_SCOPE_SOCKET',
|
||||
'AMDSMI_CACHE_PROPERTY_CPU_CACHE',
|
||||
'AMDSMI_CACHE_PROPERTY_DATA_CACHE',
|
||||
'AMDSMI_CACHE_PROPERTY_ENABLED',
|
||||
@@ -3173,18 +3202,18 @@ __all__ = \
|
||||
'amdsmi_accelerator_partition_profile_t',
|
||||
'amdsmi_accelerator_partition_resource_profile_t',
|
||||
'amdsmi_accelerator_partition_resource_type_t',
|
||||
'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t',
|
||||
'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t',
|
||||
'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t',
|
||||
'amdsmi_clean_gpu_local_data', 'amdsmi_clk_info_t',
|
||||
'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t',
|
||||
'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t',
|
||||
'amdsmi_counter_command_t', 'amdsmi_counter_value_t',
|
||||
'amdsmi_cper_guid_t', 'amdsmi_cper_hdr_t',
|
||||
'amdsmi_cper_notify_type_t', 'amdsmi_cper_sev_t',
|
||||
'amdsmi_cper_timestamp_t', 'amdsmi_cper_valid_bits_t',
|
||||
'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable',
|
||||
'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t',
|
||||
'amdsmi_accelerator_partition_type_t', 'amdsmi_affinity_scope_t',
|
||||
'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t',
|
||||
'amdsmi_board_info_t', 'amdsmi_cache_property_type_t',
|
||||
'amdsmi_card_form_factor_t', 'amdsmi_clean_gpu_local_data',
|
||||
'amdsmi_clk_info_t', 'amdsmi_clk_limit_type_t',
|
||||
'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t',
|
||||
'amdsmi_container_types_t', 'amdsmi_counter_command_t',
|
||||
'amdsmi_counter_value_t', 'amdsmi_cper_guid_t',
|
||||
'amdsmi_cper_hdr_t', 'amdsmi_cper_notify_type_t',
|
||||
'amdsmi_cper_sev_t', 'amdsmi_cper_timestamp_t',
|
||||
'amdsmi_cper_valid_bits_t', 'amdsmi_cpu_apb_disable',
|
||||
'amdsmi_cpu_apb_enable', 'amdsmi_cpu_info_t', 'amdsmi_cpu_util_t',
|
||||
'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t',
|
||||
'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t',
|
||||
'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t',
|
||||
@@ -3198,11 +3227,12 @@ __all__ = \
|
||||
'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t',
|
||||
'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t',
|
||||
'amdsmi_frequency_range_t', 'amdsmi_fw_block_t',
|
||||
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
|
||||
'amdsmi_fw_info_t', 'amdsmi_get_afids_from_cper',
|
||||
'amdsmi_get_cpu_affinity_with_scope',
|
||||
'amdsmi_get_clk_freq', 'amdsmi_get_clock_info',
|
||||
'amdsmi_get_cpu_cclk_limit', 'amdsmi_get_cpu_core_boostlimit',
|
||||
'amdsmi_get_cpu_core_current_freq_limit',
|
||||
'amdsmi_get_cpu_core_energy',
|
||||
'amdsmi_get_cpu_core_energy', 'amdsmi_get_cpu_cores_per_socket',
|
||||
'amdsmi_get_cpu_current_io_bandwidth',
|
||||
'amdsmi_get_cpu_current_xgmi_bw', 'amdsmi_get_cpu_ddr_bw',
|
||||
'amdsmi_get_cpu_dimm_power_consumption',
|
||||
@@ -3215,6 +3245,7 @@ __all__ = \
|
||||
'amdsmi_get_cpu_pwr_svi_telemetry_all_rails',
|
||||
'amdsmi_get_cpu_smu_fw_version',
|
||||
'amdsmi_get_cpu_socket_c0_residency',
|
||||
'amdsmi_get_cpu_socket_count',
|
||||
'amdsmi_get_cpu_socket_current_active_freq_limit',
|
||||
'amdsmi_get_cpu_socket_energy',
|
||||
'amdsmi_get_cpu_socket_freq_range',
|
||||
@@ -3332,8 +3363,9 @@ __all__ = \
|
||||
'amdsmi_set_gpu_process_isolation', 'amdsmi_set_power_cap',
|
||||
'amdsmi_set_soc_pstate', 'amdsmi_set_xgmi_plpd',
|
||||
'amdsmi_shut_down', 'amdsmi_smu_fw_version_t',
|
||||
'amdsmi_socket_handle', 'amdsmi_status_code_to_string',
|
||||
'amdsmi_status_t', 'amdsmi_stop_gpu_event_notification',
|
||||
'amdsmi_sock_info_t', 'amdsmi_socket_handle',
|
||||
'amdsmi_status_code_to_string', 'amdsmi_status_t',
|
||||
'amdsmi_stop_gpu_event_notification',
|
||||
'amdsmi_temp_range_refresh_rate_t', 'amdsmi_temperature_metric_t',
|
||||
'amdsmi_temperature_type_t', 'amdsmi_topo_get_link_type',
|
||||
'amdsmi_topo_get_link_weight', 'amdsmi_topo_get_numa_node_number',
|
||||
@@ -3380,7 +3412,7 @@ __all__ = \
|
||||
'struct_amdsmi_proc_info_t', 'struct_amdsmi_process_info_t',
|
||||
'struct_amdsmi_range_t', 'struct_amdsmi_ras_feature_t',
|
||||
'struct_amdsmi_retired_page_record_t',
|
||||
'struct_amdsmi_smu_fw_version_t',
|
||||
'struct_amdsmi_smu_fw_version_t', 'struct_amdsmi_sock_info_t',
|
||||
'struct_amdsmi_temp_range_refresh_rate_t',
|
||||
'struct_amdsmi_topology_nearest_t',
|
||||
'struct_amdsmi_utilization_counter_t',
|
||||
|
||||
@@ -3941,8 +3941,8 @@ amdsmi_get_gpu_cper_entries(
|
||||
std::string path = std::string("/sys/kernel/debug/dri/") +
|
||||
std::to_string(gpu_device->get_card_id()) +
|
||||
"/amdgpu_ring_cper";
|
||||
|
||||
|
||||
|
||||
|
||||
return amdsmi_get_gpu_cper_entries_by_path(
|
||||
path.c_str(),
|
||||
severity_mask,
|
||||
@@ -4795,6 +4795,85 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
return status;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle,
|
||||
uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope)
|
||||
{
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (processor_handle == nullptr || cpu_set == nullptr || cpu_set_size == 0) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Retrieve GPU device from the processor handle
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
uint32_t numa_node;
|
||||
status = amdsmi_topo_get_numa_node_number(processor_handle, &numa_node);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
int32_t node_id = static_cast<int32_t>(numa_node);
|
||||
|
||||
status = amdsmi_get_gpu_topo_numa_affinity(processor_handle, &node_id);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if(node_id < 0) {
|
||||
return AMDSMI_STATUS_NOT_FOUND;
|
||||
}
|
||||
|
||||
std::memset(cpu_set, 0, cpu_set_size * sizeof(uint64_t));
|
||||
switch(scope) {
|
||||
case AMDSMI_AFFINITY_SCOPE_NODE:
|
||||
{
|
||||
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_numa_node(node_id, cpu_set_size);
|
||||
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
|
||||
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
|
||||
} else {
|
||||
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case AMDSMI_AFFINITY_SCOPE_SOCKET:
|
||||
{
|
||||
std::vector<uint32_t> sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id);
|
||||
|
||||
if(sockets[0] == std::numeric_limits<int32_t>::max()){
|
||||
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
|
||||
} else {
|
||||
for (uint32_t idx : sockets) {
|
||||
cpu_set[idx] = idx;
|
||||
}
|
||||
|
||||
std::sort(cpu_set, cpu_set + cpu_set_size);
|
||||
|
||||
// Discard duplicates
|
||||
uint32_t temp_size = 0;
|
||||
for (uint32_t i = 0; i < cpu_set_size; ++i) {
|
||||
if (i == 0 || cpu_set[i] != cpu_set[i - 1]) {
|
||||
cpu_set[temp_size++] = cpu_set[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Update the size to the temp size after discarding duplicates
|
||||
cpu_set_size = temp_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ESMI_LIB
|
||||
static amdsmi_status_t amdsmi_errno_to_esmi_status(amdsmi_status_t status)
|
||||
@@ -5905,6 +5984,35 @@ amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_hand
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *sock_info)
|
||||
{
|
||||
std::map<uint32_t, uint32_t> socket_core_count = amd::smi::AMDSmiSystem::getInstance().get_sys_cpu_cores_per_socket();
|
||||
|
||||
for (uint32_t i = 0; i < sock_count; ++i) {
|
||||
auto it = socket_core_count.find(sock_info[i].socket_id);
|
||||
if (it != socket_core_count.end()) {
|
||||
sock_info[i].cores_per_socket = it->second;
|
||||
} else {
|
||||
sock_info[i].cores_per_socket = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count)
|
||||
{
|
||||
amdsmi_status_t status;
|
||||
uint32_t sock_num;
|
||||
status = amd::smi::AMDSmiSystem::getInstance().get_sys_num_of_cpu_sockets(&sock_num);
|
||||
if (status != AMDSMI_STATUS_SUCCESS)
|
||||
return status;
|
||||
|
||||
*sock_count = sock_num;
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_cpu_handles(uint32_t *cpu_count,
|
||||
amdsmi_processor_handle *processor_handles)
|
||||
{
|
||||
|
||||
@@ -323,7 +323,39 @@ std::string AMDSmiGPUDevice::bdf_to_string() const {
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const {
|
||||
std::vector<uint64_t> bitmask(size, 0);
|
||||
|
||||
if (node_id < 0) {
|
||||
bitmask[0] = std::numeric_limits<int32_t>::max();
|
||||
return bitmask;
|
||||
}
|
||||
|
||||
std::string path = "/sys/devices/system/node/node" + std::to_string(node_id) + "/cpulist";
|
||||
std::ifstream file(path);
|
||||
|
||||
if (file.is_open()) {
|
||||
std::string info;
|
||||
while (std::getline(file, info)) {
|
||||
std::istringstream sstr(info);
|
||||
std::string node_cpus;
|
||||
while (std::getline(sstr, node_cpus, ',')) {
|
||||
size_t hyphen = node_cpus.find('-');
|
||||
if (hyphen != std::string::npos) {
|
||||
int start = std::stoi(node_cpus.substr(0, hyphen));
|
||||
int end = std::stoi(node_cpus.substr(hyphen + 1));
|
||||
for (int i = start; i <= end; ++i) {
|
||||
bitmask[i / 64] |= (1ULL << (i % 64));
|
||||
}
|
||||
} else {
|
||||
int core = std::stoi(node_cpus);
|
||||
bitmask[core / 64] |= (1ULL << (core % 64));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bitmask;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include <map>
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
@@ -103,7 +104,7 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string
|
||||
if (!cpu_info.is_open()) {
|
||||
std::cerr << "Failed to open /proc/cpuinfo:" << strerror(errno) << std::endl;
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
} else {
|
||||
} else {
|
||||
uint32_t current_socket_id = -1;
|
||||
while (std::getline(cpu_info, info)) {
|
||||
if (info.find("processor") != std::string::npos) {
|
||||
@@ -126,7 +127,85 @@ amdsmi_status_t AMDSmiSystem::get_cpu_model_name(uint32_t socket_id, std::string
|
||||
}
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
std::map<uint32_t, uint32_t> AMDSmiSystem::get_sys_cpu_cores_per_socket() {
|
||||
std::map<uint32_t, uint32_t> socket_core_count;
|
||||
std::string base_path = "/sys/devices/system/cpu/";
|
||||
|
||||
iterate_directory(base_path, [&socket_core_count](const std::string &path) {
|
||||
std::string filename(basename(path.c_str()));
|
||||
if (filename.find("cpu") != std::string::npos) {
|
||||
std::string cpuPath = path;
|
||||
std::ifstream package_id_file(cpuPath + "/topology/physical_package_id");
|
||||
std::ifstream core_id_file(cpuPath + "/topology/core_id");
|
||||
|
||||
if (package_id_file.is_open() && core_id_file.is_open()) {
|
||||
uint32_t physical_id, core_id;
|
||||
package_id_file >> physical_id;
|
||||
core_id_file >> core_id;
|
||||
|
||||
socket_core_count[physical_id]++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return socket_core_count;
|
||||
}
|
||||
|
||||
amdsmi_status_t AMDSmiSystem::get_sys_num_of_cpu_sockets(uint32_t *sock_num) {
|
||||
std::map<uint32_t, uint32_t> socket_count_map;
|
||||
std::string base_path = "/sys/devices/system/cpu/";
|
||||
|
||||
iterate_directory(base_path, [&socket_count_map](std::string path) {
|
||||
std::string filename(basename(path.c_str()));
|
||||
if (filename.find("cpu") != std::string::npos) {
|
||||
std::string cpu_path = path;
|
||||
std::ifstream package_id_file(cpu_path + "/topology/physical_package_id");
|
||||
|
||||
if (package_id_file.is_open()) {
|
||||
uint32_t physical_id;
|
||||
package_id_file >> physical_id;
|
||||
|
||||
socket_count_map[physical_id]++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
*sock_num = static_cast<uint32_t>(socket_count_map.size());
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> AMDSmiSystem::get_cpu_sockets_from_numa_node(int32_t numa_node) {
|
||||
std::vector<uint32_t> sockets;
|
||||
if (numa_node < 0) {
|
||||
sockets[0] = std::numeric_limits<int32_t>::max();
|
||||
return sockets;
|
||||
}
|
||||
std::ifstream node_info("/sys/devices/system/node/node" + std::to_string(numa_node) + "/cpulist");
|
||||
std::string info;
|
||||
|
||||
if (node_info.is_open()) {
|
||||
std::getline(node_info, info);
|
||||
std::istringstream iss(info);
|
||||
uint32_t index;
|
||||
while (iss >> index) {
|
||||
std::ifstream cpu_info("/sys/devices/system/cpu/cpu" + std::to_string(index) + "/topology/physical_package_id");
|
||||
if (cpu_info.is_open()) {
|
||||
uint32_t socket;
|
||||
cpu_info >> socket;
|
||||
sockets.push_back(socket);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Discarding duplicate socket entries
|
||||
std::sort(sockets.begin(), sockets.end());
|
||||
sockets.erase(std::unique(sockets.begin(), sockets.end()), sockets.end());
|
||||
|
||||
return sockets;
|
||||
}
|
||||
|
||||
amdsmi_status_t AMDSmiSystem::init(uint64_t flags) {
|
||||
init_flag_ = flags;
|
||||
|
||||
@@ -36,15 +36,15 @@
|
||||
#include <dirent.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "amd_smi/impl/amd_smi_system.h"
|
||||
@@ -1046,3 +1046,25 @@ void amdsmi_wait_for_user_input(void) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool iterate_directory(const std::string &base_path,
|
||||
std::function<void(const std::string &)> entry_callback) {
|
||||
|
||||
DIR *dir = opendir(base_path.c_str());
|
||||
if (!dir) {
|
||||
return false;
|
||||
}
|
||||
|
||||
struct dirent *entry = nullptr;
|
||||
while ((entry = readdir(dir)) != NULL) {
|
||||
entry_callback(entry->d_name);
|
||||
}
|
||||
|
||||
if (errno != 0) {
|
||||
closedir(dir);
|
||||
return false;
|
||||
}
|
||||
|
||||
closedir(dir);
|
||||
return true;
|
||||
}
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur