[SWDEV-542718] Correct socket_affinity (#760)

* [SWDEV-542718] Correct socket_affinity

Updated Socket affinity to show bitmask and expanded cpu list.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

* Update per-device local_cpulist for socket_affinity

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

* Added amdsmi_get_cpu_affinity_from_local_cpulist API.
Updated the wrapper.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

* Revert "Added amdsmi_get_cpu_affinity_from_local_cpulist API."

This reverts commit 9a2ef934b1787f8aa09d3e4efe02f897b4295215.

* Moved the changes to C API.
In case of SOCKET_SCOPE, use local_cpulist first.
If it is unavailable or not readable, fallback to
numa.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

* Addressed review comments

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

---------

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>

[ROCm/amdsmi commit: 09a97f02ed]
This commit is contained in:
Kanangot Balakrishnan, Bindhiya
2025-10-22 16:20:41 -05:00
committed by GitHub
parent ce19b921b0
commit 3924171d74
4 changed files with 51 additions and 23 deletions
+12 -4
View File
@@ -904,16 +904,24 @@ class AMDSMICommands():
logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info())
try:
cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
cpusockets = {f'socket_{i}': socket for i, socket in enumerate(set(cpusockets))}
socket_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
socket_set = [f"{cpus:016X}" for cpus in socket_set]
socket_set = {f'cpu_list_{i}': f"{cpus}" for i, cpus in enumerate(socket_set)}
socket_bitmask_ranges = self.helpers.get_bitmask_ranges(socket_set)
socket_affinity = {}
for key in socket_set:
socket_affinity[key] = {
"bitmask": socket_set[key],
"cpu_cores_affinity": socket_bitmask_ranges.get(key, "N/A")
}
except amdsmi_exception.AmdSmiLibraryException as e:
cpusockets = {}
socket_affinity = "N/A"
logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info())
static_dict['numa'] = { 'node' : numa_node_number,
'affinity' : numa_affinity,
'cpu_affinity' : cpu_affinity,
'socket_affinity' : cpusockets if cpusockets else "N/A"}
'socket_affinity' : socket_affinity}
if args.vram:
vram_info_dict = {"type" : "N/A",
"vendor" : "N/A",
@@ -72,6 +72,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
// New methods for -e feature
std::string bdf_to_string() const; // -e feature
std::vector<uint64_t> get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const;
std::vector<uint64_t> get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const;
private:
uint32_t gpu_id_;
+4 -19
View File
@@ -5199,27 +5199,12 @@ amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle proce
case AMDSMI_AFFINITY_SCOPE_SOCKET:
{
std::vector<uint32_t> sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id);
if(sockets[0] == std::numeric_limits<int32_t>::max()){
uint32_t drm_card = gpu_device->get_card_id();
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_local_cpulist(drm_card, cpu_set_size);
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
} else {
for (uint32_t idx : sockets) {
cpu_set[idx] = idx;
}
std::sort(cpu_set, cpu_set + cpu_set_size);
// Discard duplicates
uint32_t temp_size = 0;
for (uint32_t i = 0; i < cpu_set_size; ++i) {
if (i == 0 || cpu_set[i] != cpu_set[i - 1]) {
cpu_set[temp_size++] = cpu_set[i];
}
}
// Update the size to the temp size after discarding duplicates
cpu_set_size = temp_size;
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
}
break;
}
@@ -310,4 +310,38 @@ std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_i
return bitmask;
}
std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const {
std::vector<uint64_t> bitmask(size, 0);
if (drm_card < 0) {
bitmask[0] = std::numeric_limits<int32_t>::max();
return bitmask;
}
std::string path = "/sys/class/drm/card" + std::to_string(drm_card) + "/device/local_cpulist";
std::ifstream file(path);
if (file.is_open()) {
std::string info;
while (std::getline(file, info)) {
std::istringstream sstr(info);
std::string node_cpus;
while (std::getline(sstr, node_cpus, ',')) {
size_t hyphen = node_cpus.find('-');
if (hyphen != std::string::npos) {
int start = std::stoi(node_cpus.substr(0, hyphen));
int end = std::stoi(node_cpus.substr(hyphen + 1));
for (int i = start; i <= end; ++i) {
bitmask[i / 64] |= (1ULL << (i % 64));
}
} else {
int core = std::stoi(node_cpus);
bitmask[core / 64] |= (1ULL << (core % 64));
}
}
}
}
return bitmask;
}
} // namespace amd::smi