[SWDEV-542718] Correct socket_affinity (#760)
* [SWDEV-542718] Correct socket_affinity
Updated Socket affinity to show bitmask and expanded cpu list.
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* Update per-device local_cpulist for socket_affinity
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* Added amdsmi_get_cpu_affinity_from_local_cpulist API.
Updated the wrapper.
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* Revert "Added amdsmi_get_cpu_affinity_from_local_cpulist API."
This reverts commit 9a2ef934b1787f8aa09d3e4efe02f897b4295215.
* Moved the changes to C API.
In case of SOCKET_SCOPE, use local_cpulist first.
If it is unavailable or not readable, fallback to
numa.
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
* Addressed review comments
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
---------
Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
[ROCm/amdsmi commit: 09a97f02ed]
This commit is contained in:
committed by
GitHub
parent
ce19b921b0
commit
3924171d74
@@ -904,16 +904,24 @@ class AMDSMICommands():
|
||||
logging.debug("Failed to get cpu affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
try:
|
||||
cpusockets = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
|
||||
cpusockets = {f'socket_{i}': socket for i, socket in enumerate(set(cpusockets))}
|
||||
socket_set = amdsmi_interface.amdsmi_get_cpu_affinity_with_scope(args.gpu, amdsmi_interface.AmdSmiAffinityScope.SOCKET_SCOPE)
|
||||
socket_set = [f"{cpus:016X}" for cpus in socket_set]
|
||||
socket_set = {f'cpu_list_{i}': f"{cpus}" for i, cpus in enumerate(socket_set)}
|
||||
socket_bitmask_ranges = self.helpers.get_bitmask_ranges(socket_set)
|
||||
socket_affinity = {}
|
||||
for key in socket_set:
|
||||
socket_affinity[key] = {
|
||||
"bitmask": socket_set[key],
|
||||
"cpu_cores_affinity": socket_bitmask_ranges.get(key, "N/A")
|
||||
}
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
cpusockets = {}
|
||||
socket_affinity = "N/A"
|
||||
logging.debug("Failed to get socket affinity for gpu %s | %s", gpu_id, e.get_error_info())
|
||||
|
||||
static_dict['numa'] = { 'node' : numa_node_number,
|
||||
'affinity' : numa_affinity,
|
||||
'cpu_affinity' : cpu_affinity,
|
||||
'socket_affinity' : cpusockets if cpusockets else "N/A"}
|
||||
'socket_affinity' : socket_affinity}
|
||||
if args.vram:
|
||||
vram_info_dict = {"type" : "N/A",
|
||||
"vendor" : "N/A",
|
||||
|
||||
@@ -72,6 +72,7 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
|
||||
// New methods for -e feature
|
||||
std::string bdf_to_string() const; // -e feature
|
||||
std::vector<uint64_t> get_bitmask_from_numa_node(int32_t node_id, uint32_t size) const;
|
||||
std::vector<uint64_t> get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const;
|
||||
|
||||
private:
|
||||
uint32_t gpu_id_;
|
||||
|
||||
@@ -5199,27 +5199,12 @@ amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle proce
|
||||
|
||||
case AMDSMI_AFFINITY_SCOPE_SOCKET:
|
||||
{
|
||||
std::vector<uint32_t> sockets = amd::smi::AMDSmiSystem::getInstance().get_cpu_sockets_from_numa_node(node_id);
|
||||
|
||||
if(sockets[0] == std::numeric_limits<int32_t>::max()){
|
||||
uint32_t drm_card = gpu_device->get_card_id();
|
||||
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_local_cpulist(drm_card, cpu_set_size);
|
||||
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
|
||||
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
|
||||
} else {
|
||||
for (uint32_t idx : sockets) {
|
||||
cpu_set[idx] = idx;
|
||||
}
|
||||
|
||||
std::sort(cpu_set, cpu_set + cpu_set_size);
|
||||
|
||||
// Discard duplicates
|
||||
uint32_t temp_size = 0;
|
||||
for (uint32_t i = 0; i < cpu_set_size; ++i) {
|
||||
if (i == 0 || cpu_set[i] != cpu_set[i - 1]) {
|
||||
cpu_set[temp_size++] = cpu_set[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Update the size to the temp size after discarding duplicates
|
||||
cpu_set_size = temp_size;
|
||||
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -310,4 +310,38 @@ std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_numa_node(int32_t node_i
|
||||
return bitmask;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> AMDSmiGPUDevice::get_bitmask_from_local_cpulist(uint32_t drm_card, uint32_t size) const {
|
||||
std::vector<uint64_t> bitmask(size, 0);
|
||||
|
||||
if (drm_card < 0) {
|
||||
bitmask[0] = std::numeric_limits<int32_t>::max();
|
||||
return bitmask;
|
||||
}
|
||||
|
||||
std::string path = "/sys/class/drm/card" + std::to_string(drm_card) + "/device/local_cpulist";
|
||||
std::ifstream file(path);
|
||||
|
||||
if (file.is_open()) {
|
||||
std::string info;
|
||||
while (std::getline(file, info)) {
|
||||
std::istringstream sstr(info);
|
||||
std::string node_cpus;
|
||||
while (std::getline(sstr, node_cpus, ',')) {
|
||||
size_t hyphen = node_cpus.find('-');
|
||||
if (hyphen != std::string::npos) {
|
||||
int start = std::stoi(node_cpus.substr(0, hyphen));
|
||||
int end = std::stoi(node_cpus.substr(hyphen + 1));
|
||||
for (int i = start; i <= end; ++i) {
|
||||
bitmask[i / 64] |= (1ULL << (i % 64));
|
||||
}
|
||||
} else {
|
||||
int core = std::stoi(node_cpus);
|
||||
bitmask[core / 64] |= (1ULL << (core % 64));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bitmask;
|
||||
}
|
||||
|
||||
} // namespace amd::smi
|
||||
|
||||
Reference in New Issue
Block a user