From 985ddbc5d55502df40cbbd18d47da122d10d2901 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 29 Nov 2023 08:23:23 -0600 Subject: [PATCH] Collect compute partition devices under the same socket The socket represents a physical device, and the partition devices should belong to the socket. The partition devices are only different in function id in BDF. Use the BD part of the BDF to identify a socket. Change-Id: I5d355a6f5db02faa7555b760a36c7351b8d8d835 [ROCm/amdsmi commit: de7e74f7db8b611780f3f9185657e6773b8e4f43] --- .../amdsmi/include/amd_smi/impl/amd_smi_system.h | 7 ++++++- projects/amdsmi/src/amd_smi/amd_smi_system.cc | 12 +++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_system.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_system.h index 348782cf1b..596c909204 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_system.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_system.h @@ -99,7 +99,12 @@ class AMDSmiSystem { #endif private: AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {} - amdsmi_status_t get_gpu_bdf_by_index(uint32_t index, std::string& bdf); + + /* The GPU socket id is used to identify the socket, so that the XCDs + on the same physical device will be collected under the same socket. + The BD part of the BDF is used as GPU socket to represent a phyiscal device. + */ + amdsmi_status_t get_gpu_socket_id(uint32_t index, std::string& socketid); amdsmi_status_t populate_amd_gpu_devices(); uint64_t init_flag_; AMDSmiDrm drm_; diff --git a/projects/amdsmi/src/amd_smi/amd_smi_system.cc b/projects/amdsmi/src/amd_smi/amd_smi_system.cc index 939dea6f5d..e9fa857bdf 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_system.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_system.cc @@ -231,7 +231,7 @@ amdsmi_status_t AMDSmiSystem::populate_amd_gpu_devices() { for (uint32_t i=0; i < device_count; i++) { // GPU device uses the bdf as the socket id std::string socket_id; - amd_smi_status = get_gpu_bdf_by_index(i, socket_id); + amd_smi_status = get_gpu_socket_id(i, socket_id); if (amd_smi_status != AMDSMI_STATUS_SUCCESS) { return amd_smi_status; } @@ -256,8 +256,8 @@ amdsmi_status_t AMDSmiSystem::populate_amd_gpu_devices() { return AMDSMI_STATUS_SUCCESS; } -amdsmi_status_t AMDSmiSystem::get_gpu_bdf_by_index(uint32_t index, - std::string& bdf) { +amdsmi_status_t AMDSmiSystem::get_gpu_socket_id(uint32_t index, + std::string& socket_id) { uint64_t bdfid = 0; rsmi_status_t ret = rsmi_dev_pci_id_get(index, &bdfid); if (ret != RSMI_STATUS_SUCCESS) { @@ -269,11 +269,13 @@ amdsmi_status_t AMDSmiSystem::get_gpu_bdf_by_index(uint32_t index, uint64_t device_id = (bdfid >> 3) & 0x1f; uint64_t function = bdfid & 0x7; + // The BD part of the BDF is used as the socket id as it + // represents a physical device. std::stringstream ss; ss << std::setfill('0') << std::uppercase << std::hex << std::setw(4) << domain << ":" << std::setw(2) << bus << ":" - << std::setw(2) << device_id << "." << std::setw(2) << function; - bdf = ss.str(); + << std::setw(2) << device_id; + socket_id = ss.str(); return AMDSMI_STATUS_SUCCESS; }