[DEVICE] Switch to amd-smi from rocm-smi (#1759)

* Use amd-smi instead of rocm-smi for ROCM_VERSION >= 7.11.0

[ROCm/rccl commit: cd745b1f4b]
Cette révision appartient à :
Nilesh M Negi
2026-01-21 09:05:47 -06:00
révisé par GitHub
Parent 520f309bb1
révision 244047310e
9 fichiers modifiés avec 505 ajouts et 82 suppressions
+76 -29
Voir le fichier
@@ -261,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED)
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
## Check for ROCM-smi
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if (rocm_smi_FOUND)
message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
else()
message(STATUS "Checking old include directory structure for rocm_smi")
set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
set(ROCM_SMI_LIBRARIES rocm_smi64)
## Check for amd-smi if ROCm 7.11.0 or newer
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
if(amd_smi_FOUND)
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIBRARIES amd_smi)
set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
endif()
endif()
if(NOT USE_AMDSMI)
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if(rocm_smi_FOUND)
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
else()
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
endif()
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
set(SMI_LIBRARIES rocm_smi64)
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
endif()
check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
## Check for BFD library if custom backtrace is requested
if(BUILD_BFD)
@@ -570,7 +600,6 @@ set(SRC_FILES
src/include/register.h
src/include/register_inline.h
src/include/rccl_float8.h
src/include/rocm_smi_wrap.h
src/include/rocmwrap.h
src/include/roctx.h
src/include/recorder.h
@@ -694,7 +723,6 @@ set(SRC_FILES
# src/misc/nvmlwrap.cc
src/misc/nvmlwrap_stub.cc
src/misc/param.cc
src/misc/rocm_smi_wrap.cc
src/misc/rocmwrap.cc
src/misc/roctx.cc
src/misc/recorder.cc
@@ -762,6 +790,19 @@ set(SRC_FILES
src/misc/latency_profiler/CollTraceUtils.cc
)
if(USE_AMDSMI)
set(SMI_SOURCES
src/include/amdsmi_wrap.h
src/misc/amdsmi_wrap.cc
)
else()
set(SMI_SOURCES
src/include/rocm_smi_wrap.h
src/misc/rocm_smi_wrap.cc
)
endif()
list(APPEND SRC_FILES ${SMI_SOURCES})
if (ENABLE_MSCCL_KERNEL)
set(MSCCL_KERNEL_SOURCES
src/device/msccl_kernel_impl.h
@@ -939,11 +980,16 @@ endif()
if(ENABLE_MSCCLPP)
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
endif()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
if(USE_AMDSMI)
target_compile_definitions(rccl PRIVATE USE_AMDSMI)
else()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
endif()
endif()
if(ENABLE_WARP_SPEED)
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
@@ -1265,7 +1311,7 @@ if (FAULT_INJECTION)
endif()
## Set RCCL linked library directories
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
@@ -1297,7 +1343,7 @@ target_link_libraries(rccl PRIVATE Threads::Threads)
target_link_libraries(rccl INTERFACE hip::host)
target_link_libraries(rccl PRIVATE hip::device)
target_link_libraries(rccl PRIVATE dl)
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES})
target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
if(ENABLE_MSCCLPP)
target_link_libraries(rccl PRIVATE mscclpp_nccl)
@@ -1417,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER)
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
+39
Voir le fichier
@@ -14,7 +14,11 @@
#include "core.h"
#include "nvmlwrap.h"
#include "xml.h"
#ifdef USE_AMDSMI
#include "amdsmi_wrap.h"
#else
#include "rocm_smi_wrap.h"
#endif
#include "archinfo.h"
#if defined(__x86_64__)
#include <cpuid.h>
@@ -819,6 +823,32 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
const char* busId;
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
uint32_t deviceCnt;
#ifdef USE_AMDSMI
NCCLCHECK(amd_smi_getNumDevice(&deviceCnt));
for (int i=0; i<deviceCnt; i++) {
if (i != dev) {
amdsmi_link_type_t amdsmi_type;
int hops, count;
if (amd_smi_getLinkInfo(dev, i, &amdsmi_type, &hops, &count) == ncclSuccess) {
if (amdsmi_type == AMDSMI_LINK_TYPE_XGMI && hops == 1) {
char busIdStr[] = "00000000:00:00.0";
NCCLCHECK(amd_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
lowerId[c] = tolower(busIdStr[c]);
if (busIdStr[c] == 0) break;
}
NCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
if (nvlNode == NULL) {
NCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count));
}
}
}
}
}
#else
NCCLCHECK(rocm_smi_getNumDevice(&deviceCnt));
for (int i=0; i<deviceCnt; i++) {
if (i != dev) {
@@ -843,6 +873,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
}
}
}
#endif
#else
// NVML NVLink detection
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18;
@@ -972,11 +1003,19 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
uint32_t devIndex = 0;
#ifdef USE_AMDSMI
static int amdsmiInit = 0;
if (amdsmiInit == 0) {
NCCLCHECK(amd_smi_init());
}
NCCLCHECK(amd_smi_getDeviceIndexByPciBusId(busId, &devIndex));
#else
static int rocmsmiInit = 0;
if (rocmsmiInit == 0) {
NCCLCHECK(rocm_smi_init());
}
NCCLCHECK(rocm_smi_getDeviceIndexByPciBusId(busId, &devIndex));
#endif
NCCLCHECK(ncclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
#else
nvmlDevice_t nvmlDev;
+17
Voir le fichier
@@ -0,0 +1,17 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
#ifndef AMDSMI_WRAP_H_
#define AMDSMI_WRAP_H_
#include <cstdint>
#include "amd_smi/amdsmi.h"
#include "nccl.h"
ncclResult_t amd_smi_init();
ncclResult_t amd_smi_shutdown();
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs);
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count);
#endif
+1 -1
Voir le fichier
@@ -118,7 +118,7 @@ void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize);
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
int parseFirmwareVersion();
int getFirmwareVersion();
bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
#ifdef ENABLE_WARP_SPEED
void rcclSetWarpSpeedCUs(struct ncclComm* comm, int algo, int threadsPerBlock, int& rcclWarpSpeedChannels);
+1 -1
Voir le fichier
@@ -24,7 +24,7 @@ THE SOFTWARE.
#define ROCM_SMI_WRAP_H_
#include "rocm_smi/rocm_smi.h"
#ifdef USE_ROCM_SMI64CONFIG
#ifdef HAVE_ROCM_SMI64CONFIG
#include "rocm_smi/rocm_smi64Config.h"
#endif
#include "nccl.h"
+11 -4
Voir le fichier
@@ -56,7 +56,11 @@
#ifdef ENABLE_MSCCLPP
#include "mscclpp/mscclpp_nccl.h"
#endif
#ifdef USE_AMDSMI
#include "amdsmi_wrap.h"
#else
#include "rocm_smi_wrap.h"
#endif
#include "rccl_common.h"
// [/RCCL]
@@ -197,9 +201,7 @@ ncclResult_t checkHsaEnvSetting() {
// hipVer is an integer e.g., 6.2.41133 -> 60241133
CUDACHECK(hipRuntimeGetVersion(&hipRuntimeVersion));
// using rocm-smi API to query FW version, instead of parsing CLI output
// will switch to amd-smi API soon
const int firmwareVersion = parseFirmwareVersion();
const int firmwareVersion = getFirmwareVersion();
hipDeviceProp_t devProp;
// use GPU0 should be good enough
@@ -734,9 +736,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
char busId[]="0000:00:00.0";
NCCLCHECK(int64ToBusId(comm->busId, busId));
#ifdef USE_AMDSMI
NCCLCHECK(amd_smi_init());
NCCLCHECK(amd_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev));
#else
NCCLCHECK(rocm_smi_init());
NCCLCHECK(rocm_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev));
#endif
comm->compCap = ncclCudaCompCap();
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
+312
Voir le fichier
@@ -0,0 +1,312 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
#include "amdsmi_wrap.h"
#include "alt_rsmi.h"
#include "core.h"
#include "utils.h"
#include <cstdio>
#include <vector>
#include <cstring>
static int is_wsl2 = -1;
#define AMDSMICHECK(cmd) do { \
amdsmi_status_t ret = cmd; \
if( ret != AMDSMI_STATUS_SUCCESS ) { \
const char *err; \
amdsmi_status_code_to_string(ret, &err); \
ERROR("AMD SMI failure: %s at line: %d in file: %s", err, __LINE__, __FILE__); \
return ncclInternalError; \
} \
} while(false)
#define ARSMICHECK(cmd) do { \
int ret = cmd; \
if( ret != 0 ) { \
ERROR("ARSMI failure: %d", ret); \
return ncclInternalError; \
} \
} while(false)
RCCL_PARAM(UseAmdSmiLib, "USE_AMD_SMI_LIB", 0); // Opt-in environment variable for enabling using amd_smi_lib instead of internal code
ncclResult_t amd_smi_init() {
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE) == -1)
__atomic_store_n(&is_wsl2, (access("/dev/dxg", F_OK) == -1) ? 0 : 1, __ATOMIC_RELEASE);
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
INFO(NCCL_INIT, "Not using amdsmi_lib due to WSL2 environment detected.");
return ncclSuccess;
}
if (rcclParamUseAmdSmiLib()) {
// initialize amd-smi for AMD GPUs
AMDSMICHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));
// get amd-smi version
amdsmi_version_t version;
AMDSMICHECK(amdsmi_get_lib_version(&version));
INFO(NCCL_INIT, "amdsmi_lib: version %d.%d.%d.%s", version.major, version.minor, version.release, version.build);
} else {
// initialize alternate rsmi
ARSMICHECK(ARSMI_init());
INFO(NCCL_INIT, "initialized internal alternative rsmi functionality");
}
return ncclSuccess;
}
ncclResult_t amd_smi_shutdown() {
AMDSMICHECK(amdsmi_shut_down());
return ncclSuccess;
}
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs) {
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE))
CUDACHECK(cudaGetDeviceCount((int *)num_devs));
else {
if (rcclParamUseAmdSmiLib()) {
// rsmi_num_monitor_devices is deprecated
// with amd-smi, first get list of socket handles,
// then get number of processor handles in said sockets,
// and then query no. of gpus in said processor handles
uint32_t socket_count = 0;
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
std::vector<amdsmi_socket_handle> sockets(socket_count);
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
uint32_t total_gpus = 0;
for (auto& socket : sockets) {
uint32_t num_gpus_per_socket = 0;
AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, nullptr));
std::vector<amdsmi_processor_handle> processor_handles(num_gpus_per_socket);
AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, processor_handles.data()));
total_gpus += num_gpus_per_socket;
}
*num_devs = total_gpus;
} else {
ARSMICHECK(ARSMI_get_num_devices(num_devs));
}
}
return ncclSuccess;
}
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* busId, size_t len) {
uint64_t id;
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
CUDACHECK(cudaDeviceGetPCIBusId(busId, len, deviceIndex));
} else {
/** amd-smi's bus ID format
* | Name | Field |
* ------------- | ------- |
* | Domain | [63:16] |
* | Bus | [15: 8] |
* | Device | [ 7: 3] |
* | Function | [ 2: 0] |
**/
if (rcclParamUseAmdSmiLib()) {
// rsmi_dev_pci_id_get is deprecated
/// with amd-smi, first get list of socket handles,
// then get number of processor handles in said sockets,
// and then query the BDF for GPU matching deviceIndex in said processor handles
uint32_t socket_count = 0;
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
std::vector<amdsmi_socket_handle> sockets(socket_count);
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
for (auto& socket : sockets) {
uint32_t processor_handle_count = 0;
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr));
std::vector<amdsmi_processor_handle> processor_handles(processor_handle_count);
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data()));
// this does not work?
// AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket));
// workaround
for (auto& proc : processor_handles) {
processor_type_t type;
uint64_t id;
AMDSMICHECK(amdsmi_get_processor_type(proc, &type));
if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
amdsmi_enumeration_info_t info;
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info));
if(info.hip_id == deviceIndex) {
AMDSMICHECK(amdsmi_get_gpu_bdf_id(proc, &id));
break;
}
}
}
}
} else {
ARSMICHECK(ARSMI_dev_pci_id_get(deviceIndex, &id));
}
// rocm-smi/amd-smi format
//snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id & 0xffffffff) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
// borrowing NCCL's format from utils.cc:int64ToBusId
// !! To be reconciled after discussion with amdsmi team !!
snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
}
return ncclSuccess;
}
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) {
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
CUDACHECK(hipDeviceGetByPCIBusId((int *)deviceIndex, pciBusId));
return ncclSuccess;
} else {
int64_t busid;
busIdToInt64(pciBusId, &busid);
/** convert to amd-smi's bus ID format
* | Name | Field |
* ------------- | ------- |
* | Domain | [63:16] |
* | Bus | [15: 8] |
* | Device | [ 7: 3] |
* | Function | [ 2: 0] |
**/
// instead of getting device count and then comparing the busid to each GPUs BDF
// with amd-smi, we can use amdsmi_get_processor_handle_from_bdf,
// and then query the enumeration info for that processor_handle
if (rcclParamUseAmdSmiLib()) {
amdsmi_processor_handle processor_handle = 0;
amdsmi_bdf_t bdf = {};
// This is the format that matches amd-smi BDF
// bdf.function_number = (busid & 0x7);
// bdf.device_number = (busid & 0xf8) >> 3;
// bdf.bus_number = (busid & 0xff00) >> 8;
// bdf.domain_number = (busid & 0xffffffffffff0000) >> 16;
// However, it is incompatible with the format enforced by NCCL in utils.cc:int64ToBusId
// !! To be reconciled after discussion with amdsmi team !!
bdf.function_number = (busid & 0xf);
bdf.device_number = (busid & 0xff) >> 4;
bdf.bus_number = (busid & 0xff000) >> 12;
bdf.domain_number = busid >> 20;
AMDSMICHECK(amdsmi_get_processor_handle_from_bdf(bdf, &processor_handle));
processor_type_t type;
AMDSMICHECK(amdsmi_get_processor_type(processor_handle, &type));
if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
amdsmi_enumeration_info_t info;
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(processor_handle, &info));
*deviceIndex = info.hip_id;
return ncclSuccess;
}
ERROR("amdsmi_lib: %s device index not found", pciBusId);
} else {
uint32_t i, num_devs = 0;
busid = ((busid&0xffff00000L)<<12)+((busid&0xff000L)>>4)+((busid&0xff0L)>>1)+(busid&0x7L);
ARSMICHECK(ARSMI_get_num_devices(&num_devs));
for (i = 0; i < num_devs; i++) {
uint64_t bdfid;
ARSMICHECK(ARSMI_dev_pci_id_get(i, &bdfid));
if (bdfid == busid) break;
}
if (i < num_devs) {
*deviceIndex = i;
return ncclSuccess;
}
else {
WARN("ARSMI_lib: %s device index not found", pciBusId);
}
}
return ncclInternalError;
}
}
ncclResult_t amd_smi_getLinkInfo(int srcIndex, int dstIndex, amdsmi_link_type_t* type, int *hops, int *count) {
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
*type = AMDSMI_LINK_TYPE_PCIE;
*hops = 1;
*count = 1;
} else {
amdsmi_link_type_t amdsmi_type;
uint64_t amdsmi_hops = 1, amdsmi_weight ;
*count = 1;
// rsmi_minmax_bandwidth_get is replaced by amdsmi_get_minmax_bandwidth_between_processors
// where the arguments for src and dst change from index to processor_handles
// with amd-smi, first get list of socket handles,
// then get number of processor handles in said sockets,
// then get the prcoessor handle matching the src and dst index,
// and then use these processor handles for amdsmi hardware topology functions
if (rcclParamUseAmdSmiLib()) {
uint32_t socket_count = 0;
amdsmi_processor_handle src_processor_handle = 0;
amdsmi_processor_handle dst_processor_handle = 0;
bool found_src = false, found_dst = false;
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
std::vector<amdsmi_socket_handle> sockets(socket_count);
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
for (auto& socket : sockets) {
uint32_t processor_handle_count = 0;
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr));
std::vector<amdsmi_processor_handle> processor_handles(processor_handle_count);
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data()));
// this does not work?
// AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket));
// workaround
for (auto& proc : processor_handles) {
processor_type_t proc_type;
AMDSMICHECK(amdsmi_get_processor_type(proc, &proc_type));
if(proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
amdsmi_enumeration_info_t info;
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info));
if(info.hip_id == srcIndex) {
src_processor_handle = proc;
found_src = true;
}
if(info.hip_id == dstIndex) {
dst_processor_handle = proc;
found_dst = true;
}
}
}
}
if (!found_src) ERROR("amd-smi could not find processor handle for srcIndex: %d", srcIndex);
if (!found_dst) ERROR("amd-smi could not find processor handle for dstIndex: %d", dstIndex);
AMDSMICHECK(amdsmi_topo_get_link_type(src_processor_handle, dst_processor_handle, &amdsmi_hops, &amdsmi_type));
AMDSMICHECK(amdsmi_topo_get_link_weight(src_processor_handle, dst_processor_handle, &amdsmi_weight));
// amd-smi reports weight=0 for XGMI ??
if (amdsmi_type == AMDSMI_LINK_TYPE_XGMI) {
uint64_t min_bw = 0, max_bw = 0;
AMDSMICHECK(amdsmi_get_minmax_bandwidth_between_processors(src_processor_handle, dst_processor_handle, &min_bw, &max_bw));
if (max_bw && min_bw) *count = max_bw/min_bw;
}
*type = amdsmi_type;
*hops = amdsmi_hops;
} else {
ARSMI_linkInfo tinfo;
ARSMICHECK(ARSMI_topo_get_link_info(srcIndex, dstIndex, &tinfo));
*type = (amdsmi_link_type_t) tinfo.type;
if (*type == AMDSMI_LINK_TYPE_XGMI && (tinfo.weight == 15 ||
tinfo.weight == 41 || tinfo.weight == 13)) {
*hops = 1;
if (tinfo.max_bandwidth && tinfo.min_bandwidth)
*count = tinfo.max_bandwidth/tinfo.min_bandwidth;
}
}
}
return ncclSuccess;
}
+1 -1
Voir le fichier
@@ -176,7 +176,7 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
rsmi_weight == 41 || rsmi_weight == 13)) {
uint64_t min_bw = 0, max_bw = 0;
*hops = 1;
#if defined USE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5
#if defined HAVE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5
rsmi_version_t version;
ROCMSMICHECK(rsmi_version_get(&version));
if (version.major >= 5)
+47 -46
Voir le fichier
@@ -24,8 +24,15 @@ THE SOFTWARE.
#include "comm.h"
#include "graph/topo.h"
#include "enqueue.h"
#include "rocm_smi/rocm_smi.h"
#include <algorithm>
#include "debug.h"
#ifdef USE_AMDSMI
#include "amd_smi/amdsmi.h"
#else
#include "rocm_smi/rocm_smi.h"
#endif
// Use this param to experiment pipelining new data types besides bfloat16
// Make sure you generate the device code with the new data type (i.e. in generate.py)
RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
@@ -642,60 +649,54 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
return ncclSuccess;
}
std::string trimString(const std::string& s) {
int sz = s.size();
int b = 0;
int e = sz - 1;
while (b < sz && isspace(s[b])) {
b++;
}
if (b >= sz) {
return "";
}
while (e >= b && e < sz && isspace(s[e])) {
e--;
}
if (b > e) {
return "";
}
return s.substr(b, e - b + 1);
}
std::vector<std::string> splitString(const std::string& s, char delimiter) {
std::vector<std::string> tokens;
std::stringstream ss(s);
std::string token;
while (std::getline(ss, token, delimiter)) {
tokens.push_back(trimString(token));
}
return tokens;
}
int parseFirmwareVersionImpl() {
int getFirmwareVersion() {
uint64_t fw_version = -1;
// using rocm-smi APIs for now to query MEC FW version
// will switch to amd-smi APIs soon
#ifdef USE_AMDSMI
amdsmi_status_t ret;
ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
if (ret != AMDSMI_STATUS_SUCCESS) {
ERROR("Could not initialize amd-smi");
return -1;
}
uint32_t socket_count = 0;
amdsmi_get_socket_handles(&socket_count, nullptr);
std::vector<amdsmi_socket_handle> sockets(socket_count);
amdsmi_get_socket_handles(&socket_count, sockets.data());
uint32_t num_gpus_per_socket = 0;
amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, nullptr);
std::vector<amdsmi_processor_handle> processor_handles(num_gpus_per_socket);
amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, processor_handles.data());
amdsmi_fw_info_t info;
ret = amdsmi_get_fw_info(processor_handles[0], &info);
if (ret != AMDSMI_STATUS_SUCCESS) {
ERROR("Could not query firmware info using amd-smi");
return -1;
}
fw_version = info.fw_info_list[0].fw_version;
#else
rsmi_status_t ret;
ret = rsmi_init(0);
if (ret != RSMI_STATUS_SUCCESS) return -1;
if (ret != RSMI_STATUS_SUCCESS) {
ERROR("Could not initialize rocm-smi");
return -1;
}
ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_MEC, &fw_version);
if (ret != RSMI_STATUS_SUCCESS) return -1;
if (ret != RSMI_STATUS_SUCCESS) {
ERROR("Could not query firmware info using rocm-smi");
return -1;
}
#endif
return fw_version;
}
int parseFirmwareVersion() {
int version = -1;
try {
version = parseFirmwareVersionImpl();
} catch (const std::exception& ex) {
}
return version;
}
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, char const* archName) {
bool hsaScratchEnvSet = (hsaScratchEnv && strcmp(hsaScratchEnv,"1") == 0);
if (hsaScratchEnvSet) {