[DEVICE] Switch to amd-smi from rocm-smi (#1759)
* Use amd-smi instead of rocm-smi for ROCM_VERSION >= 7.11.0
[ROCm/rccl commit: cd745b1f4b]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
520f309bb1
Коммит
244047310e
@@ -261,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED)
|
||||
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
|
||||
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
|
||||
|
||||
## Check for ROCM-smi
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if (rocm_smi_FOUND)
|
||||
message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}")
|
||||
else()
|
||||
message(STATUS "Checking old include directory structure for rocm_smi")
|
||||
set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
|
||||
set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
|
||||
set(ROCM_SMI_LIBRARIES rocm_smi64)
|
||||
## Check for amd-smi if ROCm 7.11.0 or newer
|
||||
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
|
||||
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
|
||||
if(amd_smi_FOUND)
|
||||
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
|
||||
message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
|
||||
set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
|
||||
set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
|
||||
set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
|
||||
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
|
||||
endif()
|
||||
message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
|
||||
set(SMI_LIBRARIES amd_smi)
|
||||
set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT USE_AMDSMI)
|
||||
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
|
||||
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if(rocm_smi_FOUND)
|
||||
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
|
||||
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
|
||||
else()
|
||||
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
|
||||
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
|
||||
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
|
||||
endif()
|
||||
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
|
||||
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
|
||||
set(SMI_LIBRARIES rocm_smi64)
|
||||
|
||||
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
|
||||
|
||||
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
|
||||
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
|
||||
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
|
||||
if(${matchres} EQUAL -1)
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
|
||||
else()
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
|
||||
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
|
||||
endif ()
|
||||
endif()
|
||||
check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
|
||||
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
|
||||
file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
|
||||
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
|
||||
if(${matchres} EQUAL -1)
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
|
||||
else()
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
|
||||
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
|
||||
endif ()
|
||||
|
||||
## Check for BFD library if custom backtrace is requested
|
||||
if(BUILD_BFD)
|
||||
@@ -570,7 +600,6 @@ set(SRC_FILES
|
||||
src/include/register.h
|
||||
src/include/register_inline.h
|
||||
src/include/rccl_float8.h
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/include/rocmwrap.h
|
||||
src/include/roctx.h
|
||||
src/include/recorder.h
|
||||
@@ -694,7 +723,6 @@ set(SRC_FILES
|
||||
# src/misc/nvmlwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/param.cc
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
src/misc/rocmwrap.cc
|
||||
src/misc/roctx.cc
|
||||
src/misc/recorder.cc
|
||||
@@ -762,6 +790,19 @@ set(SRC_FILES
|
||||
src/misc/latency_profiler/CollTraceUtils.cc
|
||||
)
|
||||
|
||||
if(USE_AMDSMI)
|
||||
set(SMI_SOURCES
|
||||
src/include/amdsmi_wrap.h
|
||||
src/misc/amdsmi_wrap.cc
|
||||
)
|
||||
else()
|
||||
set(SMI_SOURCES
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
)
|
||||
endif()
|
||||
list(APPEND SRC_FILES ${SMI_SOURCES})
|
||||
|
||||
if (ENABLE_MSCCL_KERNEL)
|
||||
set(MSCCL_KERNEL_SOURCES
|
||||
src/device/msccl_kernel_impl.h
|
||||
@@ -939,11 +980,16 @@ endif()
|
||||
if(ENABLE_MSCCLPP)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI64CONFIG)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
|
||||
if(USE_AMDSMI)
|
||||
target_compile_definitions(rccl PRIVATE USE_AMDSMI)
|
||||
else()
|
||||
if(HAVE_ROCM_SMI64CONFIG)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
|
||||
endif()
|
||||
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
|
||||
endif()
|
||||
endif()
|
||||
if(ENABLE_WARP_SPEED)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
|
||||
@@ -1265,7 +1311,7 @@ if (FAULT_INJECTION)
|
||||
endif()
|
||||
|
||||
## Set RCCL linked library directories
|
||||
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
|
||||
target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})
|
||||
|
||||
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
|
||||
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
|
||||
@@ -1297,7 +1343,7 @@ target_link_libraries(rccl PRIVATE Threads::Threads)
|
||||
target_link_libraries(rccl INTERFACE hip::host)
|
||||
target_link_libraries(rccl PRIVATE hip::device)
|
||||
target_link_libraries(rccl PRIVATE dl)
|
||||
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
|
||||
target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES})
|
||||
target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
|
||||
if(ENABLE_MSCCLPP)
|
||||
target_link_libraries(rccl PRIVATE mscclpp_nccl)
|
||||
@@ -1417,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER)
|
||||
else()
|
||||
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
|
||||
endif()
|
||||
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0")
|
||||
|
||||
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
|
||||
set(CPACK_DEB_COMPONENT_INSTALL ON)
|
||||
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
|
||||
set(CPACK_RPM_COMPONENT_INSTALL ON)
|
||||
|
||||
@@ -14,7 +14,11 @@
|
||||
#include "core.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "xml.h"
|
||||
#ifdef USE_AMDSMI
|
||||
#include "amdsmi_wrap.h"
|
||||
#else
|
||||
#include "rocm_smi_wrap.h"
|
||||
#endif
|
||||
#include "archinfo.h"
|
||||
#if defined(__x86_64__)
|
||||
#include <cpuid.h>
|
||||
@@ -819,6 +823,32 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
|
||||
const char* busId;
|
||||
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
|
||||
uint32_t deviceCnt;
|
||||
#ifdef USE_AMDSMI
|
||||
NCCLCHECK(amd_smi_getNumDevice(&deviceCnt));
|
||||
for (int i=0; i<deviceCnt; i++) {
|
||||
if (i != dev) {
|
||||
amdsmi_link_type_t amdsmi_type;
|
||||
int hops, count;
|
||||
if (amd_smi_getLinkInfo(dev, i, &amdsmi_type, &hops, &count) == ncclSuccess) {
|
||||
if (amdsmi_type == AMDSMI_LINK_TYPE_XGMI && hops == 1) {
|
||||
char busIdStr[] = "00000000:00:00.0";
|
||||
NCCLCHECK(amd_smi_getDevicePciBusIdString(i, busIdStr, sizeof(busIdStr)));
|
||||
char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
|
||||
lowerId[c] = tolower(busIdStr[c]);
|
||||
if (busIdStr[c] == 0) break;
|
||||
}
|
||||
NCCLCHECK(xmlGetSubKv(gpuNode, "xgmi", &nvlNode, "target", lowerId));
|
||||
if (nvlNode == NULL) {
|
||||
NCCLCHECK(xmlAddNode(xml, gpuNode, "xgmi", &nvlNode));
|
||||
NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
|
||||
NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
NCCLCHECK(rocm_smi_getNumDevice(&deviceCnt));
|
||||
for (int i=0; i<deviceCnt; i++) {
|
||||
if (i != dev) {
|
||||
@@ -843,6 +873,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// NVML NVLink detection
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18;
|
||||
@@ -972,11 +1003,19 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
|
||||
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
uint32_t devIndex = 0;
|
||||
#ifdef USE_AMDSMI
|
||||
static int amdsmiInit = 0;
|
||||
if (amdsmiInit == 0) {
|
||||
NCCLCHECK(amd_smi_init());
|
||||
}
|
||||
NCCLCHECK(amd_smi_getDeviceIndexByPciBusId(busId, &devIndex));
|
||||
#else
|
||||
static int rocmsmiInit = 0;
|
||||
if (rocmsmiInit == 0) {
|
||||
NCCLCHECK(rocm_smi_init());
|
||||
}
|
||||
NCCLCHECK(rocm_smi_getDeviceIndexByPciBusId(busId, &devIndex));
|
||||
#endif
|
||||
NCCLCHECK(ncclTopoGetXmlFromGpu(node, devIndex, xml, gpuNode));
|
||||
#else
|
||||
nvmlDevice_t nvmlDev;
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
#ifndef AMDSMI_WRAP_H_
|
||||
#define AMDSMI_WRAP_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "nccl.h"
|
||||
|
||||
ncclResult_t amd_smi_init();
|
||||
ncclResult_t amd_smi_shutdown();
|
||||
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs);
|
||||
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
|
||||
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
|
||||
ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count);
|
||||
|
||||
#endif
|
||||
@@ -118,7 +118,7 @@ void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize);
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
|
||||
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
|
||||
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
|
||||
int parseFirmwareVersion();
|
||||
int getFirmwareVersion();
|
||||
bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
|
||||
#ifdef ENABLE_WARP_SPEED
|
||||
void rcclSetWarpSpeedCUs(struct ncclComm* comm, int algo, int threadsPerBlock, int& rcclWarpSpeedChannels);
|
||||
|
||||
@@ -24,7 +24,7 @@ THE SOFTWARE.
|
||||
#define ROCM_SMI_WRAP_H_
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#ifdef USE_ROCM_SMI64CONFIG
|
||||
#ifdef HAVE_ROCM_SMI64CONFIG
|
||||
#include "rocm_smi/rocm_smi64Config.h"
|
||||
#endif
|
||||
#include "nccl.h"
|
||||
|
||||
@@ -56,7 +56,11 @@
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
#include "mscclpp/mscclpp_nccl.h"
|
||||
#endif
|
||||
#ifdef USE_AMDSMI
|
||||
#include "amdsmi_wrap.h"
|
||||
#else
|
||||
#include "rocm_smi_wrap.h"
|
||||
#endif
|
||||
#include "rccl_common.h"
|
||||
// [/RCCL]
|
||||
|
||||
@@ -197,9 +201,7 @@ ncclResult_t checkHsaEnvSetting() {
|
||||
// hipVer is an integer e.g., 6.2.41133 -> 60241133
|
||||
CUDACHECK(hipRuntimeGetVersion(&hipRuntimeVersion));
|
||||
|
||||
// using rocm-smi API to query FW version, instead of parsing CLI output
|
||||
// will switch to amd-smi API soon
|
||||
const int firmwareVersion = parseFirmwareVersion();
|
||||
const int firmwareVersion = getFirmwareVersion();
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
// use GPU0 should be good enough
|
||||
@@ -734,9 +736,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||
char busId[]="0000:00:00.0";
|
||||
NCCLCHECK(int64ToBusId(comm->busId, busId));
|
||||
|
||||
#ifdef USE_AMDSMI
|
||||
NCCLCHECK(amd_smi_init());
|
||||
NCCLCHECK(amd_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev));
|
||||
#else
|
||||
NCCLCHECK(rocm_smi_init());
|
||||
NCCLCHECK(rocm_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev));
|
||||
|
||||
#endif
|
||||
comm->compCap = ncclCudaCompCap();
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap);
|
||||
|
||||
|
||||
@@ -0,0 +1,312 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
|
||||
#include "amdsmi_wrap.h"
|
||||
#include "alt_rsmi.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
static int is_wsl2 = -1;
|
||||
|
||||
#define AMDSMICHECK(cmd) do { \
|
||||
amdsmi_status_t ret = cmd; \
|
||||
if( ret != AMDSMI_STATUS_SUCCESS ) { \
|
||||
const char *err; \
|
||||
amdsmi_status_code_to_string(ret, &err); \
|
||||
ERROR("AMD SMI failure: %s at line: %d in file: %s", err, __LINE__, __FILE__); \
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define ARSMICHECK(cmd) do { \
|
||||
int ret = cmd; \
|
||||
if( ret != 0 ) { \
|
||||
ERROR("ARSMI failure: %d", ret); \
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
RCCL_PARAM(UseAmdSmiLib, "USE_AMD_SMI_LIB", 0); // Opt-in environment variable for enabling using amd_smi_lib instead of internal code
|
||||
|
||||
ncclResult_t amd_smi_init() {
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE) == -1)
|
||||
__atomic_store_n(&is_wsl2, (access("/dev/dxg", F_OK) == -1) ? 0 : 1, __ATOMIC_RELEASE);
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
|
||||
INFO(NCCL_INIT, "Not using amdsmi_lib due to WSL2 environment detected.");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
if (rcclParamUseAmdSmiLib()) {
|
||||
// initialize amd-smi for AMD GPUs
|
||||
AMDSMICHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));
|
||||
|
||||
// get amd-smi version
|
||||
amdsmi_version_t version;
|
||||
AMDSMICHECK(amdsmi_get_lib_version(&version));
|
||||
INFO(NCCL_INIT, "amdsmi_lib: version %d.%d.%d.%s", version.major, version.minor, version.release, version.build);
|
||||
} else {
|
||||
// initialize alternate rsmi
|
||||
ARSMICHECK(ARSMI_init());
|
||||
INFO(NCCL_INIT, "initialized internal alternative rsmi functionality");
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t amd_smi_shutdown() {
|
||||
AMDSMICHECK(amdsmi_shut_down());
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs) {
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE))
|
||||
CUDACHECK(cudaGetDeviceCount((int *)num_devs));
|
||||
else {
|
||||
if (rcclParamUseAmdSmiLib()) {
|
||||
// rsmi_num_monitor_devices is deprecated
|
||||
|
||||
// with amd-smi, first get list of socket handles,
|
||||
// then get number of processor handles in said sockets,
|
||||
// and then query no. of gpus in said processor handles
|
||||
uint32_t socket_count = 0;
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
|
||||
|
||||
uint32_t total_gpus = 0;
|
||||
for (auto& socket : sockets) {
|
||||
uint32_t num_gpus_per_socket = 0;
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, nullptr));
|
||||
std::vector<amdsmi_processor_handle> processor_handles(num_gpus_per_socket);
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, processor_handles.data()));
|
||||
total_gpus += num_gpus_per_socket;
|
||||
}
|
||||
*num_devs = total_gpus;
|
||||
} else {
|
||||
ARSMICHECK(ARSMI_get_num_devices(num_devs));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* busId, size_t len) {
|
||||
uint64_t id;
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
|
||||
CUDACHECK(cudaDeviceGetPCIBusId(busId, len, deviceIndex));
|
||||
} else {
|
||||
/** amd-smi's bus ID format
|
||||
* | Name | Field |
|
||||
* ------------- | ------- |
|
||||
* | Domain | [63:16] |
|
||||
* | Bus | [15: 8] |
|
||||
* | Device | [ 7: 3] |
|
||||
* | Function | [ 2: 0] |
|
||||
**/
|
||||
if (rcclParamUseAmdSmiLib()) {
|
||||
// rsmi_dev_pci_id_get is deprecated
|
||||
|
||||
/// with amd-smi, first get list of socket handles,
|
||||
// then get number of processor handles in said sockets,
|
||||
// and then query the BDF for GPU matching deviceIndex in said processor handles
|
||||
uint32_t socket_count = 0;
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
|
||||
|
||||
for (auto& socket : sockets) {
|
||||
uint32_t processor_handle_count = 0;
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr));
|
||||
std::vector<amdsmi_processor_handle> processor_handles(processor_handle_count);
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data()));
|
||||
|
||||
// this does not work?
|
||||
// AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket));
|
||||
|
||||
// workaround
|
||||
for (auto& proc : processor_handles) {
|
||||
processor_type_t type;
|
||||
uint64_t id;
|
||||
|
||||
AMDSMICHECK(amdsmi_get_processor_type(proc, &type));
|
||||
if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
amdsmi_enumeration_info_t info;
|
||||
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info));
|
||||
if(info.hip_id == deviceIndex) {
|
||||
AMDSMICHECK(amdsmi_get_gpu_bdf_id(proc, &id));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ARSMICHECK(ARSMI_dev_pci_id_get(deviceIndex, &id));
|
||||
}
|
||||
|
||||
// rocm-smi/amd-smi format
|
||||
//snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id & 0xffffffff) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7));
|
||||
|
||||
// borrowing NCCL's format from utils.cc:int64ToBusId
|
||||
// !! To be reconciled after discussion with amdsmi team !!
|
||||
snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) {
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
|
||||
CUDACHECK(hipDeviceGetByPCIBusId((int *)deviceIndex, pciBusId));
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
int64_t busid;
|
||||
|
||||
busIdToInt64(pciBusId, &busid);
|
||||
/** convert to amd-smi's bus ID format
|
||||
* | Name | Field |
|
||||
* ------------- | ------- |
|
||||
* | Domain | [63:16] |
|
||||
* | Bus | [15: 8] |
|
||||
* | Device | [ 7: 3] |
|
||||
* | Function | [ 2: 0] |
|
||||
**/
|
||||
|
||||
// instead of getting device count and then comparing the busid to each GPUs BDF
|
||||
|
||||
// with amd-smi, we can use amdsmi_get_processor_handle_from_bdf,
|
||||
// and then query the enumeration info for that processor_handle
|
||||
if (rcclParamUseAmdSmiLib()) {
|
||||
amdsmi_processor_handle processor_handle = 0;
|
||||
|
||||
amdsmi_bdf_t bdf = {};
|
||||
// This is the format that matches amd-smi BDF
|
||||
// bdf.function_number = (busid & 0x7);
|
||||
// bdf.device_number = (busid & 0xf8) >> 3;
|
||||
// bdf.bus_number = (busid & 0xff00) >> 8;
|
||||
// bdf.domain_number = (busid & 0xffffffffffff0000) >> 16;
|
||||
|
||||
// However, it is incompatible with the format enforced by NCCL in utils.cc:int64ToBusId
|
||||
// !! To be reconciled after discussion with amdsmi team !!
|
||||
bdf.function_number = (busid & 0xf);
|
||||
bdf.device_number = (busid & 0xff) >> 4;
|
||||
bdf.bus_number = (busid & 0xff000) >> 12;
|
||||
bdf.domain_number = busid >> 20;
|
||||
|
||||
AMDSMICHECK(amdsmi_get_processor_handle_from_bdf(bdf, &processor_handle));
|
||||
|
||||
processor_type_t type;
|
||||
AMDSMICHECK(amdsmi_get_processor_type(processor_handle, &type));
|
||||
if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
amdsmi_enumeration_info_t info;
|
||||
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(processor_handle, &info));
|
||||
*deviceIndex = info.hip_id;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ERROR("amdsmi_lib: %s device index not found", pciBusId);
|
||||
} else {
|
||||
uint32_t i, num_devs = 0;
|
||||
busid = ((busid&0xffff00000L)<<12)+((busid&0xff000L)>>4)+((busid&0xff0L)>>1)+(busid&0x7L);
|
||||
|
||||
ARSMICHECK(ARSMI_get_num_devices(&num_devs));
|
||||
for (i = 0; i < num_devs; i++) {
|
||||
uint64_t bdfid;
|
||||
ARSMICHECK(ARSMI_dev_pci_id_get(i, &bdfid));
|
||||
if (bdfid == busid) break;
|
||||
}
|
||||
if (i < num_devs) {
|
||||
*deviceIndex = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
else {
|
||||
WARN("ARSMI_lib: %s device index not found", pciBusId);
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t amd_smi_getLinkInfo(int srcIndex, int dstIndex, amdsmi_link_type_t* type, int *hops, int *count) {
|
||||
if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) {
|
||||
*type = AMDSMI_LINK_TYPE_PCIE;
|
||||
*hops = 1;
|
||||
*count = 1;
|
||||
} else {
|
||||
amdsmi_link_type_t amdsmi_type;
|
||||
uint64_t amdsmi_hops = 1, amdsmi_weight ;
|
||||
*count = 1;
|
||||
|
||||
// rsmi_minmax_bandwidth_get is replaced by amdsmi_get_minmax_bandwidth_between_processors
|
||||
// where the arguments for src and dst change from index to processor_handles
|
||||
|
||||
// with amd-smi, first get list of socket handles,
|
||||
// then get number of processor handles in said sockets,
|
||||
// then get the prcoessor handle matching the src and dst index,
|
||||
// and then use these processor handles for amdsmi hardware topology functions
|
||||
if (rcclParamUseAmdSmiLib()) {
|
||||
uint32_t socket_count = 0;
|
||||
amdsmi_processor_handle src_processor_handle = 0;
|
||||
amdsmi_processor_handle dst_processor_handle = 0;
|
||||
bool found_src = false, found_dst = false;
|
||||
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));
|
||||
|
||||
for (auto& socket : sockets) {
|
||||
uint32_t processor_handle_count = 0;
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr));
|
||||
std::vector<amdsmi_processor_handle> processor_handles(processor_handle_count);
|
||||
AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data()));
|
||||
|
||||
// this does not work?
|
||||
// AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket));
|
||||
|
||||
// workaround
|
||||
for (auto& proc : processor_handles) {
|
||||
processor_type_t proc_type;
|
||||
AMDSMICHECK(amdsmi_get_processor_type(proc, &proc_type));
|
||||
if(proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
|
||||
amdsmi_enumeration_info_t info;
|
||||
AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info));
|
||||
if(info.hip_id == srcIndex) {
|
||||
src_processor_handle = proc;
|
||||
found_src = true;
|
||||
}
|
||||
if(info.hip_id == dstIndex) {
|
||||
dst_processor_handle = proc;
|
||||
found_dst = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found_src) ERROR("amd-smi could not find processor handle for srcIndex: %d", srcIndex);
|
||||
if (!found_dst) ERROR("amd-smi could not find processor handle for dstIndex: %d", dstIndex);
|
||||
AMDSMICHECK(amdsmi_topo_get_link_type(src_processor_handle, dst_processor_handle, &amdsmi_hops, &amdsmi_type));
|
||||
AMDSMICHECK(amdsmi_topo_get_link_weight(src_processor_handle, dst_processor_handle, &amdsmi_weight));
|
||||
|
||||
// amd-smi reports weight=0 for XGMI ??
|
||||
if (amdsmi_type == AMDSMI_LINK_TYPE_XGMI) {
|
||||
uint64_t min_bw = 0, max_bw = 0;
|
||||
AMDSMICHECK(amdsmi_get_minmax_bandwidth_between_processors(src_processor_handle, dst_processor_handle, &min_bw, &max_bw));
|
||||
if (max_bw && min_bw) *count = max_bw/min_bw;
|
||||
}
|
||||
|
||||
*type = amdsmi_type;
|
||||
*hops = amdsmi_hops;
|
||||
} else {
|
||||
ARSMI_linkInfo tinfo;
|
||||
ARSMICHECK(ARSMI_topo_get_link_info(srcIndex, dstIndex, &tinfo));
|
||||
|
||||
*type = (amdsmi_link_type_t) tinfo.type;
|
||||
if (*type == AMDSMI_LINK_TYPE_XGMI && (tinfo.weight == 15 ||
|
||||
tinfo.weight == 41 || tinfo.weight == 13)) {
|
||||
*hops = 1;
|
||||
if (tinfo.max_bandwidth && tinfo.min_bandwidth)
|
||||
*count = tinfo.max_bandwidth/tinfo.min_bandwidth;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -176,7 +176,7 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
|
||||
rsmi_weight == 41 || rsmi_weight == 13)) {
|
||||
uint64_t min_bw = 0, max_bw = 0;
|
||||
*hops = 1;
|
||||
#if defined USE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5
|
||||
#if defined HAVE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5
|
||||
rsmi_version_t version;
|
||||
ROCMSMICHECK(rsmi_version_get(&version));
|
||||
if (version.major >= 5)
|
||||
|
||||
@@ -24,8 +24,15 @@ THE SOFTWARE.
|
||||
#include "comm.h"
|
||||
#include "graph/topo.h"
|
||||
#include "enqueue.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include <algorithm>
|
||||
#include "debug.h"
|
||||
|
||||
#ifdef USE_AMDSMI
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#else
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#endif
|
||||
|
||||
// Use this param to experiment pipelining new data types besides bfloat16
|
||||
// Make sure you generate the device code with the new data type (i.e. in generate.py)
|
||||
RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
|
||||
@@ -642,60 +649,54 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
std::string trimString(const std::string& s) {
|
||||
int sz = s.size();
|
||||
int b = 0;
|
||||
int e = sz - 1;
|
||||
while (b < sz && isspace(s[b])) {
|
||||
b++;
|
||||
}
|
||||
if (b >= sz) {
|
||||
return "";
|
||||
}
|
||||
|
||||
while (e >= b && e < sz && isspace(s[e])) {
|
||||
e--;
|
||||
}
|
||||
if (b > e) {
|
||||
return "";
|
||||
}
|
||||
return s.substr(b, e - b + 1);
|
||||
}
|
||||
|
||||
std::vector<std::string> splitString(const std::string& s, char delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
std::stringstream ss(s);
|
||||
std::string token;
|
||||
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
tokens.push_back(trimString(token));
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
int parseFirmwareVersionImpl() {
|
||||
int getFirmwareVersion() {
|
||||
uint64_t fw_version = -1;
|
||||
|
||||
// using rocm-smi APIs for now to query MEC FW version
|
||||
// will switch to amd-smi APIs soon
|
||||
#ifdef USE_AMDSMI
|
||||
amdsmi_status_t ret;
|
||||
ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
ERROR("Could not initialize amd-smi");
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint32_t socket_count = 0;
|
||||
amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
|
||||
uint32_t num_gpus_per_socket = 0;
|
||||
amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, nullptr);
|
||||
std::vector<amdsmi_processor_handle> processor_handles(num_gpus_per_socket);
|
||||
amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, processor_handles.data());
|
||||
|
||||
amdsmi_fw_info_t info;
|
||||
ret = amdsmi_get_fw_info(processor_handles[0], &info);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
ERROR("Could not query firmware info using amd-smi");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fw_version = info.fw_info_list[0].fw_version;
|
||||
|
||||
#else
|
||||
rsmi_status_t ret;
|
||||
ret = rsmi_init(0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) return -1;
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ERROR("Could not initialize rocm-smi");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_MEC, &fw_version);
|
||||
if (ret != RSMI_STATUS_SUCCESS) return -1;
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ERROR("Could not query firmware info using rocm-smi");
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return fw_version;
|
||||
}
|
||||
|
||||
int parseFirmwareVersion() {
|
||||
int version = -1;
|
||||
try {
|
||||
version = parseFirmwareVersionImpl();
|
||||
} catch (const std::exception& ex) {
|
||||
}
|
||||
return version;
|
||||
}
|
||||
|
||||
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, char const* archName) {
|
||||
bool hsaScratchEnvSet = (hsaScratchEnv && strcmp(hsaScratchEnv,"1") == 0);
|
||||
if (hsaScratchEnvSet) {
|
||||
|
||||
Ссылка в новой задаче
Block a user