diff --git a/CMakeLists.txt b/CMakeLists.txt index 1576dba521..22fda5913d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,26 +261,56 @@ find_package(hsa-runtime64 REQUIRED) get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES) message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}") -## Check for ROCM-smi -find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi) -if (rocm_smi_FOUND) - message(STATUS "Found rocm_smi at ${ROCM_SMI_INCLUDE_DIR}") -else() - message(STATUS "Checking old include directory structure for rocm_smi") - set(ROCM_SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include") - set(ROCM_SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib") - set(ROCM_SMI_LIBRARIES rocm_smi64) +## Check for amd-smi if ROCm 7.11.0 or newer +if(ROCM_VERSION VERSION_GREATER_EQUAL "71100") + find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi) + if(amd_smi_FOUND) + message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}") + message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}") + set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory") + set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory") + set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging") + if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}") + message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}") + endif() + message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}") + set(SMI_LIBRARIES amd_smi) + set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi") + endif() +endif() + +if(NOT USE_AMDSMI) + ## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0 + message(WARNING "Could not find amd_smi. Falling back to rocm_smi.") + find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi) + if(rocm_smi_FOUND) + set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory") + set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory") + else() + message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi") + set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include") + set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib") + endif() + + if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}") + message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}") + endif() + message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}") + set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging") + set(SMI_LIBRARIES rocm_smi64) + + check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG) + + ### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support + file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl) + string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres) + if(${matchres} EQUAL -1) + message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported") + else() + message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported") + set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True) + endif () endif() -check_include_file_cxx("${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG) -### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support -file(READ "${ROCM_SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl) -string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres) -if(${matchres} EQUAL -1) - message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported") -else() - message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported") - set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True) -endif () ## Check for BFD library if custom backtrace is requested if(BUILD_BFD) @@ -570,7 +600,6 @@ set(SRC_FILES src/include/register.h src/include/register_inline.h src/include/rccl_float8.h - src/include/rocm_smi_wrap.h src/include/rocmwrap.h src/include/roctx.h src/include/recorder.h @@ -694,7 +723,6 @@ set(SRC_FILES # src/misc/nvmlwrap.cc src/misc/nvmlwrap_stub.cc src/misc/param.cc - src/misc/rocm_smi_wrap.cc src/misc/rocmwrap.cc src/misc/roctx.cc src/misc/recorder.cc @@ -762,6 +790,19 @@ set(SRC_FILES src/misc/latency_profiler/CollTraceUtils.cc ) +if(USE_AMDSMI) + set(SMI_SOURCES + src/include/amdsmi_wrap.h + src/misc/amdsmi_wrap.cc + ) +else() + set(SMI_SOURCES + src/include/rocm_smi_wrap.h + src/misc/rocm_smi_wrap.cc + ) +endif() +list(APPEND SRC_FILES ${SMI_SOURCES}) + if (ENABLE_MSCCL_KERNEL) set(MSCCL_KERNEL_SOURCES src/device/msccl_kernel_impl.h @@ -939,11 +980,16 @@ endif() if(ENABLE_MSCCLPP) target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP) endif() -if(HAVE_ROCM_SMI64CONFIG) - target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG) -endif() -if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX) - target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX) + +if(USE_AMDSMI) + target_compile_definitions(rccl PRIVATE USE_AMDSMI) +else() + if(HAVE_ROCM_SMI64CONFIG) + target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG) + endif() + if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX) + target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX) + endif() endif() if(ENABLE_WARP_SPEED) target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED) @@ -1265,7 +1311,7 @@ if (FAULT_INJECTION) endif() ## Set RCCL linked library directories -target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR}) +target_link_directories(rccl PRIVATE ${SMI_LIB_DIR}) if (ROCM_VERSION VERSION_GREATER_EQUAL "60100") option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON) @@ -1297,7 +1343,7 @@ target_link_libraries(rccl PRIVATE Threads::Threads) target_link_libraries(rccl INTERFACE hip::host) target_link_libraries(rccl PRIVATE hip::device) target_link_libraries(rccl PRIVATE dl) -target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES}) +target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES}) target_link_libraries(rccl PRIVATE fmt::fmt-header-only) if(ENABLE_MSCCLPP) target_link_libraries(rccl PRIVATE mscclpp_nccl) @@ -1417,7 +1463,8 @@ if(BUILD_ADDRESS_SANITIZER) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() -rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "rocm-smi-lib >= 4.0.0") + +rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}") set(CPACK_DEB_COMPONENT_INSTALL ON) set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) set(CPACK_RPM_COMPONENT_INSTALL ON) diff --git a/src/graph/xml.cc b/src/graph/xml.cc index dcdb64a08a..ecf4d7dc60 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -14,7 +14,11 @@ #include "core.h" #include "nvmlwrap.h" #include "xml.h" +#ifdef USE_AMDSMI +#include "amdsmi_wrap.h" +#else #include "rocm_smi_wrap.h" +#endif #include "archinfo.h" #if defined(__x86_64__) #include @@ -819,6 +823,32 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); uint32_t deviceCnt; +#ifdef USE_AMDSMI + NCCLCHECK(amd_smi_getNumDevice(&deviceCnt)); + for (int i=0; i +#include "amd_smi/amdsmi.h" +#include "nccl.h" + +ncclResult_t amd_smi_init(); +ncclResult_t amd_smi_shutdown(); +ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs); +ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len); +ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex); +ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count); + +#endif diff --git a/src/include/rccl_common.h b/src/include/rccl_common.h index e0409e2368..435ddbe0d3 100644 --- a/src/include/rccl_common.h +++ b/src/include/rccl_common.h @@ -118,7 +118,7 @@ void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize); ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount); ncclResult_t commSetUnrollFactor(struct ncclComm* comm); bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName); -int parseFirmwareVersion(); +int getFirmwareVersion(); bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName); #ifdef ENABLE_WARP_SPEED void rcclSetWarpSpeedCUs(struct ncclComm* comm, int algo, int threadsPerBlock, int& rcclWarpSpeedChannels); diff --git a/src/include/rocm_smi_wrap.h b/src/include/rocm_smi_wrap.h index 417e280a83..87b1f2a72f 100644 --- a/src/include/rocm_smi_wrap.h +++ b/src/include/rocm_smi_wrap.h @@ -24,7 +24,7 @@ THE SOFTWARE. #define ROCM_SMI_WRAP_H_ #include "rocm_smi/rocm_smi.h" -#ifdef USE_ROCM_SMI64CONFIG +#ifdef HAVE_ROCM_SMI64CONFIG #include "rocm_smi/rocm_smi64Config.h" #endif #include "nccl.h" diff --git a/src/init.cc b/src/init.cc index e56d3b0e2d..def51b40ea 100644 --- a/src/init.cc +++ b/src/init.cc @@ -56,7 +56,11 @@ #ifdef ENABLE_MSCCLPP #include "mscclpp/mscclpp_nccl.h" #endif +#ifdef USE_AMDSMI +#include "amdsmi_wrap.h" +#else #include "rocm_smi_wrap.h" +#endif #include "rccl_common.h" // [/RCCL] @@ -197,9 +201,7 @@ ncclResult_t checkHsaEnvSetting() { // hipVer is an integer e.g., 6.2.41133 -> 60241133 CUDACHECK(hipRuntimeGetVersion(&hipRuntimeVersion)); - // using rocm-smi API to query FW version, instead of parsing CLI output - // will switch to amd-smi API soon - const int firmwareVersion = parseFirmwareVersion(); + const int firmwareVersion = getFirmwareVersion(); hipDeviceProp_t devProp; // use GPU0 should be good enough @@ -734,9 +736,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); char busId[]="0000:00:00.0"; NCCLCHECK(int64ToBusId(comm->busId, busId)); + +#ifdef USE_AMDSMI + NCCLCHECK(amd_smi_init()); + NCCLCHECK(amd_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev)); +#else NCCLCHECK(rocm_smi_init()); NCCLCHECK(rocm_smi_getDeviceIndexByPciBusId(busId, (unsigned int*)&comm->nvmlDev)); - +#endif comm->compCap = ncclCudaCompCap(); TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx compCap %d", comm, rank, ndev, comm->cudaDev, comm->busId, comm->compCap); diff --git a/src/misc/amdsmi_wrap.cc b/src/misc/amdsmi_wrap.cc new file mode 100644 index 0000000000..e2d80410de --- /dev/null +++ b/src/misc/amdsmi_wrap.cc @@ -0,0 +1,312 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. + +#include "amdsmi_wrap.h" +#include "alt_rsmi.h" +#include "core.h" +#include "utils.h" +#include +#include +#include + +static int is_wsl2 = -1; + +#define AMDSMICHECK(cmd) do { \ + amdsmi_status_t ret = cmd; \ + if( ret != AMDSMI_STATUS_SUCCESS ) { \ + const char *err; \ + amdsmi_status_code_to_string(ret, &err); \ + ERROR("AMD SMI failure: %s at line: %d in file: %s", err, __LINE__, __FILE__); \ + return ncclInternalError; \ + } \ +} while(false) + +#define ARSMICHECK(cmd) do { \ + int ret = cmd; \ + if( ret != 0 ) { \ + ERROR("ARSMI failure: %d", ret); \ + return ncclInternalError; \ + } \ +} while(false) + +RCCL_PARAM(UseAmdSmiLib, "USE_AMD_SMI_LIB", 0); // Opt-in environment variable for enabling using amd_smi_lib instead of internal code + +ncclResult_t amd_smi_init() { + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE) == -1) + __atomic_store_n(&is_wsl2, (access("/dev/dxg", F_OK) == -1) ? 0 : 1, __ATOMIC_RELEASE); + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) { + INFO(NCCL_INIT, "Not using amdsmi_lib due to WSL2 environment detected."); + return ncclSuccess; + } + + if (rcclParamUseAmdSmiLib()) { + // initialize amd-smi for AMD GPUs + AMDSMICHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS)); + + // get amd-smi version + amdsmi_version_t version; + AMDSMICHECK(amdsmi_get_lib_version(&version)); + INFO(NCCL_INIT, "amdsmi_lib: version %d.%d.%d.%s", version.major, version.minor, version.release, version.build); + } else { + // initialize alternate rsmi + ARSMICHECK(ARSMI_init()); + INFO(NCCL_INIT, "initialized internal alternative rsmi functionality"); + } + return ncclSuccess; +} + +ncclResult_t amd_smi_shutdown() { + AMDSMICHECK(amdsmi_shut_down()); + return ncclSuccess; +} + +ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs) { + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) + CUDACHECK(cudaGetDeviceCount((int *)num_devs)); + else { + if (rcclParamUseAmdSmiLib()) { + // rsmi_num_monitor_devices is deprecated + + // with amd-smi, first get list of socket handles, + // then get number of processor handles in said sockets, + // and then query no. of gpus in said processor handles + uint32_t socket_count = 0; + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr)); + std::vector sockets(socket_count); + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data())); + + uint32_t total_gpus = 0; + for (auto& socket : sockets) { + uint32_t num_gpus_per_socket = 0; + AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, nullptr)); + std::vector processor_handles(num_gpus_per_socket); + AMDSMICHECK(amdsmi_get_processor_handles(socket, &num_gpus_per_socket, processor_handles.data())); + total_gpus += num_gpus_per_socket; + } + *num_devs = total_gpus; + } else { + ARSMICHECK(ARSMI_get_num_devices(num_devs)); + } + } + return ncclSuccess; +} + +ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* busId, size_t len) { + uint64_t id; + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) { + CUDACHECK(cudaDeviceGetPCIBusId(busId, len, deviceIndex)); + } else { + /** amd-smi's bus ID format + * | Name | Field | + * ------------- | ------- | + * | Domain | [63:16] | + * | Bus | [15: 8] | + * | Device | [ 7: 3] | + * | Function | [ 2: 0] | + **/ + if (rcclParamUseAmdSmiLib()) { + // rsmi_dev_pci_id_get is deprecated + + /// with amd-smi, first get list of socket handles, + // then get number of processor handles in said sockets, + // and then query the BDF for GPU matching deviceIndex in said processor handles + uint32_t socket_count = 0; + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr)); + std::vector sockets(socket_count); + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data())); + + for (auto& socket : sockets) { + uint32_t processor_handle_count = 0; + AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr)); + std::vector processor_handles(processor_handle_count); + AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data())); + + // this does not work? + // AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket)); + + // workaround + for (auto& proc : processor_handles) { + processor_type_t type; + uint64_t id; + + AMDSMICHECK(amdsmi_get_processor_type(proc, &type)); + if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + amdsmi_enumeration_info_t info; + AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info)); + if(info.hip_id == deviceIndex) { + AMDSMICHECK(amdsmi_get_gpu_bdf_id(proc, &id)); + break; + } + } + } + } + } else { + ARSMICHECK(ARSMI_dev_pci_id_get(deviceIndex, &id)); + } + + // rocm-smi/amd-smi format + //snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id & 0xffffffff) >> 32, (id & 0xff00) >> 8, (id & 0xf8) >> 3, (id & 0x7)); + + // borrowing NCCL's format from utils.cc:int64ToBusId + // !! To be reconciled after discussion with amdsmi team !! + snprintf(busId, len, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); + } + return ncclSuccess; +} + + +ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) { + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) { + CUDACHECK(hipDeviceGetByPCIBusId((int *)deviceIndex, pciBusId)); + return ncclSuccess; + } else { + int64_t busid; + + busIdToInt64(pciBusId, &busid); + /** convert to amd-smi's bus ID format + * | Name | Field | + * ------------- | ------- | + * | Domain | [63:16] | + * | Bus | [15: 8] | + * | Device | [ 7: 3] | + * | Function | [ 2: 0] | + **/ + + // instead of getting device count and then comparing the busid to each GPUs BDF + + // with amd-smi, we can use amdsmi_get_processor_handle_from_bdf, + // and then query the enumeration info for that processor_handle + if (rcclParamUseAmdSmiLib()) { + amdsmi_processor_handle processor_handle = 0; + + amdsmi_bdf_t bdf = {}; + // This is the format that matches amd-smi BDF + // bdf.function_number = (busid & 0x7); + // bdf.device_number = (busid & 0xf8) >> 3; + // bdf.bus_number = (busid & 0xff00) >> 8; + // bdf.domain_number = (busid & 0xffffffffffff0000) >> 16; + + // However, it is incompatible with the format enforced by NCCL in utils.cc:int64ToBusId + // !! To be reconciled after discussion with amdsmi team !! + bdf.function_number = (busid & 0xf); + bdf.device_number = (busid & 0xff) >> 4; + bdf.bus_number = (busid & 0xff000) >> 12; + bdf.domain_number = busid >> 20; + + AMDSMICHECK(amdsmi_get_processor_handle_from_bdf(bdf, &processor_handle)); + + processor_type_t type; + AMDSMICHECK(amdsmi_get_processor_type(processor_handle, &type)); + if(type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + amdsmi_enumeration_info_t info; + AMDSMICHECK(amdsmi_get_gpu_enumeration_info(processor_handle, &info)); + *deviceIndex = info.hip_id; + return ncclSuccess; + } + + ERROR("amdsmi_lib: %s device index not found", pciBusId); + } else { + uint32_t i, num_devs = 0; + busid = ((busid&0xffff00000L)<<12)+((busid&0xff000L)>>4)+((busid&0xff0L)>>1)+(busid&0x7L); + + ARSMICHECK(ARSMI_get_num_devices(&num_devs)); + for (i = 0; i < num_devs; i++) { + uint64_t bdfid; + ARSMICHECK(ARSMI_dev_pci_id_get(i, &bdfid)); + if (bdfid == busid) break; + } + if (i < num_devs) { + *deviceIndex = i; + return ncclSuccess; + } + else { + WARN("ARSMI_lib: %s device index not found", pciBusId); + } + } + return ncclInternalError; + } +} + +ncclResult_t amd_smi_getLinkInfo(int srcIndex, int dstIndex, amdsmi_link_type_t* type, int *hops, int *count) { + if (__atomic_load_n(&is_wsl2, __ATOMIC_ACQUIRE)) { + *type = AMDSMI_LINK_TYPE_PCIE; + *hops = 1; + *count = 1; + } else { + amdsmi_link_type_t amdsmi_type; + uint64_t amdsmi_hops = 1, amdsmi_weight ; + *count = 1; + + // rsmi_minmax_bandwidth_get is replaced by amdsmi_get_minmax_bandwidth_between_processors + // where the arguments for src and dst change from index to processor_handles + + // with amd-smi, first get list of socket handles, + // then get number of processor handles in said sockets, + // then get the prcoessor handle matching the src and dst index, + // and then use these processor handles for amdsmi hardware topology functions + if (rcclParamUseAmdSmiLib()) { + uint32_t socket_count = 0; + amdsmi_processor_handle src_processor_handle = 0; + amdsmi_processor_handle dst_processor_handle = 0; + bool found_src = false, found_dst = false; + + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, nullptr)); + std::vector sockets(socket_count); + AMDSMICHECK(amdsmi_get_socket_handles(&socket_count, sockets.data())); + + for (auto& socket : sockets) { + uint32_t processor_handle_count = 0; + AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, nullptr)); + std::vector processor_handles(processor_handle_count); + AMDSMICHECK(amdsmi_get_processor_handles(socket, &processor_handle_count, processor_handles.data())); + + // this does not work? + // AMDSMICHECK(amdsmi_get_processor_handles_by_type(socket, AMDSMI_PROCESSOR_TYPE_AMD_GPU, nullptr, &num_gpus_per_socket)); + + // workaround + for (auto& proc : processor_handles) { + processor_type_t proc_type; + AMDSMICHECK(amdsmi_get_processor_type(proc, &proc_type)); + if(proc_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + amdsmi_enumeration_info_t info; + AMDSMICHECK(amdsmi_get_gpu_enumeration_info(proc, &info)); + if(info.hip_id == srcIndex) { + src_processor_handle = proc; + found_src = true; + } + if(info.hip_id == dstIndex) { + dst_processor_handle = proc; + found_dst = true; + } + } + } + } + if (!found_src) ERROR("amd-smi could not find processor handle for srcIndex: %d", srcIndex); + if (!found_dst) ERROR("amd-smi could not find processor handle for dstIndex: %d", dstIndex); + AMDSMICHECK(amdsmi_topo_get_link_type(src_processor_handle, dst_processor_handle, &amdsmi_hops, &amdsmi_type)); + AMDSMICHECK(amdsmi_topo_get_link_weight(src_processor_handle, dst_processor_handle, &amdsmi_weight)); + + // amd-smi reports weight=0 for XGMI ?? + if (amdsmi_type == AMDSMI_LINK_TYPE_XGMI) { + uint64_t min_bw = 0, max_bw = 0; + AMDSMICHECK(amdsmi_get_minmax_bandwidth_between_processors(src_processor_handle, dst_processor_handle, &min_bw, &max_bw)); + if (max_bw && min_bw) *count = max_bw/min_bw; + } + + *type = amdsmi_type; + *hops = amdsmi_hops; + } else { + ARSMI_linkInfo tinfo; + ARSMICHECK(ARSMI_topo_get_link_info(srcIndex, dstIndex, &tinfo)); + + *type = (amdsmi_link_type_t) tinfo.type; + if (*type == AMDSMI_LINK_TYPE_XGMI && (tinfo.weight == 15 || + tinfo.weight == 41 || tinfo.weight == 13)) { + *hops = 1; + if (tinfo.max_bandwidth && tinfo.min_bandwidth) + *count = tinfo.max_bandwidth/tinfo.min_bandwidth; + } + } + } + + return ncclSuccess; +} diff --git a/src/misc/rocm_smi_wrap.cc b/src/misc/rocm_smi_wrap.cc index db40f87784..804ef872eb 100644 --- a/src/misc/rocm_smi_wrap.cc +++ b/src/misc/rocm_smi_wrap.cc @@ -176,7 +176,7 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE* rsmi_weight == 41 || rsmi_weight == 13)) { uint64_t min_bw = 0, max_bw = 0; *hops = 1; -#if defined USE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5 +#if defined HAVE_ROCM_SMI64CONFIG && rocm_smi_VERSION_MAJOR >= 5 rsmi_version_t version; ROCMSMICHECK(rsmi_version_get(&version)); if (version.major >= 5) diff --git a/src/rccl_wrap.cc b/src/rccl_wrap.cc index 5b3c312bdd..4d62da96d9 100644 --- a/src/rccl_wrap.cc +++ b/src/rccl_wrap.cc @@ -24,8 +24,15 @@ THE SOFTWARE. #include "comm.h" #include "graph/topo.h" #include "enqueue.h" -#include "rocm_smi/rocm_smi.h" #include +#include "debug.h" + +#ifdef USE_AMDSMI +#include "amd_smi/amdsmi.h" +#else +#include "rocm_smi/rocm_smi.h" +#endif + // Use this param to experiment pipelining new data types besides bfloat16 // Make sure you generate the device code with the new data type (i.e. in generate.py) RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0); @@ -642,60 +649,54 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm) { return ncclSuccess; } -std::string trimString(const std::string& s) { - int sz = s.size(); - int b = 0; - int e = sz - 1; - while (b < sz && isspace(s[b])) { - b++; - } - if (b >= sz) { - return ""; - } - - while (e >= b && e < sz && isspace(s[e])) { - e--; - } - if (b > e) { - return ""; - } - return s.substr(b, e - b + 1); -} - -std::vector splitString(const std::string& s, char delimiter) { - std::vector tokens; - std::stringstream ss(s); - std::string token; - - while (std::getline(ss, token, delimiter)) { - tokens.push_back(trimString(token)); - } - return tokens; -} - -int parseFirmwareVersionImpl() { +int getFirmwareVersion() { uint64_t fw_version = -1; - // using rocm-smi APIs for now to query MEC FW version - // will switch to amd-smi APIs soon +#ifdef USE_AMDSMI + amdsmi_status_t ret; + ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + if (ret != AMDSMI_STATUS_SUCCESS) { + ERROR("Could not initialize amd-smi"); + return -1; + } + + uint32_t socket_count = 0; + amdsmi_get_socket_handles(&socket_count, nullptr); + std::vector sockets(socket_count); + amdsmi_get_socket_handles(&socket_count, sockets.data()); + + uint32_t num_gpus_per_socket = 0; + amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, nullptr); + std::vector processor_handles(num_gpus_per_socket); + amdsmi_get_processor_handles(sockets[0], &num_gpus_per_socket, processor_handles.data()); + + amdsmi_fw_info_t info; + ret = amdsmi_get_fw_info(processor_handles[0], &info); + if (ret != AMDSMI_STATUS_SUCCESS) { + ERROR("Could not query firmware info using amd-smi"); + return -1; + } + + fw_version = info.fw_info_list[0].fw_version; + +#else rsmi_status_t ret; ret = rsmi_init(0); - if (ret != RSMI_STATUS_SUCCESS) return -1; + if (ret != RSMI_STATUS_SUCCESS) { + ERROR("Could not initialize rocm-smi"); + return -1; + } + ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_MEC, &fw_version); - if (ret != RSMI_STATUS_SUCCESS) return -1; + if (ret != RSMI_STATUS_SUCCESS) { + ERROR("Could not query firmware info using rocm-smi"); + return -1; + } +#endif return fw_version; } -int parseFirmwareVersion() { - int version = -1; - try { - version = parseFirmwareVersionImpl(); - } catch (const std::exception& ex) { - } - return version; -} - bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, char const* archName) { bool hsaScratchEnvSet = (hsaScratchEnv && strcmp(hsaScratchEnv,"1") == 0); if (hsaScratchEnvSet) {