21 Révisions

Auteur SHA1 Message Date
Donato Capitella a2686c9f41 Fix(critical): Prevent ncclInternalError when SMI is disabled by mocking getDeviceIndexByPciBusId 2026-02-01 12:48:18 +00:00
Donato Capitella 532214edfb Fix: Export rsmi_init shim with default visibility to be seen by PyTorch 2026-02-01 12:12:28 +00:00
Donato Capitella aec38e7dde Fix(critical): Add rsmi_init shim to satisfy PyTorch linker dependencies when SMI is disabled 2026-02-01 12:10:13 +00:00
Donato Capitella f4b6e5f450 Fix: Unconditionally include SMI headers in build list to fix hipify missing file error 2026-02-01 11:54:43 +00:00
Donato Capitella 0586700b06 fix: disable AMD SMI for gfx1151 targets in CMake and remove a debug error from the SMI wrapper header. 2026-02-01 11:49:27 +00:00
Donato Capitella 3f31d17ae7 build: Add compile-time error when ROCM_SMI is disabled. 2026-02-01 11:44:38 +00:00
Donato Capitella f227312867 Fix(Refactor): Switch SMI logic to whitelist (RCCL_SMI_ENABLED) and remove redundant fallback code 2026-02-01 11:31:39 +00:00
Donato Capitella 54de8024d3 Perf: Add NO_COMPRESS option to disable slow offload-compress 2026-02-01 11:14:25 +00:00
Donato Capitella 3bd4e81a8b Fix: Switch to add_compile_definitions for SMI_DISABLED and remove redundant target_ call 2026-02-01 11:13:07 +00:00
Donato Capitella 7504897fe4 Fix cmake syntax error: add missing endif() 2026-02-01 10:55:56 +00:00
Donato Capitella 1d5c0c1add Fix(critical): Move SMI_DISABLED logic to top of CMakeLists.txt and force via target_compile_definitions 2026-02-01 10:55:06 +00:00
Donato Capitella 2e6df33acc Fix(critical): Introduce SMI_DISABLED define to forcibly disable SMI usage in headers 2026-02-01 10:39:37 +00:00
Donato Capitella cd91b85935 Fix: Provide inline dummy SMI symbols when SMI is disabled to prevent link errors 2026-02-01 10:27:12 +00:00
Donato Capitella 484bd5bf0f Fix: Properly guard rocm_smi_wrap.cc content with USE_ROCMSMI 2026-02-01 10:13:38 +00:00
Donato Capitella 95b150d96a Fix: Do not compile rocm_smi_wrap.cc when ENABLE_AMDSMI is OFF 2026-02-01 10:11:26 +00:00
Donato Capitella 6289de70ad Force unset USE_AMDSMI internal cache variable when ENABLE_AMDSMI is OFF 2026-02-01 09:56:24 +00:00
Donato Capitella f1f0851398 Fix undefined amdsmi_init by properly guarding SMI code and adding ENABLE_AMDSMI option 2026-02-01 09:34:49 +00:00
Donato Capitella b4f25507ec Allow disabling SMI support via ENABLE_AMDSMI in cmake 2026-02-01 09:07:16 +00:00
Donato Capitella d2ea5d5d4c fix(rccl): disable symmetric kernels when GENERATE_SYM_KERNELS is OFF 2026-02-01 08:44:52 +00:00
Donato Capitella 8126402d12 fix(rccl): fix typo in ncclSymkGetKernelPtr fallback 2026-02-01 08:26:01 +00:00
Donato Capitella 0b8251289a feat(rccl): add gfx1151 support 2026-01-31 16:42:58 +00:00
11 fichiers modifiés avec 84 ajouts et 50 suppressions
+41 -44
Voir le fichier
@@ -44,6 +44,9 @@ option(TRACE "Enable additional tracing"
option(FAULT_INJECTION "Enable fault injection" ON)
option(QUIET_WARNINGS "Supress compiler warnings" OFF)
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
option(ENABLE_AMDSMI "Enable AMD/ROCm SMI support" ON)
# Default GPU architectures to build
#==================================================================================================
@@ -58,7 +61,8 @@ set(DEFAULT_GPUS
gfx1101
gfx1102
gfx1200
gfx1201)
gfx1201
gfx1151)
# Load CMake modules
#==================================================================================================
@@ -116,9 +120,23 @@ else()
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}")
set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${GPU_TARGETS}")
# Auto-disable SMI for gfx1151 as it is not supported
if("${GPU_TARGETS}" MATCHES "gfx1151")
message(STATUS "Detected gfx1151 target: Forcing ENABLE_AMDSMI=OFF")
set(ENABLE_AMDSMI OFF CACHE BOOL "Force disable SMI for gfx1151" FORCE)
endif()
if(ENABLE_AMDSMI)
message(STATUS "SMI Support: ENABLED")
add_compile_definitions(RCCL_SMI_ENABLED)
else()
message(STATUS "SMI Support: DISABLED")
endif()
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
@@ -264,7 +282,7 @@ get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCL
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
## Check for amd-smi if ROCm 7.11.0 or newer
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100" AND ENABLE_AMDSMI)
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
if(amd_smi_FOUND)
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
@@ -281,38 +299,7 @@ if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
endif()
endif()
if(NOT USE_AMDSMI)
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if(rocm_smi_FOUND)
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
else()
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
endif()
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
set(SMI_LIBRARIES rocm_smi64)
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
endif()
## Check for BFD library if custom backtrace is requested
if(BUILD_BFD)
@@ -792,16 +779,26 @@ set(SRC_FILES
src/misc/latency_profiler/CollTraceUtils.cc
)
if(USE_AMDSMI)
set(SMI_SOURCES
src/include/amdsmi_wrap.h
src/misc/amdsmi_wrap.cc
)
# Unconditionally include SMI headers so they are hipified/available
set(SMI_HEADERS
src/include/rocm_smi_wrap.h
src/include/amdsmi_wrap.h
)
list(APPEND SRC_FILES ${SMI_HEADERS})
if(ENABLE_AMDSMI)
# Only compile the wrapper sources if SMI is enabled
if(USE_AMDSMI)
list(APPEND SRC_FILES src/misc/amdsmi_wrap.cc)
else()
list(APPEND SRC_FILES src/misc/rocm_smi_wrap.cc)
endif()
else()
set(SMI_SOURCES
src/include/rocm_smi_wrap.h
src/misc/rocm_smi_wrap.cc
)
# When SMI is disabled, compile the shim to provide dummy symbols (rsmi_init)
# This satisfies external dependencies (like PyTorch) that expect SMI symbols
# to be present, preventing them from failing to load or trying to load
# the broken system library.
list(APPEND SRC_FILES src/misc/smi_shim.cc)
endif()
list(APPEND SRC_FILES ${SMI_SOURCES})
@@ -1206,12 +1203,12 @@ if (HAVE_PARALLEL_JOBS)
target_compile_options(rccl PRIVATE -parallel-jobs=12)
endif()
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200" AND NOT NO_COMPRESS)
target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time.
target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time.
message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0")
else()
message(STATUS "--offload-compress disabled - ROCm version < 6.2.0")
message(STATUS "--offload-compress disabled (ROCM < 6.2.0 or NO_COMPRESS=ON)")
endif()
target_compile_options(rccl PRIVATE -Werror=uninitialized)
+1 -1
Voir le fichier
@@ -26,7 +26,7 @@
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
#endif
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
#define __trace_hwreg() \
collTrace->data_0 = 0;
#else
+1 -1
Voir le fichier
@@ -1019,7 +1019,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
int rcclGetTuningIndexForArch(const char* gfxarch) {
static const std::vector<std::pair<std::string, int>> tuningIndexMap = {
{"gfx906", 0}, {"gfx908", 0}, {"gfx90a", 0}, {"gfx942", 5},
{"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0},
{"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0}, {"gfx1151", 0},
{"gfx1200", 7}, {"gfx1201", 7}
};
if (gfxarch == nullptr) return 0;
+9
Voir le fichier
@@ -7,11 +7,20 @@
#include "amd_smi/amdsmi.h"
#include "nccl.h"
#if defined(USE_AMDSMI) && defined(RCCL_SMI_ENABLED)
ncclResult_t amd_smi_init();
ncclResult_t amd_smi_shutdown();
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs);
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count);
#else
inline ncclResult_t amd_smi_init() { return ncclSuccess; }
inline ncclResult_t amd_smi_shutdown() { return ncclSuccess; }
inline ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
inline ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }
inline ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) { return ncclInternalError; }
inline ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count) { *hops=1; *count=1; return ncclSuccess; }
#endif
#endif
+1 -1
Voir le fichier
@@ -41,7 +41,7 @@ typedef struct
} rccl_bfloat8;
// __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
#elif HIP_VERSION >= 60300000 && !(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1030__))
#elif HIP_VERSION >= 60300000 && !(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__gfx1030__))
#include <hip/hip_fp8.h>
+8
Voir le fichier
@@ -29,10 +29,18 @@ THE SOFTWARE.
#endif
#include "nccl.h"
#if defined(USE_ROCMSMI) && defined(RCCL_SMI_ENABLED)
ncclResult_t rocm_smi_init();
ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count);
#else
inline ncclResult_t rocm_smi_init() { return ncclSuccess; }
inline ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
inline ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }
inline ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) { *deviceIndex = 0; return ncclSuccess; }
inline ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count) { *hops=1; *count=1; return ncclSuccess; }
#endif
#endif
+2
Voir le fichier
@@ -1,3 +1,4 @@
#if defined(USE_ROCMSMI)
/*
Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
@@ -201,3 +202,4 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
return ncclSuccess;
}
#endif
+11
Voir le fichier
@@ -0,0 +1,11 @@
#include <cstdint>
extern "C" {
// Dummy implementation of rsmi_init to satisfy linker dependencies
// when the real ROCm SMI library is broken or causes Bus Errors (gfx1151).
// Returns 0 (RSMI_STATUS_SUCCESS).
__attribute__((visibility("default")))
int rsmi_init(uint64_t flags) {
return 0;
}
}
+4 -1
Voir le fichier
@@ -742,7 +742,7 @@ int getFirmwareVersion() {
fw_version = info.fw_info_list[0].fw_version;
#else
#elif defined(USE_ROCMSMI)
rsmi_status_t ret;
ret = rsmi_init(0);
if (ret != RSMI_STATUS_SUCCESS) {
@@ -755,6 +755,9 @@ int getFirmwareVersion() {
ERROR("Could not query firmware info using rocm-smi");
return -1;
}
#else
// SMI disabled
fw_version = -1;
#endif
return fw_version;
+5 -1
Voir le fichier
@@ -260,6 +260,7 @@ static bool ncclSymkImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, nccl
}
static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts) {
#if defined(GENERATE_SYM_KERNELS) && GENERATE_SYM_KERNELS
uint32_t kmask = kernelMask_coll(coll);
kmask &= kernelMask_user();
@@ -299,6 +300,9 @@ static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDe
if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
return kmask;
#else
return 0;
#endif
}
bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red,
@@ -354,7 +358,7 @@ const char* ncclSymkKernelIdToString(int kernelId) {
}
#ifndef GENERATE_SYM_KERNELS
void* ncclSymGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
return nullptr;
}
#endif
+1 -1
Voir le fichier
@@ -43,7 +43,7 @@ THE SOFTWARE.
#endif
// Macro for collecting HW_REG_HW_ID
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__NVCC__)
#define GetHwId(val) \
val = 0
#else