Porównaj commity
21 Commity
develop
...
gfx1151-rccl
| Autor | SHA1 | Data | |
|---|---|---|---|
| a2686c9f41 | |||
| 532214edfb | |||
| aec38e7dde | |||
| f4b6e5f450 | |||
| 0586700b06 | |||
| 3f31d17ae7 | |||
| f227312867 | |||
| 54de8024d3 | |||
| 3bd4e81a8b | |||
| 7504897fe4 | |||
| 1d5c0c1add | |||
| 2e6df33acc | |||
| cd91b85935 | |||
| 484bd5bf0f | |||
| 95b150d96a | |||
| 6289de70ad | |||
| f1f0851398 | |||
| b4f25507ec | |||
| d2ea5d5d4c | |||
| 8126402d12 | |||
| 0b8251289a |
@@ -44,6 +44,9 @@ option(TRACE "Enable additional tracing"
|
||||
option(FAULT_INJECTION "Enable fault injection" ON)
|
||||
option(QUIET_WARNINGS "Supress compiler warnings" OFF)
|
||||
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
|
||||
option(ENABLE_AMDSMI "Enable AMD/ROCm SMI support" ON)
|
||||
|
||||
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
@@ -58,7 +61,8 @@ set(DEFAULT_GPUS
|
||||
gfx1101
|
||||
gfx1102
|
||||
gfx1200
|
||||
gfx1201)
|
||||
gfx1201
|
||||
gfx1151)
|
||||
|
||||
# Load CMake modules
|
||||
#==================================================================================================
|
||||
@@ -116,9 +120,23 @@ else()
|
||||
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
|
||||
endif()
|
||||
|
||||
set(GPU_TARGETS "${SUPPORTED_GPUS}")
|
||||
set(GPU_TARGETS "${SUPPORTED_GPUS}")
|
||||
message(STATUS "Compiling for ${GPU_TARGETS}")
|
||||
|
||||
# Auto-disable SMI for gfx1151 as it is not supported
|
||||
if("${GPU_TARGETS}" MATCHES "gfx1151")
|
||||
message(STATUS "Detected gfx1151 target: Forcing ENABLE_AMDSMI=OFF")
|
||||
set(ENABLE_AMDSMI OFF CACHE BOOL "Force disable SMI for gfx1151" FORCE)
|
||||
endif()
|
||||
|
||||
if(ENABLE_AMDSMI)
|
||||
message(STATUS "SMI Support: ENABLED")
|
||||
add_compile_definitions(RCCL_SMI_ENABLED)
|
||||
else()
|
||||
message(STATUS "SMI Support: DISABLED")
|
||||
endif()
|
||||
|
||||
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
|
||||
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
|
||||
|
||||
@@ -264,7 +282,7 @@ get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCL
|
||||
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
|
||||
|
||||
## Check for amd-smi if ROCm 7.11.0 or newer
|
||||
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
|
||||
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100" AND ENABLE_AMDSMI)
|
||||
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
|
||||
if(amd_smi_FOUND)
|
||||
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
|
||||
@@ -281,38 +299,7 @@ if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT USE_AMDSMI)
|
||||
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
|
||||
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
|
||||
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
|
||||
if(rocm_smi_FOUND)
|
||||
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
|
||||
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
|
||||
else()
|
||||
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
|
||||
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
|
||||
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
|
||||
endif()
|
||||
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
|
||||
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
|
||||
set(SMI_LIBRARIES rocm_smi64)
|
||||
|
||||
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
|
||||
|
||||
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
|
||||
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
|
||||
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
|
||||
if(${matchres} EQUAL -1)
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
|
||||
else()
|
||||
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
|
||||
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
## Check for BFD library if custom backtrace is requested
|
||||
if(BUILD_BFD)
|
||||
@@ -792,16 +779,26 @@ set(SRC_FILES
|
||||
src/misc/latency_profiler/CollTraceUtils.cc
|
||||
)
|
||||
|
||||
if(USE_AMDSMI)
|
||||
set(SMI_SOURCES
|
||||
src/include/amdsmi_wrap.h
|
||||
src/misc/amdsmi_wrap.cc
|
||||
)
|
||||
# Unconditionally include SMI headers so they are hipified/available
|
||||
set(SMI_HEADERS
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/include/amdsmi_wrap.h
|
||||
)
|
||||
list(APPEND SRC_FILES ${SMI_HEADERS})
|
||||
|
||||
if(ENABLE_AMDSMI)
|
||||
# Only compile the wrapper sources if SMI is enabled
|
||||
if(USE_AMDSMI)
|
||||
list(APPEND SRC_FILES src/misc/amdsmi_wrap.cc)
|
||||
else()
|
||||
list(APPEND SRC_FILES src/misc/rocm_smi_wrap.cc)
|
||||
endif()
|
||||
else()
|
||||
set(SMI_SOURCES
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
)
|
||||
# When SMI is disabled, compile the shim to provide dummy symbols (rsmi_init)
|
||||
# This satisfies external dependencies (like PyTorch) that expect SMI symbols
|
||||
# to be present, preventing them from failing to load or trying to load
|
||||
# the broken system library.
|
||||
list(APPEND SRC_FILES src/misc/smi_shim.cc)
|
||||
endif()
|
||||
list(APPEND SRC_FILES ${SMI_SOURCES})
|
||||
|
||||
@@ -1206,12 +1203,12 @@ if (HAVE_PARALLEL_JOBS)
|
||||
target_compile_options(rccl PRIVATE -parallel-jobs=12)
|
||||
endif()
|
||||
|
||||
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
|
||||
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200" AND NOT NO_COMPRESS)
|
||||
target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time.
|
||||
target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time.
|
||||
message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0")
|
||||
else()
|
||||
message(STATUS "--offload-compress disabled - ROCm version < 6.2.0")
|
||||
message(STATUS "--offload-compress disabled (ROCM < 6.2.0 or NO_COMPRESS=ON)")
|
||||
endif()
|
||||
|
||||
target_compile_options(rccl PRIVATE -Werror=uninitialized)
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
|
||||
#define __trace_hwreg() \
|
||||
collTrace->data_0 = 0;
|
||||
#else
|
||||
|
||||
@@ -1019,7 +1019,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
|
||||
int rcclGetTuningIndexForArch(const char* gfxarch) {
|
||||
static const std::vector<std::pair<std::string, int>> tuningIndexMap = {
|
||||
{"gfx906", 0}, {"gfx908", 0}, {"gfx90a", 0}, {"gfx942", 5},
|
||||
{"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0},
|
||||
{"gfx950", 6}, {"gfx1030", 0}, {"gfx1100", 0}, {"gfx1102", 0}, {"gfx1151", 0},
|
||||
{"gfx1200", 7}, {"gfx1201", 7}
|
||||
};
|
||||
if (gfxarch == nullptr) return 0;
|
||||
|
||||
@@ -7,11 +7,20 @@
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "nccl.h"
|
||||
|
||||
#if defined(USE_AMDSMI) && defined(RCCL_SMI_ENABLED)
|
||||
ncclResult_t amd_smi_init();
|
||||
ncclResult_t amd_smi_shutdown();
|
||||
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs);
|
||||
ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
|
||||
ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
|
||||
ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count);
|
||||
#else
|
||||
inline ncclResult_t amd_smi_init() { return ncclSuccess; }
|
||||
inline ncclResult_t amd_smi_shutdown() { return ncclSuccess; }
|
||||
inline ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
|
||||
inline ncclResult_t amd_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }
|
||||
inline ncclResult_t amd_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) { return ncclInternalError; }
|
||||
inline ncclResult_t amd_smi_getLinkInfo(int srcDev, int dstDev, amdsmi_link_type_t* type, int *hops, int *count) { *hops=1; *count=1; return ncclSuccess; }
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef struct
|
||||
} rccl_bfloat8;
|
||||
|
||||
// __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__))
|
||||
#elif HIP_VERSION >= 60300000 && !(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1030__))
|
||||
#elif HIP_VERSION >= 60300000 && !(defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__gfx1030__))
|
||||
|
||||
#include <hip/hip_fp8.h>
|
||||
|
||||
|
||||
@@ -29,10 +29,18 @@ THE SOFTWARE.
|
||||
#endif
|
||||
#include "nccl.h"
|
||||
|
||||
#if defined(USE_ROCMSMI) && defined(RCCL_SMI_ENABLED)
|
||||
ncclResult_t rocm_smi_init();
|
||||
ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
|
||||
ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);
|
||||
ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
|
||||
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count);
|
||||
#else
|
||||
inline ncclResult_t rocm_smi_init() { return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex) { *deviceIndex = 0; return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count) { *hops=1; *count=1; return ncclSuccess; }
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#if defined(USE_ROCMSMI)
|
||||
/*
|
||||
Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
@@ -201,3 +202,4 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
// Dummy implementation of rsmi_init to satisfy linker dependencies
|
||||
// when the real ROCm SMI library is broken or causes Bus Errors (gfx1151).
|
||||
// Returns 0 (RSMI_STATUS_SUCCESS).
|
||||
__attribute__((visibility("default")))
|
||||
int rsmi_init(uint64_t flags) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -742,7 +742,7 @@ int getFirmwareVersion() {
|
||||
|
||||
fw_version = info.fw_info_list[0].fw_version;
|
||||
|
||||
#else
|
||||
#elif defined(USE_ROCMSMI)
|
||||
rsmi_status_t ret;
|
||||
ret = rsmi_init(0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
@@ -755,6 +755,9 @@ int getFirmwareVersion() {
|
||||
ERROR("Could not query firmware info using rocm-smi");
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
// SMI disabled
|
||||
fw_version = -1;
|
||||
#endif
|
||||
|
||||
return fw_version;
|
||||
|
||||
@@ -260,6 +260,7 @@ static bool ncclSymkImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, nccl
|
||||
}
|
||||
|
||||
static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts) {
|
||||
#if defined(GENERATE_SYM_KERNELS) && GENERATE_SYM_KERNELS
|
||||
uint32_t kmask = kernelMask_coll(coll);
|
||||
kmask &= kernelMask_user();
|
||||
|
||||
@@ -299,6 +300,9 @@ static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDe
|
||||
if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
|
||||
|
||||
return kmask;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red,
|
||||
@@ -354,7 +358,7 @@ const char* ncclSymkKernelIdToString(int kernelId) {
|
||||
}
|
||||
|
||||
#ifndef GENERATE_SYM_KERNELS
|
||||
void* ncclSymGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
|
||||
void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -43,7 +43,7 @@ THE SOFTWARE.
|
||||
#endif
|
||||
|
||||
// Macro for collecting HW_REG_HW_ID
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1151__) || defined(__NVCC__)
|
||||
#define GetHwId(val) \
|
||||
val = 0
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user