[Device] WarpSpeed enablement and single node CU and perf opt for MI350 (#2073)

This commit is contained in:
Mustafa Abduljabbar
2025-12-11 19:04:35 -05:00
کامیت شده توسط GitHub
والد 08dd75712f
کامیت d009ab144e
22فایلهای تغییر یافته به همراه424 افزوده شده و 65 حذف شده
+3
مشاهده پرونده
@@ -8,6 +8,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
+18
مشاهده پرونده
@@ -333,6 +333,7 @@ endif()
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
# Check if any of the supported architectures are in GPU_TARGETS
set(ARCH_MATCH_FOUND OFF)
set(MSCCLPP_GPU_TARGETS "")
@@ -355,6 +356,20 @@ if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
endif()
## Disable WARP_SPEED if the build environment is invalid
set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
set(ARCH_MATCH_FOUND OFF)
foreach(ARCH IN LISTS GPU_TARGETS)
if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS)
set(ARCH_MATCH_FOUND ON)
endif()
endforeach()
if (NOT ARCH_MATCH_FOUND)
set(ENABLE_WARP_SPEED OFF)
message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build")
endif()
# cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
execute_process(
COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
@@ -875,6 +890,9 @@ endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
endif()
if(ENABLE_WARP_SPEED)
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
endif()
# NPKit flags
## May be better to move these to a separate file
+8 -1
مشاهده پرونده
@@ -39,6 +39,7 @@ run_tests_all=false
time_trace=false
force_reduce_pipeline=false
generate_sym_kernels=false
warp_speed_enabled=true # note that this flag will be overridden to false for non MI350/MI300 platforms
quiet_warnings=false
# #################################################
@@ -90,7 +91,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options cdfhij:lprtq --longoptions address-sanitizer,dependencies,debug,dump-asm,enable-code-coverage,enable_backtrace,disable-colltrace,disable-msccl-kernel,enable-mscclpp,fast,help,install,jobs:,kernel-resource-use,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,log-trace,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,force-reduce-pipeline,generate-sym-kernels,quiet-warnings,disable-warp-speed,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -137,6 +138,7 @@ while true; do
--verbose) build_verbose=true; shift ;;
--force-reduce-pipeline) force_reduce_pipeline=true; shift ;;
--generate-sym-kernels) generate_sym_kernels=true; shift ;;
--disable-warp-speed) warp_speed_enabled=false; shift ;;
-q | --quiet-warnings) quiet_warnings=true; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
@@ -316,6 +318,11 @@ if [[ "${npkit_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_NPKIT=ON"
fi
# Enable WARP_SPEED only on MI350/MI300 platforms
if [[ "${warp_speed_enabled}" == true ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_WARP_SPEED=ON"
fi
# Suppress Warnings
if [[ "${quiet_warnings}" == true ]]; then
cmake_common_options="${cmake_common_options} -DQUIET_WARNINGS=ON"
+11 -2
مشاهده پرونده
@@ -20,11 +20,20 @@ namespace {
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
ssize_t count, partOffset, partCount, chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
#endif
ssize_t offset;
ssize_t dataOffset;
int nelem;
@@ -142,7 +151,7 @@ namespace {
#endif
// Final wait/copy.
prims.directRecv(offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
@@ -671,4 +680,4 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
return;
}
}
};
};
+10
مشاهده پرونده
@@ -20,8 +20,14 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
int ringIx = ring->index;
const int nranks = ncclShmem.comm.nRanks;
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
@@ -31,7 +37,11 @@ namespace {
ssize_t gridOffset;
ssize_t channelCount;
ssize_t chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#endif
const ssize_t loopCount = nranks * chunkCount;
ssize_t offset;
int nelem;
+9
مشاهده پرونده
@@ -19,7 +19,12 @@ namespace {
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
const int root = work->root;
@@ -27,7 +32,11 @@ namespace {
ssize_t chunkCount;
ssize_t channelCount;
ssize_t gridOffset;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &size, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
int nelem;
int workNthreads;
+12 -12
مشاهده پرونده
@@ -17,24 +17,24 @@ struct RunWorkNop {
__device__ void run() {}
};
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&argsStorage.args);
}
#ifdef ENABLE_COLLTRACE
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&argsStorage.args);
}
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&args4K.args);
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/4>(&argsStorage.args);
}
#endif
+68 -9
مشاهده پرونده
@@ -138,7 +138,11 @@ struct ncclShmemData {
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
#ifdef ENABLE_WARP_SPEED
int warpComm;
alignas(16) struct ncclDevChannel warpChannel[NCCL_MAX_GROUPS];
int warpChannelId[NCCL_MAX_GROUPS];
#endif
int batchIx, nextBatchIx;
enum ncclDevWorkType workType;
uint8_t directMode;
@@ -442,10 +446,17 @@ struct RunWorkBatch {
if (work->nWarps != workPrev->nWarps) __syncthreads();
}
int subtn = work->nWarps*WARP_SIZE;
#ifdef ENABLE_WARP_SPEED
if (tid < subtn) {
if(ncclShmem.warpComm == 0 || Algo != NCCL_ALGO_RING) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
else if (ncclShmem.warpChannelId[tid / WARP_SIZE] >= 0) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid % WARP_SIZE, WARP_SIZE, work);
}
#else
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
#endif
}
}
};
@@ -489,7 +500,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
int x = tid;
int total = 0, y;
int num = MAXCHANNELS/64 > 0 ? MAXCHANNELS/64 : 1;
#ifdef ENABLE_WARP_SPEED
int warpCount = tn / WARP_SIZE;
int localWarpId = tid / WARP_SIZE;
int globalWarpId = (warpCount * blockIdx.x) + localWarpId;
int laneId = tid % WARP_SIZE;
#endif
// Copy kernel args to shmem and then only read those. Otherwise the compiler
// will end up putting the args into thread local stack which is very wasteful.
if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
@@ -583,9 +599,52 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
ncclShmem.collTrace = args->comm->collTrace + COLLTRACE_NUM_ITEMS*ncclShmem.channelId;
ncclShmem.collTraceTail = args->comm->collTraceTail + ncclShmem.channelId;
}
#endif
#ifdef ENABLE_WARP_SPEED
if(tid == 0) {
ncclShmem.warpComm = args->comm->warpLevelComm;
}
#endif
__syncthreads(); // publish shmem
#ifdef ENABLE_WARP_SPEED
// Determine per-warp channel assignment for WarpSpeed enablement
total = 0;
if(ncclShmem.warpComm == 1) { // If warpComm is enabled, assign warps to channels that have the corresponding channel mask enabled
ncclShmem.warpChannelId[localWarpId] = -1;
__syncthreads();
for (int i = 0; i < num; i++) {
if (args->channelMask.masks[i] & (1ull<<laneId)) {
y = __popcll(args->channelMask.masks[i] & ((1ull<<laneId)-1));
y = total + y;
if (globalWarpId == y) {
ncclShmem.warpChannelId[localWarpId] = laneId + total;
break;
}
}
total = total + __popcll(args->channelMask.masks[i]);
}
__syncthreads();
if(ncclShmem.warpChannelId[localWarpId] >= 0) {
void* dst = &ncclShmem.warpChannel[localWarpId];
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
// assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
copyToShmem16(tid-localWarpId*WARP_SIZE, dst, src, bytes);
}
} else { // If warpComm is disabled, all warps use the same channel as the block
if(laneId == 0) {
ncclShmem.warpChannelId[localWarpId] = ncclShmem.channelId;
}
// Use all threads in the warp to copy the channel data in parallel
void* dst = &ncclShmem.warpChannel[localWarpId];
void* src = &ncclShmem.channel;
int bytes = sizeof(ncclDevChannel);
copyToShmem16(laneId, dst, src, bytes);
}
__syncthreads();
#endif
#ifdef ENABLE_PROFILING
if (tid == 0) {
ncclShmem.prof.count = 0;
@@ -648,17 +707,17 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
#endif
}
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
#ifdef ENABLE_COLLTRACE
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage);
#endif
#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
__global__ void ncclDevKernel_##suffix(ncclDevKernelArgsDefaultStorage NCCL_GRID_CONSTANT const argsStorage) {}
#ifdef USE_INDIRECT_FUNCTION_CALL
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
@@ -146,6 +146,9 @@ __device__ __forceinline__ void mscclRunInterpreter(
}
if (bytes) copyToShmem8(tid%WARP_SIZE, dst, src, bytes);
}
#ifdef ENABLE_WARP_SPEED
ncclShmem.warpComm = 0;
#endif
__syncthreads(); // publish shmem
#if defined(ENABLE_NPKIT)
+4
مشاهده پرونده
@@ -654,7 +654,11 @@ public:
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group), threadsPerBlock(blockDim.x),
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[threadIdx.x / WARP_SIZE];
#else
auto *channel = &ncclShmem.channel;
#endif
barriers = &ncclShmem.groups[group].barrier;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
+4
مشاهده پرونده
@@ -579,7 +579,11 @@ public:
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), /*compiler warnings*/
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)),
warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group), threadsPerBlock(blockDim.x){
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[warpInBlock];
#else
auto *channel = &ncclShmem.channel;
#endif
barriers = &ncclShmem.groups[group].barrier;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
+21 -6
مشاهده پرونده
@@ -502,14 +502,22 @@ private:
public:
static inline __device__ void sendPeerNotify(int peer, int connIndex, int steps) {
#ifdef ENABLE_WARP_SPEED
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
#else
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
#endif
peerPtr->send[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->send[connIndex].tail, peerPtr->send[connIndex].step);
}
static inline __device__ void recvPeerNotify(int peer, int connIndex, int steps) {
int spins = 0;
#ifdef ENABLE_WARP_SPEED
ncclDevChannelPeer* peerPtr = ncclShmem.warpChannel[threadIdx.x/WARP_SIZE].peers[peer];
#else
ncclDevChannelPeer* peerPtr = ncclShmem.channel.peers[peer];
#endif
peerPtr->recv[connIndex].step += steps;
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
@@ -770,13 +778,20 @@ public:
struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
):
tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
#ifdef ENABLE_WARP_SPEED
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(ncclShmem.warpComm? tidInBlock / WARP_SIZE : group), threadsPerBlock(blockDim.x){
#else
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group), threadsPerBlock(blockDim.x){
#endif
barriers = &ncclShmem.groups[group].barrier;
// PAT uses the same barrier for each group
barriers_pat = &ncclShmem.barrier_pat;
this->nworkers = nthreads;
#ifdef ENABLE_WARP_SPEED
auto *channel = isMsccl(Metadata) ? &ncclShmem.channel : &ncclShmem.warpChannel[tidInBlock/WARP_SIZE];
#else
auto *channel = &ncclShmem.channel;
#endif
int peer = -1;
flags = 0;
index = -1;
@@ -831,9 +846,9 @@ public:
}
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(channel->peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(channel->peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
// if (barrierAny(flags & NetDeviceUnpack)) {
// flags |= AnyNetDeviceUnpack;
@@ -861,7 +876,7 @@ public:
// Load recv peer
int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
struct ncclConnInfo* conn = peer->conn = channel->peers[recvPeer]->recv+connIndexRecv;
peer->step = conn->step;
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
@@ -871,7 +886,7 @@ public:
// Load send peer
int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
peer = ((struct ncclPatPeer*)sendPeers)+tid;
conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
conn = peer->conn = channel->peers[sendPeer]->send+connIndexSend;
peer->step = conn->step;
peer->connFifo = conn->connFifo;
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+9
مشاهده پرونده
@@ -16,7 +16,12 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->userRanks[nranks-1];
@@ -24,7 +29,11 @@ namespace {
size_t chunkCount;
size_t channelCount;
size_t gridOffset;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
int nelem;
@@ -16,14 +16,23 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#ifdef ENABLE_WARP_SPEED
int warp = threadIdx.x / WARP_SIZE;
ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
#else
ncclRing *ring = &ncclShmem.channel.ring;
#endif
int const *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
size_t count;
size_t gridOffset;
size_t channelCount;
size_t chunkCount;
#ifdef ENABLE_WARP_SPEED
ncclCollCbdPart(work, ncclShmem.warpChannelId[warp], Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
#else
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
#endif
size_t offset;
size_t dataOffset;
uint32_t nelem;
+35 -12
مشاهده پرونده
@@ -390,7 +390,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
//[Added-comment] opCount is missing for collDevWork, adding here
devWork.opCount = task->opCount;
devWork.isOneRPN = comm->isOneRPN;
devWork.netRegUsed = devWork.regUsed = 0;
devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
@@ -513,11 +513,11 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
return ncclInvalidUsage;
}
if (!rcclIsArchSupportedForFunc(&agg, comm->archName)) {
WARN("%s: unsupported architecture (%s) for collective %s(%s, %s, %s, %s, Acc=%d, Pipeline=%d).",
__func__, comm->archName,
ncclFuncToString(task->func), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol),
WARN("%s: unsupported architecture (%s) for collective %s(%s, %s, %s, %s, Acc=%d, Pipeline=%d).",
__func__, comm->archName,
ncclFuncToString(task->func), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol),
ncclDevRedOpToString(task->opDev.op), ncclDatatypeToString(task->datatype), (agg.acc != nullptr), agg.pipeline);
return ncclInvalidUsage;
}
@@ -541,6 +541,9 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
aggBeg->protocol = agg.protocol;
aggBeg->acc = agg.acc;
aggBeg->pipeline = agg.pipeline;
#ifdef ENABLE_WARP_SPEED
aggBeg->useWarpSpeed = agg.useWarpSpeed;
#endif
if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
aggBeg->nMaxChannels = agg.nMaxChannels;
aggBeg->nWarps = agg.nWarps;
@@ -764,10 +767,12 @@ static ncclResult_t scheduleCollTasksToPlan(
(countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count;
nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0);
// Update number of channels propagated to the profiler
task->nChannels = (uint8_t)nChannels;
#ifdef ENABLE_WARP_SPEED
task->nChannels = nChannels;
#else
task->nChannels = (uint8_t) nChannels;
#endif
// Ensure room for worst case of one new batch per channel
if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) {
return ncclSuccess;
@@ -1756,7 +1761,6 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
#endif
NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
ncclResult_t ret = ncclSuccess;
struct ncclKernelPlanner* planner = &comm->planner;
@@ -1764,6 +1768,9 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
for (int i = 0; i < MAXCHANNELS/64; i++)
nChannels += countOneBits(plan->channelMask.masks[i]);
void* sym = plan->kernelFn;
#ifdef ENABLE_WARP_SPEED
rcclSetWarpSpeedSupportAndFinalCuCount(comm, plan, nChannels, plan->kernelArgs->comm->warpLevelComm, nChannels);
#endif
dim3 grid = {(unsigned)nChannels, 1, 1};
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
@@ -1883,8 +1890,8 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
// hostStreamPlanTask directly
NCCLCHECK(hostStreamPlanTask(comm, plan));
}
// Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty
// Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty
// opCount was not incremented because ncclProxyStart wasn't called in hostStreamPlanTask
if (!plan->persistent && ncclIntruQueueHead(&plan->proxyOpQueue) == nullptr) {
comm->opCount++;
@@ -2095,8 +2102,11 @@ static ncclResult_t topoGetAlgoInfo(
rcclSetPipelining(comm, nBytes, info);
if (simInfo) simInfo->estimatedTime = time;
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
#ifdef ENABLE_WARP_SPEED
int nc = comm->topo->warpSpeedEnabled? comm->nChannels / 2 : comm->nChannels;
#else
int nc = comm->nChannels;
#endif
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
@@ -2167,7 +2177,13 @@ static ncclResult_t topoGetAlgoInfo(
}
} else if (info->func == ncclFuncAllReduce && comm->topo->treeDefined == 1) {
info->algorithm = NCCL_ALGO_TREE;
#ifdef ENABLE_WARP_SPEED
nc = std::min(nc, 64); // Tree uses at most 64 channels as we don't support WarpSpeed Tree.
} else if (info->algorithm == NCCL_ALGO_TREE) {
nc = std::min(nc, 64); // Tree uses at most 64 channels as we don't support WarpSpeed Tree.
#else
info->nMaxChannels = nc;
#endif
} else {
info->nMaxChannels = nc;
}
@@ -2180,6 +2196,13 @@ static ncclResult_t topoGetAlgoInfo(
info->nWarps = nt/comm->WarpSize;
rcclOverrideAlgorithm(ncclAlgoStr, table, info);
rcclOverrideProtocol(ncclProtoStr, table, info);
#ifdef ENABLE_WARP_SPEED
rcclSetWarpSpeedAuto(comm, info, nBytes);
if(info->useWarpSpeed) {
rcclSetWarpSpeedCUs(comm, info->algorithm, info->nWarps * comm->WarpSize, nc);
}
info->nMaxChannels = nc;
#endif
return ncclSuccess;
}
+17 -5
مشاهده پرونده
@@ -721,6 +721,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int shared = parent && parent->nvlsSupport && parent->shareResources;
int maxChannels;
int minNchannels, maxNchannels;
int duplicateCount = 1;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
@@ -804,19 +805,30 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
}
#ifdef ENABLE_WARP_SPEED
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
? MAXCHANNELS : 2*CHANNEL_LIMIT;
#else
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
? std::min(comm->topo->nodes[GPU].nodes[0].gpu.cu, MAXCHANNELS) : 2*CHANNEL_LIMIT;
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
maxChannels = std::min(64, maxChannels);
}
#endif
// Duplicate ringPrev/ringNext for ncclBuildRing
if (nChannels <= maxChannels/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
if (nChannels <= maxChannels/2) memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
duplicateCount = maxChannels / nChannels;
if (duplicateCount > 1) {
int limit = duplicateCount;
for (int dup = 1; dup < limit; ++dup) {
memcpy(ringPrev + dup * nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
memcpy(ringNext + dup * nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
}
}
// Get number of channels after duplication
maxNchannels = std::min((int)ncclMaxNchannels(), maxChannels);
nc = std::min(maxNchannels/comm->nChannels, nc);
+3
مشاهده پرونده
@@ -207,6 +207,9 @@ struct ncclTopoSystem {
int pivotA2ANumBiRings;
bool treeDefined;
bool ll128Enabled;
#ifdef ENABLE_WARP_SPEED
bool warpSpeedEnabled;
#endif
float baseBw;
bool mscclEnabled;
+7 -2
مشاهده پرونده
@@ -205,7 +205,12 @@ struct ncclTaskColl {
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
#ifdef ENABLE_WARP_SPEED
int32_t nMaxChannels:16;
bool useWarpSpeed;
#else
int32_t nMaxChannels:8;
#endif
int32_t nWarps:8;
int32_t algorithm:8, protocol:8, pipeline:8;
uint32_t isCollnet:1, isNvls:1;
@@ -550,7 +555,7 @@ struct ncclComm {
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
uint64_t minMaxChannelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 32,40,48,56,64, the two values for min/max size-threshold
uint64_t minMaxChannelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 32,40,48,56,64, the two values for min/max size-threshold
/* This attribute can indicate the states of communicators and return code of
* asynchronous NCCL operations. */
@@ -719,7 +724,7 @@ struct ncclComm {
char* archName;
// multiProcessorCount from hipDeviceProp_t [RCCL]
int cuCount;
uint64_t endMagic;
};
+25 -7
مشاهده پرونده
@@ -126,8 +126,12 @@ union ncclLLFifoLine {
#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
#endif
#ifdef ENABLE_WARP_SPEED
#define MAXCHANNELS 512
#else
#define MAXCHANNELS 128
#define CHANNEL_LIMIT 16
#endif
#define CHANNEL_LIMIT 16 // this is used to limit channels for pre MI3xx GPUs
#define NCCL_MAX_LOCAL_RANKS 72
#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
@@ -354,7 +358,11 @@ inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int chann
struct alignas(16) ncclDevWorkColl {
// Running on channels [channelLo..channelHi], hi is inclusive.
// nChannels == (channelHi - channelLo) + 1
#ifdef ENABLE_WARP_SPEED
uint32_t channelLo:16, channelHi:16;
#else
uint32_t channelLo:8, channelHi:8;
#endif
uint32_t nWarps:8;
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx9CheapFenceOff:1;
uint32_t root:30, connIndex:2;
@@ -573,7 +581,7 @@ struct ncclDevComm {
int p2pChunkSize;
int isAllNvlink;
int p2pnChannelsPerPeer;
int warpLevelComm;
int* collNetDenseToUserRank;
// Flag to ask NCCL kernels to abort
@@ -637,11 +645,6 @@ struct alignas(16) ncclDevKernelArgs {
// struct ncclDevWorkBatch batches[];
};
__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
//return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
return 4<<10;
}
template<size_t capacity>
struct alignas(16) ncclDevKernelArgsStorage {
union {
@@ -650,9 +653,24 @@ struct alignas(16) ncclDevKernelArgsStorage {
};
};
typedef ncclDevKernelArgsStorage<(5<<10)> ncclDevKernelArgs5K;
typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K;
//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K;
#ifdef ENABLE_WARP_SPEED
// needed extra storage for accomodating more channels than 128 for WarpSpeed support
// 256 channels (i.e. 256 warps) would hang without this extra storage
// 5KB should be sufficient for now
typedef ncclDevKernelArgs5K ncclDevKernelArgsDefaultStorage;
#else
typedef ncclDevKernelArgs4K ncclDevKernelArgsDefaultStorage;
#endif
__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
//return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
return sizeof(ncclDevKernelArgsDefaultStorage);
}
template<typename T>
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
@@ -118,4 +118,9 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
int parseFirmwareVersion();
bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
#ifdef ENABLE_WARP_SPEED
void rcclSetWarpSpeedCUs(struct ncclComm* comm, int algo, int threadsPerBlock, int& rcclWarpSpeedChannels);
void rcclSetWarpSpeedSupportAndFinalCuCount(struct ncclComm* comm, struct ncclKernelPlan* plan, int nChannels, int& support, int &cuCount);
void rcclSetWarpSpeedAuto(struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes);
#endif
#endif
+20 -8
مشاهده پرونده
@@ -168,7 +168,7 @@ ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
else {
return ncclSuccess;
}
#endif
#endif
}
static void initOnceFunc() {
@@ -1075,7 +1075,10 @@ NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 0);
NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
#ifdef ENABLE_WARP_SPEED
extern int64_t rcclParamWarpSpeedEnable();
extern int64_t rcclParamWarpSpeedAutoMode();
#endif
// MNNVL: Flag to indicate whether to enable Multi-Node NVLink
NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
@@ -1453,8 +1456,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = 4;
}
}
#ifdef ENABLE_WARP_SPEED
comm->topo->warpSpeedEnabled = (rcclParamWarpSpeedEnable() != 0 || rcclParamWarpSpeedAutoMode() != 0);
#endif
// For single node communicators that do not uses the full xgmi links per gpu, i.e., nranks < 8
// Inflate the nChannels a bit to achieve higher b/w.
// Inflate the nChannels a bit to achieve higher b/w.
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
if (nranks == 2 && nNodes == 1){
allGather3Data[rank].nc = 16;
@@ -1464,7 +1471,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = 4;
}
}
#ifdef ENABLE_WARP_SPEED
// Double default channels for WarpSpeed enabled communicators
if (comm->topo->warpSpeedEnabled) {
allGather3Data[rank].nc *= 2;
}
#endif
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable();
allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled;
@@ -1817,8 +1829,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Compute time models for algorithm and protocol combinations
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
@@ -2075,10 +2087,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm->cuCount = cuCount;
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
// Check if using host uncached mem correctly
NCCLCHECK(checkHostUncacheMemSetting(comm));
// RCCL: determine and set unroll factor for comm
NCCLCHECK(commSetUnrollFactor(comm));
+123 -1
مشاهده پرونده
@@ -34,6 +34,14 @@ RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
// Otherwise, it is automatically set for certain archs, datatypes and reduction collectives
RCCL_PARAM(disableReduceCopyPipelining, "DISABLE_REDUCE_COPY_PIPELINING", 0);
RCCL_PARAM(DirectAllGatherThreshold, "DIRECT_ALLGATHER_THRESHOLD", 75497472);
RCCL_PARAM(ThreadsPerBlock, "THREADS_PER_BLOCK", -1);
RCCL_PARAM(UnrollFactor, "UNROLL_FACTOR", -1);
#ifdef ENABLE_WARP_SPEED
RCCL_PARAM(WarpSpeedCuCount, "WARP_SPEED_CU_COUNT", 0);
RCCL_PARAM(WarpSpeedAutoMode, "WARP_SPEED_AUTO", 0);
RCCL_PARAM(WarpSpeedEnable, "WARP_SPEED_ENABLE", 0);
#endif
#define RCCL_WARP_SPEED_MIN_BYTES (1ULL << 26) // 64 MB
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
// Honor user input for protocol choice
@@ -162,6 +170,10 @@ ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t
}
}
#ifdef ENABLE_WARP_SPEED
// fallback to max 64 channels and tune warp speed channels later
nc = std::min(nc, 64);
#endif
return ncclSuccess;
}
@@ -295,7 +307,11 @@ ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t co
NCCLCHECK(getAlgoInfo(comm, &task, collNetSupport, nvlsSupport, numPipeOps));
*algo = task.algorithm;
*protocol = task.protocol;
#ifdef ENABLE_WARP_SPEED
*maxChannels = task.useWarpSpeed? task.nMaxChannels / task.nWarps : task.nMaxChannels;
#else
*maxChannels = task.nMaxChannels;
#endif
return ncclSuccess;
}
@@ -398,6 +414,88 @@ void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize) {
}
rcclP2pNetChunkSize = p2pNetChunkSize;
}
#ifdef ENABLE_WARP_SPEED
void rcclSetWarpSpeedCUs(struct ncclComm* comm, int algo, int threadsPerBlock, int& rcclWarpSpeedChannels) {
static int userChannelControlInput = RCCL_VALUE_UNSET;
int warpsPerBlock = threadsPerBlock / comm->WarpSize;
// only adjust channels for RING algorithm
if(algo != NCCL_ALGO_RING) {
return;
}
if (userChannelControlInput == RCCL_VALUE_UNSET) {
const char *inputStr = getenv("NCCL_THREAD_THRESHOLDS");
if (!inputStr) {
inputStr = getenv("NCCL_MAX_NCHANNELS");
}
if (!inputStr) {
inputStr = getenv("NCCL_MIN_NCHANNELS");
}
userChannelControlInput = !inputStr ? 0 : 1;
}
if(!userChannelControlInput && comm->topo->warpSpeedEnabled) {
if(rcclParamWarpSpeedCuCount() != 0) {
rcclWarpSpeedChannels = rcclParamWarpSpeedCuCount() * warpsPerBlock;
INFO(NCCL_INIT, "RCCL Warp CU count set to user defined %d resulting in %d channels", rcclParamWarpSpeedCuCount(), rcclWarpSpeedChannels);
return;
}
// reuse the existing channel tuning logic if possible
if (comm->nNodes == 1) {
rcclWarpSpeedChannels = rcclWarpSpeedChannels * warpsPerBlock / 2; // use 50% CUs for single node case
} else {
rcclWarpSpeedChannels = std::min(256, rcclWarpSpeedChannels * warpsPerBlock);
}
INFO(NCCL_INIT, "RCCL Warp Speed Channels set to %d", rcclWarpSpeedChannels);
}
}
void rcclSetWarpSpeedSupportAndFinalCuCount(struct ncclComm* comm, struct ncclKernelPlan* plan, int nChannels, int& support, int &cuCount) {
if(!comm->topo->warpSpeedEnabled) {
support = 0;
cuCount = nChannels;
return;
}
// WarpSpeed is not supported currently for the following cases:
// 1. if any work batch in the plan contains P2P work
// 2. or any collective task is not using RING algorithm
bool hasP2p = !ncclIntruQueueEmpty(&plan->p2pTaskQueue);
bool hasNonRing = false;
struct ncclTaskColl* task = ncclIntruQueueHead(&plan->collTaskQueue);
while (task != nullptr) {
if (task->algorithm != NCCL_ALGO_RING || !(task->useWarpSpeed)) {
hasNonRing = true;
break;
}
task = task->next;
}
int warpsPerBlock = plan->threadPerBlock / comm->WarpSize;
support = (hasP2p || hasNonRing) ? 0 : 1;
cuCount = (support == 0)? nChannels : nChannels / warpsPerBlock + ((nChannels % warpsPerBlock) != 0 ? 1 : 0); // each CU can handle warpsPerBlock
}
void rcclSetWarpSpeedAuto(struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes) {
info->useWarpSpeed = false;
if(!comm->topo->warpSpeedEnabled) {
return;
}
info->useWarpSpeed = (info->algorithm == NCCL_ALGO_RING); // Enabled by default for any RING algorithm when platform supports it
if(rcclParamWarpSpeedAutoMode() != 0 && IsArchMatch(comm->archName, "gfx950")) { // Auto mode only available for gfx950 currently
size_t minBytes = 0;
if(info->func == ncclFuncAllReduce || info->func == ncclFuncAllGather) minBytes = RCCL_WARP_SPEED_MIN_BYTES;
else if (info->func == ncclFuncReduceScatter) minBytes = RCCL_WARP_SPEED_MIN_BYTES << 2; // ReduceScatter requires higher message size to benefit from WarpSpeed
if(comm->nNodes == 1) {
if(nBytes >= minBytes && minBytes > 0) {
comm->unroll = NCCL_UNROLL_2;
info->nWarps = 4;
}
} else {
// TODO: set unroll factor per task rather than per comm
commSetUnrollFactor(comm);
info->useWarpSpeed = false;
}
}
}
#endif
void rcclGetMaxNthreads(struct ncclComm* comm, int maxNthreads[]) {
if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
@@ -411,12 +509,27 @@ void rcclGetMaxNthreads(struct ncclComm* comm, int maxNthreads[]) {
void rcclOptThreadBlockSize(struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, int& nThreads) {
static int maxNthreads[NCCL_NUM_PROTOCOLS] = {0};
if (maxNthreads[NCCL_PROTO_SIMPLE] == 0) rcclGetMaxNthreads(comm, maxNthreads);
if(rcclParamThreadsPerBlock() != -1) {
nThreads = rcclParamThreadsPerBlock();
if(nThreads % comm->WarpSize != 0) {
nThreads = ((nThreads / comm->WarpSize) + 1) * comm->WarpSize;
INFO(NCCL_INIT, "RCCL Threads per block adjusted to %d to be multiple of warp size %d", nThreads, comm->WarpSize);
}
if(nThreads > maxNthreads[NCCL_PROTO_SIMPLE]) {
nThreads = maxNthreads[NCCL_PROTO_SIMPLE];
INFO(NCCL_INIT, "RCCL Threads per block reduced to %d to match max threads", nThreads);
} else if (nThreads < 3 * comm->WarpSize) {
nThreads = 3 * comm->WarpSize; // min requirement for tree
INFO(NCCL_INIT, "RCCL Threads per block increased to %d to be at least one warp", nThreads);
}
return;
}
if (info->algorithm == NCCL_ALGO_TREE) nThreads = maxNthreads[NCCL_PROTO_SIMPLE]; // Tree now uses all threads always.
if (info->algorithm == NCCL_ALGO_PAT) nThreads = maxNthreads[NCCL_PROTO_SIMPLE];
if (comm->nNodes == 1) nThreads = RCCL_SINGLE_NODE_MAX_NTHREADS; // For single node, we use half the number of threads for perf reasons.
// The following should be already set correctly by getNthreads
// but need to override the changes for TREE and PAT in the previous lines
if (info->protocol == NCCL_PROTO_LL) nThreads = maxNthreads[NCCL_PROTO_LL];
else if (info->protocol == NCCL_PROTO_LL) nThreads = maxNthreads[NCCL_PROTO_LL];
// ReduceScatter small count optimization
if (info->func == ncclFuncReduceScatter && divUp(nBytes, comm->nRanks) <= 524288) nThreads = maxNthreads[NCCL_PROTO_LL];
}
@@ -436,6 +549,15 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
}
ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
if( rcclParamUnrollFactor() != -1 ) {
comm->unroll = rcclParamUnrollFactor(); //-1 to map to 0 based indexing
if(comm->unroll < NCCL_UNROLL_1 || comm->unroll >= NCCL_NUM_UNROLLS) {
WARN("Invalid RCCL_UNROLL_FACTOR %d specified. Valid values are 0 to 2 corresponding to unroll factors of 1, 2, and 4 respectively.", comm->unroll);
return ncclInvalidArgument;
}
INFO(NCCL_INIT, "RCCL Unroll Factor (user set): %d", (int) (pow(2.0, (double)comm->unroll)));
return ncclSuccess;
}
if(IsArchMatch(comm->archName, "gfx950")) {
if(comm->nNodes == 1)
comm->unroll = NCCL_UNROLL_1;