diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index d7c0e6e393..ddf624db1e 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -463,13 +463,21 @@ foreach(target ${AMDGPU_TARGETS}) endforeach() if("${HIP_COMPILER}" MATCHES "clang") - target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp) + find_program( hipcc_executable hipcc ) + execute_process(COMMAND bash "-c" "${hipcc_executable} --version | grep 'HIP version' | awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'" OUTPUT_VARIABLE hipcc_version_string) + message(STATUS "hipcc version: ${hipcc_version_string}") + if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201") + add_definitions(-DUSE_INDIRECT_FUNCTION_CALL) + target_compile_options(rccl PRIVATE -fvisibility=hidden) + message(STATUS "Indirect function call enabled") + else() + target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp) + endif() foreach(target ${AMDGPU_TARGETS}) target_compile_options(rccl PRIVATE -fgpu-rdc) endforeach() target_link_libraries(rccl PRIVATE -fgpu-rdc) target_include_directories(rccl PRIVATE ${ROCM_PATH}/include) - find_program( hipcc_executable hipcc ) execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs) if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs") target_compile_options(rccl PRIVATE -parallel-jobs=12 PRIVATE -Wno-format-nonliteral) diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h index 9307cddee5..d86ae73dbc 100644 --- a/projects/rccl/src/collectives/device/all_gather.h +++ b/projects/rccl/src/collectives/device/all_gather.h @@ -11,7 +11,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index c2c40a9ad2..6eb194444e 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -15,7 +15,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; @@ -219,7 +223,11 @@ namespace { } template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runTreeUpDown(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; @@ -371,7 +379,11 @@ namespace { } template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runTreeSplit(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/alltoall_pivot.h b/projects/rccl/src/collectives/device/alltoall_pivot.h index 0ffa7caee9..93dba6d888 100644 --- a/projects/rccl/src/collectives/device/alltoall_pivot.h +++ b/projects/rccl/src/collectives/device/alltoall_pivot.h @@ -10,7 +10,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index 498143b6d9..2e1fc22496 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -10,7 +10,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 9f53b305d4..9065def46e 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -199,6 +199,10 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{ #endif }; +static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index"); +static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index"); + +#ifndef USE_INDIRECT_FUNCTION_CALL template struct Caller { static __forceinline__ __device__ __host__ @@ -216,9 +220,6 @@ struct Caller{ void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); } }; -static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index"); -static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index"); - template __forceinline__ __device__ @@ -340,11 +341,16 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { } #endif } +#endif template class ncclFunction { public: +#ifdef USE_INDIRECT_FUNCTION_CALL __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {} +#else + __device__ void run(struct ncclWorkElem* args) {} +#endif }; #ifdef ENABLE_COLLTRACE @@ -663,7 +669,12 @@ __forceinline__ __device__ void ncclKernel( if (ncclShmem.work.header.funcIndex == FnIndex) { RunWork().run(&ncclShmem.work); } else { +#ifdef USE_INDIRECT_FUNCTION_CALL + if (USING_LL128) ncclFuncs_ll128[ncclShmem.work.header.funcIndex](); + else ncclFuncs[ncclShmem.work.header.funcIndex](); +#else NCCL_CALL_FUNCTIONS(ncclShmem.work.header.funcIndex); +#endif } int workIxNext = ncclShmem.work.header.workNext; @@ -714,10 +725,18 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st // Examples : AllReduce, RING, LL, Sum, uint8 /* Functions for aggregation case */ + +#ifdef USE_INDIRECT_FUNCTION_CALL +#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ +__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ + RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ +} +#else #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ } +#endif // Only generate inline kernels for LL #define IMPL_COLL4(func, algo, devredop, type, ncclType) \ diff --git a/projects/rccl/src/collectives/device/onerank_reduce.cu b/projects/rccl/src/collectives/device/onerank_reduce.cu index 256b236386..569c4c7a0f 100644 --- a/projects/rccl/src/collectives/device/onerank_reduce.cu +++ b/projects/rccl/src/collectives/device/onerank_reduce.cu @@ -12,7 +12,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void oneRankReduce() { +#else __device__ __attribute__((noinline)) void oneRankReduce() { +#endif ncclWork *w = &ncclShmem.work; int tid = threadIdx.x; int tn = blockDim.x; @@ -42,10 +46,17 @@ namespace { } } +#ifdef USE_INDIRECT_FUNCTION_CALL #define INSTANTIATE(devredop, type) \ __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ oneRankReduce>(); \ } +#else +#define INSTANTIATE(devredop, type) \ + __device__ __attribute__((noinline)) void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ + oneRankReduce>(); \ + } +#endif INSTANTIATE(PreMulSum, int8_t) INSTANTIATE(PreMulSum, uint8_t) diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h index ac40072921..790eca0efa 100644 --- a/projects/rccl/src/collectives/device/reduce.h +++ b/projects/rccl/src/collectives/device/reduce.h @@ -11,7 +11,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h index 9600a900dc..212444dc28 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.h +++ b/projects/rccl/src/collectives/device/reduce_scatter.h @@ -11,7 +11,11 @@ namespace { template +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void runRing(ncclWorkElem *args) { +#else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { +#endif const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; diff --git a/projects/rccl/src/collectives/device/sendrecv.h b/projects/rccl/src/collectives/device/sendrecv.h index 4131f97959..6006f32fa0 100644 --- a/projects/rccl/src/collectives/device/sendrecv.h +++ b/projects/rccl/src/collectives/device/sendrecv.h @@ -175,7 +175,11 @@ struct RunWork { } } +#ifdef USE_INDIRECT_FUNCTION_CALL + __device__ void run(ncclWork *work) { +#else __device__ __attribute__((noinline)) void run(ncclWork *work) { +#endif struct ncclWorkElemP2p* args = work->p2pElems; int ngroups = args->ngroups; int tid = threadIdx.x; diff --git a/projects/rccl/src/include/collectives.h b/projects/rccl/src/include/collectives.h index 24b79fdb5e..fbad473a70 100644 --- a/projects/rccl/src/include/collectives.h +++ b/projects/rccl/src/include/collectives.h @@ -45,12 +45,21 @@ struct ncclDevRedOpFull { nccl##func##algo##proto /* Declare all collective operations */ +#ifdef USE_INDIRECT_FUNCTION_CALL +#define DECL5(func, algo, proto, devredop, type) \ + extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ + extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ + extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ + extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ + extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); +#else #define DECL5(func, algo, proto, devredop, type) \ extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); +#endif #define SINGLE_ARG(...) __VA_ARGS__ #define CONCAT(a,b) a##b diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 68d3d16e94..e0f803f4eb 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1414,7 +1414,13 @@ fail: goto exit; } +#ifdef USE_INDIRECT_FUNCTION_CALL +NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1); +RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 8); +#else NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); +RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0); +#endif struct ncclCommInitRankAsyncJob { struct ncclAsyncJob base; @@ -1440,14 +1446,17 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int virtualId = job->virtualId; ncclResult_t res = ncclSuccess; + int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) - if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { - TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); - //CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); +#ifdef USE_INDIRECT_FUNCTION_CALL + if (stackSize > 0 && ncclParamSetStackSize() == 1) { + INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes); + CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize)); } +#endif NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank, virtualId), res, fail); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, fail);