IFC mix build (#998)

2023-12-02 18:49:52 -07:00
@@ -18,7 +18,7 @@ option(BUILD_SHARED_LIBS                       "Build as shared library"
 option(BUILD_TESTS                             "Build unit test programs"                      OFF)
 option(COLLTRACE                               "Collective Trace Option"                       ON)
 option(ENABLE_MSCCL_KERNEL                     "Enable MSCCL while compiling"                  ON)
-option(ENABLE_IFC                              "Enable indirect function call"                 OFF)
+option(ENABLE_IFC                              "Enable indirect function call"                 ON)
 option(INSTALL_DEPENDENCIES                    "Force install dependencies"                    OFF)
 option(PROFILE                                 "Enable profiling"                              OFF)
 option(TIMETRACE                               "Enable time-trace during compilation"          OFF)
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -15,7 +15,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -223,7 +223,7 @@ namespace {
  }

  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runTreeUpDown(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
@@ -379,7 +379,7 @@ namespace {
  }

  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runTreeSplit(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
@@ -10,7 +10,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -10,7 +10,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -130,7 +130,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
 static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
 static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");

-#ifndef USE_INDIRECT_FUNCTION_CALL
+#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 template<unsigned short f, unsigned short l, bool u>
 struct Caller {
  static __forceinline__ __device__ __host__
@@ -274,7 +274,7 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
 template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
 class ncclFunction {
  public:
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
 #else
  __device__ void run(struct ncclWorkElem* args) {}
@@ -568,7 +568,7 @@ __forceinline__ __device__ void ncclKernel(
    if (ncclShmem.work.header.funcIndex == FnIndex) {
      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
    } else {
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
      ncclFuncs[ncclShmem.work.header.funcIndex]();
 #else
 #ifdef ENABLE_LL128
@@ -627,7 +627,7 @@ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDev
 // Examples :     AllReduce, RING, LL,    Sum,   uint8
 /* Functions for aggregation case */

-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
 #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
 __device__  void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
@@ -12,7 +12,7 @@

 namespace {
  template<typename T, typename RedOp>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void oneRankReduce() {
 #else
  __device__ __attribute__((noinline)) void oneRankReduce() {
@@ -48,7 +48,7 @@ namespace {
  }
 }

-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
 #define INSTANTIATE(devredop, type) \
  __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
    oneRankReduce<type, Func##devredop<type>>(); \
@@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
  uint32_t u32[4];
  uint64_t u64[2];
  ulong2 ul2, native;
-#ifndef USE_INDIRECT_FUNCTION_CALL
+#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
  inline __device__ BytePack<16>& operator=(BytePack<16> other) {
    u64[0] = other.u64[0];
    u64[1] = other.u64[1];
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__ void runRing(ncclWorkElem *args) {
 #else
  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
@@ -174,7 +174,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
    }
  }

-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
  __device__  void run(ncclWork *work) {
 #else
  __device__  __attribute__((noinline)) void run(ncclWork *work) {
@@ -39,7 +39,7 @@ struct ncclDevRedOpFull {
  nccl##func##algo##proto

 /* Declare all collective operations */
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
 #define DECL5(func, algo, proto, devredop, type) \
  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
@@ -1688,6 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  int* parentRanks = NULL;
  int cudaArch;
  int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
+  hipDeviceProp_t devProp;

  CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
  CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
@@ -1698,7 +1699,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  // Set the maximum kernel stack size of all kernels to avoid
  // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
 #ifdef USE_INDIRECT_FUNCTION_CALL
-  if (stackSize > 0 && ncclParamSetStackSize() == 1) {
+  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
+  if (stackSize > 0 && ncclParamSetStackSize() == 1 && devProp.gcnArch != 940 && devProp.gcnArch != 941 && devProp.gcnArch != 942) {
    INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes);
    CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
  }