diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
index d7c0e6e393..ddf624db1e 100644
--- a/projects/rccl/CMakeLists.txt
+++ b/projects/rccl/CMakeLists.txt
@@ -463,13 +463,21 @@ foreach(target ${AMDGPU_TARGETS})
 endforeach()
 
 if("${HIP_COMPILER}" MATCHES "clang")
-  target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
+  find_program( hipcc_executable hipcc )
+  execute_process(COMMAND bash "-c" "${hipcc_executable} --version | grep 'HIP version' | awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'" OUTPUT_VARIABLE hipcc_version_string)
+  message(STATUS "hipcc version: ${hipcc_version_string}")
+  if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
+    add_definitions(-DUSE_INDIRECT_FUNCTION_CALL)
+    target_compile_options(rccl PRIVATE -fvisibility=hidden)
+    message(STATUS "Indirect function call enabled")
+  else()
+    target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
+  endif()
   foreach(target ${AMDGPU_TARGETS})
     target_compile_options(rccl PRIVATE -fgpu-rdc)
   endforeach()
   target_link_libraries(rccl PRIVATE -fgpu-rdc)
   target_include_directories(rccl PRIVATE ${ROCM_PATH}/include)
-  find_program( hipcc_executable hipcc )
   execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs)
   if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs")
     target_compile_options(rccl PRIVATE -parallel-jobs=12 PRIVATE -Wno-format-nonliteral)
diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h
index 9307cddee5..d86ae73dbc 100644
--- a/projects/rccl/src/collectives/device/all_gather.h
+++ b/projects/rccl/src/collectives/device/all_gather.h
@@ -11,7 +11,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h
index c2c40a9ad2..6eb194444e 100644
--- a/projects/rccl/src/collectives/device/all_reduce.h
+++ b/projects/rccl/src/collectives/device/all_reduce.h
@@ -15,7 +15,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
@@ -219,7 +223,11 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runTreeUpDown(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
@@ -371,7 +379,11 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runTreeSplit(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/alltoall_pivot.h b/projects/rccl/src/collectives/device/alltoall_pivot.h
index 0ffa7caee9..93dba6d888 100644
--- a/projects/rccl/src/collectives/device/alltoall_pivot.h
+++ b/projects/rccl/src/collectives/device/alltoall_pivot.h
@@ -10,7 +10,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h
index 498143b6d9..2e1fc22496 100644
--- a/projects/rccl/src/collectives/device/broadcast.h
+++ b/projects/rccl/src/collectives/device/broadcast.h
@@ -10,7 +10,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h
index 9f53b305d4..9065def46e 100644
--- a/projects/rccl/src/collectives/device/common.h
+++ b/projects/rccl/src/collectives/device/common.h
@@ -199,6 +199,10 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{
 #endif
 };
 
+static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index");
+static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index");
+
+#ifndef USE_INDIRECT_FUNCTION_CALL
 template<unsigned short f, unsigned short l, bool u>
 struct Caller {
   static __forceinline__ __device__ __host__
@@ -216,9 +220,6 @@ struct Caller<f, f + 1, u>{
   void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
 };
 
-static_assert(FUNC_INDEX_P2P == 3610, "Wrong P2P function index");
-static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 3611, "Wrong AllToAllPivot function index");
-
 template<bool USING_LL128>
 __forceinline__
 __device__
@@ -340,11 +341,16 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
   }
 #endif
 }
+#endif
 
 template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
 class ncclFunction {
   public:
+#ifdef USE_INDIRECT_FUNCTION_CALL
   __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
+#else
+  __device__ void run(struct ncclWorkElem* args) {}
+#endif
 };
 
 #ifdef ENABLE_COLLTRACE
@@ -663,7 +669,12 @@ __forceinline__ __device__ void ncclKernel(
     if (ncclShmem.work.header.funcIndex == FnIndex) {
       RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
     } else {
+#ifdef USE_INDIRECT_FUNCTION_CALL
+      if (USING_LL128) ncclFuncs_ll128[ncclShmem.work.header.funcIndex]();
+      else ncclFuncs[ncclShmem.work.header.funcIndex]();
+#else
       NCCL_CALL_FUNCTIONS<USING_LL128>(ncclShmem.work.header.funcIndex);
+#endif
     }
 
     int workIxNext = ncclShmem.work.header.workNext;
@@ -714,10 +725,18 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st
 
 // Examples :     AllReduce, RING, LL,    Sum,   uint8
 /* Functions for aggregation case */
+
+#ifdef USE_INDIRECT_FUNCTION_CALL
+#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
+__device__  void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
+  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
+}
+#else
 #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
 __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
   RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
 }
+#endif
 
 // Only generate inline kernels for LL
 #define IMPL_COLL4(func, algo, devredop, type, ncclType) \
diff --git a/projects/rccl/src/collectives/device/onerank_reduce.cu b/projects/rccl/src/collectives/device/onerank_reduce.cu
index 256b236386..569c4c7a0f 100644
--- a/projects/rccl/src/collectives/device/onerank_reduce.cu
+++ b/projects/rccl/src/collectives/device/onerank_reduce.cu
@@ -12,7 +12,11 @@
 
 namespace {
   template<typename T, typename RedOp>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void oneRankReduce() {
+#else
   __device__ __attribute__((noinline)) void oneRankReduce() {
+#endif
     ncclWork *w = &ncclShmem.work;
     int tid = threadIdx.x;
     int tn = blockDim.x;
@@ -42,10 +46,17 @@ namespace {
   }
 }
 
+#ifdef USE_INDIRECT_FUNCTION_CALL
 #define INSTANTIATE(devredop, type) \
   __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
     oneRankReduce<type, Func##devredop<type>>(); \
   }
+#else
+#define INSTANTIATE(devredop, type) \
+  __device__ __attribute__((noinline)) void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
+    oneRankReduce<type, Func##devredop<type>>(); \
+  }
+#endif
 
 INSTANTIATE(PreMulSum, int8_t)
 INSTANTIATE(PreMulSum, uint8_t)
diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h
index ac40072921..790eca0efa 100644
--- a/projects/rccl/src/collectives/device/reduce.h
+++ b/projects/rccl/src/collectives/device/reduce.h
@@ -11,7 +11,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h
index 9600a900dc..212444dc28 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.h
+++ b/projects/rccl/src/collectives/device/reduce_scatter.h
@@ -11,7 +11,11 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__ void runRing(ncclWorkElem *args) {
+#else
   __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
diff --git a/projects/rccl/src/collectives/device/sendrecv.h b/projects/rccl/src/collectives/device/sendrecv.h
index 4131f97959..6006f32fa0 100644
--- a/projects/rccl/src/collectives/device/sendrecv.h
+++ b/projects/rccl/src/collectives/device/sendrecv.h
@@ -175,7 +175,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
     }
   }
 
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  __device__  void run(ncclWork *work) {
+#else
   __device__  __attribute__((noinline)) void run(ncclWork *work) {
+#endif
     struct ncclWorkElemP2p* args = work->p2pElems;
     int ngroups = args->ngroups;
     int tid = threadIdx.x;
diff --git a/projects/rccl/src/include/collectives.h b/projects/rccl/src/include/collectives.h
index 24b79fdb5e..fbad473a70 100644
--- a/projects/rccl/src/include/collectives.h
+++ b/projects/rccl/src/include/collectives.h
@@ -45,12 +45,21 @@ struct ncclDevRedOpFull {
   nccl##func##algo##proto
 
 /* Declare all collective operations */
+#ifdef USE_INDIRECT_FUNCTION_CALL
+#define DECL5(func, algo, proto, devredop, type) \
+  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+#else
 #define DECL5(func, algo, proto, devredop, type) \
   extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
   extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
   extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
   extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
   extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+#endif
 
 #define SINGLE_ARG(...) __VA_ARGS__
 #define CONCAT(a,b) a##b
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index 68d3d16e94..e0f803f4eb 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -1414,7 +1414,13 @@ fail:
   goto exit;
 }
 
+#ifdef USE_INDIRECT_FUNCTION_CALL
+NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1);
+RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 8);
+#else
 NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
+RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);
+#endif
 
 struct ncclCommInitRankAsyncJob {
   struct ncclAsyncJob base;
@@ -1440,14 +1446,17 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   int cudaDev = job->cudaDev;
   int virtualId = job->virtualId;
   ncclResult_t res = ncclSuccess;
+  int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
 
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
-  if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
-    TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
-    //CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
+#ifdef USE_INDIRECT_FUNCTION_CALL
+  if (stackSize > 0 && ncclParamSetStackSize() == 1) {
+    INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes);
+    CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
   }
+#endif
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank, virtualId), res, fail);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, fail);