Tuning the inline and unroll to reduce the scratch usage

Summary: 1. remove the noinline attribute for AllReduceThreeKernel; 2. change AUTPUNROLL for tree functions to 1 or 2; Combining 1 and 2 will reduce the scratch usage from 1256 to 952 [ROCm/rccl commit: eec319038e]
2019-10-08 09:24:49 -07:00
parent de13a48f7b
commit d8a06589c9
1 changed files with 3 additions and 4 deletions
@@ -102,8 +102,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
 #endif
 }

-template<int UNROLL, class FUNC, typename T>
-__attribute__((noinline))
+template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  const int tid = threadIdx.x;
  const int nthreads = blockDim.x;
@@ -122,7 +121,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {

  do {
    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<1, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
      // Up
      ssize_t offset = gridOffset + bid*chunkSize;
@@ -139,7 +138,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {

  do {
    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<1, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
      // Down
      ssize_t offset = gridOffset + bid*chunkSize;