From eaf54184bfb05cb587334a5ab51eaa23afee1087 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 26 Aug 2021 18:06:09 -0700 Subject: [PATCH] Improve clique kernel performance by increasing unroll [ROCm/rccl commit: a4929465c5fc7d5ab3f6a6067d9f367d172b5b6e] --- .../src/collectives/device/common_kernel.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index adba68190e..9351c19a76 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -67,7 +67,7 @@ struct MULTI { int8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().preOp(fn, u.elt[i]); return u.pack; @@ -78,7 +78,7 @@ struct MULTI { int8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().postOp(fn, u.elt[i]); return u.pack; @@ -113,7 +113,7 @@ struct MULTI { uint8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().preOp(fn, u.elt[i]); return u.pack; @@ -124,7 +124,7 @@ struct MULTI { uint8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().postOp(fn, u.elt[i]); return u.pack; @@ -578,13 +578,13 @@ __device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, c for (int u = 0; u < UNROLL; ++u) MULTI128().preOp(fn, vals[u]); } - #pragma unroll 1 + #pragma unroll for (int i=1; i()(fn, vals[u], vals2[u]); } - #pragma unroll 1 + #pragma unroll for (int i=MINSRCS; i().postOp(fn, vals[u]); } // Store - #pragma unroll 1 + #pragma unroll for (int i = 0; i < MINDSTS; i++) { for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); } - #pragma unroll 1 + #pragma unroll for (int i=MINDSTS; i + ReduceCopyMulti (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Nelem); Nrem -= Nelem;