diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index adba68190e..9351c19a76 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -67,7 +67,7 @@ struct MULTI { int8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().preOp(fn, u.elt[i]); return u.pack; @@ -78,7 +78,7 @@ struct MULTI { int8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().postOp(fn, u.elt[i]); return u.pack; @@ -113,7 +113,7 @@ struct MULTI { uint8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().preOp(fn, u.elt[i]); return u.pack; @@ -124,7 +124,7 @@ struct MULTI { uint8_t elt[8]; } u; u.pack = x; - #pragma unroll 1 + #pragma unroll for (int i=0; i < 8; i++) u.elt[i] = FuncTraits().postOp(fn, u.elt[i]); return u.pack; @@ -578,13 +578,13 @@ __device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, c for (int u = 0; u < UNROLL; ++u) MULTI128().preOp(fn, vals[u]); } - #pragma unroll 1 + #pragma unroll for (int i=1; i()(fn, vals[u], vals2[u]); } - #pragma unroll 1 + #pragma unroll for (int i=MINSRCS; i().postOp(fn, vals[u]); } // Store - #pragma unroll 1 + #pragma unroll for (int i = 0; i < MINDSTS; i++) { for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); } - #pragma unroll 1 + #pragma unroll for (int i=MINDSTS; i + ReduceCopyMulti (w, nw, t, fn, preOpSrc0, postOp, nsrcs, srcs, ndsts, dsts, offset, Nelem); Nrem -= Nelem;