From 5827a4f616fa3c856aa04d184576e625da8eb11f Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Tue, 28 May 2019 21:48:00 -0400 Subject: [PATCH] Make ncclFuncs static This is necessary to constant propagate the function pointers to eliminate the indirect function call. --- src/collectives/device/common.h | 69 ++++++++++++++++++++++++++++- src/collectives/device/functions.cu | 66 --------------------------- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 9048011aa3..819f3a12ab 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -17,7 +17,74 @@ #include typedef void(*ncclKern_t)(struct CollectiveArgs* args); -extern __device__ const ncclKern_t ncclFuncs[]; +#define NCCL_FUNC4(coll, op, dtype) \ + NCCL_COLL_NAME(coll, op, dtype), \ + NCCL_COLL_NAME(coll##LL, op, dtype) \ + +// Must be consistent with ncclDataType_t +#define NCCL_FUNCS3A(coll, op) \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, u8), \ + NCCL_FUNC4(coll, op, i32), \ + NCCL_FUNC4(coll, op, u32), \ + NCCL_FUNC4(coll, op, i64), \ + NCCL_FUNC4(coll, op, u64), \ + NCCL_FUNC4(coll, op, f16), \ + NCCL_FUNC4(coll, op, f32), \ + NCCL_FUNC4(coll, op, f64) +#define NCCL_FUNCS3B(coll, op) \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8), \ + NCCL_FUNC4(coll, op, i8) + +// Must be consistent with ncclRedOp_t +#define NCCL_FUNCS2A(coll) \ + NCCL_FUNCS3A(coll, sum ), \ + NCCL_FUNCS3A(coll, prod), \ + NCCL_FUNCS3A(coll, max ), \ + NCCL_FUNCS3A(coll, min ) +#define NCCL_FUNCS2B(coll) \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy) + +// Must be consistent with ncclColl_t +#define NCCL_FUNCS() { \ + NCCL_FUNCS2B(ncclBroadcast), \ + NCCL_FUNCS2A(ncclReduce), \ + NCCL_FUNCS2B(ncclAllGather), \ + NCCL_FUNCS2A(ncclReduceScatter), \ + NCCL_FUNCS2A(ncclAllReduce) } + +// Must be consistent with the ncclFuncSet enum +using ncclKern_t = void (*)(struct CollectiveArgs*); + +static const __device__ constexpr ncclKern_t ncclFuncs[]{ +#if defined(__HIP_DEVICE_COMPILE__) + NCCL_FUNCS2B(ncclBroadcast), + NCCL_FUNCS2A(ncclReduce), + NCCL_FUNCS2B(ncclAllGather), + NCCL_FUNCS2A(ncclReduceScatter), + NCCL_FUNCS2A(ncclAllReduce) +#endif +// Don't try to initialize the host shadow copy of this device-side global +// variable. There is no host pointer to a device-side function, which +// confuses clang. This will be fixed in the next clang release. +#if __CUDA_ARCH__ + NCCL_FUNCS2B(ncclBroadcast), + NCCL_FUNCS2A(ncclReduce), + NCCL_FUNCS2B(ncclAllGather), + NCCL_FUNCS2A(ncclReduceScatter), + NCCL_FUNCS2A(ncclAllReduce) +#endif +}; template struct Caller { diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 9226c576d4..bc7c175fc5 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -9,73 +9,7 @@ #include "collectives.h" #include "common.h" -#define NCCL_FUNC4(coll, op, dtype) \ - NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) \ -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(coll, op) \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, u8), \ - NCCL_FUNC4(coll, op, i32), \ - NCCL_FUNC4(coll, op, u32), \ - NCCL_FUNC4(coll, op, i64), \ - NCCL_FUNC4(coll, op, u64), \ - NCCL_FUNC4(coll, op, f16), \ - NCCL_FUNC4(coll, op, f32), \ - NCCL_FUNC4(coll, op, f64) -#define NCCL_FUNCS3B(coll, op) \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8), \ - NCCL_FUNC4(coll, op, i8) - -// Must be consistent with ncclRedOp_t -#define NCCL_FUNCS2A(coll) \ - NCCL_FUNCS3A(coll, sum ), \ - NCCL_FUNCS3A(coll, prod), \ - NCCL_FUNCS3A(coll, max ), \ - NCCL_FUNCS3A(coll, min ) -#define NCCL_FUNCS2B(coll) \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy) - -// Must be consistent with ncclColl_t -#define NCCL_FUNCS() { \ - NCCL_FUNCS2B(ncclBroadcast), \ - NCCL_FUNCS2A(ncclReduce), \ - NCCL_FUNCS2B(ncclAllGather), \ - NCCL_FUNCS2A(ncclReduceScatter), \ - NCCL_FUNCS2A(ncclAllReduce) } - -// Must be consistent with the ncclFuncSet enum -using ncclKern_t = void (*)(struct CollectiveArgs*); -__device__ constexpr ncclKern_t ncclFuncs[]{ -#if defined(__HIP_DEVICE_COMPILE__) - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -#endif -// Don't try to initialize the host shadow copy of this device-side global -// variable. There is no host pointer to a device-side function, which -// confuses clang. This will be fixed in the next clang release. -#if __CUDA_ARCH__ - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -#endif -}; // Workaround for https://reviews.llvm.org/D55580 __device__ void ncclWorkaroundClangD55580() {}