Make ncclFuncs static
This is necessary to constant propagate the function pointers to eliminate the indirect function call.
Esse commit está contido em:
@@ -17,7 +17,74 @@
|
||||
#include <type_traits>
|
||||
|
||||
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
|
||||
extern __device__ const ncclKern_t ncclFuncs[];
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype) \
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
// Must be consistent with ncclColl_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKern_t = void (*)(struct CollectiveArgs*);
|
||||
|
||||
static const __device__ constexpr ncclKern_t ncclFuncs[]{
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
};
|
||||
|
||||
template<unsigned short f, unsigned short l>
|
||||
struct Caller {
|
||||
|
||||
@@ -9,73 +9,7 @@
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
NCCL_COLL_NAME(coll, op, dtype), \
|
||||
NCCL_COLL_NAME(coll##LL, op, dtype) \
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
|
||||
// Must be consistent with ncclRedOp_t
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
NCCL_FUNCS3A(coll, sum ), \
|
||||
NCCL_FUNCS3A(coll, prod), \
|
||||
NCCL_FUNCS3A(coll, max ), \
|
||||
NCCL_FUNCS3A(coll, min )
|
||||
#define NCCL_FUNCS2B(coll) \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
// Must be consistent with ncclColl_t
|
||||
#define NCCL_FUNCS() { \
|
||||
NCCL_FUNCS2B(ncclBroadcast), \
|
||||
NCCL_FUNCS2A(ncclReduce), \
|
||||
NCCL_FUNCS2B(ncclAllGather), \
|
||||
NCCL_FUNCS2A(ncclReduceScatter), \
|
||||
NCCL_FUNCS2A(ncclAllReduce) }
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
using ncclKern_t = void (*)(struct CollectiveArgs*);
|
||||
__device__ constexpr ncclKern_t ncclFuncs[]{
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
NCCL_FUNCS2A(ncclReduceScatter),
|
||||
NCCL_FUNCS2A(ncclAllReduce)
|
||||
#endif
|
||||
};
|
||||
|
||||
// Workaround for https://reviews.llvm.org/D55580
|
||||
__device__ void ncclWorkaroundClangD55580() {}
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário