From 501a149d575fa62b7d8bea5b2bd20304dba55025 Mon Sep 17 00:00:00 2001 From: David Addison Date: Fri, 18 Apr 2025 19:20:59 -0700 Subject: [PATCH 1/9] Add support for FP8 datatypes Added new datatypes: f8e4m3, f8e5m2 Only supported on H100+ architectures and NCCL versions >= 2.24.0 --- src/common.cu | 71 +++++- src/common.h | 20 +- verifiable/verifiable.cu | 473 +++++++++++++++++++++++++++------------ verifiable/verifiable.h | 7 +- 4 files changed, 415 insertions(+), 156 deletions(-) diff --git a/src/common.cu b/src/common.cu index 0d4dfc1944..f83cdf009a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -21,15 +21,21 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #if HAVE_BF16 , ncclBfloat16 #endif + #if HAVE_FP8 + , ncclFloat8e4m3, ncclFloat8e5m2 + #endif }; const char *test_typenames[ncclNumTypes] = { "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" - #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #if HAVE_BF16 , "bfloat16" #endif + #if HAVE_FP8 + , "f8e4m3", "f8e5m2" + #endif }; int test_typenum = -1; @@ -86,6 +92,7 @@ static int average = 1; #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) static int local_register = 0; #endif +static int minCudaArch = 1<<30; #define NUM_BLOCKS 32 @@ -126,18 +133,18 @@ static double parsesize(const char *value) { } testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { - ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); + CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault)); CUDACHECK(cudaDeviceSynchronize()); return testSuccess; } testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { - ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); + CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault)); return testSuccess; } testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { - ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); + CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault)); return testSuccess; } @@ -358,9 +365,12 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t union { int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; half f16; float f32; double f64; - #if defined(__CUDA_BF16_TYPES_EXIST__) + #if HAVE_BF16 __nv_bfloat16 bf16; #endif + #if HAVE_FP8 + __nv_fp8_e4m3 f8e4m3; __nv_fp8_e5m2 f8e5m2; + #endif }; switch(type) { case ncclInt8: i8 = ncclVerifiablePremulScalar(rank); break; @@ -372,9 +382,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t case ncclFloat16: f16 = ncclVerifiablePremulScalar(rank); break; case ncclFloat32: f32 = ncclVerifiablePremulScalar(rank); break; case ncclFloat64: f64 = ncclVerifiablePremulScalar(rank); break; - #if defined(__CUDA_BF16_TYPES_EXIST__) + #if HAVE_BF16 case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break; #endif + #if HAVE_FP8 + case ncclFloat8e4m3: f8e4m3 = ncclVerifiablePremulScalar<__nv_fp8_e4m3>(rank); break; + case ncclFloat8e5m2: f8e5m2 = ncclVerifiablePremulScalar<__nv_fp8_e5m2>(rank); break; + #endif + default: break; // Just to silence clang } NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); } @@ -702,13 +717,20 @@ int main(int argc, char* argv[]) { test_typenum = 9; if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { test_opnum++; // ncclAvg - #if defined(__CUDA_BF16_TYPES_EXIST__) - test_typenum++; // bfloat16 - #endif } if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { test_opnum++; // PreMulSum } + #if defined(__CUDA_BF16_TYPES_EXIST__) + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_typenum++; // bfloat16 + } + #endif + #if defined(__CUDA_FP8_TYPES_EXIST__) + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) { + test_typenum += 2; // fp8 e4m3,e5m2 + } + #endif #endif // Parse args @@ -1033,12 +1055,37 @@ testResult_t run() { gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i; CUDACHECK(cudaSetDevice(gpus[i])); TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes)); - if (streamnull) + if (streamnull) { streams[i] = NULL; - else + } + else { CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking)); + } + int archMajor, archMinor; + CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i])); + CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i])); + minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor); } +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); +#endif +#if defined(__CUDA_FP8_TYPES_EXIST__) + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) { + if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware + int n = 0; + for (int i=0; i < test_typenum; i++) { + if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) { + test_types[n] = test_types[i]; + test_typenames[n] = test_typenames[i]; + n += 1; + } + } + test_typenum = n; + } + } +#endif + //if parallel init is not selected, use main thread to initialize NCCL ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) diff --git a/src/common.h b/src/common.h index 478d7fb1c0..ff834f699d 100644 --- a/src/common.h +++ b/src/common.h @@ -213,16 +213,34 @@ static uint64_t getHostHash(const char* hostname) { return getHash(hostHash, strlen(hostHash)); } +#define HAVE_BF16 0 +#define HAVE_FP8 0 + +#if NCCL_MAJOR >= 2 + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #undef HAVE_BF16 + #define HAVE_BF16 1 + #if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) + #undef HAVE_FP8 + #define HAVE_FP8 1 + #endif + #endif +#endif + static size_t wordSize(ncclDataType_t type) { switch(type) { case ncclChar: #if NCCL_MAJOR >= 2 //case ncclInt8: case ncclUint8: +#endif +#if HAVE_FP8 + case ncclFloat8e4m3: + case ncclFloat8e5m2: #endif return 1; case ncclHalf: -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if HAVE_BF16 case ncclBfloat16: #endif //case ncclFloat16: diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu index 5f617ee188..dcd6e6c939 100644 --- a/verifiable/verifiable.cu +++ b/verifiable/verifiable.cu @@ -8,6 +8,15 @@ #if CUDART_VERSION >= 11000 #include #endif +#if CUDART_VERSION >= 11080 +#include +#endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && defined(__CUDA_FP8_TYPES_EXIST__) + #define HAVE_ncclFloat8 1 +#else + #define HAVE_ncclFloat8 0 +#endif #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__) #define HAVE_ncclBfloat16 1 @@ -84,10 +93,16 @@ template struct IsIntegral: std::is_integral {}; template<> struct IsIntegral: std::false_type {}; -#ifdef __CUDA_BF16_TYPES_EXIST__ +#if HAVE_ncclBfloat16 template<> struct IsIntegral<__nv_bfloat16>: std::false_type {}; #endif +#if HAVE_ncclFloat8 +template<> +struct IsIntegral<__nv_fp8_e4m3>: std::false_type {}; +template<> +struct IsIntegral<__nv_fp8_e5m2>: std::false_type {}; +#endif } //////////////////////////////////////////////////////////////////////////////// @@ -107,23 +122,72 @@ __host__ __device__ T inhibit(T x) { //////////////////////////////////////////////////////////////////////////////// namespace { - template - __host__ __device__ Y castTo(X x) { + template + __host__ __device__ Y castTo(uint64_t x) { return Y(x); } template __host__ __device__ Y castTo(float x) { return Y(x); } + template + __host__ __device__ Y castTo(double x) { + return Y(x); + } + template<> __host__ __device__ half castTo(float x) { return __float2half(x); } - #ifdef __CUDA_BF16_TYPES_EXIST__ + template<> + __host__ __device__ half castTo(double x) { + return __double2half(x); + } + template<> + __host__ __device__ half castTo(uint64_t x) { + return __ull2half_rn(x); + } + + #if HAVE_ncclBfloat16 template<> __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) { return __float2bfloat16(x); } + template<> + __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(double x) { + return __double2bfloat16(x); + } + template<> + __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(uint64_t x) { + return __double2bfloat16((double)x); + } + #endif + + #if HAVE_ncclFloat8 + template<> + __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(float x) { + return __nv_fp8_e4m3(x); + } + template<> + __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(double x) { + return __nv_fp8_e4m3(x); + } + template<> + __host__ __device__ __nv_fp8_e4m3 castTo<__nv_fp8_e4m3>(uint64_t x) { + return __nv_fp8_e4m3((double)x); + } + template<> + __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(float x) { + return __nv_fp8_e5m2(x); + } + template<> + __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(double x) { + return __nv_fp8_e5m2(x); + } + template<> + __host__ __device__ __nv_fp8_e5m2 castTo<__nv_fp8_e5m2>(uint64_t x) { + return __nv_fp8_e5m2((double)x); + } #endif } @@ -151,7 +215,7 @@ struct ReduceSum { return __float2half(__half2float(a) + __half2float(b)); #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ + #if HAVE_ncclBfloat16 __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { #if __CUDA_ARCH__ >= 800 return __hadd(a, b); @@ -160,6 +224,22 @@ struct ReduceSum { #endif } #endif + #if HAVE_ncclFloat8 + __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e4m3(__hadd(__half(a), __half(b))); + #else + return __nv_fp8_e4m3(float(a) + float(b)); + #endif + } + __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e5m2(__hadd(__half(a), __half(b))); + #else + return __nv_fp8_e5m2(float(a) + float(b)); + #endif + } + #endif template __host__ __device__ T postOp(T x) const { return x; } }; @@ -175,7 +255,7 @@ struct ReduceProd { return __float2half(__half2float(a) * __half2float(b)); #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ + #if HAVE_ncclBfloat16 __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { #if __CUDA_ARCH__ >= 800 return __hmul(a, b); @@ -184,6 +264,22 @@ struct ReduceProd { #endif } #endif + #if HAVE_ncclFloat8 + __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e4m3(__hmul(__half(a), __half(b))); + #else + return __nv_fp8_e4m3(float(a) * float(b)); + #endif + } + __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e5m2(__hmul(__half(a), __half(b))); + #else + return __nv_fp8_e5m2(float(a) * float(b)); + #endif + } + #endif template __host__ __device__ T postOp(T x) const { return x; } }; @@ -201,7 +297,7 @@ struct ReduceMin { return __half2float(a) < __half2float(b) ? a : b; #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ + #if HAVE_ncclBfloat16 __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { #if __CUDA_ARCH__ >= 800 return __hmin(a, b); @@ -212,6 +308,22 @@ struct ReduceMin { #endif } #endif + #if HAVE_ncclFloat8 + __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e4m3(__hmin(__half(a), __half(b))); + #else + return __nv_fp8_e4m3(float(a) < float(b) ? a : b); + #endif + } + __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e5m2(__hmin(__half(a), __half(b))); + #else + return __nv_fp8_e5m2(float(a) < float(b) ? a : b); + #endif + } + #endif template __host__ __device__ T postOp(T x) const { return x; } }; @@ -229,7 +341,7 @@ struct ReduceMax { return __half2float(a) > __half2float(b) ? a : b; #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ + #if HAVE_ncclBfloat16 __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { #if __CUDA_ARCH__ >= 800 return __hmax(a, b); @@ -240,6 +352,22 @@ struct ReduceMax { #endif } #endif + #if HAVE_ncclFloat8 + __host__ __device__ __nv_fp8_e4m3 operator()(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e4m3(__hmax(__half(a), __half(b))); + #else + return __nv_fp8_e4m3(float(a) > float(b) ? a : b); + #endif + } + __host__ __device__ __nv_fp8_e5m2 operator()(__nv_fp8_e5m2 a, __nv_fp8_e5m2 b) const { + #if __CUDA_ARCH__ >= 800 + return __nv_fp8_e5m2(__hmax(__half(a), __half(b))); + #else + return __nv_fp8_e5m2(float(a) > float(b) ? a : b); + #endif + } + #endif template __host__ __device__ T postOp(T x) const { return x; } }; @@ -297,29 +425,47 @@ struct ReduceAvg { namespace { template -struct FloatLayout; +struct FloatLayout { static constexpr bool is_floating_point = false; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 8, mantissa_bits = 23; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 11, mantissa_bits = 52; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 5, mantissa_bits = 10; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; -#ifdef __CUDA_BF16_TYPES_EXIST__ +#if HAVE_ncclBfloat16 template<> struct FloatLayout<__nv_bfloat16> { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 8, mantissa_bits = 7; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; #endif +#if HAVE_ncclFloat8 +template<> +struct FloatLayout<__nv_fp8_e4m3> { + static constexpr bool is_floating_point = true; + static constexpr int exponent_bits = 4, mantissa_bits = 3; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +template<> +struct FloatLayout<__nv_fp8_e5m2> { + static constexpr bool is_floating_point = true; + static constexpr int exponent_bits = 5, mantissa_bits = 2; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +#endif template __host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) { @@ -632,11 +778,12 @@ __host__ __device__ void genOutput( //////////////////////////////////////////////////////////////////////////////// // Nil reduction (byte copy functions). Optimized to assume rank_n=1 +// genInput specialization for integer ReduceNil. namespace { -template +template __host__ __device__ void genInput( T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index, - std::integral_constant + std::true_type /*integral*/ ) { (void)rank_n, (void)rank_me; // silence unused warnings union { uint64_t bits; T tmp; }; @@ -646,6 +793,24 @@ __host__ __device__ void genInput( ans = tmp; } +// genInput specialization for floating point ReduceNil. +template +__host__ __device__ void genInput( + T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + (void)rank_n; // silence unused warnings + constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout::mantissa_bits)-1; + uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed); + int sign = rng & 1; + rng ^= rng>>1; + int exp = rng & ((1<<(FloatLayout::exponent_bits-1))-1); + exp += 1<<(FloatLayout::exponent_bits-2); + rng ^= rng >> FloatLayout::exponent_bits; + uint64_t mant = rng & mant_mask; + ans = makeFloat(sign, exp, mant); +} + template __host__ __device__ void genOutput( T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index, @@ -734,20 +899,34 @@ __host__ __device__ void genOutput( namespace { template __host__ __device__ void genInput( - T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index, + T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index, std::false_type /*integral*/ ) { - ans = genInOutFloatSum(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true); + // We can't control the nranks divisor in avareages so to control error we + // limit to two ranks contributing non-zero values. This way there is no ambiguity + // of summation. + int r = shuffleRank(rank_n, rank_me, rng); + uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + ans = r < 2 ? castTo(1+m) : castTo((uint64_t)0); } template __host__ __device__ void genOutput( - T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index, + T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index, std::false_type /*integral*/ ) { - ans = genInOutFloatSum(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true); - using T1 = typename std::conditional<(sizeof(T)::type; - ans = ReduceProd()(ans, T1(1)/T1(rank_n)); + shuffleRank(rank_n, -1, rng); + uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + if (rank_n == 1) { + ans = castTo(1+m0); + } else { + // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__. + // We account for this by using a tolerance of 2 ulps during the verification. + using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type; + ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)), + (T)(TMul(1+m1)*TMul(1.0/rank_n))); + } } } @@ -809,10 +988,9 @@ __host__ __device__ T genOutput( //////////////////////////////////////////////////////////////////////////////// -#if !SELF_TEST namespace { template -__global__ void prepareInput2( +__global__ void __launch_bounds__(512, 1) prepareInput2( T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0 ) { @@ -833,40 +1011,49 @@ __global__ void prepareInput2( } template -void prepareInput1( +cudaError_t prepareInput1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { - int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); - #define CASE_TY(T) prepareInput2<<>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break; + void const *fn = nullptr; switch(elt_ty) { - case ncclInt8: CASE_TY(int8_t) - case ncclUint8: CASE_TY(uint8_t) - case ncclInt32: CASE_TY(int32_t) - case ncclUint32: CASE_TY(uint32_t) - case ncclInt64: CASE_TY(int64_t) - case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(half) + case ncclInt8: fn = (void const*)&prepareInput2; break; + case ncclUint8: fn = (void const*)&prepareInput2; break; + case ncclInt32: fn = (void const*)&prepareInput2; break; + case ncclUint32: fn = (void const*)&prepareInput2; break; + case ncclInt64: fn = (void const*)&prepareInput2; break; + case ncclUint64: fn = (void const*)&prepareInput2; break; + case ncclFloat16: fn = (void const*)&prepareInput2; break; #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(__nv_bfloat16) + case ncclBfloat16: fn = (void const*)&prepareInput2<__nv_bfloat16, ReduceOp>; break; #endif - case ncclFloat32: CASE_TY(float) - case ncclFloat64: CASE_TY(double) - default: assert(0); + #if HAVE_ncclFloat8 + case ncclFloat8e4m3: fn = (void const*)&prepareInput2<__nv_fp8_e4m3, ReduceOp>; break; + case ncclFloat8e5m2: fn = (void const*)&prepareInput2<__nv_fp8_e5m2, ReduceOp>; break; + #endif + case ncclFloat32: fn = (void const*)&prepareInput2; break; + case ncclFloat64: fn = (void const*)&prepareInput2; break; + default: assert(0); return cudaErrorInvalidValue; } #undef CASE_TY + dim3 grid = {1, 1, 1}; + grid.x = (unsigned int)std::min(32, (elt_n + 4*512-1)/(4*512)); + dim3 block = {512, 1, 1}; + void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiablePrepareInput( +cudaError_t ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ - prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \ + return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \ else \ - prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \ + return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \ break; switch(red_op) { case ncclSum: CASE_OP(ReduceSum()) @@ -882,14 +1069,12 @@ void ncclVerifiablePrepareInput( } #undef CASE_OP } -#endif //////////////////////////////////////////////////////////////////////////////// -#if !SELF_TEST namespace { template -__global__ void prepareExpected2( +__global__ void __launch_bounds__(512, 1) prepareExpected2( T *elts, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed, intptr_t elt_ix0 ) { @@ -909,40 +1094,49 @@ __global__ void prepareExpected2( } template -void prepareExpected1( +cudaError_t prepareExpected1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { - int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); - #define CASE_TY(T) prepareExpected2<<>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break; + void const *fn = nullptr; switch(elt_ty) { - case ncclInt8: CASE_TY(int8_t) - case ncclUint8: CASE_TY(uint8_t) - case ncclInt32: CASE_TY(int32_t) - case ncclUint32: CASE_TY(uint32_t) - case ncclInt64: CASE_TY(int64_t) - case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(half) + case ncclInt8: fn = (void const*)&prepareExpected2; break; + case ncclUint8: fn = (void const*)&prepareExpected2; break; + case ncclInt32: fn = (void const*)&prepareExpected2; break; + case ncclUint32: fn = (void const*)&prepareExpected2; break; + case ncclInt64: fn = (void const*)&prepareExpected2; break; + case ncclUint64: fn = (void const*)&prepareExpected2; break; + case ncclFloat16: fn = (void const*)&prepareExpected2; break; #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(__nv_bfloat16) + case ncclBfloat16: fn = (void const*)&prepareExpected2<__nv_bfloat16, ReduceOp>; break; #endif - case ncclFloat32: CASE_TY(float) - case ncclFloat64: CASE_TY(double) - default: assert(0); + #if HAVE_ncclFloat8 + case ncclFloat8e4m3: fn = (void const*)&prepareExpected2<__nv_fp8_e4m3, ReduceOp>; break; + case ncclFloat8e5m2: fn = (void const*)&prepareExpected2<__nv_fp8_e5m2, ReduceOp>; break; + #endif + case ncclFloat32: fn = (void const*)&prepareExpected2; break; + case ncclFloat64: fn = (void const*)&prepareExpected2; break; + default: assert(0); return cudaErrorInvalidValue; } #undef CASE_TY + dim3 grid = {1, 1, 1}; + grid.x = (unsigned int)std::min(32, (elt_n + 4*512-1)/(4*512)); + dim3 block = {512, 1, 1}; + void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiablePrepareExpected( +cudaError_t ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ - prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \ + return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \ else \ - prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \ + return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \ break; switch(red_op) { case ncclSum: CASE_OP(ReduceSum()) @@ -958,52 +1152,10 @@ void ncclVerifiablePrepareExpected( } #undef CASE_OP } -#endif //////////////////////////////////////////////////////////////////////////////// namespace { -/* How we compare floating point values when exactness is impossible is interesting. - * First, we take note that simply reinterpreting integer bits as floating point - * gives us a monotonic mapping which exponentially spaces out floats. Thus - * consecutive integers encode consecutive floats. In general, using integer - * subraction on the bitpatterns of two floats gives us an integer which is the - * logarithm of their relative difference. But, if the floats always have similar - * exponents, than the integer difference is actually proportional to the - * relative error (this is because we are counting hops in the mantissa bits only, - * not the exponent bits). So a cheap way to compare if two floats are relatively - * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula - * calculates such a tolerance for a summation of n floats. This formula - * was derived by inspecting the maximum observed integer difference over many - * random runs of summation. The parameter values were computed by the - * companion program "inexact_regress.cu". - */ -__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) { - float power, coef; - switch(elt_ty) { - case ncclFloat32: - case ncclFloat64: - power = .51f; - coef = 1.25f; - break; - case ncclFloat16: - power = .91f; - coef = .75f; - break; - #if HAVE_ncclBfloat16 - case ncclBfloat16: - power = .91f; - coef = .66f; - break; - #endif - } - #if __CUDA_ARCH__ - return 1 + unsigned(coef*powf(float(rank_n), power)); - #else - return 1 + unsigned(coef*std::pow(float(rank_n), power)); - #endif -} - template __host__ __device__ uint64_t calcDelta(T a, T b) { union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y; @@ -1020,10 +1172,9 @@ __host__ __device__ uint64_t calcDelta(T a, T b) { //////////////////////////////////////////////////////////////////////////////// -#if !SELF_TEST namespace { template -__global__ void verifyPrepared( +__global__ void __launch_bounds__(512, 1) verifyPrepared( T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n ) { intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); @@ -1039,16 +1190,34 @@ __global__ void verifyPrepared( bad += tolerance < delta ? 1 : 0; #if 0 if(tolerance < delta) { - printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]); + printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance); } #endif i += blockDim.x; } - asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory"); +} + +cudaError_t verifyPrepared1(int bytePerElt, + void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n + ) { + void const *fn = nullptr; + switch(bytePerElt) { + case 1: fn = (void const*)&verifyPrepared; break; + case 2: fn = (void const*)&verifyPrepared; break; + case 4: fn = (void const*)&verifyPrepared; break; + case 8: fn = (void const*)&verifyPrepared; break; + default: assert(0); return cudaErrorInvalidValue; + } + dim3 grid = {(unsigned int)block_n, 1, 1}; + dim3 block = {512, 1, 1}; + void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } template -__global__ void verifyInline2( +__global__ void __launch_bounds__(512, 1) verifyInline2( T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed, intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n ) { @@ -1077,39 +1246,52 @@ __global__ void verifyInline2( #endif i += blockDim.x; } - asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory"); } template -void verifyInline1( +cudaError_t verifyInline1( T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n ) { + void const *fn = nullptr; + ReduceNil opnil; + ReduceSum opsum; + ReduceMin opmin; + ReduceMax opmax; + ReduceProd opprod; + ReduceAvg opavg{rank_n}; + ReducePreMulSum oppremulsum; + void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n}; #define CASE_OP(op) \ - if(rank_n == 1) \ - verifyInline2<<>> \ - ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ - else \ - verifyInline2<<>> \ - ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ - break; + if(rank_n == 1) { \ + fn = (void const*)&verifyInline2; \ + args[2] = &opnil; \ + } else { \ + fn = (void const*)&verifyInline2; \ + args[2] = &op; \ + } break; switch(red_op) { - case ncclSum: CASE_OP(ReduceSum()) - case ncclMin: CASE_OP(ReduceMin()) - case ncclMax: CASE_OP(ReduceMax()) - case ncclProd: CASE_OP(ReduceProd()) + case ncclSum: CASE_OP(opsum) + case ncclMin: CASE_OP(opmin) + case ncclMax: CASE_OP(opmax) + case ncclProd: CASE_OP(opprod) #if HAVE_ncclAvg - case ncclAvg: CASE_OP(ReduceAvg{rank_n}) + case ncclAvg: CASE_OP(opavg) #endif #if HAVE_ncclPreMulSum - default: CASE_OP(ReducePreMulSum()) + default: CASE_OP(oppremulsum) #endif } #undef CASE_OP + dim3 grid = {(unsigned int)block_n, 1, 1}; + dim3 block = {512, 1, 1}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiableVerify( +cudaError_t ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t *bad_elt_n, cudaStream_t stream @@ -1118,11 +1300,21 @@ void ncclVerifiableVerify( #if HAVE_ncclBfloat16 floating |= elt_ty == ncclBfloat16; #endif - + #if HAVE_ncclFloat8 + floating |= elt_ty == ncclFloat8e4m3; + floating |= elt_ty == ncclFloat8e5m2; + #endif + unsigned tolerance = 0; #if HAVE_ncclAvg - if (floating && red_op == ncclAvg) - tolerance = calcSumFloatTolerance(rank_n, elt_ty); + if (floating && red_op == ncclAvg) { + // Average does it's pre-multiplies in an unspecified floating point format + // (could be the actual type T or float or half). That means the premultiply + // verify does could generate a discrepancy in the least mantissa digit. After + // adding those two (since avg only has two non-zero contributions) we could + // be off by a distance of 2 units. + tolerance = 2; + } #endif int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); @@ -1130,9 +1322,9 @@ void ncclVerifiableVerify( *bad_elt_n = 0; #define CASE_TY(T, Uint) { \ if(expected != nullptr) { \ - verifyPrepared<<>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \ + return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \ } else { \ - verifyInline1((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \ + return verifyInline1((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \ } \ } break; switch(elt_ty) { @@ -1143,29 +1335,30 @@ void ncclVerifiableVerify( case ncclInt64: CASE_TY(int64_t, uint64_t) case ncclUint64: CASE_TY(uint64_t, uint64_t) case ncclFloat16: CASE_TY(half, uint16_t) + #if HAVE_ncclFloat8 + case ncclFloat8e4m3: CASE_TY(__nv_fp8_e4m3, uint8_t) + case ncclFloat8e5m2: CASE_TY(__nv_fp8_e5m2, uint8_t) + #endif #if HAVE_ncclBfloat16 case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t) #endif case ncclFloat32: CASE_TY(float, uint32_t) case ncclFloat64: CASE_TY(double, uint64_t) - default: assert(0); + default: assert(0); return cudaErrorInvalidValue; } #undef CASE_TY } -#endif //////////////////////////////////////////////////////////////////////////////// -#if SELF_TEST -#include - +namespace { template __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) { //if(!std::is_same::value) return; //if(!std::is_same::value) return; //if(rank_n!=3) return; - unsigned tolerance = !IsIntegral::value && std::is_same::value ? calcSumFloatTolerance(rank_n, ty) : 0; + unsigned tolerance = !IsIntegral::value && std::is_same::value ? 2 : 0; uint64_t seed = 0xc8e2bed69766d533; for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) { @@ -1202,7 +1395,7 @@ __device__ void sweep1(int ty, char const *tyname) { } } -__global__ void sweep() { +__global__ void __launch_bounds__(512, 1) sweep() { sweep1(ncclInt8, "int8"); sweep1(ncclUint8, "uint8"); sweep1(ncclInt32, "int32"); @@ -1210,18 +1403,18 @@ __global__ void sweep() { sweep1(ncclInt64, "int64"); sweep1(ncclUint64, "uint64"); sweep1(ncclFloat16, "half"); + #if HAVE_ncclFloat8 + sweep1<__nv_fp8_e4m3>(ncclBfloat16, "float8e4m3"); + sweep1<__nv_fp8_e5m2>(ncclBfloat16, "float8e5m2"); + #endif #if HAVE_ncclBfloat16 sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16"); #endif sweep1(ncclFloat32, "float"); sweep1(ncclFloat64, "double"); } - -int main(int arg_n, char **args) { - std::cerr<<"You are hoping to see no output beyond this line."<>>(); - cudaDeviceSynchronize(); - return 0; } -#endif + +void ncclVerifiableLaunchSelfTest() { + sweep<<<1,512>>>(); +} diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h index aca0565a6b..71d5ef6649 100644 --- a/verifiable/verifiable.h +++ b/verifiable/verifiable.h @@ -34,13 +34,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { } // Enqueue kernel to generate data which is to be reduced. -void ncclVerifiablePrepareInput( +cudaError_t ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); // Enqueue kernel to generate expected results of reduction. -void ncclVerifiablePrepareExpected( +cudaError_t ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); @@ -51,9 +51,10 @@ void ncclVerifiablePrepareExpected( // which can be costly. Thus if you plan to run the same reduction multiple // times it is advantageous to precompute the expected values with // ncclVerifiablePrepareExpected and pass them as `expected` here. -void ncclVerifiableVerify( +cudaError_t ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t *bad_elt_n, cudaStream_t stream ); + #endif From 1021260ca94ea73dcedc8a15ffc6dbfb12504b65 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 21 Apr 2025 11:26:35 -0700 Subject: [PATCH 2/9] Make verifiable a DSO and add NAME_SUFFIX support Build option DSO=1 generates libverifiable.so which can be used to reduce the combined binary size. Build option NAME_SUFFIX can be used to a add suffix to all generated binaries. e.g. NAME_SUFFIX=_mpi Added new make target: clean_intermediates --- README.md | 32 +++++++++----- src/Makefile | 93 +++++++++++----------------------------- src/common.mk | 69 +++++++++++++++++++++++++++++ verifiable/Makefile | 17 +++++--- verifiable/main.cu | 14 ++++++ verifiable/verifiable.h | 4 ++ verifiable/verifiable.mk | 15 +++++-- 7 files changed, 156 insertions(+), 88 deletions(-) create mode 100644 src/common.mk create mode 100644 verifiable/main.cu diff --git a/README.md b/README.md index 957f6afb90..bdafbe5a16 100644 --- a/README.md +++ b/README.md @@ -4,33 +4,43 @@ These tests check both the performance and the correctness of [NCCL](http://gith ## Build -To build the tests, just type `make`. +To build the tests, just type `make` or `make -j` -If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME. +If CUDA is not installed in `/usr/local/cuda`, you may specify `CUDA_HOME`. Similarly, if NCCL is not installed in `/usr`, you may specify `NCCL_HOME`. ```shell $ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl ``` -NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. +NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set `MPI=1` and set `MPI_HOME` to the path where MPI is installed. ```shell $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl ``` +You can also add a suffix to the name of the generated binaries with `NAME_SUFFIX`. For example when compiling with the MPI versions you could use: + +```shell +$ make MPI=1 NAME_SUFFIX=_mpi MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl +``` + +This will generate test binaries with names such as `all_reduce_perf_mpi`. + ## Usage -NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). +NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to `(number of processes)*(number of threads)*(number of GPUs per thread)`. ### Quick examples Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : + ```shell $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes : (NB: The nccl-tests binaries must be compiled with `MPI=1` for this case) + ```shell $ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 ``` @@ -73,7 +83,7 @@ All tests support the same set of arguments : ### Running multiple operations in parallel -NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. +NCCL tests allow to partition the set of GPUs into smaller sets, each executing the same operation in parallel. To split the GPUs, NCCL will compute a "color" for each rank, based on the `NCCL_TESTS_SPLIT` environment variable, then all ranks with the same color will end up in the same group. The resulting group is printed next to each GPU at the beginning of the test. @@ -82,13 +92,15 @@ with the same color will end up in the same group. The resulting group is printe `NCCL_TESTS_SPLIT_MASK=""` is equivalent to `NCCL_TESTS_SPLIT="&"`. Here are a few examples: - - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network) - - `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node. - - `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank. + + - `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network) + +- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node. + +- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank. Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups. ## Copyright -NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. - +NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. diff --git a/src/Makefile b/src/Makefile index 5737092a86..612395f645 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,73 +1,13 @@ # -# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # +include common.mk -CUDA_HOME ?= /usr/local/cuda -PREFIX ?= /usr/local -VERBOSE ?= 0 -DEBUG ?= 0 - -CUDA_LIB ?= $(CUDA_HOME)/lib64 -CUDA_INC ?= $(CUDA_HOME)/include -NVCC ?= $(CUDA_HOME)/bin/nvcc -CUDARTLIB ?= cudart - -CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) -CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) -CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) - -# Better define NVCC_GENCODE in your environment to the minimal set -# of archs to reduce compile time. -ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) -# Include Blackwell support if we're using CUDA12.8 or above -NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_90,code=sm_90 \ - -gencode=arch=compute_100,code=sm_100 \ - -gencode=arch=compute_120,code=sm_120 \ - -gencode=arch=compute_120,code=compute_120 -else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0) -NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_90,code=sm_90 \ - -gencode=arch=compute_90,code=compute_90 -else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) -NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_80,code=sm_80 \ - -gencode=arch=compute_80,code=compute_80 -else -NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=sm_70 \ - -gencode=arch=compute_70,code=compute_70 -endif - -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -CXXFLAGS := -std=c++11 - -LDFLAGS := -L${CUDA_LIB} -lcudart -lrt -NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt - -ifeq ($(DEBUG), 0) -NVCUFLAGS += -O3 -g -CXXFLAGS += -O3 -g -else -NVCUFLAGS += -O0 -G -g -CXXFLAGS += -O0 -g -ggdb3 -endif - -ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -else -.SILENT: -endif +MPI ?= 0 # Set to 1 to enable MPI support (multi-process/multi-node) +NAME_SUFFIX ?= # e.g. _mpi when using MPI=1 +DSO ?= 0 # Set to 1 to create and use libverifiable.so to reduce binary size .PHONY: build clean @@ -92,7 +32,7 @@ DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube -BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX}) build: ${BIN_FILES} @@ -103,18 +43,35 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable include ../verifiable/verifiable.mk +.PRECIOUS: ${DST_DIR}/%.o + ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) -c $< +${DST_DIR}/%$(NAME_SUFFIX).o: %.cu common.h $(TEST_VERIFIABLE_HDRS) + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + ${DST_DIR}/timer.o: timer.cc timer.h @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - $(CXX) $(CXXFLAGS) -o $@ -c timer.cc + $(CXX) $(CXXFLAGS) -o $@ -c $< -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) +ifeq ($(DSO), 1) +${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable ${NVLDFLAGS} -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable" +else +${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} +endif + +clean_intermediates: + rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS) diff --git a/src/common.mk b/src/common.mk new file mode 100644 index 0000000000..2bc7e358a0 --- /dev/null +++ b/src/common.mk @@ -0,0 +1,69 @@ +# +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 0 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC ?= $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart + +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) +CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) +# Include Blackwell support if we're using CUDA12.8 or above +NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_100,code=sm_100 \ + -gencode=arch=compute_120,code=sm_120 \ + -gencode=arch=compute_120,code=compute_120 +else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_90,code=sm_90 \ + -gencode=arch=compute_90,code=compute_90 +else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_80,code=compute_80 +else +NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ + -gencode=arch=compute_50,code=sm_50 \ + -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_70,code=compute_70 +endif + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 +CXXFLAGS := -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else +.SILENT: +endif diff --git a/verifiable/Makefile b/verifiable/Makefile index b141a2a7c5..bb90001e1e 100644 --- a/verifiable/Makefile +++ b/verifiable/Makefile @@ -1,13 +1,18 @@ -include ../../makefiles/common.mk +# +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +include ../src/common.mk .PHONY: all clean -BUILDDIR := $(abspath ../../build) +BUILDDIR := $(abspath ../build) NCCLDIR := $(BUILDDIR) NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include -DST_DIR := $(BUILDDIR)/test/verifiable +DST_DIR := $(BUILDDIR)/verifiable -all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o +all: $(DST_DIR)/self_test clean: rm -rf $(DST_DIR) @@ -18,7 +23,7 @@ include verifiable.mk self_test: $(DST_DIR)/self_test -$(DST_DIR)/self_test: verifiable.cu verifiable.h +$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS) @printf "Linking %s\n" $@ @mkdir -p $(DST_DIR) - $(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS) + $(NVCC) -o $@ $(NVCUFLAGS) $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(NVLDFLAGS) -Xlinker "-rpath=\$$ORIGIN" diff --git a/verifiable/main.cu b/verifiable/main.cu new file mode 100644 index 0000000000..4e4aef6713 --- /dev/null +++ b/verifiable/main.cu @@ -0,0 +1,14 @@ + +#include +#include + +#define NCCL_VERIFIABLE_SELF_TEST 1 +#include "verifiable.h" + +int main(int arg_n, char **args) { + std::cerr<<"You are hoping to see no output beyond this line."< -# TEST_VERIFIABLE_BUILDDIR = +# TEST_VERIFIABLE_BUILDDIR = TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o +TEST_VERIFIABLE_LIBS = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so -$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) +$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFIABLE_HDRS) @printf "Compiling %s\n" $@ @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) - $(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu + $(NVCC) -Xcompiler "-fPIC" -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu + +$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS) + @printf "Creating DSO %s\n" $@ + @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) + $(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0 + ln -sf $(notdir $@).0 $@ From e041d901e6d3dabb67a22905cba77d9ba2689898 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 7 May 2025 10:30:59 -0700 Subject: [PATCH 3/9] Re-add sm_70 support for CUDA 12.8+ and 13.0 builds --- src/common.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common.mk b/src/common.mk index 2bc7e358a0..5fd9418860 100644 --- a/src/common.mk +++ b/src/common.mk @@ -21,7 +21,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) # of archs to reduce compile time. ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) # Include Blackwell support if we're using CUDA12.8 or above -NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \ +NVCC_GENCODE ?= -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ -gencode=arch=compute_90,code=sm_90 \ -gencode=arch=compute_100,code=sm_100 \ -gencode=arch=compute_120,code=sm_120 \ From a5c539e68bb7263304997012498b0cd0667b99e8 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 19 May 2025 18:20:22 -0700 Subject: [PATCH 4/9] Add support for Symmetric Memory Registration From NCCL 2.27.x we can now use the Symmetric Memory APIs (-R 2) --- README.md | 2 +- src/common.cu | 58 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index bdafbe5a16..22687d4771 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ All tests support the same set of arguments : * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. * `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0. - * `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0. + * `-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0. * `-T,--timeout