diff --git a/src/all_reduce.cu b/src/all_reduce.cu index cdf781cd7a..d64371bbb2 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -65,7 +65,6 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t ncclRedOp_t *run_ops; const char **run_typenames, **run_opnames; int type_count, op_count; - if ((int)type != -1) { type_count = 1; run_types = &type; @@ -89,8 +88,8 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; i= 2 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble @@ -38,7 +45,7 @@ size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB , ncclBfloat16 #endif #if RCCL_FLOAT8 == 1 - , ncclFp8E4M3, ncclFp8E5M2 + , ncclFloat8e4m3, ncclFloat8e5m2 #endif }; const char *test_typenames[ncclNumTypes] = { @@ -196,6 +203,7 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size } bool Reporter::isMainThread() { return is_main_thread == 1; } +static int minCudaArch = 1<<30; #define NUM_BLOCKS 32 @@ -304,18 +312,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch) } testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { - ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); + CUDACHECK(ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault)); CUDACHECK(cudaDeviceSynchronize()); return testSuccess; } testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { - ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); + CUDACHECK(ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault)); return testSuccess; } testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { - ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); + CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault)); return testSuccess; } @@ -563,8 +571,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t case ncclBfloat16: bf16 = ncclVerifiablePremulScalar(rank); break; #endif #if defined(RCCL_FLOAT8) - case ncclFp8E4M3: fp8_e4m3 = ncclVerifiablePremulScalar(rank); break; - case ncclFp8E5M2: fp8_e5m2 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar(rank); break; #endif case ncclNumTypes: break; } @@ -1330,6 +1338,13 @@ testResult_t run() { char hostname[1024]; getHostName(hostname, 1024); + hipDeviceProp_t devProp; + CUDACHECK(hipGetDeviceProperties(&devProp, 0)); + if (IsArchMatch(devProp.gcnArchName, "gfx942")) { + PRINT("On gfx942 architecture, using FNUZ FP8 types"); + rccl_float8_useFnuz = true; + } + #ifdef MPI_SUPPORT MPI_Comm_size(MPI_COMM_WORLD, &totalProcs); MPI_Comm_rank(MPI_COMM_WORLD, &proc); @@ -1456,12 +1471,21 @@ testResult_t run() { gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices; CUDACHECK(cudaSetDevice(gpus[i])); TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes)); - if (streamnull) + if (streamnull) { streams[i] = NULL; - else + } + else { CUDACHECK(cudaStreamCreateWithFlags(streams.data()+i, cudaStreamNonBlocking)); + } + int archMajor, archMinor; + CUDACHECK(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, gpus[i])); + CUDACHECK(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, gpus[i])); + minCudaArch = std::min(minCudaArch, 100*archMajor + 10*archMinor); } +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); +#endif //if parallel init is not selected, use main thread to initialize NCCL ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0) diff --git a/src/common.h b/src/common.h index 1b368cd28a..5a3623cddb 100644 --- a/src/common.h +++ b/src/common.h @@ -258,8 +258,8 @@ static size_t wordSize(ncclDataType_t type) { //case ncclInt8: case ncclUint8: #if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1 - case ncclFp8E4M3: - case ncclFp8E5M2: + case ncclFloat8e4m3: + case ncclFloat8e5m2: #endif #endif return 1; diff --git a/src/rccl_float8.h b/src/rccl_float8.h index 01cab41f71..76bd4f35a1 100644 --- a/src/rccl_float8.h +++ b/src/rccl_float8.h @@ -24,8 +24,9 @@ #define ROCBLAS_FLOAT8_H #include +#include -#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) +#if __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)) /*! \brief Struct to represent a 8 bit floating-point number. */ typedef struct @@ -38,7 +39,60 @@ typedef struct uint8_t data; } rccl_bfloat8; -#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) +// __cplusplus < 201103L || (!defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)) +#elif HIP_VERSION >= 60200000 + +#include + +#if __HIP_DEVICE_COMPILE__ && (defined(__gfx950__) || defined(__gfx1200__) || defined(__gfx1201__) || (defined(__gfx1100__) || defined(__gfx1101__)))//HIP_FP8_TYPE_OCP is enabled. +typedef __hip_fp8_e4m3 rccl_float8; +typedef __hip_fp8_e5m2 rccl_bfloat8; +#elif __HIP_DEVICE_COMPILE__ && (defined(__gfx942__)) +typedef __hip_fp8_e4m3_fnuz rccl_float8; +typedef __hip_fp8_e5m2_fnuz rccl_bfloat8; +#else +typedef __hip_fp8_e4m3 rccl_float8; +typedef __hip_fp8_e5m2 rccl_bfloat8; +#endif + +#if __HIP_DEVICE_COMPILE__ +inline std::ostream& operator<<(std::ostream& os, const rccl_float8& f8) +{ + return os << float(f8); +} + +inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat8& bf8) +{ + return os << float(bf8); +} + +#else +inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3& f8) +{ + return os << float(f8); +} + +inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2& bf8) +{ + return os << float(bf8); +} + +//adding support for those operators on the host side +inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e4m3_fnuz& f8) +{ + return os << float(f8); +} + +inline std::ostream& operator<<(std::ostream& os, const __hip_fp8_e5m2_fnuz& bf8) +{ + return os << float(bf8); +} +#endif + +extern bool rccl_float8_useFnuz; +// For older versions of ROCm that do not include hip_fp8.h, +// we provide a local version of the header file as a fallback. +#else #define HIP_HOST_DEVICE __host__ __device__ #define HIP_HOST __host__ @@ -344,7 +398,7 @@ struct rccl_float8 // default constructor HIP_HOST_DEVICE rccl_float8() = default; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // device specific optimized F8 down-conversion code template @@ -381,10 +435,10 @@ struct rccl_float8 return i8data; } -#endif // __gfx940__ +#endif // __gfx942__ // constructor from float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // NOTE: ON-DEVICE... always optimal bias explicit HIP_DEVICE rccl_float8(float v, @@ -402,7 +456,7 @@ struct rccl_float8 // Host only implementation using s/w simulation explicit HIP_HOST #else - // both Host and DEVICE for non-gfx940 using s/w simulation + // both Host and DEVICE for non-gfx942 using s/w simulation explicit HIP_HOST_DEVICE #endif rccl_float8(float v, @@ -446,7 +500,7 @@ struct rccl_float8 } // convert to float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // upcast using device specific intrinsic explicit inline HIP_DEVICE operator float() const { @@ -460,7 +514,7 @@ struct rccl_float8 } explicit inline HIP_HOST operator float() const -#else // non gfx940 +#else // non gfx942 explicit inline HIP_HOST_DEVICE operator float() const #endif { @@ -511,7 +565,7 @@ struct rccl_bfloat8 // default constructor HIP_HOST_DEVICE rccl_bfloat8() = default; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // device specific optimized F8 down-conversion code template @@ -548,10 +602,10 @@ struct rccl_bfloat8 return i8data; } -#endif // __gfx940__ +#endif // __gfx942__ // constructor from float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // NOTE: ON-DEVICE... always optimal bias explicit HIP_DEVICE rccl_bfloat8(float v, @@ -569,7 +623,7 @@ struct rccl_bfloat8 // Host only implementation using s/w simulation explicit HIP_HOST #else - // both Host and DEVICE for non-gfx940 using s/w simulation + // both Host and DEVICE for non-gfx942 using s/w simulation explicit HIP_HOST_DEVICE #endif rccl_bfloat8(float v, @@ -613,7 +667,7 @@ struct rccl_bfloat8 } // convert to float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // upcast using device specific intrinsic explicit inline HIP_DEVICE operator float() const { @@ -627,7 +681,7 @@ struct rccl_bfloat8 } explicit inline HIP_HOST operator float() const -#else // non gfx940 +#else // non gfx942 explicit inline HIP_HOST_DEVICE operator float() const #endif { @@ -969,7 +1023,7 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng = 0) return a; } -// Use h/w intrinsic and optimized version when __gfx940__ +// Use h/w intrinsic and optimized version when __gfx942__ template < typename T, typename Ta, @@ -980,7 +1034,7 @@ template < = 0> inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng) { -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx942__) || defined(__gfx950__) // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch T val; if(std::is_same::value) @@ -988,12 +1042,12 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng) else val.data = rccl_bfloat8::cast_to_bf8_from_f32(float(a), rng); return val; -#else // non gfx940 +#else // non gfx942 return T(float(a), stochastic_rounding ? T::rocblas_hip_f8_rounding_mode::stochastic : T::rocblas_hip_f8_rounding_mode::standard, rng); -#endif // __gfx940__ +#endif // __gfx942__ } // NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider the quantization @@ -1016,6 +1070,6 @@ inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng) // ================================================================================================= -#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) +#endif #endif // ROCBLAS_FLOAT8_H diff --git a/src/reduce.cu b/src/reduce.cu index c8ee2f84a6..f8c059e140 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -96,8 +96,8 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ for (int i=0; i= NCCL_VERSION(2,10,0) && RCCL_FLOAT8 == 1 - #define HAVE_ncclfp8 1 - // Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3) - #define ncclFp8E4M3 ncclFloat8e4m3 - #define ncclFp8E5M2 ncclFloat8e5m2 - #endif +#if __HIP_DEVICE_COMPILE__ + #define HAVE_ncclfp8_DEVICE 1 +#else + #define HAVE_ncclfp8_HOST 1 +#endif +// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,24,3) + #define ncclFp8E4M3 ncclFloat8e4m3 + #define ncclFp8E5M2 ncclFloat8e5m2 +#endif #else #define HAVE_ncclfp8 0 #endif @@ -130,23 +134,39 @@ __host__ __device__ T inhibit(T x) { //////////////////////////////////////////////////////////////////////////////// namespace { - template - __host__ __device__ Y castTo(X x) { + template + __host__ __device__ Y castTo(uint64_t x) { return Y(x); } template __host__ __device__ Y castTo(float x) { return Y(x); } + template + __host__ __device__ Y castTo(double x) { + return Y(x); + } template<> __host__ __device__ __half castTo<__half>(float x) { return __float2half(x); } + template<> + __host__ __device__ half castTo<__half>(uint64_t x) { + return __ull2half_rn(x); + } #if RCCL_BFLOAT16 == 1 template<> __host__ __device__ hip_bfloat16 castTo(float x) { return hip_bfloat16(x); } + template<> + __host__ __device__ hip_bfloat16 castTo(double x) { + return hip_bfloat16(x); + } + template<> + __host__ __device__ hip_bfloat16 castTo(uint64_t x) { + return hip_bfloat16((double)x); + } #endif #if RCCL_FLOAT8 == 1 template<> @@ -157,6 +177,22 @@ namespace { __host__ __device__ rccl_bfloat8 castTo(float x) { return static_cast(x); } + template<> + __host__ __device__ rccl_float8 castTo(double x) { + return static_cast(x); + } + template<> + __host__ __device__ rccl_float8 castTo(uint64_t x) { + return static_cast((double)x); + } + template<> + __host__ __device__ rccl_bfloat8 castTo(double x) { + return static_cast(x); + } + template<> + __host__ __device__ rccl_bfloat8 castTo(uint64_t x) { + return static_cast((double)x); + } #endif } @@ -211,16 +247,16 @@ struct ReduceProd { #endif #if RCCL_FLOAT8 == 1 __host__ __device__ rccl_float8 operator()(rccl_float8 a, rccl_float8 b) const { - return static_cast(a * b); + return static_cast(float(a) * float(b)); } __host__ __device__ rccl_float8 operator()(rccl_float8 a, float b) const { - return static_cast(a * b); + return static_cast(float(a) * float(b)); } __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, rccl_bfloat8 b) const { - return static_cast(a * b); + return static_cast(float(a) * float(b)); } __host__ __device__ rccl_bfloat8 operator()(rccl_bfloat8 a, float b) const { - return static_cast(a * b); + return static_cast(float(a) * float(b)); } #endif template @@ -328,40 +364,72 @@ struct ReduceAvg { namespace { template -struct FloatLayout; +struct FloatLayout { static constexpr bool is_floating_point = false; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 8, mantissa_bits = 23; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 11, mantissa_bits = 52; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> struct FloatLayout<__half> { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 5, mantissa_bits = 10; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; #if RCCL_BFLOAT16 == 1 template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 8, mantissa_bits = 7; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; #endif #if RCCL_FLOAT8 == 1 +#if __HIP_DEVICE_COMPILE__ template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 4, mantissa_bits = 3; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> struct FloatLayout { + static constexpr bool is_floating_point = true; static constexpr int exponent_bits = 5, mantissa_bits = 2; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; +#else +template<> +struct FloatLayout<__hip_fp8_e4m3> { + static constexpr bool is_floating_point = true; + static constexpr int exponent_bits = 4, mantissa_bits = 3; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +template<> +struct FloatLayout<__hip_fp8_e5m2> { + static constexpr bool is_floating_point = true; + static constexpr int exponent_bits = 5, mantissa_bits = 2; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; + +template<> +struct FloatLayout<__hip_fp8_e4m3_fnuz> { + static constexpr int exponent_bits = 4, mantissa_bits = 3; + static constexpr int exponent_bias = (1<<(exponent_bits-1)); +}; +template<> +struct FloatLayout<__hip_fp8_e5m2_fnuz> { + static constexpr int exponent_bits = 5, mantissa_bits = 2; + static constexpr int exponent_bias = (1<<(exponent_bits-1)); +}; +#endif #endif template @@ -675,11 +743,12 @@ __host__ __device__ void genOutput( //////////////////////////////////////////////////////////////////////////////// // Nil reduction (byte copy functions). Optimized to assume rank_n=1 +// genInput specialization for integer ReduceNil. namespace { -template +template __host__ __device__ void genInput( T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index, - std::integral_constant + std::true_type /*integral*/ ) { (void)rank_n, (void)rank_me; // silence unused warnings union { uint64_t bits; T tmp; }; @@ -689,6 +758,24 @@ __host__ __device__ void genInput( ans = tmp; } +// genInput specialization for floating point ReduceNil. +template +__host__ __device__ void genInput( + T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + (void)rank_n; // silence unused warnings + constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout::mantissa_bits)-1; + uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed); + int sign = rng & 1; + rng ^= rng>>1; + int exp = rng & ((1<<(FloatLayout::exponent_bits-1))-1); + exp += 1<<(FloatLayout::exponent_bits-2); + rng ^= rng >> FloatLayout::exponent_bits; + uint64_t mant = rng & mant_mask; + ans = makeFloat(sign, exp, mant); +} + template __host__ __device__ void genOutput( T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index, @@ -777,22 +864,35 @@ __host__ __device__ void genOutput( namespace { template __host__ __device__ void genInput( - T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index, + T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index, std::false_type /*integral*/ ) { - ans = genInOutFloatSum(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true); + // We can't control the nranks divisor in avareages so to control error we + // limit to two ranks contributing non-zero values. This way there is no ambiguity + // of summation. + int r = shuffleRank(rank_n, rank_me, rng); + uint64_t m = (rng*(r ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + ans = r < 2 ? castTo(1+m) : castTo((uint64_t)0); } template __host__ __device__ void genOutput( - T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index, + T &ans, ReduceAvg, int rank_n, uint64_t rng, intptr_t index, std::false_type /*integral*/ ) { - ans = genInOutFloatSum(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true); - using T1 = typename std::conditional<(sizeof(T)::type; - //ans = ReduceProd()(ans, T1(1)/T1(rank_n)); - ans = ReduceProd()(ans, inhibit(castTo(T1(1)/T1(rank_n)))); - } + shuffleRank(rank_n, -1, rng); + uint64_t m0 = (rng*(0 ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + uint64_t m1 = (rng*(1 ? 0xbeef : 1)) & ((1ul<::mantissa_bits)-1); + if (rank_n == 1) { + ans = castTo(1+m0); + } else { + // NCCL varies which datatype it does the muls with depending on __CUDA_ARCH__. + // We account for this by using a tolerance of 2 ulps during the verification. + using TMul = typename std::conditional<(sizeof(T) < sizeof(double)), float, double>::type; + ans = ReduceSum()((T)(TMul(1+m0)*TMul(1.0/rank_n)), + (T)(TMul(1+m1)*TMul(1.0/rank_n))); + } +} } ///////////////////////////////////////////////////////////////////////////////// @@ -856,7 +956,7 @@ __host__ __device__ T genOutput( #if !SELF_TEST namespace { template -__global__ void prepareInput2( +__global__ void __launch_bounds__(512, 1) prepareInput2( T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0 ) { @@ -877,44 +977,55 @@ __global__ void prepareInput2( } template -void prepareInput1( +cudaError_t prepareInput1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { - int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); - #define CASE_TY(T) prepareInput2<<>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break; + void const *fn = nullptr; switch(elt_ty) { - case ncclInt8: CASE_TY(int8_t) - case ncclUint8: CASE_TY(uint8_t) - case ncclInt32: CASE_TY(int32_t) - case ncclUint32: CASE_TY(uint32_t) - case ncclInt64: CASE_TY(int64_t) - case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(__half) + case ncclInt8: fn = (void const*)&prepareInput2; break; + case ncclUint8: fn = (void const*)&prepareInput2; break; + case ncclInt32: fn = (void const*)&prepareInput2; break; + case ncclUint32: fn = (void const*)&prepareInput2; break; + case ncclInt64: fn = (void const*)&prepareInput2; break; + case ncclUint64: fn = (void const*)&prepareInput2; break; + case ncclFloat16: fn = (void const*)&prepareInput2<__half, ReduceOp>; break; #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(hip_bfloat16) + case ncclBfloat16: fn = (void const*)&prepareInput2; break; #endif - #if HAVE_ncclfp8 - case ncclFp8E4M3: CASE_TY(rccl_float8) - case ncclFp8E5M2: CASE_TY(rccl_bfloat8) + #if HAVE_ncclfp8_DEVICE + case ncclFloat8e4m3: fn = (void const*)&prepareInput2; break; + case ncclFloat8e5m2: fn = (void const*)&prepareInput2; break; #endif - case ncclFloat32: CASE_TY(float) - case ncclFloat64: CASE_TY(double) - default: assert(0); + #if HAVE_ncclfp8_HOST + case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e4m3_fnuz, ReduceOp>; break;} + else { fn = (void const*)&prepareInput2<__hip_fp8_e4m3, ReduceOp>; break;} + case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareInput2<__hip_fp8_e5m2_fnuz, ReduceOp>; break;} + else { fn = (void const*)&prepareInput2<__hip_fp8_e5m2, ReduceOp>; break;} + #endif + case ncclFloat32: fn = (void const*)&prepareInput2; break; + case ncclFloat64: fn = (void const*)&prepareInput2; break; + default: assert(0); return cudaErrorInvalidValue; } #undef CASE_TY + dim3 grid = {1, 1, 1}; + grid.x = (unsigned int)std::min(32, (elt_n + 4*512-1)/(4*512)); + dim3 block = {512, 1, 1}; + void *args[7] = {&elts, &elt_n, &op, &rank_n, &rank_me, &seed, &elt_ix0}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiablePrepareInput( +hipError_t ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ - prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \ + return prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \ else \ - prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \ + return prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \ break; switch(red_op) { case ncclSum: CASE_OP(ReduceSum()) @@ -937,7 +1048,7 @@ void ncclVerifiablePrepareInput( #if !SELF_TEST namespace { template -__global__ void prepareExpected2( +__global__ void __launch_bounds__(512, 1) prepareExpected2( T *elts, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed, intptr_t elt_ix0 ) { @@ -957,44 +1068,55 @@ __global__ void prepareExpected2( } template -void prepareExpected1( +cudaError_t prepareExpected1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { - int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); - #define CASE_TY(T) prepareExpected2<<>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break; + void const *fn = nullptr; switch(elt_ty) { - case ncclInt8: CASE_TY(int8_t) - case ncclUint8: CASE_TY(uint8_t) - case ncclInt32: CASE_TY(int32_t) - case ncclUint32: CASE_TY(uint32_t) - case ncclInt64: CASE_TY(int64_t) - case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(__half) + case ncclInt8: fn = (void const*)&prepareExpected2; break; + case ncclUint8: fn = (void const*)&prepareExpected2; break; + case ncclInt32: fn = (void const*)&prepareExpected2; break; + case ncclUint32: fn = (void const*)&prepareExpected2; break; + case ncclInt64: fn = (void const*)&prepareExpected2; break; + case ncclUint64: fn = (void const*)&prepareExpected2; break; + case ncclFloat16: fn = (void const*)&prepareExpected2<__half, ReduceOp>; break; #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(hip_bfloat16) + case ncclBfloat16: fn = (void const*)&prepareExpected2; break; #endif - #if HAVE_ncclfp8 - case ncclFp8E4M3: CASE_TY(rccl_float8) - case ncclFp8E5M2: CASE_TY(rccl_bfloat8) + #if HAVE_ncclfp8_DEVICE + case ncclFloat8e4m3: fn = (void const*)&prepareExpected2; break; + case ncclFloat8e5m2: fn = (void const*)&prepareExpected2; break; #endif - case ncclFloat32: CASE_TY(float) - case ncclFloat64: CASE_TY(double) - default: assert(0); + #if HAVE_ncclfp8_HOST + case ncclFloat8e4m3: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3_fnuz, ReduceOp>; break; } + else { fn = (void const*)&prepareExpected2<__hip_fp8_e4m3, ReduceOp>; break; } + case ncclFloat8e5m2: if (rccl_float8_useFnuz) { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2_fnuz, ReduceOp>; break; } + else { fn = (void const*)&prepareExpected2<__hip_fp8_e5m2, ReduceOp>; break; } + #endif + case ncclFloat32: { fn = (void const*)&prepareExpected2; break; } + case ncclFloat64: { fn = (void const*)&prepareExpected2; break; } + default: assert(0); return cudaErrorInvalidValue; } - #undef CASE_TY + + dim3 grid = {1, 1, 1}; + grid.x = (unsigned int)std::min(32, (elt_n + 4*512-1)/(4*512)); + dim3 block = {512, 1, 1}; + void *args[6] = {&elts, &elt_n, &op, &rank_n, &seed, &elt_ix0}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiablePrepareExpected( +hipError_t ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ - prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \ + return prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \ else \ - prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \ + return prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \ break; switch(red_op) { case ncclSum: CASE_OP(ReduceSum()) @@ -1015,54 +1137,6 @@ void ncclVerifiablePrepareExpected( //////////////////////////////////////////////////////////////////////////////// namespace { -/* How we compare floating point values when exactness is impossible is interesting. - * First, we take note that simply reinterpreting integer bits as floating point - * gives us a monotonic mapping which exponentially spaces out floats. Thus - * consecutive integers encode consecutive floats. In general, using integer - * subraction on the bitpatterns of two floats gives us an integer which is the - * logarithm of their relative difference. But, if the floats always have similar - * exponents, than the integer difference is actually proportional to the - * relative error (this is because we are counting hops in the mantissa bits only, - * not the exponent bits). So a cheap way to compare if two floats are relatively - * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula - * calculates such a tolerance for a summation of n floats. This formula - * was derived by inspecting the maximum observed integer difference over many - * random runs of summation. The parameter values were computed by the - * companion program "inexact_regress.cu". - */ -__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) { - float power, coef; - switch(elt_ty) { - case ncclFloat32: - case ncclFloat64: - power = .51f; - coef = 1.25f; - break; - case ncclFloat16: - power = .91f; - coef = .75f; - break; - #if HAVE_ncclBfloat16 - case ncclBfloat16: - power = .91f; - coef = .66f; - break; - #endif - #if HAVE_ncclfp8 - case ncclFp8E4M3: - case ncclFp8E5M2: - power = .91f; - coef = .66f; - break; - #endif - } - #if __CUDA_ARCH__ - return 1 + unsigned(coef*powf(float(rank_n), power)); - #else - return 1 + unsigned(coef*std::pow(float(rank_n), power)); - #endif -} - template __host__ __device__ uint64_t calcDelta(T a, T b) { union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y; @@ -1082,7 +1156,7 @@ __host__ __device__ uint64_t calcDelta(T a, T b) { #if !SELF_TEST namespace { template -__global__ void verifyPrepared( +__global__ void __launch_bounds__(512, 1) verifyPrepared( T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n ) { intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); @@ -1098,17 +1172,35 @@ __global__ void verifyPrepared( bad += tolerance < delta ? 1 : 0; #if 0 if(tolerance < delta) { - printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]); + printf("verifyPrepared ix=%lld got=%g exp=%g tol=%d\n", (long long)i, (float)results[i], (float)expected[i], tolerance); } #endif i += blockDim.x; } - //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad) : "memory"); atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad); } +hipError_t verifyPrepared1(int bytePerElt, + void const *results, void const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n +) { + void const *fn = nullptr; + switch(bytePerElt) { + case 1: fn = (void const*)&verifyPrepared; break; + case 2: fn = (void const*)&verifyPrepared; break; + case 4: fn = (void const*)&verifyPrepared; break; + case 8: fn = (void const*)&verifyPrepared; break; + default: assert(0); return cudaErrorInvalidValue; + } + dim3 grid = {(unsigned int)block_n, 1, 1}; + dim3 block = {512, 1, 1}; + void *args[5] = {&results, &expected, &elt_n, &tolerance, &bad_elt_n}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); +} + template -__global__ void verifyInline2( +__global__ void __launch_bounds__(512, 1) verifyInline2( T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed, intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n ) { @@ -1142,35 +1234,49 @@ __global__ void verifyInline2( } template -void verifyInline1( +hipError_t verifyInline1( T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n ) { + void const *fn = nullptr; + ReduceNil opnil; + ReduceSum opsum; + ReduceMin opmin; + ReduceMax opmax; + ReduceProd opprod; + ReduceAvg opavg{rank_n}; + ReducePreMulSum oppremulsum; + void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n}; + #define CASE_OP(op) \ - if(rank_n == 1) \ - verifyInline2<<>> \ - ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ - else \ - verifyInline2<<>> \ - ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ - break; + if(rank_n == 1) { \ + fn = (void const*)&verifyInline2; \ + args[2] = &opnil; \ + } else { \ + fn = (void const*)&verifyInline2; \ + args[2] = &op; \ + } break; switch(red_op) { - case ncclSum: CASE_OP(ReduceSum()) - case ncclMin: CASE_OP(ReduceMin()) - case ncclMax: CASE_OP(ReduceMax()) - case ncclProd: CASE_OP(ReduceProd()) + case ncclSum: CASE_OP(opsum) + case ncclMin: CASE_OP(opmin) + case ncclMax: CASE_OP(opmax) + case ncclProd: CASE_OP(opprod) #if HAVE_ncclAvg - case ncclAvg: CASE_OP(ReduceAvg{rank_n}) + case ncclAvg: CASE_OP(opavg) #endif #if HAVE_ncclPreMulSum - default: CASE_OP(ReducePreMulSum()) + default: CASE_OP(oppremulsum) #endif } #undef CASE_OP + dim3 grid = {(unsigned int)block_n, 1, 1}; + dim3 block = {512, 1, 1}; + if (grid.x == 0) return cudaSuccess; + return cudaLaunchKernel(fn, grid, block, args, 0, stream); } } -void ncclVerifiableVerify( +hipError_t ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t *bad_elt_n, cudaStream_t stream @@ -1179,15 +1285,21 @@ void ncclVerifiableVerify( #if HAVE_ncclBfloat16 floating |= elt_ty == ncclBfloat16; #endif - #if HAVE_ncclfp8 - floating |= elt_ty == ncclFp8E4M3; - floating |= elt_ty == ncclFp8E5M2; + #if HAVE_ncclfp8_DEVICE || HAVE_ncclfp8_HOST + floating |= elt_ty == ncclFloat8e4m3; + floating |= elt_ty == ncclFloat8e5m2; #endif unsigned tolerance = 0; #if HAVE_ncclAvg - if (floating && red_op == ncclAvg) - tolerance = calcSumFloatTolerance(rank_n, elt_ty); + if (floating && red_op == ncclAvg) { + // Average does it's pre-multiplies in an unspecified floating point format + // (could be the actual type T or float or half). That means the premultiply + // verify does could generate a discrepancy in the least mantissa digit. After + // adding those two (since avg only has two non-zero contributions) we could + // be off by a distance of 2 units. + tolerance = 2; + } #endif int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); @@ -1195,9 +1307,9 @@ void ncclVerifiableVerify( *bad_elt_n = 0; #define CASE_TY(T, Uint) { \ if(expected != nullptr) { \ - verifyPrepared<<>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \ + return verifyPrepared1(sizeof(T), results, expected, elt_n, tolerance, bad_elt_n, stream, block_n); \ } else { \ - verifyInline1((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \ + return verifyInline1((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \ } \ } break; switch(elt_ty) { @@ -1211,13 +1323,19 @@ void ncclVerifiableVerify( #if HAVE_ncclBfloat16 case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t) #endif - #if HAVE_ncclfp8 - case ncclFp8E4M3: CASE_TY(rccl_float8, uint8_t) - case ncclFp8E5M2: CASE_TY(rccl_bfloat8, uint8_t) + #if HAVE_ncclfp8_DEVICE + case ncclFloat8e4m3: CASE_TY(rccl_float8, uint8_t) + case ncclFloat8e5m2: CASE_TY(rccl_bfloat8, uint8_t) + #endif + #if HAVE_ncclfp8_HOST + case ncclFloat8e4m3: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e4m3_fnuz, uint8_t);} + else { CASE_TY(__hip_fp8_e4m3, uint8_t);} + case ncclFloat8e5m2: if (rccl_float8_useFnuz) { CASE_TY(__hip_fp8_e5m2_fnuz, uint8_t);} + else { CASE_TY(__hip_fp8_e5m2, uint8_t);} #endif case ncclFloat32: CASE_TY(float, uint32_t) case ncclFloat64: CASE_TY(double, uint64_t) - default: assert(0); + default: assert(0); return cudaErrorInvalidValue; } #undef CASE_TY } @@ -1234,7 +1352,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in //if(!std::is_same::value) return; //if(rank_n!=3) return; - unsigned tolerance = !IsIntegral::value && std::is_same::value ? calcSumFloatTolerance(rank_n, ty) : 0; + unsigned tolerance = !IsIntegral::value && std::is_same::value ? 2 : 0; uint64_t seed = 0xc8e2bed69766d533; for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) { @@ -1271,7 +1389,7 @@ __device__ void sweep1(int ty, char const *tyname) { } } -__global__ void sweep() { +__global__ void __launch_bounds__(512, 1) sweep() { sweep1(ncclInt8, "int8"); sweep1(ncclUint8, "uint8"); sweep1(ncclInt32, "int32"); @@ -1282,19 +1400,16 @@ __global__ void sweep() { #if HAVE_ncclBfloat16 sweep1(ncclBfloat16, "bfloat16"); #endif - #if HAVE_ncclfp8 - sweep1(ncclFp8E4M3, "fp8_e4m3"); - sweep1(ncclFp8E5M2, "fp8_e5m2"); + #if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__ + sweep1(ncclFloat8e4m3, "fp8_e4m3"); + sweep1(ncclFloat8e5m2, "fp8_e5m2"); #endif sweep1(ncclFloat32, "float"); sweep1(ncclFloat64, "double"); } -int main(int arg_n, char **args) { - std::cerr<<"You are hoping to see no output beyond this line."<>>(); sweep<<<1,512>>>(); - cudaDeviceSynchronize(); - return 0; } #endif diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h index da54778a6f..64b4e22514 100644 --- a/verifiable/verifiable.h +++ b/verifiable/verifiable.h @@ -41,13 +41,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { } // Enqueue kernel to generate data which is to be reduced. -void ncclVerifiablePrepareInput( +hipError_t ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); // Enqueue kernel to generate expected results of reduction. -void ncclVerifiablePrepareExpected( +hipError_t ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); @@ -58,7 +58,7 @@ void ncclVerifiablePrepareExpected( // which can be costly. Thus if you plan to run the same reduction multiple // times it is advantageous to precompute the expected values with // ncclVerifiablePrepareExpected and pass them as `expected` here. -void ncclVerifiableVerify( +hipError_t ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t *bad_elt_n, cudaStream_t stream