/************************************************************************* * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_BITOPS_H_ #define NCCL_BITOPS_H_ #include #include #if !__NVCC__ #ifndef __host__ #define __host__ #endif #ifndef __device__ #define __device__ #endif #endif template constexpr static __host__ __device__ Int minval(Int a) { return a; } template constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) { #if __CUDA_ARCH__ return minval(min(a, b), more...); #else return minval(a < b ? a : b, more...); #endif } template constexpr static __host__ __device__ Int maxval(Int a) { return a; } template constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) { #if __CUDA_ARCH__ return maxval(max(a, b), more...); #else return maxval(a > b ? a : b, more...); #endif } #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) #define ROUNDUP(x, y) \ (DIVUP((x), (y))*(y)) #define ALIGN_POWER(x, y) \ ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) #define ALIGN_SIZE(size, align) \ size = ((size + (align) - 1) / (align)) * (align); template static __host__ __device__ constexpr Z divUp(X x, Y y) { return (x+y-1)/y; } template static __host__ __device__ constexpr Z roundUp(X x, Y y) { return (x+y-1) - (x+y-1)%y; } template static __host__ __device__ constexpr Z roundDown(X x, Y y) { return x - x%y; } // assumes second argument is a power of 2 template static __host__ __device__ constexpr Z alignUp(X x, int a) { return (x + a-1) & Z(-a); } // assumes second argument is a power of 2 template static __host__ __device__ constexpr Z alignDown(X x, int a) { return x & Z(-a); } template constexpr __host__ __device__ bool isPow2(Int x) { return (x & (x-1)) == 0; } template static __host__ __device__ T add4G(T base, int delta4G) { union { T tmp; uint32_t u32[2]; }; tmp = base; u32[1] += delta4G; return tmp; } template static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) { union { T tmp; uint32_t u32[2]; }; tmp = ptr; u32[1] += delta4G; if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G; return tmp; } template static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) { union { T tmp; uint32_t u32[2]; }; tmp = ptr; u32[1] -= delta4G; if (u32[1] < lo4G) u32[1] += hi4G-lo4G; return tmp; } // Produce the reciprocal of x for use in idivByRcp constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) { return uint32_t(uint64_t(0x100000000)/x); } constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) { return uint64_t(-1)/x + isPow2(x); } static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) { #if __CUDA_ARCH__ return __umulhi(a, b); #else return uint64_t(a)*b >> 32; #endif } static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) { #if __CUDA_ARCH__ return __umul64hi(a, b); #else return (uint64_t)(((unsigned __int128)a)*b >> 64); #endif } // Produce the reciprocal of x*y given their respective reciprocals. This incurs // no integer division on device. static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) { if (xrcp == 0) return yrcp; if (yrcp == 0) return xrcp; uint32_t rcp = mul32hi(xrcp, yrcp); uint32_t rem = -x*y*rcp; if (x*y <= rem) rcp += 1; return rcp; } static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) { if (xrcp == 0) return yrcp; if (yrcp == 0) return xrcp; uint64_t rcp = mul64hi(xrcp, yrcp); uint64_t rem = -x*y*rcp; if (x*y <= rem) rcp += 1; return rcp; } // Fast integer division where divisor has precomputed reciprocal. // idivFast(x, y, idivRcp(y)) == x/y static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) { uint32_t q = x, r = 0; if (yrcp != 0) { q = mul32hi(x, yrcp); r = x - y*q; if (r >= y) { q += 1; r -= y; } } *quo = q; *rem = r; } static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) { uint64_t q = x, r = 0; if (yrcp != 0) { q = mul64hi(x, yrcp); r = x - y*q; if (r >= y) { q += 1; r -= y; } } *quo = q; *rem = r; } static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) { uint32_t q, r; idivmodFast32(&q, &r, x, y, yrcp); return q; } static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) { uint64_t q, r; idivmodFast64(&q, &r, x, y, yrcp); return q; } static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) { uint32_t q, r; idivmodFast32(&q, &r, x, y, yrcp); return r; } static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) { uint64_t q, r; idivmodFast64(&q, &r, x, y, yrcp); return r; } template static __host__ __device__ int countOneBits(Int x) { #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(unsigned int)) { return __popc((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { return __popcll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); return -1; } #else if (sizeof(Int) <= sizeof(unsigned int)) { return __builtin_popcount((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { return __builtin_popcountl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { return __builtin_popcountll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); return -1; } #endif } // Returns index of first one bit or returns -1 if mask is zero. template static __host__ __device__ int firstOneBit(Int mask) { int i; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { i = __ffs((int)mask); } else if (sizeof(Int) <= sizeof(long long)) { i = __ffsll((long long)mask); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (sizeof(Int) <= sizeof(int)) { i = __builtin_ffs((int)mask); } else if (sizeof(Int) <= sizeof(long)) { i = __builtin_ffsl((long)mask); } else if (sizeof(Int) <= sizeof(long long)) { i = __builtin_ffsll((long long)mask); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #endif return i-1; } template static __host__ __device__ int popFirstOneBit(Int* mask) { Int tmp = *mask; *mask &= *mask-1; return firstOneBit(tmp); } template static __host__ __device__ int log2Down(Int x) { int w, n; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { w = 8*sizeof(int); n = __clz((int)x); } else if (sizeof(Int) <= sizeof(long long)) { w = 8*sizeof(long long); n = __clzll((long long)x); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (x == 0) { return -1; } else if (sizeof(Int) <= sizeof(unsigned int)) { w = 8*sizeof(unsigned int); n = __builtin_clz((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { w = 8*sizeof(unsigned long); n = __builtin_clzl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { w = 8*sizeof(unsigned long long); n = __builtin_clzll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); } #endif return (w-1)-n; } template static __host__ __device__ int log2Up(Int x) { int w, n; if (x != 0) x -= 1; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(int)) { w = 8*sizeof(int); n = __clz((int)x); } else if (sizeof(Int) <= sizeof(long long)) { w = 8*sizeof(long long); n = __clzll((long long)x); } else { static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size."); } #else if (x == 0) { return 0; } else if (sizeof(Int) <= sizeof(unsigned int)) { w = 8*sizeof(unsigned int); n = __builtin_clz((unsigned int)x); } else if (sizeof(Int) <= sizeof(unsigned long)) { w = 8*sizeof(unsigned long); n = __builtin_clzl((unsigned long)x); } else if (sizeof(Int) <= sizeof(unsigned long long)) { w = 8*sizeof(unsigned long long); n = __builtin_clzll((unsigned long long)x); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size."); } #endif return w-n; } template static __host__ __device__ Int pow2Up(Int x) { return Int(1)< static __host__ __device__ Int pow2Down(Int x) { // True, log2Down can return -1, but we don't normally pass 0 as an argument... // coverity[negative_shift] return Int(1)< static __host__ __device__ UInt reverseSubBits(UInt x) { if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) { switch (8*sizeof(UInt)) { case 16: x = __builtin_bswap16(x); break; case 32: x = __builtin_bswap32(x); break; case 64: x = __builtin_bswap64(x); break; default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type."); } return reverseSubBits(x); } else if (nSubBits == 1) { return x; } else { UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1); x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2); return reverseSubBits(x); } } template struct ncclToUnsigned; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned char; }; template<> struct ncclToUnsigned { using type = unsigned short; }; template<> struct ncclToUnsigned { using type = unsigned short; }; template<> struct ncclToUnsigned { using type = unsigned int; }; template<> struct ncclToUnsigned { using type = unsigned int; }; template<> struct ncclToUnsigned { using type = unsigned long; }; template<> struct ncclToUnsigned { using type = unsigned long; }; template<> struct ncclToUnsigned { using type = unsigned long long; }; template<> struct ncclToUnsigned { using type = unsigned long long; }; // Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's. template static __host__ __device__ Int reverseBits(Int x, int nBits) { using UInt = typename ncclToUnsigned::type; union { UInt ux; Int sx; }; sx = x; #if __CUDA_ARCH__ if (sizeof(Int) <= sizeof(unsigned int)) { ux = __brev(ux); } else if (sizeof(Int) <= sizeof(unsigned long long)) { ux = __brevll(ux); } else { static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type."); } #else ux = reverseSubBits(ux); #endif ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits); return sx; } //////////////////////////////////////////////////////////////////////////////// // Custom 8 bit floating point format for approximating 32 bit uints. This format // has nearly the full range of uint32_t except it only keeps the top 3 bits // beneath the leading 1 bit and thus has a max value of 0xf0000000. static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) { int log2x; #if __CUDA_ARCH__ log2x = 31-__clz(x|1); #else log2x = 31-__builtin_clz(x|1); #endif uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0; return exponent<>bitsPerPow2; uint32_t mantissa = (x & ((1u<> 31; acc[0] *= 0x9de62bbc8cef3ce3; acc[1] ^= acc[1] >> 32; acc[1] *= 0x485cd6311b599e79; // Read in a chunk of input. size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t); uint64_t x = 0; memcpy(&x, ptr, chunkSize); ptr += chunkSize; size -= chunkSize; // Add to accumulator. acc[0] += x; } } template static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) { eatHash(acc, (const void*)bytes, sizeof(T)); } static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) { uint64_t h = acc[0]; h ^= h >> 31; h *= 0xbac3bd562846de6b; h += acc[1]; h ^= h >> 32; h *= 0x995a187a14e7b445; return h; } static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) { uint64_t acc[2] = {1, 1}; eatHash(acc, bytes, size); return digestHash(acc); } template static __host__ __device__ uint64_t getHash(const T* bytes) { return getHash((const void*)bytes, sizeof(T)); } #endif