Merge remote-tracking branch 'nccl/master' into develop
This commit is contained in:
+163
-23
@@ -19,6 +19,28 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int minval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return minval(min(a, b), more...);
|
||||
#else
|
||||
return minval(a < b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int maxval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return maxval(max(a, b), more...);
|
||||
#else
|
||||
return maxval(a > b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
|
||||
@@ -32,32 +54,150 @@
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
}
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
return x - x%y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x + a-1) & Z(-a);
|
||||
}
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
return x & Z(-a);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int countOneBits(Int x) {
|
||||
constexpr __host__ __device__ bool isPow2(Int x) {
|
||||
return (x & (x-1)) == 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T add4G(T base, int delta4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = base;
|
||||
u32[1] += delta4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] += delta4G;
|
||||
if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] -= delta4G;
|
||||
if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x for use in idivByRcp
|
||||
constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
|
||||
return uint32_t(uint64_t(0x100000000)/x);
|
||||
}
|
||||
constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
|
||||
return uint64_t(-1)/x + isPow2(x);
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umulhi(a, b);
|
||||
#else
|
||||
return uint64_t(a)*b >> 32;
|
||||
#endif
|
||||
}
|
||||
static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umul64hi(a, b);
|
||||
#else
|
||||
return (uint64_t)(((unsigned __int128)a)*b >> 64);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x*y given their respective reciprocals. This incurs
|
||||
// no integer division on device.
|
||||
static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint32_t rcp = mul32hi(xrcp, yrcp);
|
||||
uint32_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint64_t rcp = mul64hi(xrcp, yrcp);
|
||||
uint64_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
|
||||
// Fast integer division where divisor has precomputed reciprocal.
|
||||
// idivFast(x, y, idivRcp(y)) == x/y
|
||||
static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul32hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul64hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
static __host__ __device__ int countOneBits(Int x) {
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
return __popc((unsigned int)x);
|
||||
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {
|
||||
|
||||
// Returns index of first one bit or returns -1 if mask is zero.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
static __host__ __device__ int firstOneBit(Int mask) {
|
||||
int i;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
static __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
Int tmp = *mask;
|
||||
*mask &= *mask-1;
|
||||
return firstOneBit(tmp);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Down(Int x) {
|
||||
static __host__ __device__ int log2Down(Int x) {
|
||||
int w, n;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Up(Int x) {
|
||||
static __host__ __device__ int log2Up(Int x) {
|
||||
int w, n;
|
||||
if (x != 0) x -= 1;
|
||||
#if __CUDA_ARCH__
|
||||
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Up(Int x) {
|
||||
static __host__ __device__ Int pow2Up(Int x) {
|
||||
return Int(1)<<log2Up(x);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Down(Int x) {
|
||||
static __host__ __device__ Int pow2Down(Int x) {
|
||||
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
|
||||
// coverity[negative_shift]
|
||||
return Int(1)<<log2Down(x);
|
||||
}
|
||||
|
||||
template<typename UInt, int nSubBits>
|
||||
inline __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
static __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
|
||||
switch (8*sizeof(UInt)) {
|
||||
case 16: x = __builtin_bswap16(x); break;
|
||||
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon
|
||||
|
||||
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
static __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
using UInt = typename ncclToUnsigned<Int>::type;
|
||||
union { UInt ux; Int sx; };
|
||||
sx = x;
|
||||
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
// has nearly the full range of uint32_t except it only keeps the top 3 bits
|
||||
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
int log2x;
|
||||
#if __CUDA_ARCH__
|
||||
log2x = 31-__clz(x|1);
|
||||
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
return exponent<<bitsPerPow2 | mantissa;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
uint32_t exponent = x>>bitsPerPow2;
|
||||
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
|
||||
if (exponent != 0) exponent -= 1;
|
||||
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
|
||||
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
|
||||
|
||||
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
return u32fpEncode(x, 3);
|
||||
}
|
||||
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
// The hash isn't just a function of the bytes but also where the bytes are split
|
||||
// into different calls to eatHash().
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
char const* ptr = (char const*)bytes;
|
||||
acc[0] ^= size;
|
||||
while (size != 0) {
|
||||
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
eatHash(acc, (const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
uint64_t h = acc[0];
|
||||
h ^= h >> 31;
|
||||
h *= 0xbac3bd562846de6b;
|
||||
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
return h;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
uint64_t acc[2] = {1, 1};
|
||||
eatHash(acc, bytes, size);
|
||||
return digestHash(acc);
|
||||
}
|
||||
template<typename T>
|
||||
inline __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
static __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
return getHash((const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
|
||||
Referens i nytt ärende
Block a user