static inline in a header, just like excess sugar in a diet, causes bloat (#1692)
Этот коммит содержится в:
коммит произвёл
Maneesh Gupta
родитель
9c91632194
Коммит
be70b9f7e7
@@ -32,23 +32,23 @@ THE SOFTWARE.
|
||||
HIP_PUBLIC_API
|
||||
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
|
||||
inline hipChannelFormatDesc hipCreateChannelDescHalf() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
|
||||
inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
|
||||
inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline hipChannelFormatDesc hipCreateChannelDesc() {
|
||||
inline hipChannelFormatDesc hipCreateChannelDesc() {
|
||||
return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
|
||||
}
|
||||
|
||||
|
||||
@@ -38,69 +38,69 @@ Integer Intrinsics
|
||||
*/
|
||||
|
||||
// integer intrinsic function __poc __clz __ffs __brev
|
||||
__device__ static inline unsigned int __popc(unsigned int input) {
|
||||
__device__ inline unsigned int __popc(unsigned int input) {
|
||||
return __builtin_popcount(input);
|
||||
}
|
||||
__device__ static inline unsigned int __popcll(unsigned long long int input) {
|
||||
__device__ inline unsigned int __popcll(unsigned long long int input) {
|
||||
return __builtin_popcountll(input);
|
||||
}
|
||||
|
||||
__device__ static inline int __clz(int input) {
|
||||
__device__ inline int __clz(int input) {
|
||||
return __ockl_clz_u32((uint)input);
|
||||
}
|
||||
|
||||
__device__ static inline int __clzll(long long int input) {
|
||||
__device__ inline int __clzll(long long int input) {
|
||||
return __ockl_clz_u64((ullong)input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __ffs(unsigned int input) {
|
||||
__device__ inline unsigned int __ffs(unsigned int input) {
|
||||
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __ffsll(unsigned long long int input) {
|
||||
__device__ inline unsigned int __ffsll(unsigned long long int input) {
|
||||
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __ffs(int input) {
|
||||
__device__ inline unsigned int __ffs(int input) {
|
||||
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __ffsll(long long int input) {
|
||||
__device__ inline unsigned int __ffsll(long long int input) {
|
||||
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __brev(unsigned int input) {
|
||||
__device__ inline unsigned int __brev(unsigned int input) {
|
||||
return __llvm_bitrev_b32(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
|
||||
__device__ inline unsigned long long int __brevll(unsigned long long int input) {
|
||||
return __llvm_bitrev_b64(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
|
||||
__device__ inline unsigned int __lastbit_u32_u64(uint64_t input) {
|
||||
return input == 0 ? -1 : __builtin_ctzl(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
|
||||
__device__ inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
|
||||
uint32_t offset = src1 & 31;
|
||||
uint32_t width = src2 & 31;
|
||||
return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
|
||||
}
|
||||
|
||||
__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
|
||||
__device__ inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
|
||||
uint64_t offset = src1 & 63;
|
||||
uint64_t width = src2 & 63;
|
||||
return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
|
||||
__device__ inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
|
||||
uint32_t offset = src2 & 31;
|
||||
uint32_t width = src3 & 31;
|
||||
uint32_t mask = (1 << width) - 1;
|
||||
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
|
||||
}
|
||||
|
||||
__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
|
||||
__device__ inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
|
||||
uint64_t offset = src2 & 63;
|
||||
uint64_t width = src3 & 63;
|
||||
uint64_t mask = (1ULL << width) - 1;
|
||||
@@ -136,7 +136,7 @@ struct uchar2Holder {
|
||||
} __attribute__((aligned(8)));
|
||||
|
||||
__device__
|
||||
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
|
||||
inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
|
||||
struct uchar2Holder cHoldVal;
|
||||
struct ucharHolder cHoldKey;
|
||||
struct ucharHolder cHoldOut;
|
||||
@@ -150,18 +150,18 @@ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned
|
||||
return cHoldOut.ui;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __hadd(int x, int y) {
|
||||
__device__ inline unsigned int __hadd(int x, int y) {
|
||||
int z = x + y;
|
||||
int sign = z & 0x8000000;
|
||||
int value = z & 0x7FFFFFFF;
|
||||
return ((value) >> 1 || sign);
|
||||
}
|
||||
|
||||
__device__ static inline int __mul24(int x, int y) {
|
||||
__device__ inline int __mul24(int x, int y) {
|
||||
return __ockl_mul24_i32(x, y);
|
||||
}
|
||||
|
||||
__device__ static inline long long __mul64hi(long long int x, long long int y) {
|
||||
__device__ inline long long __mul64hi(long long int x, long long int y) {
|
||||
ulong x0 = (ulong)x & 0xffffffffUL;
|
||||
long x1 = x >> 32;
|
||||
ulong y0 = (ulong)y & 0xffffffffUL;
|
||||
@@ -174,28 +174,28 @@ __device__ static inline long long __mul64hi(long long int x, long long int y) {
|
||||
return x1*y1 + z2 + (z1 >> 32);
|
||||
}
|
||||
|
||||
__device__ static inline int __mulhi(int x, int y) {
|
||||
__device__ inline int __mulhi(int x, int y) {
|
||||
return __ockl_mul_hi_i32(x, y);
|
||||
}
|
||||
|
||||
__device__ static inline int __rhadd(int x, int y) {
|
||||
__device__ inline int __rhadd(int x, int y) {
|
||||
int z = x + y + 1;
|
||||
int sign = z & 0x8000000;
|
||||
int value = z & 0x7FFFFFFF;
|
||||
return ((value) >> 1 || sign);
|
||||
}
|
||||
__device__ static inline unsigned int __sad(int x, int y, int z) {
|
||||
__device__ inline unsigned int __sad(int x, int y, int z) {
|
||||
return x > y ? x - y + z : y - x + z;
|
||||
}
|
||||
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
|
||||
__device__ inline unsigned int __uhadd(unsigned int x, unsigned int y) {
|
||||
return (x + y) >> 1;
|
||||
}
|
||||
__device__ static inline int __umul24(unsigned int x, unsigned int y) {
|
||||
__device__ inline int __umul24(unsigned int x, unsigned int y) {
|
||||
return __ockl_mul24_u32(x, y);
|
||||
}
|
||||
|
||||
__device__
|
||||
static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
|
||||
inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
|
||||
ulong x0 = x & 0xffffffffUL;
|
||||
ulong x1 = x >> 32;
|
||||
ulong y0 = y & 0xffffffffUL;
|
||||
@@ -208,41 +208,41 @@ static inline unsigned long long __umul64hi(unsigned long long int x, unsigned l
|
||||
return x1*y1 + z2 + (z1 >> 32);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
|
||||
__device__ inline unsigned int __umulhi(unsigned int x, unsigned int y) {
|
||||
return __ockl_mul_hi_u32(x, y);
|
||||
}
|
||||
__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
|
||||
__device__ inline unsigned int __urhadd(unsigned int x, unsigned int y) {
|
||||
return (x + y + 1) >> 1;
|
||||
}
|
||||
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
|
||||
__device__ inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
|
||||
return __ockl_sad_u32(x, y, z);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
|
||||
__device__ inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
|
||||
|
||||
/*
|
||||
HIP specific device functions
|
||||
*/
|
||||
|
||||
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
|
||||
__device__ inline unsigned __hip_ds_bpermute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
|
||||
__device__ inline float __hip_ds_bpermutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
|
||||
__device__ inline unsigned __hip_ds_permute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_permutef(int index, float src) {
|
||||
__device__ inline float __hip_ds_permutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
@@ -252,7 +252,7 @@ __device__ static inline float __hip_ds_permutef(int index, float src) {
|
||||
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
|
||||
|
||||
template <int pattern>
|
||||
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
||||
__device__ inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
#if defined(__HCC__)
|
||||
tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
@@ -263,7 +263,7 @@ __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
||||
}
|
||||
|
||||
template <int pattern>
|
||||
__device__ static inline float __hip_ds_swizzlef_N(float src) {
|
||||
__device__ inline float __hip_ds_swizzlef_N(float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
#if defined(__HCC__)
|
||||
tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
|
||||
@@ -277,7 +277,7 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
|
||||
__hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
|
||||
|
||||
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
|
||||
__device__ static inline int __hip_move_dpp_N(int src) {
|
||||
__device__ inline int __hip_move_dpp_N(int src) {
|
||||
return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
|
||||
bound_ctrl);
|
||||
}
|
||||
@@ -434,7 +434,7 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) {
|
||||
#define MASK1 0x00ff00ff
|
||||
#define MASK2 0xff00ff00
|
||||
|
||||
__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
||||
__device__ inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.w & MASK1;
|
||||
unsigned one2 = in2.w & MASK1;
|
||||
@@ -445,7 +445,7 @@ __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
||||
__device__ inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.w & MASK1;
|
||||
unsigned one2 = in2.w & MASK1;
|
||||
@@ -456,7 +456,7 @@ __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
||||
__device__ inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
||||
char4 out;
|
||||
unsigned one1 = in1.w & MASK1;
|
||||
unsigned one2 = in2.w & MASK1;
|
||||
@@ -472,12 +472,12 @@ __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
||||
* TODO: Conversion functions are not correct, need to fix when BE is ready
|
||||
*/
|
||||
|
||||
__device__ static inline float __double2float_rd(double x) { return (double)x; }
|
||||
__device__ static inline float __double2float_rn(double x) { return (double)x; }
|
||||
__device__ static inline float __double2float_ru(double x) { return (double)x; }
|
||||
__device__ static inline float __double2float_rz(double x) { return (double)x; }
|
||||
__device__ inline float __double2float_rd(double x) { return (double)x; }
|
||||
__device__ inline float __double2float_rn(double x) { return (double)x; }
|
||||
__device__ inline float __double2float_ru(double x) { return (double)x; }
|
||||
__device__ inline float __double2float_rz(double x) { return (double)x; }
|
||||
|
||||
__device__ static inline int __double2hiint(double x) {
|
||||
__device__ inline int __double2hiint(double x) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
|
||||
int tmp[2];
|
||||
@@ -485,7 +485,7 @@ __device__ static inline int __double2hiint(double x) {
|
||||
|
||||
return tmp[1];
|
||||
}
|
||||
__device__ static inline int __double2loint(double x) {
|
||||
__device__ inline int __double2loint(double x) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
|
||||
int tmp[2];
|
||||
@@ -494,35 +494,35 @@ __device__ static inline int __double2loint(double x) {
|
||||
return tmp[0];
|
||||
}
|
||||
|
||||
__device__ static inline int __double2int_rd(double x) { return (int)x; }
|
||||
__device__ static inline int __double2int_rn(double x) { return (int)x; }
|
||||
__device__ static inline int __double2int_ru(double x) { return (int)x; }
|
||||
__device__ static inline int __double2int_rz(double x) { return (int)x; }
|
||||
__device__ inline int __double2int_rd(double x) { return (int)x; }
|
||||
__device__ inline int __double2int_rn(double x) { return (int)x; }
|
||||
__device__ inline int __double2int_ru(double x) { return (int)x; }
|
||||
__device__ inline int __double2int_rz(double x) { return (int)x; }
|
||||
|
||||
__device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
|
||||
__device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
|
||||
__device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
|
||||
__device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
|
||||
__device__ inline long long int __double2ll_rd(double x) { return (long long int)x; }
|
||||
__device__ inline long long int __double2ll_rn(double x) { return (long long int)x; }
|
||||
__device__ inline long long int __double2ll_ru(double x) { return (long long int)x; }
|
||||
__device__ inline long long int __double2ll_rz(double x) { return (long long int)x; }
|
||||
|
||||
__device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
|
||||
|
||||
__device__ static inline unsigned long long int __double2ull_rd(double x) {
|
||||
__device__ inline unsigned long long int __double2ull_rd(double x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __double2ull_rn(double x) {
|
||||
__device__ inline unsigned long long int __double2ull_rn(double x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __double2ull_ru(double x) {
|
||||
__device__ inline unsigned long long int __double2ull_ru(double x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __double2ull_rz(double x) {
|
||||
__device__ inline unsigned long long int __double2ull_rz(double x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
|
||||
__device__ static inline long long int __double_as_longlong(double x) {
|
||||
__device__ inline long long int __double_as_longlong(double x) {
|
||||
static_assert(sizeof(long long) == sizeof(double), "");
|
||||
|
||||
long long tmp;
|
||||
@@ -545,35 +545,35 @@ CUDA implements half as unsigned short whereas, HIP doesn't.
|
||||
|
||||
*/
|
||||
|
||||
__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
|
||||
__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
|
||||
__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
|
||||
__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
|
||||
__device__ inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
|
||||
__device__ inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
|
||||
__device__ inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
|
||||
__device__ inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
|
||||
|
||||
__device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
|
||||
__device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
|
||||
__device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
|
||||
__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
|
||||
__device__ inline long long int __float2ll_rd(float x) { return (long long int)x; }
|
||||
__device__ inline long long int __float2ll_rn(float x) { return (long long int)x; }
|
||||
__device__ inline long long int __float2ll_ru(float x) { return (long long int)x; }
|
||||
__device__ inline long long int __float2ll_rz(float x) { return (long long int)x; }
|
||||
|
||||
__device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
|
||||
__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
|
||||
__device__ inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
|
||||
|
||||
__device__ static inline unsigned long long int __float2ull_rd(float x) {
|
||||
__device__ inline unsigned long long int __float2ull_rd(float x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __float2ull_rn(float x) {
|
||||
__device__ inline unsigned long long int __float2ull_rn(float x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __float2ull_ru(float x) {
|
||||
__device__ inline unsigned long long int __float2ull_ru(float x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
__device__ static inline unsigned long long int __float2ull_rz(float x) {
|
||||
__device__ inline unsigned long long int __float2ull_rz(float x) {
|
||||
return (unsigned long long int)x;
|
||||
}
|
||||
|
||||
__device__ static inline int __float_as_int(float x) {
|
||||
__device__ inline int __float_as_int(float x) {
|
||||
static_assert(sizeof(int) == sizeof(float), "");
|
||||
|
||||
int tmp;
|
||||
@@ -582,7 +582,7 @@ __device__ static inline int __float_as_int(float x) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __float_as_uint(float x) {
|
||||
__device__ inline unsigned int __float_as_uint(float x) {
|
||||
static_assert(sizeof(unsigned int) == sizeof(float), "");
|
||||
|
||||
unsigned int tmp;
|
||||
@@ -591,7 +591,7 @@ __device__ static inline unsigned int __float_as_uint(float x) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ static inline double __hiloint2double(int hi, int lo) {
|
||||
__device__ inline double __hiloint2double(int hi, int lo) {
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
|
||||
@@ -601,14 +601,14 @@ __device__ static inline double __hiloint2double(int hi, int lo) {
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__ static inline double __int2double_rn(int x) { return (double)x; }
|
||||
__device__ inline double __int2double_rn(int x) { return (double)x; }
|
||||
|
||||
__device__ static inline float __int2float_rd(int x) { return (float)x; }
|
||||
__device__ static inline float __int2float_rn(int x) { return (float)x; }
|
||||
__device__ static inline float __int2float_ru(int x) { return (float)x; }
|
||||
__device__ static inline float __int2float_rz(int x) { return (float)x; }
|
||||
__device__ inline float __int2float_rd(int x) { return (float)x; }
|
||||
__device__ inline float __int2float_rn(int x) { return (float)x; }
|
||||
__device__ inline float __int2float_ru(int x) { return (float)x; }
|
||||
__device__ inline float __int2float_rz(int x) { return (float)x; }
|
||||
|
||||
__device__ static inline float __int_as_float(int x) {
|
||||
__device__ inline float __int_as_float(int x) {
|
||||
static_assert(sizeof(float) == sizeof(int), "");
|
||||
|
||||
float tmp;
|
||||
@@ -617,17 +617,17 @@ __device__ static inline float __int_as_float(int x) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
|
||||
__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
|
||||
__device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
|
||||
__device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
|
||||
__device__ inline double __ll2double_rd(long long int x) { return (double)x; }
|
||||
__device__ inline double __ll2double_rn(long long int x) { return (double)x; }
|
||||
__device__ inline double __ll2double_ru(long long int x) { return (double)x; }
|
||||
__device__ inline double __ll2double_rz(long long int x) { return (double)x; }
|
||||
|
||||
__device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
|
||||
__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
|
||||
__device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
|
||||
__device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
|
||||
__device__ inline float __ll2float_rd(long long int x) { return (float)x; }
|
||||
__device__ inline float __ll2float_rn(long long int x) { return (float)x; }
|
||||
__device__ inline float __ll2float_ru(long long int x) { return (float)x; }
|
||||
__device__ inline float __ll2float_rz(long long int x) { return (float)x; }
|
||||
|
||||
__device__ static inline double __longlong_as_double(long long int x) {
|
||||
__device__ inline double __longlong_as_double(long long int x) {
|
||||
static_assert(sizeof(double) == sizeof(long long), "");
|
||||
|
||||
double tmp;
|
||||
@@ -636,14 +636,14 @@ __device__ static inline double __longlong_as_double(long long int x) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ static inline double __uint2double_rn(int x) { return (double)x; }
|
||||
__device__ inline double __uint2double_rn(int x) { return (double)x; }
|
||||
|
||||
__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
|
||||
__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
|
||||
__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
|
||||
__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
|
||||
__device__ inline float __uint2float_rd(unsigned int x) { return (float)x; }
|
||||
__device__ inline float __uint2float_rn(unsigned int x) { return (float)x; }
|
||||
__device__ inline float __uint2float_ru(unsigned int x) { return (float)x; }
|
||||
__device__ inline float __uint2float_rz(unsigned int x) { return (float)x; }
|
||||
|
||||
__device__ static inline float __uint_as_float(unsigned int x) {
|
||||
__device__ inline float __uint_as_float(unsigned int x) {
|
||||
static_assert(sizeof(float) == sizeof(unsigned int), "");
|
||||
|
||||
float tmp;
|
||||
@@ -652,15 +652,15 @@ __device__ static inline float __uint_as_float(unsigned int x) {
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
|
||||
__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
|
||||
__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
|
||||
__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
|
||||
__device__ inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
|
||||
__device__ inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
|
||||
__device__ inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
|
||||
__device__ inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
|
||||
|
||||
__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
|
||||
__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
|
||||
__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
|
||||
__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
|
||||
__device__ inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
|
||||
__device__ inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
|
||||
__device__ inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
|
||||
__device__ inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
|
||||
|
||||
#if defined(__HCC__)
|
||||
#define __HCC_OR_HIP_CLANG__ 1
|
||||
@@ -819,7 +819,7 @@ typedef enum __memory_order
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void
|
||||
void
|
||||
__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
|
||||
{
|
||||
// We're tying global-happens-before and local-happens-before together as does HSA
|
||||
@@ -871,21 +871,21 @@ __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __mem
|
||||
// Memory Fence Functions
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence()
|
||||
void __threadfence()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence_block()
|
||||
void __threadfence_block()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence_system()
|
||||
void __threadfence_system()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
|
||||
}
|
||||
@@ -945,7 +945,7 @@ void __assertfail(const char * __assertion,
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
|
||||
void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
|
||||
{
|
||||
if (flags) {
|
||||
__atomic_work_item_fence(flags, __memory_order_release, scope);
|
||||
@@ -958,7 +958,7 @@ static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scop
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __barrier(int n)
|
||||
void __barrier(int n)
|
||||
{
|
||||
__work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
|
||||
}
|
||||
@@ -1037,7 +1037,7 @@ unsigned __smid(void)
|
||||
|
||||
|
||||
// loop unrolling
|
||||
static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
|
||||
inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
|
||||
auto dstPtr = static_cast<unsigned char*>(dst);
|
||||
auto srcPtr = static_cast<const unsigned char*>(src);
|
||||
|
||||
@@ -1063,7 +1063,7 @@ static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_
|
||||
return dst;
|
||||
}
|
||||
|
||||
static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
|
||||
inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
|
||||
auto dstPtr = static_cast<unsigned char*>(dst);
|
||||
|
||||
while (size >= 4u) {
|
||||
@@ -1086,11 +1086,11 @@ static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, siz
|
||||
|
||||
return dst;
|
||||
}
|
||||
static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
|
||||
inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
|
||||
return __hip_hc_memcpy(dst, src, size);
|
||||
}
|
||||
|
||||
static inline __device__ void* memset(void* ptr, int val, size_t size) {
|
||||
inline __device__ void* memset(void* ptr, int val, size_t size) {
|
||||
unsigned char val8 = static_cast<unsigned char>(val);
|
||||
return __hip_hc_memset(ptr, val8, size);
|
||||
}
|
||||
|
||||
@@ -287,8 +287,8 @@ typedef struct hipMemcpy3DParms {
|
||||
size_t srcZ;
|
||||
}hipMemcpy3DParms;
|
||||
|
||||
static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
|
||||
size_t ysz) {
|
||||
inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
|
||||
size_t ysz) {
|
||||
struct hipPitchedPtr s;
|
||||
|
||||
s.ptr = d;
|
||||
@@ -299,7 +299,7 @@ static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t
|
||||
return s;
|
||||
}
|
||||
|
||||
static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
|
||||
inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
|
||||
struct hipPos p;
|
||||
|
||||
p.x = x;
|
||||
@@ -309,7 +309,7 @@ static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
|
||||
inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
|
||||
struct hipExtent e;
|
||||
|
||||
e.width = w;
|
||||
|
||||
@@ -36,7 +36,7 @@ THE SOFTWARE.
|
||||
|
||||
#if __cplusplus
|
||||
#define COMPLEX_NEG_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type operator-(const type& op) { \
|
||||
__device__ __host__ inline type operator-(const type& op) { \
|
||||
type ret; \
|
||||
ret.x = -op.x; \
|
||||
ret.y = -op.y; \
|
||||
@@ -44,17 +44,17 @@ THE SOFTWARE.
|
||||
}
|
||||
|
||||
#define COMPLEX_EQ_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline bool operator==(const type& lhs, const type& rhs) { \
|
||||
return lhs.x == rhs.x && lhs.y == rhs.y; \
|
||||
}
|
||||
|
||||
#define COMPLEX_NE_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline bool operator!=(const type& lhs, const type& rhs) { \
|
||||
return !(lhs == rhs); \
|
||||
}
|
||||
|
||||
#define COMPLEX_ADD_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type operator+(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type operator+(const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x + rhs.x; \
|
||||
ret.y = lhs.y + rhs.y; \
|
||||
@@ -62,7 +62,7 @@ THE SOFTWARE.
|
||||
}
|
||||
|
||||
#define COMPLEX_SUB_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type operator-(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type operator-(const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x - rhs.x; \
|
||||
ret.y = lhs.y - rhs.y; \
|
||||
@@ -70,7 +70,7 @@ THE SOFTWARE.
|
||||
}
|
||||
|
||||
#define COMPLEX_MUL_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type operator*(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type operator*(const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
|
||||
ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
|
||||
@@ -78,7 +78,7 @@ THE SOFTWARE.
|
||||
}
|
||||
|
||||
#define COMPLEX_DIV_OP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type operator/(const type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type operator/(const type& lhs, const type& rhs) { \
|
||||
type ret; \
|
||||
ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
|
||||
ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
|
||||
@@ -88,33 +88,33 @@ THE SOFTWARE.
|
||||
}
|
||||
|
||||
#define COMPLEX_ADD_PREOP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type& operator+=(type& lhs, const type& rhs) { \
|
||||
lhs.x += rhs.x; \
|
||||
lhs.y += rhs.y; \
|
||||
return lhs; \
|
||||
}
|
||||
|
||||
#define COMPLEX_SUB_PREOP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type& operator-=(type& lhs, const type& rhs) { \
|
||||
lhs.x -= rhs.x; \
|
||||
lhs.y -= rhs.y; \
|
||||
return lhs; \
|
||||
}
|
||||
|
||||
#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type& operator*=(type& lhs, const type& rhs) { \
|
||||
lhs = lhs * rhs; \
|
||||
return lhs; \
|
||||
}
|
||||
|
||||
#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
|
||||
__device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) { \
|
||||
__device__ __host__ inline type& operator/=(type& lhs, const type& rhs) { \
|
||||
lhs = lhs / rhs; \
|
||||
return lhs; \
|
||||
}
|
||||
|
||||
#define COMPLEX_SCALAR_PRODUCT(type, type1) \
|
||||
__device__ __host__ static inline type operator*(const type& lhs, type1 rhs) { \
|
||||
__device__ __host__ inline type operator*(const type& lhs, type1 rhs) { \
|
||||
type ret; \
|
||||
ret.x = lhs.x * rhs; \
|
||||
ret.y = lhs.y * rhs; \
|
||||
@@ -125,41 +125,41 @@ THE SOFTWARE.
|
||||
|
||||
typedef float2 hipFloatComplex;
|
||||
|
||||
__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
||||
__device__ __host__ inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
||||
__device__ __host__ inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
||||
__device__ __host__ inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
||||
hipFloatComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
||||
__device__ __host__ inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
||||
hipFloatComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
|
||||
__device__ __host__ inline float hipCsqabsf(hipFloatComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
||||
__device__ __host__ inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
||||
__device__ __host__ inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
||||
__device__ __host__ inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
||||
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
||||
__device__ __host__ inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
||||
float sqabs = hipCsqabsf(q);
|
||||
hipFloatComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
@@ -167,46 +167,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
||||
__device__ __host__ inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
||||
|
||||
|
||||
typedef double2 hipDoubleComplex;
|
||||
|
||||
__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
||||
__device__ __host__ inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
||||
|
||||
__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
||||
__device__ __host__ inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
||||
__device__ __host__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
||||
hipDoubleComplex z;
|
||||
z.x = a;
|
||||
z.y = b;
|
||||
return z;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
||||
__device__ __host__ inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
||||
hipDoubleComplex ret;
|
||||
ret.x = z.x;
|
||||
ret.y = -z.y;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
|
||||
__device__ __host__ inline double hipCsqabs(hipDoubleComplex z) {
|
||||
return z.x * z.x + z.y * z.y;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
__device__ __host__ inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
__device__ __host__ inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
__device__ __host__ inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
__device__ __host__ inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
||||
double sqabs = hipCsqabs(q);
|
||||
hipDoubleComplex ret;
|
||||
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
||||
@@ -214,7 +214,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
|
||||
__device__ __host__ inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
|
||||
|
||||
|
||||
#if __cplusplus
|
||||
@@ -268,19 +268,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
|
||||
|
||||
typedef hipFloatComplex hipComplex;
|
||||
|
||||
__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
|
||||
__device__ __host__ inline hipComplex make_hipComplex(float x, float y) {
|
||||
return make_hipFloatComplex(x, y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
|
||||
__device__ __host__ inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
|
||||
return make_hipFloatComplex((float)z.x, (float)z.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
|
||||
__device__ __host__ inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
|
||||
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
|
||||
__device__ __host__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
|
||||
float real = (p.x * q.x) + r.x;
|
||||
float imag = (q.x * p.y) + r.y;
|
||||
|
||||
@@ -290,7 +290,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
|
||||
return make_hipComplex(real, imag);
|
||||
}
|
||||
|
||||
__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
|
||||
__device__ __host__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
|
||||
hipDoubleComplex r) {
|
||||
double real = (p.x * q.x) + r.x;
|
||||
double imag = (q.x * p.y) + r.y;
|
||||
|
||||
+1081
-1085
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -95,160 +95,157 @@ struct __half2_raw {
|
||||
};
|
||||
// END STRUCT __HALF2
|
||||
|
||||
namespace
|
||||
inline
|
||||
unsigned short __internal_float2half(
|
||||
float flt, unsigned int& sgn, unsigned int& rem)
|
||||
{
|
||||
inline
|
||||
unsigned short __internal_float2half(
|
||||
float flt, unsigned int& sgn, unsigned int& rem)
|
||||
{
|
||||
unsigned int x{};
|
||||
std::memcpy(&x, &flt, sizeof(flt));
|
||||
unsigned int x{};
|
||||
std::memcpy(&x, &flt, sizeof(flt));
|
||||
|
||||
unsigned int u = (x & 0x7fffffffU);
|
||||
sgn = ((x >> 16) & 0x8000U);
|
||||
unsigned int u = (x & 0x7fffffffU);
|
||||
sgn = ((x >> 16) & 0x8000U);
|
||||
|
||||
// NaN/+Inf/-Inf
|
||||
if (u >= 0x7f800000U) {
|
||||
rem = 0;
|
||||
return static_cast<unsigned short>(
|
||||
(u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
|
||||
// NaN/+Inf/-Inf
|
||||
if (u >= 0x7f800000U) {
|
||||
rem = 0;
|
||||
return static_cast<unsigned short>(
|
||||
(u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
|
||||
}
|
||||
// Overflows
|
||||
if (u > 0x477fefffU) {
|
||||
rem = 0x80000000U;
|
||||
return static_cast<unsigned short>(sgn | 0x7bffU);
|
||||
}
|
||||
// Normal numbers
|
||||
if (u >= 0x38800000U) {
|
||||
rem = u << 19;
|
||||
u -= 0x38000000U;
|
||||
return static_cast<unsigned short>(sgn | (u >> 13));
|
||||
}
|
||||
// +0/-0
|
||||
if (u < 0x33000001U) {
|
||||
rem = u;
|
||||
return static_cast<unsigned short>(sgn);
|
||||
}
|
||||
// Denormal numbers
|
||||
unsigned int exponent = u >> 23;
|
||||
unsigned int mantissa = (u & 0x7fffffU);
|
||||
unsigned int shift = 0x7eU - exponent;
|
||||
mantissa |= 0x800000U;
|
||||
rem = mantissa << (32 - shift);
|
||||
return static_cast<unsigned short>(sgn | (mantissa >> shift));
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rn(float x) { return __float2half(x); }
|
||||
|
||||
inline
|
||||
__half __float2half_rz(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rd(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_ru(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && !sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __float2half2_rn(float x)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(x)};
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __floats2half2_rn(float x, float y)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(y)};
|
||||
}
|
||||
|
||||
inline
|
||||
float __internal_half2float(unsigned short x)
|
||||
{
|
||||
unsigned int sign = ((x >> 15) & 1);
|
||||
unsigned int exponent = ((x >> 10) & 0x1f);
|
||||
unsigned int mantissa = ((x & 0x3ff) << 13);
|
||||
|
||||
if (exponent == 0x1fU) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
|
||||
exponent = 0xffU;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71U;
|
||||
do {
|
||||
msb = (mantissa & 0x400000U);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
||||
}
|
||||
// Overflows
|
||||
if (u > 0x477fefffU) {
|
||||
rem = 0x80000000U;
|
||||
return static_cast<unsigned short>(sgn | 0x7bffU);
|
||||
}
|
||||
// Normal numbers
|
||||
if (u >= 0x38800000U) {
|
||||
rem = u << 19;
|
||||
u -= 0x38000000U;
|
||||
return static_cast<unsigned short>(sgn | (u >> 13));
|
||||
}
|
||||
// +0/-0
|
||||
if (u < 0x33000001U) {
|
||||
rem = u;
|
||||
return static_cast<unsigned short>(sgn);
|
||||
}
|
||||
// Denormal numbers
|
||||
unsigned int exponent = u >> 23;
|
||||
unsigned int mantissa = (u & 0x7fffffU);
|
||||
unsigned int shift = 0x7eU - exponent;
|
||||
mantissa |= 0x800000U;
|
||||
rem = mantissa << (32 - shift);
|
||||
return static_cast<unsigned short>(sgn | (mantissa >> shift));
|
||||
} else {
|
||||
exponent += 0x70U;
|
||||
}
|
||||
unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
float f;
|
||||
std::memcpy(&f, &u, sizeof(u));
|
||||
|
||||
inline
|
||||
__half __float2half(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
|
||||
return f;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
inline
|
||||
float __half2float(__half x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half_raw>(x).x);
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rn(float x) { return __float2half(x); }
|
||||
inline
|
||||
float __low2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).x);
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rz(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_rd(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half __float2half_ru(float x)
|
||||
{
|
||||
__half_raw r;
|
||||
unsigned int sgn{};
|
||||
unsigned int rem{};
|
||||
r.x = __internal_float2half(x, sgn, rem);
|
||||
if (rem && !sgn) ++r.x;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __float2half2_rn(float x)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(x)};
|
||||
}
|
||||
|
||||
inline
|
||||
__half2 __floats2half2_rn(float x, float y)
|
||||
{
|
||||
return __half2{__float2half_rn(x), __float2half_rn(y)};
|
||||
}
|
||||
|
||||
inline
|
||||
float __internal_half2float(unsigned short x)
|
||||
{
|
||||
unsigned int sign = ((x >> 15) & 1);
|
||||
unsigned int exponent = ((x >> 10) & 0x1f);
|
||||
unsigned int mantissa = ((x & 0x3ff) << 13);
|
||||
|
||||
if (exponent == 0x1fU) { /* NaN or Inf */
|
||||
mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
|
||||
exponent = 0xffU;
|
||||
} else if (!exponent) { /* Denorm or Zero */
|
||||
if (mantissa) {
|
||||
unsigned int msb;
|
||||
exponent = 0x71U;
|
||||
do {
|
||||
msb = (mantissa & 0x400000U);
|
||||
mantissa <<= 1; /* normalize */
|
||||
--exponent;
|
||||
} while (!msb);
|
||||
mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
|
||||
}
|
||||
} else {
|
||||
exponent += 0x70U;
|
||||
}
|
||||
unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
|
||||
float f;
|
||||
memcpy(&f, &u, sizeof(u));
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
inline
|
||||
float __half2float(__half x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half_raw>(x).x);
|
||||
}
|
||||
|
||||
inline
|
||||
float __low2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).x);
|
||||
}
|
||||
|
||||
inline
|
||||
float __high2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).y);
|
||||
}
|
||||
} // Anonymous namespace.
|
||||
inline
|
||||
float __high2float(__half2 x)
|
||||
{
|
||||
return __internal_half2float(static_cast<__half2_raw>(x).y);
|
||||
}
|
||||
|
||||
#if !defined(HIP_NO_HALF)
|
||||
using half = __half;
|
||||
|
||||
@@ -308,17 +308,17 @@ static constexpr Coordinates<hip_impl::WorkitemId> threadIdx{};
|
||||
extern "C" __device__ void* __hip_malloc(size_t);
|
||||
extern "C" __device__ void* __hip_free(void* ptr);
|
||||
|
||||
static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
|
||||
static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
|
||||
inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
|
||||
inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
|
||||
|
||||
#if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
|
||||
template <typename... All>
|
||||
static inline __device__ void printf(const char* format, All... all) {
|
||||
inline __device__ void printf(const char* format, All... all) {
|
||||
hc::printf(format, all...);
|
||||
}
|
||||
#elif defined(__HCC_ACCELERATOR__) || __HIP__
|
||||
template <typename... All>
|
||||
static inline __device__ void printf(const char* format, All... all) {}
|
||||
inline __device__ void printf(const char* format, All... all) {}
|
||||
#endif
|
||||
|
||||
#endif //__HCC_OR_HIP_CLANG__
|
||||
|
||||
@@ -1107,38 +1107,38 @@ __MAKE_VECTOR_TYPE__(double, double);
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x) { type r{x}; return r; }
|
||||
|
||||
#define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y) { type r{x, y}; return r; }
|
||||
|
||||
#define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }
|
||||
|
||||
#define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y, comp z, comp w) { \
|
||||
type r{x, y, z, w}; \
|
||||
return r; \
|
||||
}
|
||||
#else
|
||||
#define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x) { type r; r.x =x; return r; }
|
||||
|
||||
#define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }
|
||||
|
||||
#define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }
|
||||
|
||||
#define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
|
||||
static inline __device__ __host__ \
|
||||
inline __device__ __host__ \
|
||||
type make_##type(comp x, comp y, comp z, comp w) { \
|
||||
type r; r.x=x; r.y=y; r.z=z; r.w=w; \
|
||||
return r; \
|
||||
|
||||
@@ -46,7 +46,7 @@ union TData {
|
||||
__hip_uint4_vector_value_type u;
|
||||
};
|
||||
|
||||
#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__
|
||||
#define __TEXTURE_FUNCTIONS_DECL__ inline __device__
|
||||
|
||||
|
||||
#if (__hcc_workweek__ >= 18114) || __clang__
|
||||
|
||||
@@ -358,21 +358,21 @@ enum hipComputeMode {
|
||||
*/
|
||||
#if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__)
|
||||
template <class T>
|
||||
static inline hipError_t hipMalloc(T** devPtr, size_t size) {
|
||||
inline hipError_t hipMalloc(T** devPtr, size_t size) {
|
||||
return hipMalloc((void**)devPtr, size);
|
||||
}
|
||||
|
||||
// Provide an override to automatically typecast the pointer type from void**, and also provide a
|
||||
// default for the flags.
|
||||
template <class T>
|
||||
static inline hipError_t hipHostMalloc(T** ptr, size_t size,
|
||||
unsigned int flags = hipHostMallocDefault) {
|
||||
inline hipError_t hipHostMalloc(T** ptr, size_t size,
|
||||
unsigned int flags = hipHostMallocDefault) {
|
||||
return hipHostMalloc((void**)ptr, size, flags);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
|
||||
unsigned int flags = hipMemAttachGlobal) {
|
||||
inline hipError_t hipMallocManaged(T** devPtr, size_t size,
|
||||
unsigned int flags = hipMemAttachGlobal) {
|
||||
return hipMallocManaged((void**)devPtr, size, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -97,13 +97,13 @@ class TidInfo {
|
||||
TidInfo();
|
||||
|
||||
int tid() const { return _shortTid; };
|
||||
pid_t pid() const { return _pid; };
|
||||
pid_t pid() const { return _pid; };
|
||||
uint64_t incApiSeqNum() { return ++_apiSeqNum; };
|
||||
uint64_t apiSeqNum() const { return _apiSeqNum; };
|
||||
|
||||
private:
|
||||
int _shortTid;
|
||||
pid_t _pid;
|
||||
pid_t _pid;
|
||||
|
||||
// monotonically increasing API sequence number for this threa.
|
||||
uint64_t _apiSeqNum;
|
||||
@@ -280,7 +280,7 @@ static const DbName dbName[] = {
|
||||
#endif
|
||||
|
||||
|
||||
static inline uint64_t getTicks() { return hc::get_system_ticks(); }
|
||||
inline uint64_t getTicks() { return hc::get_system_ticks(); }
|
||||
|
||||
//---
|
||||
extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
|
||||
@@ -798,7 +798,7 @@ class ihipDevice_t {
|
||||
|
||||
// TODO - report this through device properties, base on HCC API call.
|
||||
int _isLargeBar;
|
||||
|
||||
|
||||
// Node id reported by kfd for this device
|
||||
uint32_t _driver_node_id;
|
||||
|
||||
@@ -1047,7 +1047,7 @@ struct mg_info {
|
||||
// setDevice first.
|
||||
// - hipDeviceReset destroys the primary context for device?
|
||||
// - Then context is created again for next usage.
|
||||
static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
|
||||
inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
|
||||
// Per-thread initialization of the TLS:
|
||||
if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
|
||||
tls->defaultCtx = ihipGetPrimaryCtx(0);
|
||||
|
||||
Ссылка в новой задаче
Block a user