diff --git a/include/hip/hcc_detail/channel_descriptor.h b/include/hip/hcc_detail/channel_descriptor.h index 38acff9951..db81fc76b6 100644 --- a/include/hip/hcc_detail/channel_descriptor.h +++ b/include/hip/hcc_detail/channel_descriptor.h @@ -32,23 +32,23 @@ THE SOFTWARE. HIP_PUBLIC_API hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f); -static inline hipChannelFormatDesc hipCreateChannelDescHalf() { +inline hipChannelFormatDesc hipCreateChannelDescHalf() { int e = (int)sizeof(unsigned short) * 8; return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); } -static inline hipChannelFormatDesc hipCreateChannelDescHalf1() { +inline hipChannelFormatDesc hipCreateChannelDescHalf1() { int e = (int)sizeof(unsigned short) * 8; return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); } -static inline hipChannelFormatDesc hipCreateChannelDescHalf2() { +inline hipChannelFormatDesc hipCreateChannelDescHalf2() { int e = (int)sizeof(unsigned short) * 8; return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat); } template -static inline hipChannelFormatDesc hipCreateChannelDesc() { +inline hipChannelFormatDesc hipCreateChannelDesc() { return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone); } diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index 68e3277270..58f837aa76 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -38,69 +38,69 @@ Integer Intrinsics */ // integer intrinsic function __poc __clz __ffs __brev -__device__ static inline unsigned int __popc(unsigned int input) { +__device__ inline unsigned int __popc(unsigned int input) { return __builtin_popcount(input); } -__device__ static inline unsigned int __popcll(unsigned long long int input) { +__device__ inline unsigned int __popcll(unsigned long long int input) { return __builtin_popcountll(input); } -__device__ static inline int __clz(int input) { +__device__ inline int __clz(int input) { return __ockl_clz_u32((uint)input); } -__device__ static inline int __clzll(long long int input) { +__device__ inline int __clzll(long long int input) { return __ockl_clz_u64((ullong)input); } -__device__ static inline unsigned int __ffs(unsigned int input) { +__device__ inline unsigned int __ffs(unsigned int input) { return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1; } -__device__ static inline unsigned int __ffsll(unsigned long long int input) { +__device__ inline unsigned int __ffsll(unsigned long long int input) { return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1; } -__device__ static inline unsigned int __ffs(int input) { +__device__ inline unsigned int __ffs(int input) { return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1; } -__device__ static inline unsigned int __ffsll(long long int input) { +__device__ inline unsigned int __ffsll(long long int input) { return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1; } -__device__ static inline unsigned int __brev(unsigned int input) { +__device__ inline unsigned int __brev(unsigned int input) { return __llvm_bitrev_b32(input); } -__device__ static inline unsigned long long int __brevll(unsigned long long int input) { +__device__ inline unsigned long long int __brevll(unsigned long long int input) { return __llvm_bitrev_b64(input); } -__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) { +__device__ inline unsigned int __lastbit_u32_u64(uint64_t input) { return input == 0 ? -1 : __builtin_ctzl(input); } -__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) { +__device__ inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) { uint32_t offset = src1 & 31; uint32_t width = src2 & 31; return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width); } -__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) { +__device__ inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) { uint64_t offset = src1 & 63; uint64_t width = src2 & 63; return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width); } -__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) { +__device__ inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) { uint32_t offset = src2 & 31; uint32_t width = src3 & 31; uint32_t mask = (1 << width) - 1; return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); } -__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) { +__device__ inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) { uint64_t offset = src2 & 63; uint64_t width = src3 & 63; uint64_t mask = (1ULL << width) - 1; @@ -136,7 +136,7 @@ struct uchar2Holder { } __attribute__((aligned(8))); __device__ -static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) { +inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) { struct uchar2Holder cHoldVal; struct ucharHolder cHoldKey; struct ucharHolder cHoldOut; @@ -150,18 +150,18 @@ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned return cHoldOut.ui; } -__device__ static inline unsigned int __hadd(int x, int y) { +__device__ inline unsigned int __hadd(int x, int y) { int z = x + y; int sign = z & 0x8000000; int value = z & 0x7FFFFFFF; return ((value) >> 1 || sign); } -__device__ static inline int __mul24(int x, int y) { +__device__ inline int __mul24(int x, int y) { return __ockl_mul24_i32(x, y); } -__device__ static inline long long __mul64hi(long long int x, long long int y) { +__device__ inline long long __mul64hi(long long int x, long long int y) { ulong x0 = (ulong)x & 0xffffffffUL; long x1 = x >> 32; ulong y0 = (ulong)y & 0xffffffffUL; @@ -174,28 +174,28 @@ __device__ static inline long long __mul64hi(long long int x, long long int y) { return x1*y1 + z2 + (z1 >> 32); } -__device__ static inline int __mulhi(int x, int y) { +__device__ inline int __mulhi(int x, int y) { return __ockl_mul_hi_i32(x, y); } -__device__ static inline int __rhadd(int x, int y) { +__device__ inline int __rhadd(int x, int y) { int z = x + y + 1; int sign = z & 0x8000000; int value = z & 0x7FFFFFFF; return ((value) >> 1 || sign); } -__device__ static inline unsigned int __sad(int x, int y, int z) { +__device__ inline unsigned int __sad(int x, int y, int z) { return x > y ? x - y + z : y - x + z; } -__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) { +__device__ inline unsigned int __uhadd(unsigned int x, unsigned int y) { return (x + y) >> 1; } -__device__ static inline int __umul24(unsigned int x, unsigned int y) { +__device__ inline int __umul24(unsigned int x, unsigned int y) { return __ockl_mul24_u32(x, y); } __device__ -static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) { +inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) { ulong x0 = x & 0xffffffffUL; ulong x1 = x >> 32; ulong y0 = y & 0xffffffffUL; @@ -208,41 +208,41 @@ static inline unsigned long long __umul64hi(unsigned long long int x, unsigned l return x1*y1 + z2 + (z1 >> 32); } -__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) { +__device__ inline unsigned int __umulhi(unsigned int x, unsigned int y) { return __ockl_mul_hi_u32(x, y); } -__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) { +__device__ inline unsigned int __urhadd(unsigned int x, unsigned int y) { return (x + y + 1) >> 1; } -__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) { +__device__ inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) { return __ockl_sad_u32(x, y, z); } -__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); } +__device__ inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); } /* HIP specific device functions */ -__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) { +__device__ inline unsigned __hip_ds_bpermute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i); return tmp.u; } -__device__ static inline float __hip_ds_bpermutef(int index, float src) { +__device__ inline float __hip_ds_bpermutef(int index, float src) { union { int i; unsigned u; float f; } tmp; tmp.f = src; tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i); return tmp.f; } -__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) { +__device__ inline unsigned __hip_ds_permute(int index, unsigned src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i); return tmp.u; } -__device__ static inline float __hip_ds_permutef(int index, float src) { +__device__ inline float __hip_ds_permutef(int index, float src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i); return tmp.u; @@ -252,7 +252,7 @@ __device__ static inline float __hip_ds_permutef(int index, float src) { #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src)) template -__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) { +__device__ inline unsigned __hip_ds_swizzle_N(unsigned int src) { union { int i; unsigned u; float f; } tmp; tmp.u = src; #if defined(__HCC__) tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern); @@ -263,7 +263,7 @@ __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) { } template -__device__ static inline float __hip_ds_swizzlef_N(float src) { +__device__ inline float __hip_ds_swizzlef_N(float src) { union { int i; unsigned u; float f; } tmp; tmp.f = src; #if defined(__HCC__) tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern); @@ -277,7 +277,7 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) { __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src)) template -__device__ static inline int __hip_move_dpp_N(int src) { +__device__ inline int __hip_move_dpp_N(int src) { return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); } @@ -434,7 +434,7 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) { #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 -__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) { +__device__ inline char4 __hip_hc_add8pk(char4 in1, char4 in2) { char4 out; unsigned one1 = in1.w & MASK1; unsigned one2 = in2.w & MASK1; @@ -445,7 +445,7 @@ __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) { return out; } -__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) { +__device__ inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) { char4 out; unsigned one1 = in1.w & MASK1; unsigned one2 = in2.w & MASK1; @@ -456,7 +456,7 @@ __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) { return out; } -__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) { +__device__ inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) { char4 out; unsigned one1 = in1.w & MASK1; unsigned one2 = in2.w & MASK1; @@ -472,12 +472,12 @@ __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) { * TODO: Conversion functions are not correct, need to fix when BE is ready */ -__device__ static inline float __double2float_rd(double x) { return (double)x; } -__device__ static inline float __double2float_rn(double x) { return (double)x; } -__device__ static inline float __double2float_ru(double x) { return (double)x; } -__device__ static inline float __double2float_rz(double x) { return (double)x; } +__device__ inline float __double2float_rd(double x) { return (double)x; } +__device__ inline float __double2float_rn(double x) { return (double)x; } +__device__ inline float __double2float_ru(double x) { return (double)x; } +__device__ inline float __double2float_rz(double x) { return (double)x; } -__device__ static inline int __double2hiint(double x) { +__device__ inline int __double2hiint(double x) { static_assert(sizeof(double) == 2 * sizeof(int), ""); int tmp[2]; @@ -485,7 +485,7 @@ __device__ static inline int __double2hiint(double x) { return tmp[1]; } -__device__ static inline int __double2loint(double x) { +__device__ inline int __double2loint(double x) { static_assert(sizeof(double) == 2 * sizeof(int), ""); int tmp[2]; @@ -494,35 +494,35 @@ __device__ static inline int __double2loint(double x) { return tmp[0]; } -__device__ static inline int __double2int_rd(double x) { return (int)x; } -__device__ static inline int __double2int_rn(double x) { return (int)x; } -__device__ static inline int __double2int_ru(double x) { return (int)x; } -__device__ static inline int __double2int_rz(double x) { return (int)x; } +__device__ inline int __double2int_rd(double x) { return (int)x; } +__device__ inline int __double2int_rn(double x) { return (int)x; } +__device__ inline int __double2int_ru(double x) { return (int)x; } +__device__ inline int __double2int_rz(double x) { return (int)x; } -__device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; } -__device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; } -__device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; } -__device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; } +__device__ inline long long int __double2ll_rd(double x) { return (long long int)x; } +__device__ inline long long int __double2ll_rn(double x) { return (long long int)x; } +__device__ inline long long int __double2ll_ru(double x) { return (long long int)x; } +__device__ inline long long int __double2ll_rz(double x) { return (long long int)x; } -__device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; } -__device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; } -__device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; } -__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; } +__device__ inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; } +__device__ inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; } +__device__ inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; } +__device__ inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; } -__device__ static inline unsigned long long int __double2ull_rd(double x) { +__device__ inline unsigned long long int __double2ull_rd(double x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __double2ull_rn(double x) { +__device__ inline unsigned long long int __double2ull_rn(double x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __double2ull_ru(double x) { +__device__ inline unsigned long long int __double2ull_ru(double x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __double2ull_rz(double x) { +__device__ inline unsigned long long int __double2ull_rz(double x) { return (unsigned long long int)x; } -__device__ static inline long long int __double_as_longlong(double x) { +__device__ inline long long int __double_as_longlong(double x) { static_assert(sizeof(long long) == sizeof(double), ""); long long tmp; @@ -545,35 +545,35 @@ CUDA implements half as unsigned short whereas, HIP doesn't. */ -__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); } -__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); } -__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); } -__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); } +__device__ inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); } +__device__ inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); } +__device__ inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); } +__device__ inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); } -__device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; } -__device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; } -__device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; } -__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; } +__device__ inline long long int __float2ll_rd(float x) { return (long long int)x; } +__device__ inline long long int __float2ll_rn(float x) { return (long long int)x; } +__device__ inline long long int __float2ll_ru(float x) { return (long long int)x; } +__device__ inline long long int __float2ll_rz(float x) { return (long long int)x; } -__device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; } -__device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; } -__device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; } -__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; } +__device__ inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; } +__device__ inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; } +__device__ inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; } +__device__ inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; } -__device__ static inline unsigned long long int __float2ull_rd(float x) { +__device__ inline unsigned long long int __float2ull_rd(float x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __float2ull_rn(float x) { +__device__ inline unsigned long long int __float2ull_rn(float x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __float2ull_ru(float x) { +__device__ inline unsigned long long int __float2ull_ru(float x) { return (unsigned long long int)x; } -__device__ static inline unsigned long long int __float2ull_rz(float x) { +__device__ inline unsigned long long int __float2ull_rz(float x) { return (unsigned long long int)x; } -__device__ static inline int __float_as_int(float x) { +__device__ inline int __float_as_int(float x) { static_assert(sizeof(int) == sizeof(float), ""); int tmp; @@ -582,7 +582,7 @@ __device__ static inline int __float_as_int(float x) { return tmp; } -__device__ static inline unsigned int __float_as_uint(float x) { +__device__ inline unsigned int __float_as_uint(float x) { static_assert(sizeof(unsigned int) == sizeof(float), ""); unsigned int tmp; @@ -591,7 +591,7 @@ __device__ static inline unsigned int __float_as_uint(float x) { return tmp; } -__device__ static inline double __hiloint2double(int hi, int lo) { +__device__ inline double __hiloint2double(int hi, int lo) { static_assert(sizeof(double) == sizeof(uint64_t), ""); uint64_t tmp0 = (static_cast(hi) << 32ull) | static_cast(lo); @@ -601,14 +601,14 @@ __device__ static inline double __hiloint2double(int hi, int lo) { return tmp1; } -__device__ static inline double __int2double_rn(int x) { return (double)x; } +__device__ inline double __int2double_rn(int x) { return (double)x; } -__device__ static inline float __int2float_rd(int x) { return (float)x; } -__device__ static inline float __int2float_rn(int x) { return (float)x; } -__device__ static inline float __int2float_ru(int x) { return (float)x; } -__device__ static inline float __int2float_rz(int x) { return (float)x; } +__device__ inline float __int2float_rd(int x) { return (float)x; } +__device__ inline float __int2float_rn(int x) { return (float)x; } +__device__ inline float __int2float_ru(int x) { return (float)x; } +__device__ inline float __int2float_rz(int x) { return (float)x; } -__device__ static inline float __int_as_float(int x) { +__device__ inline float __int_as_float(int x) { static_assert(sizeof(float) == sizeof(int), ""); float tmp; @@ -617,17 +617,17 @@ __device__ static inline float __int_as_float(int x) { return tmp; } -__device__ static inline double __ll2double_rd(long long int x) { return (double)x; } -__device__ static inline double __ll2double_rn(long long int x) { return (double)x; } -__device__ static inline double __ll2double_ru(long long int x) { return (double)x; } -__device__ static inline double __ll2double_rz(long long int x) { return (double)x; } +__device__ inline double __ll2double_rd(long long int x) { return (double)x; } +__device__ inline double __ll2double_rn(long long int x) { return (double)x; } +__device__ inline double __ll2double_ru(long long int x) { return (double)x; } +__device__ inline double __ll2double_rz(long long int x) { return (double)x; } -__device__ static inline float __ll2float_rd(long long int x) { return (float)x; } -__device__ static inline float __ll2float_rn(long long int x) { return (float)x; } -__device__ static inline float __ll2float_ru(long long int x) { return (float)x; } -__device__ static inline float __ll2float_rz(long long int x) { return (float)x; } +__device__ inline float __ll2float_rd(long long int x) { return (float)x; } +__device__ inline float __ll2float_rn(long long int x) { return (float)x; } +__device__ inline float __ll2float_ru(long long int x) { return (float)x; } +__device__ inline float __ll2float_rz(long long int x) { return (float)x; } -__device__ static inline double __longlong_as_double(long long int x) { +__device__ inline double __longlong_as_double(long long int x) { static_assert(sizeof(double) == sizeof(long long), ""); double tmp; @@ -636,14 +636,14 @@ __device__ static inline double __longlong_as_double(long long int x) { return tmp; } -__device__ static inline double __uint2double_rn(int x) { return (double)x; } +__device__ inline double __uint2double_rn(int x) { return (double)x; } -__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; } -__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; } -__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; } -__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; } +__device__ inline float __uint2float_rd(unsigned int x) { return (float)x; } +__device__ inline float __uint2float_rn(unsigned int x) { return (float)x; } +__device__ inline float __uint2float_ru(unsigned int x) { return (float)x; } +__device__ inline float __uint2float_rz(unsigned int x) { return (float)x; } -__device__ static inline float __uint_as_float(unsigned int x) { +__device__ inline float __uint_as_float(unsigned int x) { static_assert(sizeof(float) == sizeof(unsigned int), ""); float tmp; @@ -652,15 +652,15 @@ __device__ static inline float __uint_as_float(unsigned int x) { return tmp; } -__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; } -__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; } -__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; } -__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; } +__device__ inline double __ull2double_rd(unsigned long long int x) { return (double)x; } +__device__ inline double __ull2double_rn(unsigned long long int x) { return (double)x; } +__device__ inline double __ull2double_ru(unsigned long long int x) { return (double)x; } +__device__ inline double __ull2double_rz(unsigned long long int x) { return (double)x; } -__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; } -__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; } -__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; } -__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; } +__device__ inline float __ull2float_rd(unsigned long long int x) { return (float)x; } +__device__ inline float __ull2float_rn(unsigned long long int x) { return (float)x; } +__device__ inline float __ull2float_ru(unsigned long long int x) { return (float)x; } +__device__ inline float __ull2float_rz(unsigned long long int x) { return (float)x; } #if defined(__HCC__) #define __HCC_OR_HIP_CLANG__ 1 @@ -819,7 +819,7 @@ typedef enum __memory_order __device__ inline -static void +void __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope) { // We're tying global-happens-before and local-happens-before together as does HSA @@ -871,21 +871,21 @@ __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __mem // Memory Fence Functions __device__ inline -static void __threadfence() +void __threadfence() { __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device); } __device__ inline -static void __threadfence_block() +void __threadfence_block() { __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group); } __device__ inline -static void __threadfence_system() +void __threadfence_system() { __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices); } @@ -945,7 +945,7 @@ void __assertfail(const char * __assertion, __device__ inline -static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope) +void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope) { if (flags) { __atomic_work_item_fence(flags, __memory_order_release, scope); @@ -958,7 +958,7 @@ static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scop __device__ inline -static void __barrier(int n) +void __barrier(int n) { __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group); } @@ -1037,7 +1037,7 @@ unsigned __smid(void) // loop unrolling -static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { +inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) { auto dstPtr = static_cast(dst); auto srcPtr = static_cast(src); @@ -1063,7 +1063,7 @@ static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_ return dst; } -static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) { +inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) { auto dstPtr = static_cast(dst); while (size >= 4u) { @@ -1086,11 +1086,11 @@ static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, siz return dst; } -static inline __device__ void* memcpy(void* dst, const void* src, size_t size) { +inline __device__ void* memcpy(void* dst, const void* src, size_t size) { return __hip_hc_memcpy(dst, src, size); } -static inline __device__ void* memset(void* ptr, int val, size_t size) { +inline __device__ void* memset(void* ptr, int val, size_t size) { unsigned char val8 = static_cast(val); return __hip_hc_memset(ptr, val8, size); } diff --git a/include/hip/hcc_detail/driver_types.h b/include/hip/hcc_detail/driver_types.h index 510d3d058e..4fb24e092b 100644 --- a/include/hip/hcc_detail/driver_types.h +++ b/include/hip/hcc_detail/driver_types.h @@ -287,8 +287,8 @@ typedef struct hipMemcpy3DParms { size_t srcZ; }hipMemcpy3DParms; -static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz, - size_t ysz) { +inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz, + size_t ysz) { struct hipPitchedPtr s; s.ptr = d; @@ -299,7 +299,7 @@ static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t return s; } -static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) { +inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) { struct hipPos p; p.x = x; @@ -309,7 +309,7 @@ static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) { return p; } -static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) { +inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) { struct hipExtent e; e.width = w; diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h index 11648ce123..18f0a9961e 100644 --- a/include/hip/hcc_detail/hip_complex.h +++ b/include/hip/hcc_detail/hip_complex.h @@ -36,7 +36,7 @@ THE SOFTWARE. #if __cplusplus #define COMPLEX_NEG_OP_OVERLOAD(type) \ - __device__ __host__ static inline type operator-(const type& op) { \ + __device__ __host__ inline type operator-(const type& op) { \ type ret; \ ret.x = -op.x; \ ret.y = -op.y; \ @@ -44,17 +44,17 @@ THE SOFTWARE. } #define COMPLEX_EQ_OP_OVERLOAD(type) \ - __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) { \ + __device__ __host__ inline bool operator==(const type& lhs, const type& rhs) { \ return lhs.x == rhs.x && lhs.y == rhs.y; \ } #define COMPLEX_NE_OP_OVERLOAD(type) \ - __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) { \ + __device__ __host__ inline bool operator!=(const type& lhs, const type& rhs) { \ return !(lhs == rhs); \ } #define COMPLEX_ADD_OP_OVERLOAD(type) \ - __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) { \ + __device__ __host__ inline type operator+(const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x + rhs.x; \ ret.y = lhs.y + rhs.y; \ @@ -62,7 +62,7 @@ THE SOFTWARE. } #define COMPLEX_SUB_OP_OVERLOAD(type) \ - __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) { \ + __device__ __host__ inline type operator-(const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x - rhs.x; \ ret.y = lhs.y - rhs.y; \ @@ -70,7 +70,7 @@ THE SOFTWARE. } #define COMPLEX_MUL_OP_OVERLOAD(type) \ - __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) { \ + __device__ __host__ inline type operator*(const type& lhs, const type& rhs) { \ type ret; \ ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \ ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \ @@ -78,7 +78,7 @@ THE SOFTWARE. } #define COMPLEX_DIV_OP_OVERLOAD(type) \ - __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) { \ + __device__ __host__ inline type operator/(const type& lhs, const type& rhs) { \ type ret; \ ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \ ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \ @@ -88,33 +88,33 @@ THE SOFTWARE. } #define COMPLEX_ADD_PREOP_OVERLOAD(type) \ - __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) { \ + __device__ __host__ inline type& operator+=(type& lhs, const type& rhs) { \ lhs.x += rhs.x; \ lhs.y += rhs.y; \ return lhs; \ } #define COMPLEX_SUB_PREOP_OVERLOAD(type) \ - __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) { \ + __device__ __host__ inline type& operator-=(type& lhs, const type& rhs) { \ lhs.x -= rhs.x; \ lhs.y -= rhs.y; \ return lhs; \ } #define COMPLEX_MUL_PREOP_OVERLOAD(type) \ - __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) { \ + __device__ __host__ inline type& operator*=(type& lhs, const type& rhs) { \ lhs = lhs * rhs; \ return lhs; \ } #define COMPLEX_DIV_PREOP_OVERLOAD(type) \ - __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) { \ + __device__ __host__ inline type& operator/=(type& lhs, const type& rhs) { \ lhs = lhs / rhs; \ return lhs; \ } #define COMPLEX_SCALAR_PRODUCT(type, type1) \ - __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) { \ + __device__ __host__ inline type operator*(const type& lhs, type1 rhs) { \ type ret; \ ret.x = lhs.x * rhs; \ ret.y = lhs.y * rhs; \ @@ -125,41 +125,41 @@ THE SOFTWARE. typedef float2 hipFloatComplex; -__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; } +__device__ __host__ inline float hipCrealf(hipFloatComplex z) { return z.x; } -__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; } +__device__ __host__ inline float hipCimagf(hipFloatComplex z) { return z.y; } -__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) { +__device__ __host__ inline hipFloatComplex make_hipFloatComplex(float a, float b) { hipFloatComplex z; z.x = a; z.y = b; return z; } -__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { +__device__ __host__ inline hipFloatComplex hipConjf(hipFloatComplex z) { hipFloatComplex ret; ret.x = z.x; ret.y = -z.y; return ret; } -__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) { +__device__ __host__ inline float hipCsqabsf(hipFloatComplex z) { return z.x * z.x + z.y * z.y; } -__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) { +__device__ __host__ inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) { return make_hipFloatComplex(p.x + q.x, p.y + q.y); } -__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) { +__device__ __host__ inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) { return make_hipFloatComplex(p.x - q.x, p.y - q.y); } -__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) { +__device__ __host__ inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) { return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) { +__device__ __host__ inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) { float sqabs = hipCsqabsf(q); hipFloatComplex ret; ret.x = (p.x * q.x + p.y * q.y) / sqabs; @@ -167,46 +167,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi return ret; } -__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); } +__device__ __host__ inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); } typedef double2 hipDoubleComplex; -__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; } +__device__ __host__ inline double hipCreal(hipDoubleComplex z) { return z.x; } -__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; } +__device__ __host__ inline double hipCimag(hipDoubleComplex z) { return z.y; } -__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) { +__device__ __host__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b) { hipDoubleComplex z; z.x = a; z.y = b; return z; } -__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { +__device__ __host__ inline hipDoubleComplex hipConj(hipDoubleComplex z) { hipDoubleComplex ret; ret.x = z.x; ret.y = -z.y; return ret; } -__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) { +__device__ __host__ inline double hipCsqabs(hipDoubleComplex z) { return z.x * z.x + z.y * z.y; } -__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) { +__device__ __host__ inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) { return make_hipDoubleComplex(p.x + q.x, p.y + q.y); } -__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) { +__device__ __host__ inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) { return make_hipDoubleComplex(p.x - q.x, p.y - q.y); } -__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) { +__device__ __host__ inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) { return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) { +__device__ __host__ inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) { double sqabs = hipCsqabs(q); hipDoubleComplex ret; ret.x = (p.x * q.x + p.y * q.y) / sqabs; @@ -214,7 +214,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h return ret; } -__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); } +__device__ __host__ inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); } #if __cplusplus @@ -268,19 +268,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long) typedef hipFloatComplex hipComplex; -__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) { +__device__ __host__ inline hipComplex make_hipComplex(float x, float y) { return make_hipFloatComplex(x, y); } -__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) { +__device__ __host__ inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) { return make_hipFloatComplex((float)z.x, (float)z.y); } -__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) { +__device__ __host__ inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) { return make_hipDoubleComplex((double)z.x, (double)z.y); } -__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) { +__device__ __host__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) { float real = (p.x * q.x) + r.x; float imag = (q.x * p.y) + r.y; @@ -290,7 +290,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q return make_hipComplex(real, imag); } -__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, +__device__ __host__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r) { double real = (p.x * q.x) + r.x; double imag = (q.x * p.y) + r.y; diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index 52abc1a004..0017d91e3f 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -375,7 +375,7 @@ THE SOFTWARE. data = x.data; return *this; } - + // MANIPULATORS - DEVICE ONLY #if !defined(__HIP_NO_HALF_OPERATORS__) __device__ @@ -519,1120 +519,1116 @@ THE SOFTWARE. }; // END STRUCT __HALF2 - namespace + inline + __host__ __device__ + __half2 make_half2(__half x, __half y) { - inline - __host__ __device__ - __half2 make_half2(__half x, __half y) - { - return __half2{x, y}; - } + return __half2{x, y}; + } - inline - __device__ - __half __low2half(__half2 x) - { - return __half{__half_raw{static_cast<__half2_raw>(x).data.x}}; - } + inline + __device__ + __half __low2half(__half2 x) + { + return __half{__half_raw{static_cast<__half2_raw>(x).data.x}}; + } - inline - __device__ - __half __high2half(__half2 x) - { - return __half{__half_raw{static_cast<__half2_raw>(x).data.y}}; - } + inline + __device__ + __half __high2half(__half2 x) + { + return __half{__half_raw{static_cast<__half2_raw>(x).data.y}}; + } - inline - __device__ - __half2 __half2half2(__half x) - { - return __half2{x, x}; - } + inline + __device__ + __half2 __half2half2(__half x) + { + return __half2{x, x}; + } - inline - __device__ - __half2 __halves2half2(__half x, __half y) - { - return __half2{x, y}; - } + inline + __device__ + __half2 __halves2half2(__half x, __half y) + { + return __half2{x, y}; + } - inline - __device__ - __half2 __low2half2(__half2 x) - { - return __half2{ - _Float16_2{ - static_cast<__half2_raw>(x).data.x, - static_cast<__half2_raw>(x).data.x}}; - } - - inline - __device__ - __half2 __high2half2(__half2 x) - { - return __half2_raw{ - _Float16_2{ - static_cast<__half2_raw>(x).data.y, - static_cast<__half2_raw>(x).data.y}}; - } - - inline - __device__ - __half2 __lows2half2(__half2 x, __half2 y) - { - return __half2_raw{ - _Float16_2{ - static_cast<__half2_raw>(x).data.x, - static_cast<__half2_raw>(y).data.x}}; - } - - inline - __device__ - __half2 __highs2half2(__half2 x, __half2 y) - { - return __half2_raw{ - _Float16_2{ - static_cast<__half2_raw>(x).data.y, - static_cast<__half2_raw>(y).data.y}}; - } - - inline - __device__ - __half2 __lowhigh2highlow(__half2 x) - { - return __half2_raw{ - _Float16_2{ - static_cast<__half2_raw>(x).data.y, - static_cast<__half2_raw>(x).data.x}}; - } - - // Bitcasts - inline - __device__ - short __half_as_short(__half x) - { - return static_cast<__half_raw>(x).x; - } - - inline - __device__ - unsigned short __half_as_ushort(__half x) - { - return static_cast<__half_raw>(x).x; - } - - inline - __device__ - __half __short_as_half(short x) - { - __half_raw r; r.x = x; - return r; - } - - inline - __device__ - __half __ushort_as_half(unsigned short x) - { - __half_raw r; r.x = x; - return r; - } - - // TODO: rounding behaviour is not correct. - // float -> half | half2 - inline - __device__ __host__ - __half __float2half(float x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ __host__ - __half __float2half_rn(float x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ __host__ - __half __float2half_rz(float x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ __host__ - __half __float2half_rd(float x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ __host__ - __half __float2half_ru(float x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ __host__ - __half2 __float2half2_rn(float x) - { - return __half2_raw{ - _Float16_2{ - static_cast<_Float16>(x), static_cast<_Float16>(x)}}; - } - inline - __device__ __host__ - __half2 __floats2half2_rn(float x, float y) - { - return __half2_raw{_Float16_2{ - static_cast<_Float16>(x), static_cast<_Float16>(y)}}; - } - inline - __device__ __host__ - __half2 __float22half2_rn(float2 x) - { - return __floats2half2_rn(x.x, x.y); - } - - // half | half2 -> float - inline - __device__ __host__ - float __half2float(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ __host__ - float __low2float(__half2 x) - { - return static_cast<__half2_raw>(x).data.x; - } - inline - __device__ __host__ - float __high2float(__half2 x) - { - return static_cast<__half2_raw>(x).data.y; - } - inline - __device__ __host__ - float2 __half22float2(__half2 x) - { - return make_float2( + inline + __device__ + __half2 __low2half2(__half2 x) + { + return __half2{ + _Float16_2{ static_cast<__half2_raw>(x).data.x, - static_cast<__half2_raw>(x).data.y); - } + static_cast<__half2_raw>(x).data.x}}; + } - // half -> int - inline - __device__ - int __half2int_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - int __half2int_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - int __half2int_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - int __half2int_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + inline + __device__ + __half2 __high2half2(__half2 x) + { + return __half2_raw{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.y}}; + } - // int -> half - inline - __device__ - __half __int2half_rn(int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __int2half_rz(int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __int2half_rd(int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __int2half_ru(int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + inline + __device__ + __half2 __lows2half2(__half2 x, __half2 y) + { + return __half2_raw{ + _Float16_2{ + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(y).data.x}}; + } - // half -> short - inline - __device__ - short __half2short_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - short __half2short_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - short __half2short_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - short __half2short_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + inline + __device__ + __half2 __highs2half2(__half2 x, __half2 y) + { + return __half2_raw{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(y).data.y}}; + } - // short -> half - inline - __device__ - __half __short2half_rn(short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __short2half_rz(short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __short2half_rd(short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __short2half_ru(short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + inline + __device__ + __half2 __lowhigh2highlow(__half2 x) + { + return __half2_raw{ + _Float16_2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.x}}; + } - // half -> long long - inline - __device__ - long long __half2ll_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - long long __half2ll_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - long long __half2ll_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - long long __half2ll_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + // Bitcasts + inline + __device__ + short __half_as_short(__half x) + { + return static_cast<__half_raw>(x).x; + } - // long long -> half - inline - __device__ - __half __ll2half_rn(long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ll2half_rz(long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ll2half_rd(long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ll2half_ru(long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + inline + __device__ + unsigned short __half_as_ushort(__half x) + { + return static_cast<__half_raw>(x).x; + } - // half -> unsigned int - inline - __device__ - unsigned int __half2uint_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned int __half2uint_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned int __half2uint_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned int __half2uint_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + inline + __device__ + __half __short_as_half(short x) + { + __half_raw r; r.x = x; + return r; + } - // unsigned int -> half - inline - __device__ - __half __uint2half_rn(unsigned int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __uint2half_rz(unsigned int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __uint2half_rd(unsigned int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __uint2half_ru(unsigned int x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + inline + __device__ + __half __ushort_as_half(unsigned short x) + { + __half_raw r; r.x = x; + return r; + } - // half -> unsigned short - inline - __device__ - unsigned short __half2ushort_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned short __half2ushort_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned short __half2ushort_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned short __half2ushort_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + // TODO: rounding behaviour is not correct. + // float -> half | half2 + inline + __device__ __host__ + __half __float2half(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ __host__ + __half __float2half_rn(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ __host__ + __half __float2half_rz(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ __host__ + __half __float2half_rd(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ __host__ + __half __float2half_ru(float x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ __host__ + __half2 __float2half2_rn(float x) + { + return __half2_raw{ + _Float16_2{static_cast<_Float16>(x), static_cast<_Float16>(x)}}; + } + inline + __device__ __host__ + __half2 __floats2half2_rn(float x, float y) + { + return __half2_raw{_Float16_2{ + static_cast<_Float16>(x), static_cast<_Float16>(y)}}; + } + inline + __device__ __host__ + __half2 __float22half2_rn(float2 x) + { + return __floats2half2_rn(x.x, x.y); + } - // unsigned short -> half - inline - __device__ - __half __ushort2half_rn(unsigned short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ushort2half_rz(unsigned short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ushort2half_rd(unsigned short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ushort2half_ru(unsigned short x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + // half | half2 -> float + inline + __device__ __host__ + float __half2float(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ __host__ + float __low2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.x; + } + inline + __device__ __host__ + float __high2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.y; + } + inline + __device__ __host__ + float2 __half22float2(__half2 x) + { + return make_float2( + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(x).data.y); + } - // half -> unsigned long long - inline - __device__ - unsigned long long __half2ull_rn(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned long long __half2ull_rz(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned long long __half2ull_rd(__half x) - { - return static_cast<__half_raw>(x).data; - } - inline - __device__ - unsigned long long __half2ull_ru(__half x) - { - return static_cast<__half_raw>(x).data; - } + // half -> int + inline + __device__ + int __half2int_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - // unsigned long long -> half - inline - __device__ - __half __ull2half_rn(unsigned long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ull2half_rz(unsigned long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ull2half_rd(unsigned long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } - inline - __device__ - __half __ull2half_ru(unsigned long long x) - { - return __half_raw{static_cast<_Float16>(x)}; - } + // int -> half + inline + __device__ + __half __int2half_rn(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_rz(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_rd(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __int2half_ru(int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - // Load primitives - inline - __device__ - __half __ldg(const __half* ptr) { return *ptr; } - inline - __device__ - __half __ldcg(const __half* ptr) { return *ptr; } - inline - __device__ - __half __ldca(const __half* ptr) { return *ptr; } - inline - __device__ - __half __ldcs(const __half* ptr) { return *ptr; } + // half -> short + inline + __device__ + short __half2short_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - inline - __device__ - __half2 __ldg(const __half2* ptr) { return *ptr; } - inline - __device__ - __half2 __ldcg(const __half2* ptr) { return *ptr; } - inline - __device__ - __half2 __ldca(const __half2* ptr) { return *ptr; } - inline - __device__ - __half2 __ldcs(const __half2* ptr) { return *ptr; } + // short -> half + inline + __device__ + __half __short2half_rn(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_rz(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_rd(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __short2half_ru(short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - // Relations - inline - __device__ - bool __heq(__half x, __half y) - { - return static_cast<__half_raw>(x).data == - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hne(__half x, __half y) - { - return static_cast<__half_raw>(x).data != - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hle(__half x, __half y) - { - return static_cast<__half_raw>(x).data <= - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hge(__half x, __half y) - { - return static_cast<__half_raw>(x).data >= - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hlt(__half x, __half y) - { - return static_cast<__half_raw>(x).data < - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hgt(__half x, __half y) - { - return static_cast<__half_raw>(x).data > - static_cast<__half_raw>(y).data; - } - inline - __device__ - bool __hequ(__half x, __half y) { return __heq(x, y); } - inline - __device__ - bool __hneu(__half x, __half y) { return __hne(x, y); } - inline - __device__ - bool __hleu(__half x, __half y) { return __hle(x, y); } - inline - __device__ - bool __hgeu(__half x, __half y) { return __hge(x, y); } - inline - __device__ - bool __hltu(__half x, __half y) { return __hlt(x, y); } - inline - __device__ - bool __hgtu(__half x, __half y) { return __hgt(x, y); } + // half -> long long + inline + __device__ + long long __half2ll_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - inline - __device__ - __half2 __heq2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data == - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hne2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data != - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hle2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data <= - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hge2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data >= - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hlt2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data < - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hgt2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(x).data > - static_cast<__half2_raw>(y).data; - return __builtin_convertvector(-r, _Float16_2); - } - inline - __device__ - __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); } - inline - __device__ - __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); } - inline - __device__ - __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); } - inline - __device__ - __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); } - inline - __device__ - __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); } - inline - __device__ - __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); } + // long long -> half + inline + __device__ + __half __ll2half_rn(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_rz(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_rd(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ll2half_ru(long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - inline - __device__ - bool __hbeq2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__heq2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hbne2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hne2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hble2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hle2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hbge2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hge2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hblt2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hlt2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hbgt2(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hgt2(x, y)); - return r.data.x != 0 && r.data.y != 0; - } - inline - __device__ - bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); } - inline - __device__ - bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); } - inline - __device__ - bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); } - inline - __device__ - bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); } - inline - __device__ - bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); } - inline - __device__ - bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); } + // half -> unsigned int + inline + __device__ + unsigned int __half2uint_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - // Arithmetic - inline - __device__ - __half __clamp_01(__half x) - { - auto r = static_cast<__half_raw>(x); + // unsigned int -> half + inline + __device__ + __half __uint2half_rn(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_rz(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_rd(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __uint2half_ru(unsigned int x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - if (__hlt(x, __half_raw{0})) return __half_raw{0}; - if (__hlt(__half_raw{1}, x)) return __half_raw{1}; - return r; - } + // half -> unsigned short + inline + __device__ + unsigned short __half2ushort_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - inline - __device__ - __half __hadd(__half x, __half y) - { - return __half_raw{ - static_cast<__half_raw>(x).data + - static_cast<__half_raw>(y).data}; - } - inline - __device__ - __half __hsub(__half x, __half y) - { - return __half_raw{ - static_cast<__half_raw>(x).data - - static_cast<__half_raw>(y).data}; - } - inline - __device__ - __half __hmul(__half x, __half y) - { - return __half_raw{ - static_cast<__half_raw>(x).data * - static_cast<__half_raw>(y).data}; - } - inline - __device__ - __half __hadd_sat(__half x, __half y) - { - return __clamp_01(__hadd(x, y)); - } - inline - __device__ - __half __hsub_sat(__half x, __half y) - { - return __clamp_01(__hsub(x, y)); - } - inline - __device__ - __half __hmul_sat(__half x, __half y) - { - return __clamp_01(__hmul(x, y)); - } - inline - __device__ - __half __hfma(__half x, __half y, __half z) - { - return __half_raw{__ocml_fma_f16( - static_cast<__half_raw>(x).data, - static_cast<__half_raw>(y).data, - static_cast<__half_raw>(z).data)}; - } - inline - __device__ - __half __hfma_sat(__half x, __half y, __half z) - { - return __clamp_01(__hfma(x, y, z)); - } - inline - __device__ - __half __hdiv(__half x, __half y) - { - return __half_raw{ - static_cast<__half_raw>(x).data / - static_cast<__half_raw>(y).data}; - } + // unsigned short -> half + inline + __device__ + __half __ushort2half_rn(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_rz(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_rd(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ushort2half_ru(unsigned short x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - inline - __device__ - __half2 __hadd2(__half2 x, __half2 y) - { - return __half2_raw{ - static_cast<__half2_raw>(x).data + - static_cast<__half2_raw>(y).data}; - } - inline - __device__ - __half2 __hsub2(__half2 x, __half2 y) - { - return __half2_raw{ - static_cast<__half2_raw>(x).data - - static_cast<__half2_raw>(y).data}; - } - inline - __device__ - __half2 __hmul2(__half2 x, __half2 y) - { - return __half2_raw{ - static_cast<__half2_raw>(x).data * - static_cast<__half2_raw>(y).data}; - } - inline - __device__ - __half2 __hadd2_sat(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hadd2(x, y)); - return __half2{ - __clamp_01(__half_raw{r.data.x}), - __clamp_01(__half_raw{r.data.y})}; - } - inline - __device__ - __half2 __hsub2_sat(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hsub2(x, y)); - return __half2{ - __clamp_01(__half_raw{r.data.x}), - __clamp_01(__half_raw{r.data.y})}; - } - inline - __device__ - __half2 __hmul2_sat(__half2 x, __half2 y) - { - auto r = static_cast<__half2_raw>(__hmul2(x, y)); - return __half2{ - __clamp_01(__half_raw{r.data.x}), - __clamp_01(__half_raw{r.data.y})}; - } - inline - __device__ - __half2 __hfma2(__half2 x, __half2 y, __half2 z) - { - return __half2_raw{__ocml_fma_2f16(x, y, z)}; - } - inline - __device__ - __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z) - { - auto r = static_cast<__half2_raw>(__hfma2(x, y, z)); - return __half2{ - __clamp_01(__half_raw{r.data.x}), - __clamp_01(__half_raw{r.data.y})}; - } - inline - __device__ - __half2 __h2div(__half2 x, __half2 y) - { - return __half2_raw{ - static_cast<__half2_raw>(x).data / - static_cast<__half2_raw>(y).data}; - } + // half -> unsigned long long + inline + __device__ + unsigned long long __half2ull_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } - // Math functions - #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__ - inline - __device__ - float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) { - return __ockl_fdot2(static_cast<__half2_raw>(a).data, - static_cast<__half2_raw>(b).data, - c, saturate); - } - #endif - inline - __device__ - __half htrunc(__half x) - { - return __half_raw{ - __ocml_trunc_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hceil(__half x) - { - return __half_raw{ - __ocml_ceil_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hfloor(__half x) - { - return __half_raw{ - __ocml_floor_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hrint(__half x) - { - return __half_raw{ - __ocml_rint_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hsin(__half x) - { - return __half_raw{ - __ocml_sin_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hcos(__half x) - { - return __half_raw{ - __ocml_cos_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hexp(__half x) - { - return __half_raw{ - __ocml_exp_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hexp2(__half x) - { - return __half_raw{ - __ocml_exp2_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hexp10(__half x) - { - return __half_raw{ - __ocml_exp10_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hlog2(__half x) - { - return __half_raw{ - __ocml_log2_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hlog(__half x) - { - return __half_raw{ - __ocml_log_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hlog10(__half x) - { - return __half_raw{ - __ocml_log10_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hrcp(__half x) - { - return __half_raw{ - __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hrsqrt(__half x) - { - return __half_raw{ - __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - __half hsqrt(__half x) - { - return __half_raw{ - __ocml_sqrt_f16(static_cast<__half_raw>(x).data)}; - } - inline - __device__ - bool __hisinf(__half x) - { - return __ocml_isinf_f16(static_cast<__half_raw>(x).data); - } - inline - __device__ - bool __hisnan(__half x) - { - return __ocml_isnan_f16(static_cast<__half_raw>(x).data); - } - inline - __device__ - __half __hneg(__half x) - { - return __half_raw{-static_cast<__half_raw>(x).data}; - } + // unsigned long long -> half + inline + __device__ + __half __ull2half_rn(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_rz(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_rd(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } + inline + __device__ + __half __ull2half_ru(unsigned long long x) + { + return __half_raw{static_cast<_Float16>(x)}; + } - inline - __device__ - __half2 h2trunc(__half2 x) - { - return __half2_raw{__ocml_trunc_2f16(x)}; - } - inline - __device__ - __half2 h2ceil(__half2 x) - { - return __half2_raw{__ocml_ceil_2f16(x)}; - } - inline - __device__ - __half2 h2floor(__half2 x) - { - return __half2_raw{__ocml_floor_2f16(x)}; - } - inline - __device__ - __half2 h2rint(__half2 x) - { - return __half2_raw{__ocml_rint_2f16(x)}; - } - inline - __device__ - __half2 h2sin(__half2 x) - { - return __half2_raw{__ocml_sin_2f16(x)}; - } - inline - __device__ - __half2 h2cos(__half2 x) - { - return __half2_raw{__ocml_cos_2f16(x)}; - } - inline - __device__ - __half2 h2exp(__half2 x) - { - return __half2_raw{__ocml_exp_2f16(x)}; - } - inline - __device__ - __half2 h2exp2(__half2 x) - { - return __half2_raw{__ocml_exp2_2f16(x)}; - } - inline - __device__ - __half2 h2exp10(__half2 x) - { - return __half2_raw{__ocml_exp10_2f16(x)}; - } - inline - __device__ - __half2 h2log2(__half2 x) - { - return __half2_raw{__ocml_log2_2f16(x)}; - } - inline - __device__ - __half2 h2log(__half2 x) { return __ocml_log_2f16(x); } - inline - __device__ - __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } - inline - __device__ - __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); } - inline - __device__ - __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } - inline - __device__ - __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); } - inline - __device__ - __half2 __hisinf2(__half2 x) - { - auto r = __ocml_isinf_2f16(x); - return __half2_raw{_Float16_2{ - static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; - } - inline - __device__ - __half2 __hisnan2(__half2 x) - { - auto r = __ocml_isnan_2f16(x); - return __half2_raw{_Float16_2{ - static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; - } - inline - __device__ - __half2 __hneg2(__half2 x) - { - return __half2_raw{-static_cast<__half2_raw>(x).data}; - } - } // Anonymous namespace. + // Load primitives + inline + __device__ + __half __ldg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldca(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcs(const __half* ptr) { return *ptr; } + + inline + __device__ + __half2 __ldg(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldcg(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldca(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldcs(const __half2* ptr) { return *ptr; } + + // Relations + inline + __device__ + bool __heq(__half x, __half y) + { + return static_cast<__half_raw>(x).data == + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hne(__half x, __half y) + { + return static_cast<__half_raw>(x).data != + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hle(__half x, __half y) + { + return static_cast<__half_raw>(x).data <= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hge(__half x, __half y) + { + return static_cast<__half_raw>(x).data >= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hlt(__half x, __half y) + { + return static_cast<__half_raw>(x).data < + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hgt(__half x, __half y) + { + return static_cast<__half_raw>(x).data > + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hequ(__half x, __half y) { return __heq(x, y); } + inline + __device__ + bool __hneu(__half x, __half y) { return __hne(x, y); } + inline + __device__ + bool __hleu(__half x, __half y) { return __hle(x, y); } + inline + __device__ + bool __hgeu(__half x, __half y) { return __hge(x, y); } + inline + __device__ + bool __hltu(__half x, __half y) { return __hlt(x, y); } + inline + __device__ + bool __hgtu(__half x, __half y) { return __hgt(x, y); } + + inline + __device__ + __half2 __heq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data == + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data != + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hle2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data <= + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data >= + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hlt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data < + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data > + static_cast<__half2_raw>(y).data; + return __builtin_convertvector(-r, _Float16_2); + } + inline + __device__ + __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); } + inline + __device__ + __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); } + inline + __device__ + __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); } + inline + __device__ + __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); } + inline + __device__ + __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); } + inline + __device__ + __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); } + + inline + __device__ + bool __hbeq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__heq2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hne2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hble2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hle2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hge2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hblt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hlt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hgt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); } + inline + __device__ + bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); } + inline + __device__ + bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); } + inline + __device__ + bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); } + inline + __device__ + bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); } + inline + __device__ + bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); } + + // Arithmetic + inline + __device__ + __half __clamp_01(__half x) + { + auto r = static_cast<__half_raw>(x); + + if (__hlt(x, __half_raw{0})) return __half_raw{0}; + if (__hlt(__half_raw{1}, x)) return __half_raw{1}; + return r; + } + + inline + __device__ + __half __hadd(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data + + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __hsub(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data - + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __hmul(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data * + static_cast<__half_raw>(y).data}; + } + inline + __device__ + __half __hadd_sat(__half x, __half y) + { + return __clamp_01(__hadd(x, y)); + } + inline + __device__ + __half __hsub_sat(__half x, __half y) + { + return __clamp_01(__hsub(x, y)); + } + inline + __device__ + __half __hmul_sat(__half x, __half y) + { + return __clamp_01(__hmul(x, y)); + } + inline + __device__ + __half __hfma(__half x, __half y, __half z) + { + return __half_raw{__ocml_fma_f16( + static_cast<__half_raw>(x).data, + static_cast<__half_raw>(y).data, + static_cast<__half_raw>(z).data)}; + } + inline + __device__ + __half __hfma_sat(__half x, __half y, __half z) + { + return __clamp_01(__hfma(x, y, z)); + } + inline + __device__ + __half __hdiv(__half x, __half y) + { + return __half_raw{ + static_cast<__half_raw>(x).data / + static_cast<__half_raw>(y).data}; + } + + inline + __device__ + __half2 __hadd2(__half2 x, __half2 y) + { + return __half2_raw{ + static_cast<__half2_raw>(x).data + + static_cast<__half2_raw>(y).data}; + } + inline + __device__ + __half2 __hsub2(__half2 x, __half2 y) + { + return __half2_raw{ + static_cast<__half2_raw>(x).data - + static_cast<__half2_raw>(y).data}; + } + inline + __device__ + __half2 __hmul2(__half2 x, __half2 y) + { + return __half2_raw{ + static_cast<__half2_raw>(x).data * + static_cast<__half2_raw>(y).data}; + } + inline + __device__ + __half2 __hadd2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hadd2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __device__ + __half2 __hsub2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hsub2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __device__ + __half2 __hmul2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hmul2(x, y)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __device__ + __half2 __hfma2(__half2 x, __half2 y, __half2 z) + { + return __half2_raw{__ocml_fma_2f16(x, y, z)}; + } + inline + __device__ + __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z) + { + auto r = static_cast<__half2_raw>(__hfma2(x, y, z)); + return __half2{ + __clamp_01(__half_raw{r.data.x}), + __clamp_01(__half_raw{r.data.y})}; + } + inline + __device__ + __half2 __h2div(__half2 x, __half2 y) + { + return __half2_raw{ + static_cast<__half2_raw>(x).data / + static_cast<__half2_raw>(y).data}; + } + + // Math functions + #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__ + inline + __device__ + float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) { + return __ockl_fdot2(static_cast<__half2_raw>(a).data, + static_cast<__half2_raw>(b).data, + c, saturate); + } + #endif + inline + __device__ + __half htrunc(__half x) + { + return __half_raw{ + __ocml_trunc_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hceil(__half x) + { + return __half_raw{ + __ocml_ceil_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hfloor(__half x) + { + return __half_raw{ + __ocml_floor_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hrint(__half x) + { + return __half_raw{ + __ocml_rint_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hsin(__half x) + { + return __half_raw{ + __ocml_sin_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hcos(__half x) + { + return __half_raw{ + __ocml_cos_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp(__half x) + { + return __half_raw{ + __ocml_exp_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp2(__half x) + { + return __half_raw{ + __ocml_exp2_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hexp10(__half x) + { + return __half_raw{ + __ocml_exp10_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog2(__half x) + { + return __half_raw{ + __ocml_log2_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog(__half x) + { + return __half_raw{ + __ocml_log_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hlog10(__half x) + { + return __half_raw{ + __ocml_log10_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hrcp(__half x) + { + return __half_raw{ + __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hrsqrt(__half x) + { + return __half_raw{ + __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + __half hsqrt(__half x) + { + return __half_raw{ + __ocml_sqrt_f16(static_cast<__half_raw>(x).data)}; + } + inline + __device__ + bool __hisinf(__half x) + { + return __ocml_isinf_f16(static_cast<__half_raw>(x).data); + } + inline + __device__ + bool __hisnan(__half x) + { + return __ocml_isnan_f16(static_cast<__half_raw>(x).data); + } + inline + __device__ + __half __hneg(__half x) + { + return __half_raw{-static_cast<__half_raw>(x).data}; + } + + inline + __device__ + __half2 h2trunc(__half2 x) + { + return __half2_raw{__ocml_trunc_2f16(x)}; + } + inline + __device__ + __half2 h2ceil(__half2 x) + { + return __half2_raw{__ocml_ceil_2f16(x)}; + } + inline + __device__ + __half2 h2floor(__half2 x) + { + return __half2_raw{__ocml_floor_2f16(x)}; + } + inline + __device__ + __half2 h2rint(__half2 x) + { + return __half2_raw{__ocml_rint_2f16(x)}; + } + inline + __device__ + __half2 h2sin(__half2 x) + { + return __half2_raw{__ocml_sin_2f16(x)}; + } + inline + __device__ + __half2 h2cos(__half2 x) + { + return __half2_raw{__ocml_cos_2f16(x)}; + } + inline + __device__ + __half2 h2exp(__half2 x) + { + return __half2_raw{__ocml_exp_2f16(x)}; + } + inline + __device__ + __half2 h2exp2(__half2 x) + { + return __half2_raw{__ocml_exp2_2f16(x)}; + } + inline + __device__ + __half2 h2exp10(__half2 x) + { + return __half2_raw{__ocml_exp10_2f16(x)}; + } + inline + __device__ + __half2 h2log2(__half2 x) + { + return __half2_raw{__ocml_log2_2f16(x)}; + } + inline + __device__ + __half2 h2log(__half2 x) { return __ocml_log_2f16(x); } + inline + __device__ + __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } + inline + __device__ + __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); } + inline + __device__ + __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } + inline + __device__ + __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); } + inline + __device__ + __half2 __hisinf2(__half2 x) + { + auto r = __ocml_isinf_2f16(x); + return __half2_raw{_Float16_2{ + static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; + } + inline + __device__ + __half2 __hisnan2(__half2 x) + { + auto r = __ocml_isnan_2f16(x); + return __half2_raw{_Float16_2{ + static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; + } + inline + __device__ + __half2 __hneg2(__half2 x) + { + return __half2_raw{-static_cast<__half2_raw>(x).data}; + } #if !defined(HIP_NO_HALF) using half = __half; diff --git a/include/hip/hcc_detail/hip_fp16_gcc.h b/include/hip/hcc_detail/hip_fp16_gcc.h index 9b31f9e3ce..3d8752e619 100644 --- a/include/hip/hcc_detail/hip_fp16_gcc.h +++ b/include/hip/hcc_detail/hip_fp16_gcc.h @@ -95,160 +95,157 @@ struct __half2_raw { }; // END STRUCT __HALF2 - namespace + inline + unsigned short __internal_float2half( + float flt, unsigned int& sgn, unsigned int& rem) { - inline - unsigned short __internal_float2half( - float flt, unsigned int& sgn, unsigned int& rem) - { - unsigned int x{}; - std::memcpy(&x, &flt, sizeof(flt)); + unsigned int x{}; + std::memcpy(&x, &flt, sizeof(flt)); - unsigned int u = (x & 0x7fffffffU); - sgn = ((x >> 16) & 0x8000U); + unsigned int u = (x & 0x7fffffffU); + sgn = ((x >> 16) & 0x8000U); - // NaN/+Inf/-Inf - if (u >= 0x7f800000U) { - rem = 0; - return static_cast( - (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + rem = 0; + return static_cast( + (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU); + } + // Overflows + if (u > 0x477fefffU) { + rem = 0x80000000U; + return static_cast(sgn | 0x7bffU); + } + // Normal numbers + if (u >= 0x38800000U) { + rem = u << 19; + u -= 0x38000000U; + return static_cast(sgn | (u >> 13)); + } + // +0/-0 + if (u < 0x33000001U) { + rem = u; + return static_cast(sgn); + } + // Denormal numbers + unsigned int exponent = u >> 23; + unsigned int mantissa = (u & 0x7fffffU); + unsigned int shift = 0x7eU - exponent; + mantissa |= 0x800000U; + rem = mantissa << (32 - shift); + return static_cast(sgn | (mantissa >> shift)); + } + + inline + __half __float2half(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x; + + return r; + } + + inline + __half __float2half_rn(float x) { return __float2half(x); } + + inline + __half __float2half_rz(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + + return r; + } + + inline + __half __float2half_rd(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem && sgn) ++r.x; + + return r; + } + + inline + __half __float2half_ru(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem && !sgn) ++r.x; + + return r; + } + + inline + __half2 __float2half2_rn(float x) + { + return __half2{__float2half_rn(x), __float2half_rn(x)}; + } + + inline + __half2 __floats2half2_rn(float x, float y) + { + return __half2{__float2half_rn(x), __float2half_rn(y)}; + } + + inline + float __internal_half2float(unsigned short x) + { + unsigned int sign = ((x >> 15) & 1); + unsigned int exponent = ((x >> 10) & 0x1f); + unsigned int mantissa = ((x & 0x3ff) << 13); + + if (exponent == 0x1fU) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0); + exponent = 0xffU; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ } - // Overflows - if (u > 0x477fefffU) { - rem = 0x80000000U; - return static_cast(sgn | 0x7bffU); - } - // Normal numbers - if (u >= 0x38800000U) { - rem = u << 19; - u -= 0x38000000U; - return static_cast(sgn | (u >> 13)); - } - // +0/-0 - if (u < 0x33000001U) { - rem = u; - return static_cast(sgn); - } - // Denormal numbers - unsigned int exponent = u >> 23; - unsigned int mantissa = (u & 0x7fffffU); - unsigned int shift = 0x7eU - exponent; - mantissa |= 0x800000U; - rem = mantissa << (32 - shift); - return static_cast(sgn | (mantissa >> shift)); + } else { + exponent += 0x70U; } + unsigned int u = ((sign << 31) | (exponent << 23) | mantissa); + float f; + std::memcpy(&f, &u, sizeof(u)); - inline - __half __float2half(float x) - { - __half_raw r; - unsigned int sgn{}; - unsigned int rem{}; - r.x = __internal_float2half(x, sgn, rem); - if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x; + return f; + } - return r; - } + inline + float __half2float(__half x) + { + return __internal_half2float(static_cast<__half_raw>(x).x); + } - inline - __half __float2half_rn(float x) { return __float2half(x); } + inline + float __low2float(__half2 x) + { + return __internal_half2float(static_cast<__half2_raw>(x).x); + } - inline - __half __float2half_rz(float x) - { - __half_raw r; - unsigned int sgn{}; - unsigned int rem{}; - r.x = __internal_float2half(x, sgn, rem); - - return r; - } - - inline - __half __float2half_rd(float x) - { - __half_raw r; - unsigned int sgn{}; - unsigned int rem{}; - r.x = __internal_float2half(x, sgn, rem); - if (rem && sgn) ++r.x; - - return r; - } - - inline - __half __float2half_ru(float x) - { - __half_raw r; - unsigned int sgn{}; - unsigned int rem{}; - r.x = __internal_float2half(x, sgn, rem); - if (rem && !sgn) ++r.x; - - return r; - } - - inline - __half2 __float2half2_rn(float x) - { - return __half2{__float2half_rn(x), __float2half_rn(x)}; - } - - inline - __half2 __floats2half2_rn(float x, float y) - { - return __half2{__float2half_rn(x), __float2half_rn(y)}; - } - - inline - float __internal_half2float(unsigned short x) - { - unsigned int sign = ((x >> 15) & 1); - unsigned int exponent = ((x >> 10) & 0x1f); - unsigned int mantissa = ((x & 0x3ff) << 13); - - if (exponent == 0x1fU) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0); - exponent = 0xffU; - } else if (!exponent) { /* Denorm or Zero */ - if (mantissa) { - unsigned int msb; - exponent = 0x71U; - do { - msb = (mantissa & 0x400000U); - mantissa <<= 1; /* normalize */ - --exponent; - } while (!msb); - mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ - } - } else { - exponent += 0x70U; - } - unsigned int u = ((sign << 31) | (exponent << 23) | mantissa); - float f; - memcpy(&f, &u, sizeof(u)); - - return f; - } - - inline - float __half2float(__half x) - { - return __internal_half2float(static_cast<__half_raw>(x).x); - } - - inline - float __low2float(__half2 x) - { - return __internal_half2float(static_cast<__half2_raw>(x).x); - } - - inline - float __high2float(__half2 x) - { - return __internal_half2float(static_cast<__half2_raw>(x).y); - } - } // Anonymous namespace. + inline + float __high2float(__half2 x) + { + return __internal_half2float(static_cast<__half2_raw>(x).y); + } #if !defined(HIP_NO_HALF) using half = __half; diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index fbf96d3d0b..fbba1419fa 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -308,17 +308,17 @@ static constexpr Coordinates threadIdx{}; extern "C" __device__ void* __hip_malloc(size_t); extern "C" __device__ void* __hip_free(void* ptr); -static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); } -static inline __device__ void* free(void* ptr) { return __hip_free(ptr); } +inline __device__ void* malloc(size_t size) { return __hip_malloc(size); } +inline __device__ void* free(void* ptr) { return __hip_free(ptr); } #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF) template -static inline __device__ void printf(const char* format, All... all) { +inline __device__ void printf(const char* format, All... all) { hc::printf(format, all...); } #elif defined(__HCC_ACCELERATOR__) || __HIP__ template -static inline __device__ void printf(const char* format, All... all) {} +inline __device__ void printf(const char* format, All... all) {} #endif #endif //__HCC_OR_HIP_CLANG__ diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 2079bb7e44..e986e5ab3c 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -1107,38 +1107,38 @@ __MAKE_VECTOR_TYPE__(double, double); #ifdef __cplusplus #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x) { type r{x}; return r; } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y) { type r{x, y}; return r; } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y, comp z, comp w) { \ type r{x, y, z, w}; \ return r; \ } #else #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x) { type r; r.x =x; return r; } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ - static inline __device__ __host__ \ + inline __device__ __host__ \ type make_##type(comp x, comp y, comp z, comp w) { \ type r; r.x=x; r.y=y; r.z=z; r.w=w; \ return r; \ diff --git a/include/hip/hcc_detail/texture_functions.h b/include/hip/hcc_detail/texture_functions.h index bb0a9e7223..d2ef5d66ef 100644 --- a/include/hip/hcc_detail/texture_functions.h +++ b/include/hip/hcc_detail/texture_functions.h @@ -46,7 +46,7 @@ union TData { __hip_uint4_vector_value_type u; }; -#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__ +#define __TEXTURE_FUNCTIONS_DECL__ inline __device__ #if (__hcc_workweek__ >= 18114) || __clang__ diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index b85051d64c..135b06efc2 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -358,21 +358,21 @@ enum hipComputeMode { */ #if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__) template -static inline hipError_t hipMalloc(T** devPtr, size_t size) { +inline hipError_t hipMalloc(T** devPtr, size_t size) { return hipMalloc((void**)devPtr, size); } // Provide an override to automatically typecast the pointer type from void**, and also provide a // default for the flags. template -static inline hipError_t hipHostMalloc(T** ptr, size_t size, - unsigned int flags = hipHostMallocDefault) { +inline hipError_t hipHostMalloc(T** ptr, size_t size, + unsigned int flags = hipHostMallocDefault) { return hipHostMalloc((void**)ptr, size, flags); } template -static inline hipError_t hipMallocManaged(T** devPtr, size_t size, - unsigned int flags = hipMemAttachGlobal) { +inline hipError_t hipMallocManaged(T** devPtr, size_t size, + unsigned int flags = hipMemAttachGlobal) { return hipMallocManaged((void**)devPtr, size, flags); } #endif diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 8ee47eba4a..08b30196b8 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -97,13 +97,13 @@ class TidInfo { TidInfo(); int tid() const { return _shortTid; }; - pid_t pid() const { return _pid; }; + pid_t pid() const { return _pid; }; uint64_t incApiSeqNum() { return ++_apiSeqNum; }; uint64_t apiSeqNum() const { return _apiSeqNum; }; private: int _shortTid; - pid_t _pid; + pid_t _pid; // monotonically increasing API sequence number for this threa. uint64_t _apiSeqNum; @@ -280,7 +280,7 @@ static const DbName dbName[] = { #endif -static inline uint64_t getTicks() { return hc::get_system_ticks(); } +inline uint64_t getTicks() { return hc::get_system_ticks(); } //--- extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr); @@ -798,7 +798,7 @@ class ihipDevice_t { // TODO - report this through device properties, base on HCC API call. int _isLargeBar; - + // Node id reported by kfd for this device uint32_t _driver_node_id; @@ -1047,7 +1047,7 @@ struct mg_info { // setDevice first. // - hipDeviceReset destroys the primary context for device? // - Then context is created again for next usage. -static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) { +inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) { // Per-thread initialization of the TLS: if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) { tls->defaultCtx = ihipGetPrimaryCtx(0);