static inline in a header, just like excess sugar in a diet, causes bloat (#1692)

2019-12-23 15:39:38 +02:00
@@ -32,23 +32,23 @@ THE SOFTWARE.
 HIP_PUBLIC_API
 hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);

-static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

 template <typename T>
-static inline hipChannelFormatDesc hipCreateChannelDesc() {
+inline hipChannelFormatDesc hipCreateChannelDesc() {
    return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
 }

@@ -38,69 +38,69 @@ Integer Intrinsics
 */

 // integer intrinsic function __poc __clz __ffs __brev
-__device__ static inline unsigned int __popc(unsigned int input) {
+__device__ inline unsigned int __popc(unsigned int input) {
    return __builtin_popcount(input);
 }
-__device__ static inline unsigned int __popcll(unsigned long long int input) {
+__device__ inline unsigned int __popcll(unsigned long long int input) {
    return __builtin_popcountll(input);
 }

-__device__ static inline int __clz(int input) {
+__device__ inline int __clz(int input) {
    return __ockl_clz_u32((uint)input);
 }

-__device__ static inline int __clzll(long long int input) {
+__device__ inline int __clzll(long long int input) {
    return __ockl_clz_u64((ullong)input);
 }

-__device__ static inline unsigned int __ffs(unsigned int input) {
+__device__ inline unsigned int __ffs(unsigned int input) {
    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
 }

-__device__ static inline unsigned int __ffsll(unsigned long long int input) {
+__device__ inline unsigned int __ffsll(unsigned long long int input) {
    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
 }

-__device__ static inline unsigned int __ffs(int input) {
+__device__ inline unsigned int __ffs(int input) {
    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
 }

-__device__ static inline unsigned int __ffsll(long long int input) {
+__device__ inline unsigned int __ffsll(long long int input) {
    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
 }

-__device__ static inline unsigned int __brev(unsigned int input) {
+__device__ inline unsigned int __brev(unsigned int input) {
    return __llvm_bitrev_b32(input);
 }

-__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
+__device__ inline unsigned long long int __brevll(unsigned long long int input) {
    return __llvm_bitrev_b64(input);
 }

-__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
+__device__ inline unsigned int __lastbit_u32_u64(uint64_t input) {
    return input == 0 ? -1 : __builtin_ctzl(input);
 }

-__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
+__device__ inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
    uint32_t offset = src1 & 31;
    uint32_t width = src2 & 31;
    return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
 }

-__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
+__device__ inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
    uint64_t offset = src1 & 63;
    uint64_t width = src2 & 63;
    return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
 }

-__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
+__device__ inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
    uint32_t offset = src2 & 31;
    uint32_t width = src3 & 31;
    uint32_t mask = (1 << width) - 1;
    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }

-__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
+__device__ inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
    uint64_t offset = src2 & 63;
    uint64_t width = src3 & 63;
    uint64_t mask = (1ULL << width) - 1;
@@ -136,7 +136,7 @@ struct uchar2Holder {
 } __attribute__((aligned(8)));

 __device__
-static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
    struct uchar2Holder cHoldVal;
    struct ucharHolder cHoldKey;
    struct ucharHolder cHoldOut;
@@ -150,18 +150,18 @@ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned
    return cHoldOut.ui;
 }

-__device__ static inline unsigned int __hadd(int x, int y) {
+__device__ inline unsigned int __hadd(int x, int y) {
    int z = x + y;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
 }

-__device__ static inline int __mul24(int x, int y) {
+__device__ inline int __mul24(int x, int y) {
    return __ockl_mul24_i32(x, y);
 }

-__device__ static inline long long __mul64hi(long long int x, long long int y) {
+__device__ inline long long __mul64hi(long long int x, long long int y) {
    ulong x0 = (ulong)x & 0xffffffffUL;
    long x1 = x >> 32;
    ulong y0 = (ulong)y & 0xffffffffUL;
@@ -174,28 +174,28 @@ __device__ static inline long long __mul64hi(long long int x, long long int y) {
    return x1*y1 + z2 + (z1 >> 32);
 }

-__device__ static inline int __mulhi(int x, int y) {
+__device__ inline int __mulhi(int x, int y) {
    return __ockl_mul_hi_i32(x, y);
 }

-__device__ static inline int __rhadd(int x, int y) {
+__device__ inline int __rhadd(int x, int y) {
    int z = x + y + 1;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
 }
-__device__ static inline unsigned int __sad(int x, int y, int z) {
+__device__ inline unsigned int __sad(int x, int y, int z) {
    return x > y ? x - y + z : y - x + z;
 }
-__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __uhadd(unsigned int x, unsigned int y) {
    return (x + y) >> 1;
 }
-__device__ static inline int __umul24(unsigned int x, unsigned int y) {
+__device__ inline int __umul24(unsigned int x, unsigned int y) {
    return __ockl_mul24_u32(x, y);
 }

 __device__
-static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
+inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
    ulong x0 = x & 0xffffffffUL;
    ulong x1 = x >> 32;
    ulong y0 = y & 0xffffffffUL;
@@ -208,41 +208,41 @@ static inline unsigned long long __umul64hi(unsigned long long int x, unsigned l
    return x1*y1 + z2 + (z1 >> 32);
 }

-__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __umulhi(unsigned int x, unsigned int y) {
    return __ockl_mul_hi_u32(x, y);
 }
-__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __urhadd(unsigned int x, unsigned int y) {
    return (x + y + 1) >> 1;
 }
-__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+__device__ inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
    return __ockl_sad_u32(x, y, z);
 }

-__device__ static inline unsigned int __lane_id() { return  __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
+__device__ inline unsigned int __lane_id() { return  __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }

 /*
 HIP specific device functions
 */

-__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+__device__ inline unsigned __hip_ds_bpermute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.u;
 }

-__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+__device__ inline float __hip_ds_bpermutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.f;
 }

-__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+__device__ inline unsigned __hip_ds_permute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
    return tmp.u;
 }

-__device__ static inline float __hip_ds_permutef(int index, float src) {
+__device__ inline float __hip_ds_permutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
    return tmp.u;
@@ -252,7 +252,7 @@ __device__ static inline float __hip_ds_permutef(int index, float src) {
 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))

 template <int pattern>
-__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+__device__ inline unsigned __hip_ds_swizzle_N(unsigned int src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
 #if defined(__HCC__)
    tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
@@ -263,7 +263,7 @@ __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
 }

 template <int pattern>
-__device__ static inline float __hip_ds_swizzlef_N(float src) {
+__device__ inline float __hip_ds_swizzlef_N(float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
 #if defined(__HCC__)
    tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
@@ -277,7 +277,7 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))

 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
-__device__ static inline int __hip_move_dpp_N(int src) {
+__device__ inline int __hip_move_dpp_N(int src) {
    return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
                                  bound_ctrl);
 }
@@ -434,7 +434,7 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) {
 #define MASK1 0x00ff00ff
 #define MASK2 0xff00ff00

-__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
@@ -445,7 +445,7 @@ __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
    return out;
 }

-__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
@@ -456,7 +456,7 @@ __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
    return out;
 }

-__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
@@ -472,12 +472,12 @@ __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
 * TODO: Conversion functions are not correct, need to fix when BE is ready
 */

-__device__ static inline float __double2float_rd(double x) { return (double)x; }
-__device__ static inline float __double2float_rn(double x) { return (double)x; }
-__device__ static inline float __double2float_ru(double x) { return (double)x; }
-__device__ static inline float __double2float_rz(double x) { return (double)x; }
+__device__ inline float __double2float_rd(double x) { return (double)x; }
+__device__ inline float __double2float_rn(double x) { return (double)x; }
+__device__ inline float __double2float_ru(double x) { return (double)x; }
+__device__ inline float __double2float_rz(double x) { return (double)x; }

-__device__ static inline int __double2hiint(double x) {
+__device__ inline int __double2hiint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
@@ -485,7 +485,7 @@ __device__ static inline int __double2hiint(double x) {

    return tmp[1];
 }
-__device__ static inline int __double2loint(double x) {
+__device__ inline int __double2loint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
@@ -494,35 +494,35 @@ __device__ static inline int __double2loint(double x) {
    return tmp[0];
 }

-__device__ static inline int __double2int_rd(double x) { return (int)x; }
-__device__ static inline int __double2int_rn(double x) { return (int)x; }
-__device__ static inline int __double2int_ru(double x) { return (int)x; }
-__device__ static inline int __double2int_rz(double x) { return (int)x; }
+__device__ inline int __double2int_rd(double x) { return (int)x; }
+__device__ inline int __double2int_rn(double x) { return (int)x; }
+__device__ inline int __double2int_ru(double x) { return (int)x; }
+__device__ inline int __double2int_rz(double x) { return (int)x; }

-__device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rd(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rn(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_ru(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rz(double x) { return (long long int)x; }

-__device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }

-__device__ static inline unsigned long long int __double2ull_rd(double x) {
+__device__ inline unsigned long long int __double2ull_rd(double x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_rn(double x) {
+__device__ inline unsigned long long int __double2ull_rn(double x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_ru(double x) {
+__device__ inline unsigned long long int __double2ull_ru(double x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_rz(double x) {
+__device__ inline unsigned long long int __double2ull_rz(double x) {
    return (unsigned long long int)x;
 }

-__device__ static inline long long int __double_as_longlong(double x) {
+__device__ inline long long int __double_as_longlong(double x) {
    static_assert(sizeof(long long) == sizeof(double), "");

    long long tmp;
@@ -545,35 +545,35 @@ CUDA implements half as unsigned short whereas, HIP doesn't.

 */

-__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
-__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
-__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
-__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
+__device__ inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
+__device__ inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
+__device__ inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
+__device__ inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }

-__device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rd(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rn(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_ru(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rz(float x) { return (long long int)x; }

-__device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }

-__device__ static inline unsigned long long int __float2ull_rd(float x) {
+__device__ inline unsigned long long int __float2ull_rd(float x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_rn(float x) {
+__device__ inline unsigned long long int __float2ull_rn(float x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_ru(float x) {
+__device__ inline unsigned long long int __float2ull_ru(float x) {
    return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_rz(float x) {
+__device__ inline unsigned long long int __float2ull_rz(float x) {
    return (unsigned long long int)x;
 }

-__device__ static inline int __float_as_int(float x) {
+__device__ inline int __float_as_int(float x) {
    static_assert(sizeof(int) == sizeof(float), "");

    int tmp;
@@ -582,7 +582,7 @@ __device__ static inline int __float_as_int(float x) {
    return tmp;
 }

-__device__ static inline unsigned int __float_as_uint(float x) {
+__device__ inline unsigned int __float_as_uint(float x) {
    static_assert(sizeof(unsigned int) == sizeof(float), "");

    unsigned int tmp;
@@ -591,7 +591,7 @@ __device__ static inline unsigned int __float_as_uint(float x) {
    return tmp;
 }

-__device__ static inline double __hiloint2double(int hi, int lo) {
+__device__ inline double __hiloint2double(int hi, int lo) {
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
@@ -601,14 +601,14 @@ __device__ static inline double __hiloint2double(int hi, int lo) {
    return tmp1;
 }

-__device__ static inline double __int2double_rn(int x) { return (double)x; }
+__device__ inline double __int2double_rn(int x) { return (double)x; }

-__device__ static inline float __int2float_rd(int x) { return (float)x; }
-__device__ static inline float __int2float_rn(int x) { return (float)x; }
-__device__ static inline float __int2float_ru(int x) { return (float)x; }
-__device__ static inline float __int2float_rz(int x) { return (float)x; }
+__device__ inline float __int2float_rd(int x) { return (float)x; }
+__device__ inline float __int2float_rn(int x) { return (float)x; }
+__device__ inline float __int2float_ru(int x) { return (float)x; }
+__device__ inline float __int2float_rz(int x) { return (float)x; }

-__device__ static inline float __int_as_float(int x) {
+__device__ inline float __int_as_float(int x) {
    static_assert(sizeof(float) == sizeof(int), "");

    float tmp;
@@ -617,17 +617,17 @@ __device__ static inline float __int_as_float(int x) {
    return tmp;
 }

-__device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rd(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rn(long long int x) { return (double)x; }
+__device__ inline double __ll2double_ru(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rz(long long int x) { return (double)x; }

-__device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rd(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rn(long long int x) { return (float)x; }
+__device__ inline float __ll2float_ru(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rz(long long int x) { return (float)x; }

-__device__ static inline double __longlong_as_double(long long int x) {
+__device__ inline double __longlong_as_double(long long int x) {
    static_assert(sizeof(double) == sizeof(long long), "");

    double tmp;
@@ -636,14 +636,14 @@ __device__ static inline double __longlong_as_double(long long int x) {
    return tmp;
 }

-__device__ static inline double __uint2double_rn(int x) { return (double)x; }
+__device__ inline double __uint2double_rn(int x) { return (double)x; }

-__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rd(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rn(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_ru(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rz(unsigned int x) { return (float)x; }

-__device__ static inline float __uint_as_float(unsigned int x) {
+__device__ inline float __uint_as_float(unsigned int x) {
   static_assert(sizeof(float) == sizeof(unsigned int), "");

    float tmp;
@@ -652,15 +652,15 @@ __device__ static inline float __uint_as_float(unsigned int x) {
    return tmp;
 }

-__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rz(unsigned long long int x) { return (double)x; }

-__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rz(unsigned long long int x) { return (float)x; }

 #if defined(__HCC__)
 #define __HCC_OR_HIP_CLANG__ 1
@@ -819,7 +819,7 @@ typedef enum __memory_order

 __device__
 inline
-static void
+void
 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
 {
    // We're tying global-happens-before and local-happens-before together as does HSA
@@ -871,21 +871,21 @@ __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __mem
 // Memory Fence Functions
 __device__
 inline
-static void __threadfence()
+void __threadfence()
 {
  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
 }

 __device__
 inline
-static void __threadfence_block()
+void __threadfence_block()
 {
  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
 }

 __device__
 inline
-static void __threadfence_system()
+void __threadfence_system()
 {
  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
 }
@@ -945,7 +945,7 @@ void __assertfail(const char * __assertion,

 __device__
 inline
-static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
+void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
 {
    if (flags) {
        __atomic_work_item_fence(flags, __memory_order_release, scope);
@@ -958,7 +958,7 @@ static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scop

 __device__
 inline
-static void __barrier(int n)
+void __barrier(int n)
 {
  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
 }
@@ -1037,7 +1037,7 @@ unsigned __smid(void)


 // loop unrolling
-static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
+inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);
    auto srcPtr = static_cast<const unsigned char*>(src);

@@ -1063,7 +1063,7 @@ static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_
    return dst;
 }

-static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
+inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);

    while (size >= 4u) {
@@ -1086,11 +1086,11 @@ static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, siz

    return dst;
 }
-static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
    return __hip_hc_memcpy(dst, src, size);
 }

-static inline __device__ void* memset(void* ptr, int val, size_t size) {
+inline __device__ void* memset(void* ptr, int val, size_t size) {
    unsigned char val8 = static_cast<unsigned char>(val);
    return __hip_hc_memset(ptr, val8, size);
 }
@@ -287,8 +287,8 @@ typedef struct hipMemcpy3DParms {
    size_t srcZ;
 }hipMemcpy3DParms;

-static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
-                                                          size_t ysz) {
+inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
+                                               size_t ysz) {
    struct hipPitchedPtr s;

    s.ptr = d;
@@ -299,7 +299,7 @@ static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t
    return s;
 }

-static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
    struct hipPos p;

    p.x = x;
@@ -309,7 +309,7 @@ static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
    return p;
 }

-static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
    struct hipExtent e;

    e.width = w;
@@ -36,7 +36,7 @@ THE SOFTWARE.

 #if __cplusplus
 #define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& op) {                             \
+    __device__ __host__ inline type operator-(const type& op) {                             \
        type ret;                                                                                  \
        ret.x = -op.x;                                                                             \
        ret.y = -op.y;                                                                             \
@@ -44,17 +44,17 @@ THE SOFTWARE.
    }

 #define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+    __device__ __host__ inline bool operator==(const type& lhs, const type& rhs) {          \
        return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
    }

 #define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+    __device__ __host__ inline bool operator!=(const type& lhs, const type& rhs) {          \
        return !(lhs == rhs);                                                                      \
    }

 #define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator+(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x + rhs.x;                                                                     \
        ret.y = lhs.y + rhs.y;                                                                     \
@@ -62,7 +62,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator-(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x - rhs.x;                                                                     \
        ret.y = lhs.y - rhs.y;                                                                     \
@@ -70,7 +70,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator*(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
@@ -78,7 +78,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator/(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
@@ -88,33 +88,33 @@ THE SOFTWARE.
    }

 #define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator+=(type& lhs, const type& rhs) {               \
        lhs.x += rhs.x;                                                                            \
        lhs.y += rhs.y;                                                                            \
        return lhs;                                                                                \
    }

 #define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator-=(type& lhs, const type& rhs) {               \
        lhs.x -= rhs.x;                                                                            \
        lhs.y -= rhs.y;                                                                            \
        return lhs;                                                                                \
    }

 #define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator*=(type& lhs, const type& rhs) {               \
        lhs = lhs * rhs;                                                                           \
        return lhs;                                                                                \
    }

 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator/=(type& lhs, const type& rhs) {               \
        lhs = lhs / rhs;                                                                           \
        return lhs;                                                                                \
    }

 #define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
-    __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+    __device__ __host__ inline type operator*(const type& lhs, type1 rhs) {                 \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs;                                                                       \
        ret.y = lhs.y * rhs;                                                                       \
@@ -125,41 +125,41 @@ THE SOFTWARE.

 typedef float2 hipFloatComplex;

-__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+__device__ __host__ inline float hipCrealf(hipFloatComplex z) { return z.x; }

-__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+__device__ __host__ inline float hipCimagf(hipFloatComplex z) { return z.y; }

-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+__device__ __host__ inline hipFloatComplex make_hipFloatComplex(float a, float b) {
    hipFloatComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+__device__ __host__ inline hipFloatComplex hipConjf(hipFloatComplex z) {
    hipFloatComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+__device__ __host__ inline float hipCsqabsf(hipFloatComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
    float sqabs = hipCsqabsf(q);
    hipFloatComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -167,46 +167,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi
    return ret;
 }

-__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+__device__ __host__ inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }


 typedef double2 hipDoubleComplex;

-__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+__device__ __host__ inline double hipCreal(hipDoubleComplex z) { return z.x; }

-__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+__device__ __host__ inline double hipCimag(hipDoubleComplex z) { return z.y; }

-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+__device__ __host__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
    hipDoubleComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+__device__ __host__ inline hipDoubleComplex hipConj(hipDoubleComplex z) {
    hipDoubleComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+__device__ __host__ inline double hipCsqabs(hipDoubleComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
    double sqabs = hipCsqabs(q);
    hipDoubleComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -214,7 +214,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h
    return ret;
 }

-__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
+__device__ __host__ inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }


 #if __cplusplus
@@ -268,19 +268,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)

 typedef hipFloatComplex hipComplex;

-__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
+__device__ __host__ inline hipComplex make_hipComplex(float x, float y) {
    return make_hipFloatComplex(x, y);
 }

-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+__device__ __host__ inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
    return make_hipFloatComplex((float)z.x, (float)z.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+__device__ __host__ inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
    return make_hipDoubleComplex((double)z.x, (double)z.y);
 }

-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+__device__ __host__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -290,7 +290,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
    return make_hipComplex(real, imag);
 }

-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+__device__ __host__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
                                                           hipDoubleComplex r) {
    double real = (p.x * q.x) + r.x;
    double imag = (q.x * p.y) + r.y;
@@ -95,160 +95,157 @@ struct __half2_raw {
    };
    // END STRUCT __HALF2

-    namespace
+    inline
+    unsigned short __internal_float2half(
+        float flt, unsigned int& sgn, unsigned int& rem)
    {
-        inline
-        unsigned short __internal_float2half(
-            float flt, unsigned int& sgn, unsigned int& rem)
-        {
-            unsigned int x{};
-            std::memcpy(&x, &flt, sizeof(flt));
+        unsigned int x{};
+        std::memcpy(&x, &flt, sizeof(flt));

-            unsigned int u = (x & 0x7fffffffU);
-            sgn = ((x >> 16) & 0x8000U);
+        unsigned int u = (x & 0x7fffffffU);
+        sgn = ((x >> 16) & 0x8000U);

-            // NaN/+Inf/-Inf
-            if (u >= 0x7f800000U) {
-                rem = 0;
-                return static_cast<unsigned short>(
-                    (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        // NaN/+Inf/-Inf
+        if (u >= 0x7f800000U) {
+            rem = 0;
+            return static_cast<unsigned short>(
+                (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        }
+        // Overflows
+        if (u > 0x477fefffU) {
+            rem = 0x80000000U;
+            return static_cast<unsigned short>(sgn | 0x7bffU);
+        }
+        // Normal numbers
+        if (u >= 0x38800000U) {
+            rem = u << 19;
+            u -= 0x38000000U;
+            return static_cast<unsigned short>(sgn | (u >> 13));
+        }
+        // +0/-0
+        if (u < 0x33000001U) {
+            rem = u;
+            return static_cast<unsigned short>(sgn);
+        }
+        // Denormal numbers
+        unsigned int exponent = u >> 23;
+        unsigned int mantissa = (u & 0x7fffffU);
+        unsigned int shift = 0x7eU - exponent;
+        mantissa |= 0x800000U;
+        rem = mantissa << (32 - shift);
+        return static_cast<unsigned short>(sgn | (mantissa >> shift));
+    }
+
+    inline
+    __half __float2half(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rn(float x) { return __float2half(x); }
+
+    inline
+    __half __float2half_rz(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rd(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_ru(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && !sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half2 __float2half2_rn(float x)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(x)};
+    }
+
+    inline
+    __half2 __floats2half2_rn(float x, float y)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(y)};
+    }
+
+    inline
+    float __internal_half2float(unsigned short x)
+    {
+        unsigned int sign = ((x >> 15) & 1);
+        unsigned int exponent = ((x >> 10) & 0x1f);
+        unsigned int mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1fU) { /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
+            exponent = 0xffU;
+        } else if (!exponent) { /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71U;
+                do {
+                    msb = (mantissa & 0x400000U);
+                    mantissa <<= 1; /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
            }
-            // Overflows
-            if (u > 0x477fefffU) {
-                rem = 0x80000000U;
-                return static_cast<unsigned short>(sgn | 0x7bffU);
-            }
-            // Normal numbers
-            if (u >= 0x38800000U) {
-                rem = u << 19;
-                u -= 0x38000000U;
-                return static_cast<unsigned short>(sgn | (u >> 13));
-            }
-            // +0/-0
-            if (u < 0x33000001U) {
-                rem = u;
-                return static_cast<unsigned short>(sgn);
-            }
-            // Denormal numbers
-            unsigned int exponent = u >> 23;
-            unsigned int mantissa = (u & 0x7fffffU);
-            unsigned int shift = 0x7eU - exponent;
-            mantissa |= 0x800000U;
-            rem = mantissa << (32 - shift);
-            return static_cast<unsigned short>(sgn | (mantissa >> shift));
+        } else {
+            exponent += 0x70U;
        }
+        unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
+        float f;
+        std::memcpy(&f, &u, sizeof(u));

-        inline
-        __half __float2half(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+        return f;
+    }

-            return r;
-        }
+    inline
+    float __half2float(__half x)
+    {
+        return __internal_half2float(static_cast<__half_raw>(x).x);
+    }

-        inline
-        __half __float2half_rn(float x) { return __float2half(x); }
+    inline
+    float __low2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).x);
+    }

-        inline
-        __half __float2half_rz(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-
-            return r;
-        }
-
-        inline
-        __half __float2half_rd(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem && sgn) ++r.x;
-
-            return r;
-        }
-
-        inline
-        __half __float2half_ru(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem && !sgn) ++r.x;
-
-            return r;
-        }
-
-        inline
-        __half2 __float2half2_rn(float x)
-        {
-            return __half2{__float2half_rn(x), __float2half_rn(x)};
-        }
-
-        inline
-        __half2 __floats2half2_rn(float x, float y)
-        {
-            return __half2{__float2half_rn(x), __float2half_rn(y)};
-        }
-
-        inline
-        float __internal_half2float(unsigned short x)
-        {
-            unsigned int sign = ((x >> 15) & 1);
-            unsigned int exponent = ((x >> 10) & 0x1f);
-            unsigned int mantissa = ((x & 0x3ff) << 13);
-
-            if (exponent == 0x1fU) { /* NaN or Inf */
-                mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
-                exponent = 0xffU;
-            } else if (!exponent) { /* Denorm or Zero */
-                if (mantissa) {
-                    unsigned int msb;
-                    exponent = 0x71U;
-                    do {
-                        msb = (mantissa & 0x400000U);
-                        mantissa <<= 1; /* normalize */
-                        --exponent;
-                    } while (!msb);
-                    mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
-                }
-            } else {
-                exponent += 0x70U;
-            }
-            unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
-            float f;
-            memcpy(&f, &u, sizeof(u));
-
-            return f;
-        }
-
-        inline
-        float __half2float(__half x)
-        {
-            return __internal_half2float(static_cast<__half_raw>(x).x);
-        }
-
-        inline
-        float __low2float(__half2 x)
-        {
-            return __internal_half2float(static_cast<__half2_raw>(x).x);
-        }
-
-        inline
-        float __high2float(__half2 x)
-        {
-            return __internal_half2float(static_cast<__half2_raw>(x).y);
-        }
-    } // Anonymous namespace.
+    inline
+    float __high2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).y);
+    }

    #if !defined(HIP_NO_HALF)
        using half = __half;
@@ -308,17 +308,17 @@ static constexpr Coordinates<hip_impl::WorkitemId> threadIdx{};
 extern "C" __device__ void* __hip_malloc(size_t);
 extern "C" __device__ void* __hip_free(void* ptr);

-static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
-static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
+inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+inline __device__ void* free(void* ptr) { return __hip_free(ptr); }

 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
 template <typename... All>
-static inline __device__ void printf(const char* format, All... all) {
+inline __device__ void printf(const char* format, All... all) {
    hc::printf(format, all...);
 }
 #elif defined(__HCC_ACCELERATOR__) || __HIP__
 template <typename... All>
-static inline __device__ void printf(const char* format, All... all) {}
+inline __device__ void printf(const char* format, All... all) {}
 #endif

 #endif //__HCC_OR_HIP_CLANG__
@@ -1107,38 +1107,38 @@ __MAKE_VECTOR_TYPE__(double, double);

 #ifdef __cplusplus
 #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
    type make_##type(comp x) { type r{x}; return r; }

 #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
    type make_##type(comp x, comp y) { type r{x, y}; return r; }

 #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
    type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }

 #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
    type make_##type(comp x, comp y, comp z, comp w) { \
        type r{x, y, z, w}; \
        return r; \
    }
 #else
 #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
     type make_##type(comp x) { type r; r.x =x; return r; }

 #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
     type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }

 #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
     type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }

 #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
     type make_##type(comp x, comp y, comp z, comp w) { \
         type r; r.x=x; r.y=y; r.z=z; r.w=w; \
         return r; \
@@ -46,7 +46,7 @@ union TData {
    __hip_uint4_vector_value_type u;
 };

-#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__
+#define __TEXTURE_FUNCTIONS_DECL__ inline __device__


 #if (__hcc_workweek__ >= 18114) || __clang__
@@ -358,21 +358,21 @@ enum hipComputeMode {
 */
 #if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__)
 template <class T>
-static inline hipError_t hipMalloc(T** devPtr, size_t size) {
+inline hipError_t hipMalloc(T** devPtr, size_t size) {
    return hipMalloc((void**)devPtr, size);
 }

 // Provide an override to automatically typecast the pointer type from void**, and also provide a
 // default for the flags.
 template <class T>
-static inline hipError_t hipHostMalloc(T** ptr, size_t size,
-                                       unsigned int flags = hipHostMallocDefault) {
+inline hipError_t hipHostMalloc(T** ptr, size_t size,
+                                unsigned int flags = hipHostMallocDefault) {
    return hipHostMalloc((void**)ptr, size, flags);
 }

 template <class T>
-static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
-                                       unsigned int flags = hipMemAttachGlobal) {
+inline hipError_t hipMallocManaged(T** devPtr, size_t size,
+                                   unsigned int flags = hipMemAttachGlobal) {
    return hipMallocManaged((void**)devPtr, size, flags);
 }
 #endif
@@ -97,13 +97,13 @@ class TidInfo {
    TidInfo();

    int tid() const { return _shortTid; };
-    pid_t pid() const { return _pid; }; 
+    pid_t pid() const { return _pid; };
    uint64_t incApiSeqNum() { return ++_apiSeqNum; };
    uint64_t apiSeqNum() const { return _apiSeqNum; };

   private:
    int _shortTid;
-    pid_t _pid; 
+    pid_t _pid;

    // monotonically increasing API sequence number for this threa.
    uint64_t _apiSeqNum;
@@ -280,7 +280,7 @@ static const DbName dbName[] = {
 #endif


-static inline uint64_t getTicks() { return hc::get_system_ticks(); }
+inline uint64_t getTicks() { return hc::get_system_ticks(); }

 //---
 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
@@ -798,7 +798,7 @@ class ihipDevice_t {

    // TODO - report this through device properties, base on HCC API call.
    int _isLargeBar;
-   
+
    // Node id reported by kfd for this device
    uint32_t _driver_node_id;

@@ -1047,7 +1047,7 @@ struct mg_info {
 //  setDevice first.
 //  - hipDeviceReset destroys the primary context for device?
 //  - Then context is created again for next usage.
-static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
+inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
    // Per-thread initialization of the TLS:
    if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
        tls->defaultCtx = ihipGetPrimaryCtx(0);