diff --git a/include/hip/hcc_detail/channel_descriptor.h b/include/hip/hcc_detail/channel_descriptor.h
index 38acff9951..db81fc76b6 100644
--- a/include/hip/hcc_detail/channel_descriptor.h
+++ b/include/hip/hcc_detail/channel_descriptor.h
@@ -32,23 +32,23 @@ THE SOFTWARE.
 HIP_PUBLIC_API
 hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
 
-static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf() {
     int e = (int)sizeof(unsigned short) * 8;
     return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }
 
-static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
     int e = (int)sizeof(unsigned short) * 8;
     return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }
 
-static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
     int e = (int)sizeof(unsigned short) * 8;
     return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }
 
 template <typename T>
-static inline hipChannelFormatDesc hipCreateChannelDesc() {
+inline hipChannelFormatDesc hipCreateChannelDesc() {
     return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
 }
 
diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h
index 68e3277270..58f837aa76 100644
--- a/include/hip/hcc_detail/device_functions.h
+++ b/include/hip/hcc_detail/device_functions.h
@@ -38,69 +38,69 @@ Integer Intrinsics
 */
 
 // integer intrinsic function __poc __clz __ffs __brev
-__device__ static inline unsigned int __popc(unsigned int input) {
+__device__ inline unsigned int __popc(unsigned int input) {
     return __builtin_popcount(input);
 }
-__device__ static inline unsigned int __popcll(unsigned long long int input) {
+__device__ inline unsigned int __popcll(unsigned long long int input) {
     return __builtin_popcountll(input);
 }
 
-__device__ static inline int __clz(int input) {
+__device__ inline int __clz(int input) {
     return __ockl_clz_u32((uint)input);
 }
 
-__device__ static inline int __clzll(long long int input) {
+__device__ inline int __clzll(long long int input) {
     return __ockl_clz_u64((ullong)input);
 }
 
-__device__ static inline unsigned int __ffs(unsigned int input) {
+__device__ inline unsigned int __ffs(unsigned int input) {
     return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
 }
 
-__device__ static inline unsigned int __ffsll(unsigned long long int input) {
+__device__ inline unsigned int __ffsll(unsigned long long int input) {
     return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
 }
 
-__device__ static inline unsigned int __ffs(int input) {
+__device__ inline unsigned int __ffs(int input) {
     return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
 }
 
-__device__ static inline unsigned int __ffsll(long long int input) {
+__device__ inline unsigned int __ffsll(long long int input) {
     return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
 }
 
-__device__ static inline unsigned int __brev(unsigned int input) {
+__device__ inline unsigned int __brev(unsigned int input) {
     return __llvm_bitrev_b32(input);
 }
 
-__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
+__device__ inline unsigned long long int __brevll(unsigned long long int input) {
     return __llvm_bitrev_b64(input);
 }
 
-__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
+__device__ inline unsigned int __lastbit_u32_u64(uint64_t input) {
     return input == 0 ? -1 : __builtin_ctzl(input);
 }
 
-__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
+__device__ inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
     uint32_t offset = src1 & 31;
     uint32_t width = src2 & 31;
     return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
 }
 
-__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
+__device__ inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
     uint64_t offset = src1 & 63;
     uint64_t width = src2 & 63;
     return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
 }
 
-__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
+__device__ inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
     uint32_t offset = src2 & 31;
     uint32_t width = src3 & 31;
     uint32_t mask = (1 << width) - 1;
     return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }
 
-__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
+__device__ inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
     uint64_t offset = src2 & 63;
     uint64_t width = src3 & 63;
     uint64_t mask = (1ULL << width) - 1;
@@ -136,7 +136,7 @@ struct uchar2Holder {
 } __attribute__((aligned(8)));
 
 __device__
-static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
     struct uchar2Holder cHoldVal;
     struct ucharHolder cHoldKey;
     struct ucharHolder cHoldOut;
@@ -150,18 +150,18 @@ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned
     return cHoldOut.ui;
 }
 
-__device__ static inline unsigned int __hadd(int x, int y) {
+__device__ inline unsigned int __hadd(int x, int y) {
     int z = x + y;
     int sign = z & 0x8000000;
     int value = z & 0x7FFFFFFF;
     return ((value) >> 1 || sign);
 }
 
-__device__ static inline int __mul24(int x, int y) {
+__device__ inline int __mul24(int x, int y) {
     return __ockl_mul24_i32(x, y);
 }
 
-__device__ static inline long long __mul64hi(long long int x, long long int y) {
+__device__ inline long long __mul64hi(long long int x, long long int y) {
     ulong x0 = (ulong)x & 0xffffffffUL;
     long x1 = x >> 32;
     ulong y0 = (ulong)y & 0xffffffffUL;
@@ -174,28 +174,28 @@ __device__ static inline long long __mul64hi(long long int x, long long int y) {
     return x1*y1 + z2 + (z1 >> 32);
 }
 
-__device__ static inline int __mulhi(int x, int y) {
+__device__ inline int __mulhi(int x, int y) {
     return __ockl_mul_hi_i32(x, y);
 }
 
-__device__ static inline int __rhadd(int x, int y) {
+__device__ inline int __rhadd(int x, int y) {
     int z = x + y + 1;
     int sign = z & 0x8000000;
     int value = z & 0x7FFFFFFF;
     return ((value) >> 1 || sign);
 }
-__device__ static inline unsigned int __sad(int x, int y, int z) {
+__device__ inline unsigned int __sad(int x, int y, int z) {
     return x > y ? x - y + z : y - x + z;
 }
-__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __uhadd(unsigned int x, unsigned int y) {
     return (x + y) >> 1;
 }
-__device__ static inline int __umul24(unsigned int x, unsigned int y) {
+__device__ inline int __umul24(unsigned int x, unsigned int y) {
     return __ockl_mul24_u32(x, y);
 }
 
 __device__
-static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
+inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
     ulong x0 = x & 0xffffffffUL;
     ulong x1 = x >> 32;
     ulong y0 = y & 0xffffffffUL;
@@ -208,41 +208,41 @@ static inline unsigned long long __umul64hi(unsigned long long int x, unsigned l
     return x1*y1 + z2 + (z1 >> 32);
 }
 
-__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __umulhi(unsigned int x, unsigned int y) {
     return __ockl_mul_hi_u32(x, y);
 }
-__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
+__device__ inline unsigned int __urhadd(unsigned int x, unsigned int y) {
     return (x + y + 1) >> 1;
 }
-__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+__device__ inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
     return __ockl_sad_u32(x, y, z);
 }
 
-__device__ static inline unsigned int __lane_id() { return  __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
+__device__ inline unsigned int __lane_id() { return  __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
 
 /*
 HIP specific device functions
 */
 
-__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+__device__ inline unsigned __hip_ds_bpermute(int index, unsigned src) {
     union { int i; unsigned u; float f; } tmp; tmp.u = src;
     tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
     return tmp.u;
 }
 
-__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+__device__ inline float __hip_ds_bpermutef(int index, float src) {
     union { int i; unsigned u; float f; } tmp; tmp.f = src;
     tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
     return tmp.f;
 }
 
-__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+__device__ inline unsigned __hip_ds_permute(int index, unsigned src) {
     union { int i; unsigned u; float f; } tmp; tmp.u = src;
     tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
     return tmp.u;
 }
 
-__device__ static inline float __hip_ds_permutef(int index, float src) {
+__device__ inline float __hip_ds_permutef(int index, float src) {
     union { int i; unsigned u; float f; } tmp; tmp.u = src;
     tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
     return tmp.u;
@@ -252,7 +252,7 @@ __device__ static inline float __hip_ds_permutef(int index, float src) {
 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
 
 template <int pattern>
-__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+__device__ inline unsigned __hip_ds_swizzle_N(unsigned int src) {
     union { int i; unsigned u; float f; } tmp; tmp.u = src;
 #if defined(__HCC__)
     tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
@@ -263,7 +263,7 @@ __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
 }
 
 template <int pattern>
-__device__ static inline float __hip_ds_swizzlef_N(float src) {
+__device__ inline float __hip_ds_swizzlef_N(float src) {
     union { int i; unsigned u; float f; } tmp; tmp.f = src;
 #if defined(__HCC__)
     tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
@@ -277,7 +277,7 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
   __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
 
 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
-__device__ static inline int __hip_move_dpp_N(int src) {
+__device__ inline int __hip_move_dpp_N(int src) {
     return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
                                   bound_ctrl);
 }
@@ -434,7 +434,7 @@ double __shfl_xor(double var, int lane_mask, int width = warpSize) {
 #define MASK1 0x00ff00ff
 #define MASK2 0xff00ff00
 
-__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
     char4 out;
     unsigned one1 = in1.w & MASK1;
     unsigned one2 = in2.w & MASK1;
@@ -445,7 +445,7 @@ __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
     return out;
 }
 
-__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
     char4 out;
     unsigned one1 = in1.w & MASK1;
     unsigned one2 = in2.w & MASK1;
@@ -456,7 +456,7 @@ __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
     return out;
 }
 
-__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+__device__ inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
     char4 out;
     unsigned one1 = in1.w & MASK1;
     unsigned one2 = in2.w & MASK1;
@@ -472,12 +472,12 @@ __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
  * TODO: Conversion functions are not correct, need to fix when BE is ready
 */
 
-__device__ static inline float __double2float_rd(double x) { return (double)x; }
-__device__ static inline float __double2float_rn(double x) { return (double)x; }
-__device__ static inline float __double2float_ru(double x) { return (double)x; }
-__device__ static inline float __double2float_rz(double x) { return (double)x; }
+__device__ inline float __double2float_rd(double x) { return (double)x; }
+__device__ inline float __double2float_rn(double x) { return (double)x; }
+__device__ inline float __double2float_ru(double x) { return (double)x; }
+__device__ inline float __double2float_rz(double x) { return (double)x; }
 
-__device__ static inline int __double2hiint(double x) {
+__device__ inline int __double2hiint(double x) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
 
     int tmp[2];
@@ -485,7 +485,7 @@ __device__ static inline int __double2hiint(double x) {
 
     return tmp[1];
 }
-__device__ static inline int __double2loint(double x) {
+__device__ inline int __double2loint(double x) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
 
     int tmp[2];
@@ -494,35 +494,35 @@ __device__ static inline int __double2loint(double x) {
     return tmp[0];
 }
 
-__device__ static inline int __double2int_rd(double x) { return (int)x; }
-__device__ static inline int __double2int_rn(double x) { return (int)x; }
-__device__ static inline int __double2int_ru(double x) { return (int)x; }
-__device__ static inline int __double2int_rz(double x) { return (int)x; }
+__device__ inline int __double2int_rd(double x) { return (int)x; }
+__device__ inline int __double2int_rn(double x) { return (int)x; }
+__device__ inline int __double2int_ru(double x) { return (int)x; }
+__device__ inline int __double2int_rz(double x) { return (int)x; }
 
-__device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
-__device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rd(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rn(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_ru(double x) { return (long long int)x; }
+__device__ inline long long int __double2ll_rz(double x) { return (long long int)x; }
 
-__device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
-__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
+__device__ inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
 
-__device__ static inline unsigned long long int __double2ull_rd(double x) {
+__device__ inline unsigned long long int __double2ull_rd(double x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_rn(double x) {
+__device__ inline unsigned long long int __double2ull_rn(double x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_ru(double x) {
+__device__ inline unsigned long long int __double2ull_ru(double x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __double2ull_rz(double x) {
+__device__ inline unsigned long long int __double2ull_rz(double x) {
     return (unsigned long long int)x;
 }
 
-__device__ static inline long long int __double_as_longlong(double x) {
+__device__ inline long long int __double_as_longlong(double x) {
     static_assert(sizeof(long long) == sizeof(double), "");
 
     long long tmp;
@@ -545,35 +545,35 @@ CUDA implements half as unsigned short whereas, HIP doesn't.
 
 */
 
-__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
-__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
-__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
-__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
+__device__ inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
+__device__ inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
+__device__ inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
+__device__ inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
 
-__device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
-__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rd(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rn(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_ru(float x) { return (long long int)x; }
+__device__ inline long long int __float2ll_rz(float x) { return (long long int)x; }
 
-__device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
-__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
+__device__ inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
 
-__device__ static inline unsigned long long int __float2ull_rd(float x) {
+__device__ inline unsigned long long int __float2ull_rd(float x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_rn(float x) {
+__device__ inline unsigned long long int __float2ull_rn(float x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_ru(float x) {
+__device__ inline unsigned long long int __float2ull_ru(float x) {
     return (unsigned long long int)x;
 }
-__device__ static inline unsigned long long int __float2ull_rz(float x) {
+__device__ inline unsigned long long int __float2ull_rz(float x) {
     return (unsigned long long int)x;
 }
 
-__device__ static inline int __float_as_int(float x) {
+__device__ inline int __float_as_int(float x) {
     static_assert(sizeof(int) == sizeof(float), "");
 
     int tmp;
@@ -582,7 +582,7 @@ __device__ static inline int __float_as_int(float x) {
     return tmp;
 }
 
-__device__ static inline unsigned int __float_as_uint(float x) {
+__device__ inline unsigned int __float_as_uint(float x) {
     static_assert(sizeof(unsigned int) == sizeof(float), "");
 
     unsigned int tmp;
@@ -591,7 +591,7 @@ __device__ static inline unsigned int __float_as_uint(float x) {
     return tmp;
 }
 
-__device__ static inline double __hiloint2double(int hi, int lo) {
+__device__ inline double __hiloint2double(int hi, int lo) {
     static_assert(sizeof(double) == sizeof(uint64_t), "");
 
     uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
@@ -601,14 +601,14 @@ __device__ static inline double __hiloint2double(int hi, int lo) {
     return tmp1;
 }
 
-__device__ static inline double __int2double_rn(int x) { return (double)x; }
+__device__ inline double __int2double_rn(int x) { return (double)x; }
 
-__device__ static inline float __int2float_rd(int x) { return (float)x; }
-__device__ static inline float __int2float_rn(int x) { return (float)x; }
-__device__ static inline float __int2float_ru(int x) { return (float)x; }
-__device__ static inline float __int2float_rz(int x) { return (float)x; }
+__device__ inline float __int2float_rd(int x) { return (float)x; }
+__device__ inline float __int2float_rn(int x) { return (float)x; }
+__device__ inline float __int2float_ru(int x) { return (float)x; }
+__device__ inline float __int2float_rz(int x) { return (float)x; }
 
-__device__ static inline float __int_as_float(int x) {
+__device__ inline float __int_as_float(int x) {
     static_assert(sizeof(float) == sizeof(int), "");
 
     float tmp;
@@ -617,17 +617,17 @@ __device__ static inline float __int_as_float(int x) {
     return tmp;
 }
 
-__device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rd(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rn(long long int x) { return (double)x; }
+__device__ inline double __ll2double_ru(long long int x) { return (double)x; }
+__device__ inline double __ll2double_rz(long long int x) { return (double)x; }
 
-__device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rd(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rn(long long int x) { return (float)x; }
+__device__ inline float __ll2float_ru(long long int x) { return (float)x; }
+__device__ inline float __ll2float_rz(long long int x) { return (float)x; }
 
-__device__ static inline double __longlong_as_double(long long int x) {
+__device__ inline double __longlong_as_double(long long int x) {
     static_assert(sizeof(double) == sizeof(long long), "");
 
     double tmp;
@@ -636,14 +636,14 @@ __device__ static inline double __longlong_as_double(long long int x) {
     return tmp;
 }
 
-__device__ static inline double __uint2double_rn(int x) { return (double)x; }
+__device__ inline double __uint2double_rn(int x) { return (double)x; }
 
-__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rd(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rn(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_ru(unsigned int x) { return (float)x; }
+__device__ inline float __uint2float_rz(unsigned int x) { return (float)x; }
 
-__device__ static inline float __uint_as_float(unsigned int x) {
+__device__ inline float __uint_as_float(unsigned int x) {
    static_assert(sizeof(float) == sizeof(unsigned int), "");
 
     float tmp;
@@ -652,15 +652,15 @@ __device__ static inline float __uint_as_float(unsigned int x) {
     return tmp;
 }
 
-__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
-__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
+__device__ inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
 
-__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
-__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
+__device__ inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
 
 #if defined(__HCC__)
 #define __HCC_OR_HIP_CLANG__ 1
@@ -819,7 +819,7 @@ typedef enum __memory_order
 
 __device__
 inline
-static void
+void
 __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
 {
     // We're tying global-happens-before and local-happens-before together as does HSA
@@ -871,21 +871,21 @@ __atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __mem
 // Memory Fence Functions
 __device__
 inline
-static void __threadfence()
+void __threadfence()
 {
   __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
 }
 
 __device__
 inline
-static void __threadfence_block()
+void __threadfence_block()
 {
   __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
 }
 
 __device__
 inline
-static void __threadfence_system()
+void __threadfence_system()
 {
   __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
 }
@@ -945,7 +945,7 @@ void __assertfail(const char * __assertion,
 
 __device__
 inline
-static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
+void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
 {
     if (flags) {
         __atomic_work_item_fence(flags, __memory_order_release, scope);
@@ -958,7 +958,7 @@ static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scop
 
 __device__
 inline
-static void __barrier(int n)
+void __barrier(int n)
 {
   __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
 }
@@ -1037,7 +1037,7 @@ unsigned __smid(void)
 
 
 // loop unrolling
-static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
+inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
     auto dstPtr = static_cast<unsigned char*>(dst);
     auto srcPtr = static_cast<const unsigned char*>(src);
 
@@ -1063,7 +1063,7 @@ static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_
     return dst;
 }
 
-static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
+inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
     auto dstPtr = static_cast<unsigned char*>(dst);
 
     while (size >= 4u) {
@@ -1086,11 +1086,11 @@ static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, siz
 
     return dst;
 }
-static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
     return __hip_hc_memcpy(dst, src, size);
 }
 
-static inline __device__ void* memset(void* ptr, int val, size_t size) {
+inline __device__ void* memset(void* ptr, int val, size_t size) {
     unsigned char val8 = static_cast<unsigned char>(val);
     return __hip_hc_memset(ptr, val8, size);
 }
diff --git a/include/hip/hcc_detail/driver_types.h b/include/hip/hcc_detail/driver_types.h
index 510d3d058e..4fb24e092b 100644
--- a/include/hip/hcc_detail/driver_types.h
+++ b/include/hip/hcc_detail/driver_types.h
@@ -287,8 +287,8 @@ typedef struct hipMemcpy3DParms {
     size_t srcZ;
 }hipMemcpy3DParms;
 
-static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
-                                                          size_t ysz) {
+inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
+                                               size_t ysz) {
     struct hipPitchedPtr s;
 
     s.ptr = d;
@@ -299,7 +299,7 @@ static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t
     return s;
 }
 
-static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
     struct hipPos p;
 
     p.x = x;
@@ -309,7 +309,7 @@ static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
     return p;
 }
 
-static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
     struct hipExtent e;
 
     e.width = w;
diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h
index 11648ce123..18f0a9961e 100644
--- a/include/hip/hcc_detail/hip_complex.h
+++ b/include/hip/hcc_detail/hip_complex.h
@@ -36,7 +36,7 @@ THE SOFTWARE.
 
 #if __cplusplus
 #define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& op) {                             \
+    __device__ __host__ inline type operator-(const type& op) {                             \
         type ret;                                                                                  \
         ret.x = -op.x;                                                                             \
         ret.y = -op.y;                                                                             \
@@ -44,17 +44,17 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+    __device__ __host__ inline bool operator==(const type& lhs, const type& rhs) {          \
         return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
     }
 
 #define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+    __device__ __host__ inline bool operator!=(const type& lhs, const type& rhs) {          \
         return !(lhs == rhs);                                                                      \
     }
 
 #define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator+(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x + rhs.x;                                                                     \
         ret.y = lhs.y + rhs.y;                                                                     \
@@ -62,7 +62,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator-(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x - rhs.x;                                                                     \
         ret.y = lhs.y - rhs.y;                                                                     \
@@ -70,7 +70,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator*(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
         ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
@@ -78,7 +78,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) {           \
+    __device__ __host__ inline type operator/(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
         ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
@@ -88,33 +88,33 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator+=(type& lhs, const type& rhs) {               \
         lhs.x += rhs.x;                                                                            \
         lhs.y += rhs.y;                                                                            \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator-=(type& lhs, const type& rhs) {               \
         lhs.x -= rhs.x;                                                                            \
         lhs.y -= rhs.y;                                                                            \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator*=(type& lhs, const type& rhs) {               \
         lhs = lhs * rhs;                                                                           \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+    __device__ __host__ inline type& operator/=(type& lhs, const type& rhs) {               \
         lhs = lhs / rhs;                                                                           \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
-    __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+    __device__ __host__ inline type operator*(const type& lhs, type1 rhs) {                 \
         type ret;                                                                                  \
         ret.x = lhs.x * rhs;                                                                       \
         ret.y = lhs.y * rhs;                                                                       \
@@ -125,41 +125,41 @@ THE SOFTWARE.
 
 typedef float2 hipFloatComplex;
 
-__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+__device__ __host__ inline float hipCrealf(hipFloatComplex z) { return z.x; }
 
-__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+__device__ __host__ inline float hipCimagf(hipFloatComplex z) { return z.y; }
 
-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+__device__ __host__ inline hipFloatComplex make_hipFloatComplex(float a, float b) {
     hipFloatComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+__device__ __host__ inline hipFloatComplex hipConjf(hipFloatComplex z) {
     hipFloatComplex ret;
     ret.x = z.x;
     ret.y = -z.y;
     return ret;
 }
 
-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+__device__ __host__ inline float hipCsqabsf(hipFloatComplex z) {
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+__device__ __host__ inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
     float sqabs = hipCsqabsf(q);
     hipFloatComplex ret;
     ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -167,46 +167,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi
     return ret;
 }
 
-__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+__device__ __host__ inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
 
 
 typedef double2 hipDoubleComplex;
 
-__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+__device__ __host__ inline double hipCreal(hipDoubleComplex z) { return z.x; }
 
-__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+__device__ __host__ inline double hipCimag(hipDoubleComplex z) { return z.y; }
 
-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+__device__ __host__ inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
     hipDoubleComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+__device__ __host__ inline hipDoubleComplex hipConj(hipDoubleComplex z) {
     hipDoubleComplex ret;
     ret.x = z.x;
     ret.y = -z.y;
     return ret;
 }
 
-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+__device__ __host__ inline double hipCsqabs(hipDoubleComplex z) {
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+__device__ __host__ inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
     double sqabs = hipCsqabs(q);
     hipDoubleComplex ret;
     ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -214,7 +214,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h
     return ret;
 }
 
-__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
+__device__ __host__ inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
 
 
 #if __cplusplus
@@ -268,19 +268,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
 
 typedef hipFloatComplex hipComplex;
 
-__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
+__device__ __host__ inline hipComplex make_hipComplex(float x, float y) {
     return make_hipFloatComplex(x, y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+__device__ __host__ inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
     return make_hipFloatComplex((float)z.x, (float)z.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+__device__ __host__ inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
     return make_hipDoubleComplex((double)z.x, (double)z.y);
 }
 
-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+__device__ __host__ inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
     float real = (p.x * q.x) + r.x;
     float imag = (q.x * p.y) + r.y;
 
@@ -290,7 +290,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
     return make_hipComplex(real, imag);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+__device__ __host__ inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
                                                            hipDoubleComplex r) {
     double real = (p.x * q.x) + r.x;
     double imag = (q.x * p.y) + r.y;
diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h
index 52abc1a004..0017d91e3f 100644
--- a/include/hip/hcc_detail/hip_fp16.h
+++ b/include/hip/hcc_detail/hip_fp16.h
@@ -375,7 +375,7 @@ THE SOFTWARE.
                 data = x.data;
                 return *this;
             }
- 
+
             // MANIPULATORS - DEVICE ONLY
             #if !defined(__HIP_NO_HALF_OPERATORS__)
                 __device__
@@ -519,1120 +519,1116 @@ THE SOFTWARE.
         };
         // END STRUCT __HALF2
 
-        namespace
+        inline
+        __host__ __device__
+        __half2 make_half2(__half x, __half y)
         {
-            inline
-            __host__ __device__
-            __half2 make_half2(__half x, __half y)
-            {
-                return __half2{x, y};
-            }
+            return __half2{x, y};
+        }
 
-            inline
-            __device__
-            __half __low2half(__half2 x)
-            {
-                return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
-            }
+        inline
+        __device__
+        __half __low2half(__half2 x)
+        {
+            return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
+        }
 
-            inline
-            __device__
-            __half __high2half(__half2 x)
-            {
-                return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
-            }
+        inline
+        __device__
+        __half __high2half(__half2 x)
+        {
+            return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
+        }
 
-            inline
-            __device__
-            __half2 __half2half2(__half x)
-            {
-                return __half2{x, x};
-            }
+        inline
+        __device__
+        __half2 __half2half2(__half x)
+        {
+            return __half2{x, x};
+        }
 
-            inline
-            __device__
-            __half2 __halves2half2(__half x, __half y)
-            {
-                return __half2{x, y};
-            }
+        inline
+        __device__
+        __half2 __halves2half2(__half x, __half y)
+        {
+            return __half2{x, y};
+        }
 
-            inline
-            __device__
-            __half2 __low2half2(__half2 x)
-            {
-                return __half2{
-                    _Float16_2{
-                        static_cast<__half2_raw>(x).data.x,
-                        static_cast<__half2_raw>(x).data.x}};
-            }
-
-            inline
-            __device__
-            __half2 __high2half2(__half2 x)
-            {
-                return __half2_raw{
-                    _Float16_2{
-                        static_cast<__half2_raw>(x).data.y,
-                        static_cast<__half2_raw>(x).data.y}};
-            }
-
-            inline
-            __device__
-            __half2 __lows2half2(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    _Float16_2{
-                        static_cast<__half2_raw>(x).data.x,
-                        static_cast<__half2_raw>(y).data.x}};
-            }
-
-            inline
-            __device__
-            __half2 __highs2half2(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    _Float16_2{
-                        static_cast<__half2_raw>(x).data.y,
-                        static_cast<__half2_raw>(y).data.y}};
-            }
-
-            inline
-            __device__
-            __half2 __lowhigh2highlow(__half2 x)
-            {
-                return __half2_raw{
-                    _Float16_2{
-                        static_cast<__half2_raw>(x).data.y,
-                        static_cast<__half2_raw>(x).data.x}};
-            }
-
-            // Bitcasts
-            inline
-            __device__
-            short __half_as_short(__half x)
-            {
-                return static_cast<__half_raw>(x).x;
-            }
-
-            inline
-            __device__
-            unsigned short __half_as_ushort(__half x)
-            {
-                return static_cast<__half_raw>(x).x;
-            }
-
-            inline
-            __device__
-            __half __short_as_half(short x)
-            {
-                __half_raw r; r.x = x;
-                return r;
-            }
-
-            inline
-            __device__
-            __half __ushort_as_half(unsigned short x)
-            {
-                __half_raw r; r.x = x;
-                return r;
-            }
-
-            // TODO: rounding behaviour is not correct.
-            // float -> half | half2
-            inline
-            __device__ __host__
-            __half __float2half(float x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__ __host__
-            __half __float2half_rn(float x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__ __host__
-            __half __float2half_rz(float x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__ __host__
-            __half __float2half_rd(float x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__ __host__
-            __half __float2half_ru(float x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__ __host__
-            __half2 __float2half2_rn(float x)
-            {
-                return __half2_raw{
-                    _Float16_2{
-                        static_cast<_Float16>(x), static_cast<_Float16>(x)}};
-            }
-            inline
-            __device__ __host__
-            __half2 __floats2half2_rn(float x, float y)
-            {
-                return __half2_raw{_Float16_2{
-                    static_cast<_Float16>(x), static_cast<_Float16>(y)}};
-            }
-            inline
-            __device__ __host__
-            __half2 __float22half2_rn(float2 x)
-            {
-                return __floats2half2_rn(x.x, x.y);
-            }
-
-            // half | half2 -> float
-            inline
-            __device__ __host__
-            float __half2float(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__ __host__
-            float __low2float(__half2 x)
-            {
-                return static_cast<__half2_raw>(x).data.x;
-            }
-            inline
-            __device__ __host__
-            float __high2float(__half2 x)
-            {
-                return static_cast<__half2_raw>(x).data.y;
-            }
-            inline
-            __device__ __host__
-            float2 __half22float2(__half2 x)
-            {
-                return make_float2(
+        inline
+        __device__
+        __half2 __low2half2(__half2 x)
+        {
+            return __half2{
+                _Float16_2{
                     static_cast<__half2_raw>(x).data.x,
-                    static_cast<__half2_raw>(x).data.y);
-            }
+                    static_cast<__half2_raw>(x).data.x}};
+        }
 
-            // half -> int
-            inline
-            __device__
-            int __half2int_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            int __half2int_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            int __half2int_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            int __half2int_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        inline
+        __device__
+        __half2 __high2half2(__half2 x)
+        {
+            return __half2_raw{
+                _Float16_2{
+                    static_cast<__half2_raw>(x).data.y,
+                    static_cast<__half2_raw>(x).data.y}};
+        }
 
-            // int -> half
-            inline
-            __device__
-            __half __int2half_rn(int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __int2half_rz(int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __int2half_rd(int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __int2half_ru(int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        inline
+        __device__
+        __half2 __lows2half2(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                _Float16_2{
+                    static_cast<__half2_raw>(x).data.x,
+                    static_cast<__half2_raw>(y).data.x}};
+        }
 
-            // half -> short
-            inline
-            __device__
-            short __half2short_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            short __half2short_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            short __half2short_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            short __half2short_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        inline
+        __device__
+        __half2 __highs2half2(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                _Float16_2{
+                    static_cast<__half2_raw>(x).data.y,
+                    static_cast<__half2_raw>(y).data.y}};
+        }
 
-            // short -> half
-            inline
-            __device__
-            __half __short2half_rn(short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __short2half_rz(short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __short2half_rd(short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __short2half_ru(short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        inline
+        __device__
+        __half2 __lowhigh2highlow(__half2 x)
+        {
+            return __half2_raw{
+                _Float16_2{
+                    static_cast<__half2_raw>(x).data.y,
+                    static_cast<__half2_raw>(x).data.x}};
+        }
 
-            // half -> long long
-            inline
-            __device__
-            long long __half2ll_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            long long __half2ll_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            long long __half2ll_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            long long __half2ll_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        // Bitcasts
+        inline
+        __device__
+        short __half_as_short(__half x)
+        {
+            return static_cast<__half_raw>(x).x;
+        }
 
-            // long long -> half
-            inline
-            __device__
-            __half __ll2half_rn(long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ll2half_rz(long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ll2half_rd(long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ll2half_ru(long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        inline
+        __device__
+        unsigned short __half_as_ushort(__half x)
+        {
+            return static_cast<__half_raw>(x).x;
+        }
 
-            // half -> unsigned int
-            inline
-            __device__
-            unsigned int __half2uint_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned int __half2uint_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned int __half2uint_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned int __half2uint_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        inline
+        __device__
+        __half __short_as_half(short x)
+        {
+            __half_raw r; r.x = x;
+            return r;
+        }
 
-            // unsigned int -> half
-            inline
-            __device__
-            __half __uint2half_rn(unsigned int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __uint2half_rz(unsigned int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __uint2half_rd(unsigned int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __uint2half_ru(unsigned int x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        inline
+        __device__
+        __half __ushort_as_half(unsigned short x)
+        {
+            __half_raw r; r.x = x;
+            return r;
+        }
 
-            // half -> unsigned short
-            inline
-            __device__
-            unsigned short __half2ushort_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned short __half2ushort_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned short __half2ushort_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned short __half2ushort_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        // TODO: rounding behaviour is not correct.
+        // float -> half | half2
+        inline
+        __device__ __host__
+        __half __float2half(float x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__ __host__
+        __half __float2half_rn(float x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__ __host__
+        __half __float2half_rz(float x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__ __host__
+        __half __float2half_rd(float x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__ __host__
+        __half __float2half_ru(float x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__ __host__
+        __half2 __float2half2_rn(float x)
+        {
+            return __half2_raw{
+                _Float16_2{static_cast<_Float16>(x), static_cast<_Float16>(x)}};
+        }
+        inline
+        __device__ __host__
+        __half2 __floats2half2_rn(float x, float y)
+        {
+            return __half2_raw{_Float16_2{
+                static_cast<_Float16>(x), static_cast<_Float16>(y)}};
+        }
+        inline
+        __device__ __host__
+        __half2 __float22half2_rn(float2 x)
+        {
+            return __floats2half2_rn(x.x, x.y);
+        }
 
-            // unsigned short -> half
-            inline
-            __device__
-            __half __ushort2half_rn(unsigned short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ushort2half_rz(unsigned short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ushort2half_rd(unsigned short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ushort2half_ru(unsigned short x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        // half | half2 -> float
+        inline
+        __device__ __host__
+        float __half2float(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__ __host__
+        float __low2float(__half2 x)
+        {
+            return static_cast<__half2_raw>(x).data.x;
+        }
+        inline
+        __device__ __host__
+        float __high2float(__half2 x)
+        {
+            return static_cast<__half2_raw>(x).data.y;
+        }
+        inline
+        __device__ __host__
+        float2 __half22float2(__half2 x)
+        {
+            return make_float2(
+                static_cast<__half2_raw>(x).data.x,
+                static_cast<__half2_raw>(x).data.y);
+        }
 
-            // half -> unsigned long long
-            inline
-            __device__
-            unsigned long long __half2ull_rn(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned long long __half2ull_rz(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned long long __half2ull_rd(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
-            inline
-            __device__
-            unsigned long long __half2ull_ru(__half x)
-            {
-                return static_cast<__half_raw>(x).data;
-            }
+        // half -> int
+        inline
+        __device__
+        int __half2int_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        int __half2int_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        int __half2int_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        int __half2int_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            // unsigned long long -> half
-            inline
-            __device__
-            __half __ull2half_rn(unsigned long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ull2half_rz(unsigned long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ull2half_rd(unsigned long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
-            inline
-            __device__
-            __half __ull2half_ru(unsigned long long x)
-            {
-                return __half_raw{static_cast<_Float16>(x)};
-            }
+        // int -> half
+        inline
+        __device__
+        __half __int2half_rn(int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __int2half_rz(int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __int2half_rd(int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __int2half_ru(int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-            // Load primitives
-            inline
-            __device__
-            __half __ldg(const __half* ptr) { return *ptr; }
-            inline
-            __device__
-            __half __ldcg(const __half* ptr) { return *ptr; }
-            inline
-            __device__
-            __half __ldca(const __half* ptr) { return *ptr; }
-            inline
-            __device__
-            __half __ldcs(const __half* ptr) { return *ptr; }
+        // half -> short
+        inline
+        __device__
+        short __half2short_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        short __half2short_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        short __half2short_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        short __half2short_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            inline
-            __device__
-            __half2 __ldg(const __half2* ptr) { return *ptr; }
-            inline
-            __device__
-            __half2 __ldcg(const __half2* ptr) { return *ptr; }
-            inline
-            __device__
-            __half2 __ldca(const __half2* ptr) { return *ptr; }
-            inline
-            __device__
-            __half2 __ldcs(const __half2* ptr) { return *ptr; }
+        // short -> half
+        inline
+        __device__
+        __half __short2half_rn(short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __short2half_rz(short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __short2half_rd(short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __short2half_ru(short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-            // Relations
-            inline
-            __device__
-            bool __heq(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data ==
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hne(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data !=
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hle(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data <=
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hge(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data >=
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hlt(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data <
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hgt(__half x, __half y)
-            {
-                return static_cast<__half_raw>(x).data >
-                    static_cast<__half_raw>(y).data;
-            }
-            inline
-            __device__
-            bool __hequ(__half x, __half y) { return __heq(x, y); }
-            inline
-            __device__
-            bool __hneu(__half x, __half y) { return __hne(x, y); }
-            inline
-            __device__
-            bool __hleu(__half x, __half y) { return __hle(x, y); }
-            inline
-            __device__
-            bool __hgeu(__half x, __half y) { return __hge(x, y); }
-            inline
-            __device__
-            bool __hltu(__half x, __half y) { return __hlt(x, y); }
-            inline
-            __device__
-            bool __hgtu(__half x, __half y) { return __hgt(x, y); }
+        // half -> long long
+        inline
+        __device__
+        long long __half2ll_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        long long __half2ll_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        long long __half2ll_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        long long __half2ll_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            inline
-            __device__
-            __half2 __heq2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data ==
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hne2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data !=
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hle2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data <=
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hge2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data >=
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hlt2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data <
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hgt2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(x).data >
-                    static_cast<__half2_raw>(y).data;
-                return __builtin_convertvector(-r, _Float16_2);
-            }
-            inline
-            __device__
-            __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
-            inline
-            __device__
-            __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
-            inline
-            __device__
-            __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
-            inline
-            __device__
-            __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
-            inline
-            __device__
-            __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
-            inline
-            __device__
-            __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }
+        // long long -> half
+        inline
+        __device__
+        __half __ll2half_rn(long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ll2half_rz(long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ll2half_rd(long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ll2half_ru(long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-            inline
-            __device__
-            bool __hbeq2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__heq2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hbne2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hne2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hble2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hle2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hbge2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hge2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hblt2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hlt2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hbgt2(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hgt2(x, y));
-                return r.data.x != 0 && r.data.y != 0;
-            }
-            inline
-            __device__
-            bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
-            inline
-            __device__
-            bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
-            inline
-            __device__
-            bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
-            inline
-            __device__
-            bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
-            inline
-            __device__
-            bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
-            inline
-            __device__
-            bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
+        // half -> unsigned int
+        inline
+        __device__
+        unsigned int __half2uint_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned int __half2uint_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned int __half2uint_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned int __half2uint_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            // Arithmetic
-            inline
-            __device__
-            __half __clamp_01(__half x)
-            {
-                auto r = static_cast<__half_raw>(x);
+        // unsigned int -> half
+        inline
+        __device__
+        __half __uint2half_rn(unsigned int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __uint2half_rz(unsigned int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __uint2half_rd(unsigned int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __uint2half_ru(unsigned int x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-                if (__hlt(x, __half_raw{0})) return __half_raw{0};
-                if (__hlt(__half_raw{1}, x)) return __half_raw{1};
-                return r;
-            }
+        // half -> unsigned short
+        inline
+        __device__
+        unsigned short __half2ushort_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned short __half2ushort_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned short __half2ushort_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned short __half2ushort_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            inline
-            __device__
-            __half __hadd(__half x, __half y)
-            {
-                return __half_raw{
-                    static_cast<__half_raw>(x).data +
-                    static_cast<__half_raw>(y).data};
-            }
-            inline
-            __device__
-            __half __hsub(__half x, __half y)
-            {
-                return __half_raw{
-                    static_cast<__half_raw>(x).data -
-                    static_cast<__half_raw>(y).data};
-            }
-            inline
-            __device__
-            __half __hmul(__half x, __half y)
-            {
-                return __half_raw{
-                    static_cast<__half_raw>(x).data *
-                    static_cast<__half_raw>(y).data};
-            }
-            inline
-            __device__
-            __half __hadd_sat(__half x, __half y)
-            {
-                return __clamp_01(__hadd(x, y));
-            }
-            inline
-            __device__
-            __half __hsub_sat(__half x, __half y)
-            {
-                return __clamp_01(__hsub(x, y));
-            }
-            inline
-            __device__
-            __half __hmul_sat(__half x, __half y)
-            {
-                return __clamp_01(__hmul(x, y));
-            }
-            inline
-            __device__
-            __half __hfma(__half x, __half y, __half z)
-            {
-                return __half_raw{__ocml_fma_f16(
-                    static_cast<__half_raw>(x).data,
-                    static_cast<__half_raw>(y).data,
-                    static_cast<__half_raw>(z).data)};
-            }
-            inline
-            __device__
-            __half __hfma_sat(__half x, __half y, __half z)
-            {
-                return __clamp_01(__hfma(x, y, z));
-            }
-            inline
-            __device__
-            __half __hdiv(__half x, __half y)
-            {
-                return __half_raw{
-                    static_cast<__half_raw>(x).data /
-                    static_cast<__half_raw>(y).data};
-            }
+        // unsigned short -> half
+        inline
+        __device__
+        __half __ushort2half_rn(unsigned short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ushort2half_rz(unsigned short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ushort2half_rd(unsigned short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ushort2half_ru(unsigned short x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-            inline
-            __device__
-            __half2 __hadd2(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    static_cast<__half2_raw>(x).data +
-                    static_cast<__half2_raw>(y).data};
-            }
-            inline
-            __device__
-            __half2 __hsub2(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    static_cast<__half2_raw>(x).data -
-                    static_cast<__half2_raw>(y).data};
-            }
-            inline
-            __device__
-            __half2 __hmul2(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    static_cast<__half2_raw>(x).data *
-                    static_cast<__half2_raw>(y).data};
-            }
-            inline
-            __device__
-            __half2 __hadd2_sat(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hadd2(x, y));
-                return __half2{
-                    __clamp_01(__half_raw{r.data.x}),
-                    __clamp_01(__half_raw{r.data.y})};
-            }
-            inline
-            __device__
-            __half2 __hsub2_sat(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hsub2(x, y));
-                return __half2{
-                    __clamp_01(__half_raw{r.data.x}),
-                    __clamp_01(__half_raw{r.data.y})};
-            }
-            inline
-            __device__
-            __half2 __hmul2_sat(__half2 x, __half2 y)
-            {
-                auto r = static_cast<__half2_raw>(__hmul2(x, y));
-                return __half2{
-                    __clamp_01(__half_raw{r.data.x}),
-                    __clamp_01(__half_raw{r.data.y})};
-            }
-            inline
-            __device__
-            __half2 __hfma2(__half2 x, __half2 y, __half2 z)
-            {
-                return __half2_raw{__ocml_fma_2f16(x, y, z)};
-            }
-            inline
-            __device__
-            __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
-            {
-                auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
-                return __half2{
-                    __clamp_01(__half_raw{r.data.x}),
-                    __clamp_01(__half_raw{r.data.y})};
-            }
-            inline
-            __device__
-            __half2 __h2div(__half2 x, __half2 y)
-            {
-                return __half2_raw{
-                    static_cast<__half2_raw>(x).data /
-                    static_cast<__half2_raw>(y).data};
-            }
+        // half -> unsigned long long
+        inline
+        __device__
+        unsigned long long __half2ull_rn(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned long long __half2ull_rz(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned long long __half2ull_rd(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
+        inline
+        __device__
+        unsigned long long __half2ull_ru(__half x)
+        {
+            return static_cast<__half_raw>(x).data;
+        }
 
-            // Math functions
-            #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
-            inline
-            __device__
-            float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
-                return __ockl_fdot2(static_cast<__half2_raw>(a).data,
-                                    static_cast<__half2_raw>(b).data,
-                                    c, saturate);
-            }
-            #endif
-            inline
-            __device__
-            __half htrunc(__half x)
-            {
-                return __half_raw{
-                    __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hceil(__half x)
-            {
-                return __half_raw{
-                    __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hfloor(__half x)
-            {
-                return __half_raw{
-                   __ocml_floor_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hrint(__half x)
-            {
-                return __half_raw{
-                    __ocml_rint_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hsin(__half x)
-            {
-                return __half_raw{
-                    __ocml_sin_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hcos(__half x)
-            {
-                return __half_raw{
-                    __ocml_cos_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hexp(__half x)
-            {
-                return __half_raw{
-                    __ocml_exp_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hexp2(__half x)
-            {
-                return __half_raw{
-                    __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hexp10(__half x)
-            {
-                return __half_raw{
-                    __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hlog2(__half x)
-            {
-                return __half_raw{
-                    __ocml_log2_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hlog(__half x)
-            {
-                return __half_raw{
-                    __ocml_log_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hlog10(__half x)
-            {
-                return __half_raw{
-                    __ocml_log10_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hrcp(__half x)
-            {
-                return __half_raw{
-                    __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hrsqrt(__half x)
-            {
-                return __half_raw{
-                    __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            __half hsqrt(__half x)
-            {
-                return __half_raw{
-                    __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
-            }
-            inline
-            __device__
-            bool __hisinf(__half x)
-            {
-                return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
-            }
-            inline
-            __device__
-            bool __hisnan(__half x)
-            {
-                return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
-            }
-            inline
-            __device__
-            __half __hneg(__half x)
-            {
-                return __half_raw{-static_cast<__half_raw>(x).data};
-            }
+        // unsigned long long -> half
+        inline
+        __device__
+        __half __ull2half_rn(unsigned long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ull2half_rz(unsigned long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ull2half_rd(unsigned long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
+        inline
+        __device__
+        __half __ull2half_ru(unsigned long long x)
+        {
+            return __half_raw{static_cast<_Float16>(x)};
+        }
 
-            inline
-            __device__
-            __half2 h2trunc(__half2 x)
-            {
-                return __half2_raw{__ocml_trunc_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2ceil(__half2 x)
-            {
-                return __half2_raw{__ocml_ceil_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2floor(__half2 x)
-            {
-                return __half2_raw{__ocml_floor_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2rint(__half2 x)
-            {
-                return __half2_raw{__ocml_rint_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2sin(__half2 x)
-            {
-                return __half2_raw{__ocml_sin_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2cos(__half2 x)
-            {
-                return __half2_raw{__ocml_cos_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2exp(__half2 x)
-            {
-                return __half2_raw{__ocml_exp_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2exp2(__half2 x)
-            {
-                return __half2_raw{__ocml_exp2_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2exp10(__half2 x)
-            {
-                return __half2_raw{__ocml_exp10_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2log2(__half2 x)
-            {
-                return __half2_raw{__ocml_log2_2f16(x)};
-            }
-            inline
-            __device__
-            __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
-            inline
-            __device__
-            __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
-            inline
-            __device__
-            __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
-            inline
-            __device__
-            __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
-            inline
-            __device__
-            __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
-            inline
-            __device__
-            __half2 __hisinf2(__half2 x)
-            {
-                auto r = __ocml_isinf_2f16(x);
-                return __half2_raw{_Float16_2{
-                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
-            }
-            inline
-            __device__
-            __half2 __hisnan2(__half2 x)
-            {
-                auto r = __ocml_isnan_2f16(x);
-                return __half2_raw{_Float16_2{
-                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
-            }
-            inline
-            __device__
-            __half2 __hneg2(__half2 x)
-            {
-                return __half2_raw{-static_cast<__half2_raw>(x).data};
-            }
-        } // Anonymous namespace.
+        // Load primitives
+        inline
+        __device__
+        __half __ldg(const __half* ptr) { return *ptr; }
+        inline
+        __device__
+        __half __ldcg(const __half* ptr) { return *ptr; }
+        inline
+        __device__
+        __half __ldca(const __half* ptr) { return *ptr; }
+        inline
+        __device__
+        __half __ldcs(const __half* ptr) { return *ptr; }
+
+        inline
+        __device__
+        __half2 __ldg(const __half2* ptr) { return *ptr; }
+        inline
+        __device__
+        __half2 __ldcg(const __half2* ptr) { return *ptr; }
+        inline
+        __device__
+        __half2 __ldca(const __half2* ptr) { return *ptr; }
+        inline
+        __device__
+        __half2 __ldcs(const __half2* ptr) { return *ptr; }
+
+        // Relations
+        inline
+        __device__
+        bool __heq(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data ==
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hne(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data !=
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hle(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data <=
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hge(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data >=
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hlt(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data <
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hgt(__half x, __half y)
+        {
+            return static_cast<__half_raw>(x).data >
+                static_cast<__half_raw>(y).data;
+        }
+        inline
+        __device__
+        bool __hequ(__half x, __half y) { return __heq(x, y); }
+        inline
+        __device__
+        bool __hneu(__half x, __half y) { return __hne(x, y); }
+        inline
+        __device__
+        bool __hleu(__half x, __half y) { return __hle(x, y); }
+        inline
+        __device__
+        bool __hgeu(__half x, __half y) { return __hge(x, y); }
+        inline
+        __device__
+        bool __hltu(__half x, __half y) { return __hlt(x, y); }
+        inline
+        __device__
+        bool __hgtu(__half x, __half y) { return __hgt(x, y); }
+
+        inline
+        __device__
+        __half2 __heq2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data ==
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hne2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data !=
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hle2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data <=
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hge2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data >=
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hlt2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data <
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hgt2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(x).data >
+                static_cast<__half2_raw>(y).data;
+            return __builtin_convertvector(-r, _Float16_2);
+        }
+        inline
+        __device__
+        __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
+        inline
+        __device__
+        __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
+        inline
+        __device__
+        __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
+        inline
+        __device__
+        __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
+        inline
+        __device__
+        __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
+        inline
+        __device__
+        __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }
+
+        inline
+        __device__
+        bool __hbeq2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__heq2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hbne2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hne2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hble2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hle2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hbge2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hge2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hblt2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hlt2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hbgt2(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hgt2(x, y));
+            return r.data.x != 0 && r.data.y != 0;
+        }
+        inline
+        __device__
+        bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
+        inline
+        __device__
+        bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
+        inline
+        __device__
+        bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
+        inline
+        __device__
+        bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
+        inline
+        __device__
+        bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
+        inline
+        __device__
+        bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
+
+        // Arithmetic
+        inline
+        __device__
+        __half __clamp_01(__half x)
+        {
+            auto r = static_cast<__half_raw>(x);
+
+            if (__hlt(x, __half_raw{0})) return __half_raw{0};
+            if (__hlt(__half_raw{1}, x)) return __half_raw{1};
+            return r;
+        }
+
+        inline
+        __device__
+        __half __hadd(__half x, __half y)
+        {
+            return __half_raw{
+                static_cast<__half_raw>(x).data +
+                static_cast<__half_raw>(y).data};
+        }
+        inline
+        __device__
+        __half __hsub(__half x, __half y)
+        {
+            return __half_raw{
+                static_cast<__half_raw>(x).data -
+                static_cast<__half_raw>(y).data};
+        }
+        inline
+        __device__
+        __half __hmul(__half x, __half y)
+        {
+            return __half_raw{
+                static_cast<__half_raw>(x).data *
+                static_cast<__half_raw>(y).data};
+        }
+        inline
+        __device__
+        __half __hadd_sat(__half x, __half y)
+        {
+            return __clamp_01(__hadd(x, y));
+        }
+        inline
+        __device__
+        __half __hsub_sat(__half x, __half y)
+        {
+            return __clamp_01(__hsub(x, y));
+        }
+        inline
+        __device__
+        __half __hmul_sat(__half x, __half y)
+        {
+            return __clamp_01(__hmul(x, y));
+        }
+        inline
+        __device__
+        __half __hfma(__half x, __half y, __half z)
+        {
+            return __half_raw{__ocml_fma_f16(
+                static_cast<__half_raw>(x).data,
+                static_cast<__half_raw>(y).data,
+                static_cast<__half_raw>(z).data)};
+        }
+        inline
+        __device__
+        __half __hfma_sat(__half x, __half y, __half z)
+        {
+            return __clamp_01(__hfma(x, y, z));
+        }
+        inline
+        __device__
+        __half __hdiv(__half x, __half y)
+        {
+            return __half_raw{
+                static_cast<__half_raw>(x).data /
+                static_cast<__half_raw>(y).data};
+        }
+
+        inline
+        __device__
+        __half2 __hadd2(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                static_cast<__half2_raw>(x).data +
+                static_cast<__half2_raw>(y).data};
+        }
+        inline
+        __device__
+        __half2 __hsub2(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                static_cast<__half2_raw>(x).data -
+                static_cast<__half2_raw>(y).data};
+        }
+        inline
+        __device__
+        __half2 __hmul2(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                static_cast<__half2_raw>(x).data *
+                static_cast<__half2_raw>(y).data};
+        }
+        inline
+        __device__
+        __half2 __hadd2_sat(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hadd2(x, y));
+            return __half2{
+                __clamp_01(__half_raw{r.data.x}),
+                __clamp_01(__half_raw{r.data.y})};
+        }
+        inline
+        __device__
+        __half2 __hsub2_sat(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hsub2(x, y));
+            return __half2{
+                __clamp_01(__half_raw{r.data.x}),
+                __clamp_01(__half_raw{r.data.y})};
+        }
+        inline
+        __device__
+        __half2 __hmul2_sat(__half2 x, __half2 y)
+        {
+            auto r = static_cast<__half2_raw>(__hmul2(x, y));
+            return __half2{
+                __clamp_01(__half_raw{r.data.x}),
+                __clamp_01(__half_raw{r.data.y})};
+        }
+        inline
+        __device__
+        __half2 __hfma2(__half2 x, __half2 y, __half2 z)
+        {
+            return __half2_raw{__ocml_fma_2f16(x, y, z)};
+        }
+        inline
+        __device__
+        __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
+        {
+            auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
+            return __half2{
+                __clamp_01(__half_raw{r.data.x}),
+                __clamp_01(__half_raw{r.data.y})};
+        }
+        inline
+        __device__
+        __half2 __h2div(__half2 x, __half2 y)
+        {
+            return __half2_raw{
+                static_cast<__half2_raw>(x).data /
+                static_cast<__half2_raw>(y).data};
+        }
+
+        // Math functions
+        #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
+        inline
+        __device__
+        float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
+            return __ockl_fdot2(static_cast<__half2_raw>(a).data,
+                                static_cast<__half2_raw>(b).data,
+                                c, saturate);
+        }
+        #endif
+        inline
+        __device__
+        __half htrunc(__half x)
+        {
+            return __half_raw{
+                __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hceil(__half x)
+        {
+            return __half_raw{
+                __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hfloor(__half x)
+        {
+            return __half_raw{
+                __ocml_floor_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hrint(__half x)
+        {
+            return __half_raw{
+                __ocml_rint_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hsin(__half x)
+        {
+            return __half_raw{
+                __ocml_sin_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hcos(__half x)
+        {
+            return __half_raw{
+                __ocml_cos_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hexp(__half x)
+        {
+            return __half_raw{
+                __ocml_exp_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hexp2(__half x)
+        {
+            return __half_raw{
+                __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hexp10(__half x)
+        {
+            return __half_raw{
+                __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hlog2(__half x)
+        {
+            return __half_raw{
+                __ocml_log2_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hlog(__half x)
+        {
+            return __half_raw{
+                __ocml_log_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hlog10(__half x)
+        {
+            return __half_raw{
+                __ocml_log10_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hrcp(__half x)
+        {
+            return __half_raw{
+                __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hrsqrt(__half x)
+        {
+            return __half_raw{
+                __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        __half hsqrt(__half x)
+        {
+            return __half_raw{
+                __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
+        }
+        inline
+        __device__
+        bool __hisinf(__half x)
+        {
+            return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
+        }
+        inline
+        __device__
+        bool __hisnan(__half x)
+        {
+            return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
+        }
+        inline
+        __device__
+        __half __hneg(__half x)
+        {
+            return __half_raw{-static_cast<__half_raw>(x).data};
+        }
+
+        inline
+        __device__
+        __half2 h2trunc(__half2 x)
+        {
+            return __half2_raw{__ocml_trunc_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2ceil(__half2 x)
+        {
+            return __half2_raw{__ocml_ceil_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2floor(__half2 x)
+        {
+            return __half2_raw{__ocml_floor_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2rint(__half2 x)
+        {
+            return __half2_raw{__ocml_rint_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2sin(__half2 x)
+        {
+            return __half2_raw{__ocml_sin_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2cos(__half2 x)
+        {
+            return __half2_raw{__ocml_cos_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2exp(__half2 x)
+        {
+            return __half2_raw{__ocml_exp_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2exp2(__half2 x)
+        {
+            return __half2_raw{__ocml_exp2_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2exp10(__half2 x)
+        {
+            return __half2_raw{__ocml_exp10_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2log2(__half2 x)
+        {
+            return __half2_raw{__ocml_log2_2f16(x)};
+        }
+        inline
+        __device__
+        __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
+        inline
+        __device__
+        __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
+        inline
+        __device__
+        __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
+        inline
+        __device__
+        __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
+        inline
+        __device__
+        __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
+        inline
+        __device__
+        __half2 __hisinf2(__half2 x)
+        {
+            auto r = __ocml_isinf_2f16(x);
+            return __half2_raw{_Float16_2{
+                static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+        }
+        inline
+        __device__
+        __half2 __hisnan2(__half2 x)
+        {
+            auto r = __ocml_isnan_2f16(x);
+            return __half2_raw{_Float16_2{
+                static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+        }
+        inline
+        __device__
+        __half2 __hneg2(__half2 x)
+        {
+            return __half2_raw{-static_cast<__half2_raw>(x).data};
+        }
 
         #if !defined(HIP_NO_HALF)
             using half = __half;
diff --git a/include/hip/hcc_detail/hip_fp16_gcc.h b/include/hip/hcc_detail/hip_fp16_gcc.h
index 9b31f9e3ce..3d8752e619 100644
--- a/include/hip/hcc_detail/hip_fp16_gcc.h
+++ b/include/hip/hcc_detail/hip_fp16_gcc.h
@@ -95,160 +95,157 @@ struct __half2_raw {
     };
     // END STRUCT __HALF2
 
-    namespace
+    inline
+    unsigned short __internal_float2half(
+        float flt, unsigned int& sgn, unsigned int& rem)
     {
-        inline
-        unsigned short __internal_float2half(
-            float flt, unsigned int& sgn, unsigned int& rem)
-        {
-            unsigned int x{};
-            std::memcpy(&x, &flt, sizeof(flt));
+        unsigned int x{};
+        std::memcpy(&x, &flt, sizeof(flt));
 
-            unsigned int u = (x & 0x7fffffffU);
-            sgn = ((x >> 16) & 0x8000U);
+        unsigned int u = (x & 0x7fffffffU);
+        sgn = ((x >> 16) & 0x8000U);
 
-            // NaN/+Inf/-Inf
-            if (u >= 0x7f800000U) {
-                rem = 0;
-                return static_cast<unsigned short>(
-                    (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        // NaN/+Inf/-Inf
+        if (u >= 0x7f800000U) {
+            rem = 0;
+            return static_cast<unsigned short>(
+                (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+        }
+        // Overflows
+        if (u > 0x477fefffU) {
+            rem = 0x80000000U;
+            return static_cast<unsigned short>(sgn | 0x7bffU);
+        }
+        // Normal numbers
+        if (u >= 0x38800000U) {
+            rem = u << 19;
+            u -= 0x38000000U;
+            return static_cast<unsigned short>(sgn | (u >> 13));
+        }
+        // +0/-0
+        if (u < 0x33000001U) {
+            rem = u;
+            return static_cast<unsigned short>(sgn);
+        }
+        // Denormal numbers
+        unsigned int exponent = u >> 23;
+        unsigned int mantissa = (u & 0x7fffffU);
+        unsigned int shift = 0x7eU - exponent;
+        mantissa |= 0x800000U;
+        rem = mantissa << (32 - shift);
+        return static_cast<unsigned short>(sgn | (mantissa >> shift));
+    }
+
+    inline
+    __half __float2half(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rn(float x) { return __float2half(x); }
+
+    inline
+    __half __float2half_rz(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+
+        return r;
+    }
+
+    inline
+    __half __float2half_rd(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half __float2half_ru(float x)
+    {
+        __half_raw r;
+        unsigned int sgn{};
+        unsigned int rem{};
+        r.x = __internal_float2half(x, sgn, rem);
+        if (rem && !sgn) ++r.x;
+
+        return r;
+    }
+
+    inline
+    __half2 __float2half2_rn(float x)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(x)};
+    }
+
+    inline
+    __half2 __floats2half2_rn(float x, float y)
+    {
+        return __half2{__float2half_rn(x), __float2half_rn(y)};
+    }
+
+    inline
+    float __internal_half2float(unsigned short x)
+    {
+        unsigned int sign = ((x >> 15) & 1);
+        unsigned int exponent = ((x >> 10) & 0x1f);
+        unsigned int mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1fU) { /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
+            exponent = 0xffU;
+        } else if (!exponent) { /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71U;
+                do {
+                    msb = (mantissa & 0x400000U);
+                    mantissa <<= 1; /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
             }
-            // Overflows
-            if (u > 0x477fefffU) {
-                rem = 0x80000000U;
-                return static_cast<unsigned short>(sgn | 0x7bffU);
-            }
-            // Normal numbers
-            if (u >= 0x38800000U) {
-                rem = u << 19;
-                u -= 0x38000000U;
-                return static_cast<unsigned short>(sgn | (u >> 13));
-            }
-            // +0/-0
-            if (u < 0x33000001U) {
-                rem = u;
-                return static_cast<unsigned short>(sgn);
-            }
-            // Denormal numbers
-            unsigned int exponent = u >> 23;
-            unsigned int mantissa = (u & 0x7fffffU);
-            unsigned int shift = 0x7eU - exponent;
-            mantissa |= 0x800000U;
-            rem = mantissa << (32 - shift);
-            return static_cast<unsigned short>(sgn | (mantissa >> shift));
+        } else {
+            exponent += 0x70U;
         }
+        unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
+        float f;
+        std::memcpy(&f, &u, sizeof(u));
 
-        inline
-        __half __float2half(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+        return f;
+    }
 
-            return r;
-        }
+    inline
+    float __half2float(__half x)
+    {
+        return __internal_half2float(static_cast<__half_raw>(x).x);
+    }
 
-        inline
-        __half __float2half_rn(float x) { return __float2half(x); }
+    inline
+    float __low2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).x);
+    }
 
-        inline
-        __half __float2half_rz(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-
-            return r;
-        }
-
-        inline
-        __half __float2half_rd(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem && sgn) ++r.x;
-
-            return r;
-        }
-
-        inline
-        __half __float2half_ru(float x)
-        {
-            __half_raw r;
-            unsigned int sgn{};
-            unsigned int rem{};
-            r.x = __internal_float2half(x, sgn, rem);
-            if (rem && !sgn) ++r.x;
-
-            return r;
-        }
-
-        inline
-        __half2 __float2half2_rn(float x)
-        {
-            return __half2{__float2half_rn(x), __float2half_rn(x)};
-        }
-
-        inline
-        __half2 __floats2half2_rn(float x, float y)
-        {
-            return __half2{__float2half_rn(x), __float2half_rn(y)};
-        }
-
-        inline
-        float __internal_half2float(unsigned short x)
-        {
-            unsigned int sign = ((x >> 15) & 1);
-            unsigned int exponent = ((x >> 10) & 0x1f);
-            unsigned int mantissa = ((x & 0x3ff) << 13);
-
-            if (exponent == 0x1fU) { /* NaN or Inf */
-                mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
-                exponent = 0xffU;
-            } else if (!exponent) { /* Denorm or Zero */
-                if (mantissa) {
-                    unsigned int msb;
-                    exponent = 0x71U;
-                    do {
-                        msb = (mantissa & 0x400000U);
-                        mantissa <<= 1; /* normalize */
-                        --exponent;
-                    } while (!msb);
-                    mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
-                }
-            } else {
-                exponent += 0x70U;
-            }
-            unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
-            float f;
-            memcpy(&f, &u, sizeof(u));
-
-            return f;
-        }
-
-        inline
-        float __half2float(__half x)
-        {
-            return __internal_half2float(static_cast<__half_raw>(x).x);
-        }
-
-        inline
-        float __low2float(__half2 x)
-        {
-            return __internal_half2float(static_cast<__half2_raw>(x).x);
-        }
-
-        inline
-        float __high2float(__half2 x)
-        {
-            return __internal_half2float(static_cast<__half2_raw>(x).y);
-        }
-    } // Anonymous namespace.
+    inline
+    float __high2float(__half2 x)
+    {
+        return __internal_half2float(static_cast<__half2_raw>(x).y);
+    }
 
     #if !defined(HIP_NO_HALF)
         using half = __half;
diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h
index fbf96d3d0b..fbba1419fa 100644
--- a/include/hip/hcc_detail/hip_runtime.h
+++ b/include/hip/hcc_detail/hip_runtime.h
@@ -308,17 +308,17 @@ static constexpr Coordinates<hip_impl::WorkitemId> threadIdx{};
 extern "C" __device__ void* __hip_malloc(size_t);
 extern "C" __device__ void* __hip_free(void* ptr);
 
-static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
-static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
+inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
 
 #if defined(__HCC_ACCELERATOR__) && defined(HC_FEATURE_PRINTF)
 template <typename... All>
-static inline __device__ void printf(const char* format, All... all) {
+inline __device__ void printf(const char* format, All... all) {
     hc::printf(format, all...);
 }
 #elif defined(__HCC_ACCELERATOR__) || __HIP__
 template <typename... All>
-static inline __device__ void printf(const char* format, All... all) {}
+inline __device__ void printf(const char* format, All... all) {}
 #endif
 
 #endif //__HCC_OR_HIP_CLANG__
diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h
index 2079bb7e44..e986e5ab3c 100644
--- a/include/hip/hcc_detail/hip_vector_types.h
+++ b/include/hip/hcc_detail/hip_vector_types.h
@@ -1107,38 +1107,38 @@ __MAKE_VECTOR_TYPE__(double, double);
 
 #ifdef __cplusplus
 #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
     type make_##type(comp x) { type r{x}; return r; }
 
 #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
     type make_##type(comp x, comp y) { type r{x, y}; return r; }
 
 #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
     type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }
 
 #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    inline __device__ __host__ \
     type make_##type(comp x, comp y, comp z, comp w) { \
         type r{x, y, z, w}; \
         return r; \
     }
 #else
  #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
      type make_##type(comp x) { type r; r.x =x; return r; }
 
  #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
      type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }
 
  #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
      type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }
 
  #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     inline __device__ __host__ \
      type make_##type(comp x, comp y, comp z, comp w) { \
          type r; r.x=x; r.y=y; r.z=z; r.w=w; \
          return r; \
diff --git a/include/hip/hcc_detail/texture_functions.h b/include/hip/hcc_detail/texture_functions.h
index bb0a9e7223..d2ef5d66ef 100644
--- a/include/hip/hcc_detail/texture_functions.h
+++ b/include/hip/hcc_detail/texture_functions.h
@@ -46,7 +46,7 @@ union TData {
     __hip_uint4_vector_value_type u;
 };
 
-#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__
+#define __TEXTURE_FUNCTIONS_DECL__ inline __device__
 
 
 #if (__hcc_workweek__ >= 18114) || __clang__
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index b85051d64c..135b06efc2 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -358,21 +358,21 @@ enum hipComputeMode {
  */
 #if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__)
 template <class T>
-static inline hipError_t hipMalloc(T** devPtr, size_t size) {
+inline hipError_t hipMalloc(T** devPtr, size_t size) {
     return hipMalloc((void**)devPtr, size);
 }
 
 // Provide an override to automatically typecast the pointer type from void**, and also provide a
 // default for the flags.
 template <class T>
-static inline hipError_t hipHostMalloc(T** ptr, size_t size,
-                                       unsigned int flags = hipHostMallocDefault) {
+inline hipError_t hipHostMalloc(T** ptr, size_t size,
+                                unsigned int flags = hipHostMallocDefault) {
     return hipHostMalloc((void**)ptr, size, flags);
 }
 
 template <class T>
-static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
-                                       unsigned int flags = hipMemAttachGlobal) {
+inline hipError_t hipMallocManaged(T** devPtr, size_t size,
+                                   unsigned int flags = hipMemAttachGlobal) {
     return hipMallocManaged((void**)devPtr, size, flags);
 }
 #endif
diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h
index 8ee47eba4a..08b30196b8 100644
--- a/src/hip_hcc_internal.h
+++ b/src/hip_hcc_internal.h
@@ -97,13 +97,13 @@ class TidInfo {
     TidInfo();
 
     int tid() const { return _shortTid; };
-    pid_t pid() const { return _pid; }; 
+    pid_t pid() const { return _pid; };
     uint64_t incApiSeqNum() { return ++_apiSeqNum; };
     uint64_t apiSeqNum() const { return _apiSeqNum; };
 
    private:
     int _shortTid;
-    pid_t _pid; 
+    pid_t _pid;
 
     // monotonically increasing API sequence number for this threa.
     uint64_t _apiSeqNum;
@@ -280,7 +280,7 @@ static const DbName dbName[] = {
 #endif
 
 
-static inline uint64_t getTicks() { return hc::get_system_ticks(); }
+inline uint64_t getTicks() { return hc::get_system_ticks(); }
 
 //---
 extern uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr);
@@ -798,7 +798,7 @@ class ihipDevice_t {
 
     // TODO - report this through device properties, base on HCC API call.
     int _isLargeBar;
-   
+
     // Node id reported by kfd for this device
     uint32_t _driver_node_id;
 
@@ -1047,7 +1047,7 @@ struct mg_info {
 //  setDevice first.
 //  - hipDeviceReset destroys the primary context for device?
 //  - Then context is created again for next usage.
-static inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
+inline ihipCtx_t* iihipGetTlsDefaultCtx(TlsData* tls) {
     // Per-thread initialization of the TLS:
     if ((tls->defaultCtx == nullptr) && (g_deviceCnt > 0)) {
         tls->defaultCtx = ihipGetPrimaryCtx(0);