From 811bc363c817cfd8bb07beebfd185bf4b9d27c18 Mon Sep 17 00:00:00 2001
From: Aaron En Ye Shi <enye.shi@gmail.com>
Date: Wed, 5 May 2021 21:26:21 +0000
Subject: [PATCH] SWDEV-283266 - __HIPCC_RTC__ macro added to HIP headers

For hipRTC on Windows, add macro __HIPCC_RTC__ to allow
online compilation of with device functions excluding standard
C/C++ headers, system headers, and host HIP APIs.

Change-Id: I1d91f042baf1359856ec83ab7030dc58785e0334
---
 include/hip/amd_detail/device_functions.h     |   3 +-
 include/hip/amd_detail/driver_types.h         |   8 +-
 include/hip/amd_detail/hip_atomic.h           |   4 +
 include/hip/amd_detail/hip_complex.h          |  79 ++---
 include/hip/amd_detail/hip_fp16.h             | 226 ++++++-------
 include/hip/amd_detail/hip_runtime.h          |  25 +-
 include/hip/amd_detail/hip_texture_types.h    |   3 +-
 include/hip/amd_detail/hip_vector_types.h     | 312 +++++++++---------
 include/hip/amd_detail/math_functions.h       |   6 +-
 .../hip/amd_detail/texture_fetch_functions.h  |   4 +-
 .../amd_detail/texture_indirect_functions.h   |   2 +
 include/hip/hip_runtime.h                     |   6 +-
 12 files changed, 368 insertions(+), 310 deletions(-)

diff --git a/include/hip/amd_detail/device_functions.h b/include/hip/amd_detail/device_functions.h
index 703e72b5f1..0bc9cd33e8 100644
--- a/include/hip/amd_detail/device_functions.h
+++ b/include/hip/amd_detail/device_functions.h
@@ -26,9 +26,10 @@ THE SOFTWARE.
 #include "host_defines.h"
 #include "math_fwd.h"
 
+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
 #include <stddef.h>
-
+#endif // !defined(__HIPCC_RTC__)
 
 #include <hip/hip_vector_types.h>
 #include <hip/amd_detail/device_library_decls.h>
diff --git a/include/hip/amd_detail/driver_types.h b/include/hip/amd_detail/driver_types.h
index 4de4e3ee7e..fe29d1f144 100644
--- a/include/hip/amd_detail/driver_types.h
+++ b/include/hip/amd_detail/driver_types.h
@@ -27,9 +27,11 @@ THE SOFTWARE.
 // It's defined here for workarround of rocThrust building failure.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
 
+#if !defined(__HIPCC_RTC__)
 #ifndef __cplusplus
 #include <stdbool.h>
 #endif
+#endif // !defined(__HIPCC_RTC__)
 
 typedef void* hipDeviceptr_t;
 typedef enum hipChannelFormatKind {
@@ -92,6 +94,7 @@ typedef struct hipArray {
     unsigned int textureType;
 }hipArray;
 
+#if !defined(__HIPCC_RTC__)
 typedef struct hip_Memcpy2D {
     size_t srcXInBytes;
     size_t srcY;
@@ -110,7 +113,7 @@ typedef struct hip_Memcpy2D {
     size_t WidthInBytes;
     size_t Height;
 } hip_Memcpy2D;
-
+#endif // !defined(__HIPCC_RTC__)
 
 typedef struct hipArray* hipArray_t;
 typedef hipArray_t hiparray;
@@ -359,6 +362,7 @@ typedef struct HIP_RESOURCE_VIEW_DESC_st
  * Memory copy types
  *
  */
+#if !defined(__HIPCC_RTC__)
 typedef enum hipMemcpyKind {
     hipMemcpyHostToHost = 0,      ///< Host-to-Host Copy
     hipMemcpyHostToDevice = 1,    ///< Host-to-Device Copy
@@ -470,5 +474,5 @@ typedef enum hipFunction_attribute {
     HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
     HIP_FUNC_ATTRIBUTE_MAX
 }hipFunction_attribute;
-
+#endif // !defined(__HIPCC_RTC__)
 #endif
diff --git a/include/hip/amd_detail/hip_atomic.h b/include/hip/amd_detail/hip_atomic.h
index fdbf5be463..0c4bc80cf6 100644
--- a/include/hip/amd_detail/hip_atomic.h
+++ b/include/hip/amd_detail/hip_atomic.h
@@ -111,7 +111,9 @@ float atomicAdd_system(float* address, float val) {
   return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }
 
+#if !defined(__HIPCC_RTC__)
 DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
 __device__
 inline
 void atomicAddNoRet(float* address, float val)
@@ -476,7 +478,9 @@ float atomicAdd(float* address, float val)
     return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 }
 
+#if !defined(__HIPCC_RTC__)
 DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
 __device__
 inline
 void atomicAddNoRet(float* address, float val)
diff --git a/include/hip/amd_detail/hip_complex.h b/include/hip/amd_detail/hip_complex.h
index 4a59b52a54..db312780c1 100644
--- a/include/hip/amd_detail/hip_complex.h
+++ b/include/hip/amd_detail/hip_complex.h
@@ -25,6 +25,10 @@ THE SOFTWARE.
 
 #include "hip/amd_detail/hip_vector_types.h"
 
+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
 // TODO: Clang has a bug which allows device functions to call std functions
 // when std functions are introduced into default namespace by using statement.
 // math.h may be included after this bug is fixed.
@@ -33,10 +37,11 @@ THE SOFTWARE.
 #else
 #include "math.h"
 #endif
+#endif // !defined(__HIPCC_RTC__)
 
 #if __cplusplus
 #define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& op) {                             \
+    __HOST_DEVICE__ static inline type operator-(const type& op) {                             \
         type ret;                                                                                  \
         ret.x = -op.x;                                                                             \
         ret.y = -op.y;                                                                             \
@@ -44,17 +49,17 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+    __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) {          \
         return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
     }
 
 #define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+    __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
         return !(lhs == rhs);                                                                      \
     }
 
 #define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x + rhs.x;                                                                     \
         ret.y = lhs.y + rhs.y;                                                                     \
@@ -62,7 +67,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x - rhs.x;                                                                     \
         ret.y = lhs.y - rhs.y;                                                                     \
@@ -70,7 +75,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
         ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
@@ -78,7 +83,7 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) {           \
         type ret;                                                                                  \
         ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
         ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
@@ -88,33 +93,33 @@ THE SOFTWARE.
     }
 
 #define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) {               \
         lhs.x += rhs.x;                                                                            \
         lhs.y += rhs.y;                                                                            \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) {               \
         lhs.x -= rhs.x;                                                                            \
         lhs.y -= rhs.y;                                                                            \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {               \
         lhs = lhs * rhs;                                                                           \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {               \
         lhs = lhs / rhs;                                                                           \
         return lhs;                                                                                \
     }
 
 #define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
-    __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) {                 \
         type ret;                                                                                  \
         ret.x = lhs.x * rhs;                                                                       \
         ret.y = lhs.y * rhs;                                                                       \
@@ -125,41 +130,41 @@ THE SOFTWARE.
 
 typedef float2 hipFloatComplex;
 
-__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
 
-__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
 
-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
     hipFloatComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
     hipFloatComplex ret;
     ret.x = z.x;
     ret.y = -z.y;
     return ret;
 }
 
-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
     return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
     float sqabs = hipCsqabsf(q);
     hipFloatComplex ret;
     ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -167,46 +172,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi
     return ret;
 }
 
-__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
 
 
 typedef double2 hipDoubleComplex;
 
-__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
 
-__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
 
-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
     hipDoubleComplex z;
     z.x = a;
     z.y = b;
     return z;
 }
 
-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
     hipDoubleComplex ret;
     ret.x = z.x;
     ret.y = -z.y;
     return ret;
 }
 
-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
     return z.x * z.x + z.y * z.y;
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
     return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
     double sqabs = hipCsqabs(q);
     hipDoubleComplex ret;
     ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -214,7 +219,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h
     return ret;
 }
 
-__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
+__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
 
 
 #if __cplusplus
@@ -268,19 +273,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
 
 typedef hipFloatComplex hipComplex;
 
-__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
+__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
     return make_hipFloatComplex(x, y);
 }
 
-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
     return make_hipFloatComplex((float)z.x, (float)z.y);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
     return make_hipDoubleComplex((double)z.x, (double)z.y);
 }
 
-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
     float real = (p.x * q.x) + r.x;
     float imag = (q.x * p.y) + r.y;
 
@@ -290,7 +295,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
     return make_hipComplex(real, imag);
 }
 
-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
                                                            hipDoubleComplex r) {
     double real = (p.x * q.x) + r.x;
     double imag = (q.x * p.y) + r.y;
diff --git a/include/hip/amd_detail/hip_fp16.h b/include/hip/amd_detail/hip_fp16.h
index 859c3efd9e..fb344aa7d5 100644
--- a/include/hip/amd_detail/hip_fp16.h
+++ b/include/hip/amd_detail/hip_fp16.h
@@ -25,14 +25,18 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
 
 #include <hip/amd_detail/hip_common.h>
-
 #include "hip/amd_detail/host_defines.h"
-#include <assert.h>
-#if defined(__cplusplus)
+#if defined(__HIPCC_RTC__)
+  #define __HOST_DEVICE__ __device__
+#else
+  #define __HOST_DEVICE__ __host__ __device__
+  #include <assert.h>
+  #if defined(__cplusplus)
     #include <algorithm>
     #include <type_traits>
     #include <utility>
 #endif
+#endif // !defined(__HIPCC_RTC__)
 
 #if __HIP_CLANG_ONLY__
     typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
@@ -82,46 +86,46 @@ THE SOFTWARE.
             };
         public:
             // CREATORS
-            __host__ __device__
+            __HOST_DEVICE__
             __half() = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half(const __half_raw& x) : data{x.data} {}
             #if !defined(__HIP_NO_HALF_CONVERSIONS__)
-                __host__ __device__
+                __HOST_DEVICE__
                 __half(decltype(data) x) : data{x} {}
                 template<
                     typename T,
                     Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                 __half(T x) : data{static_cast<_Float16>(x)} {}
             #endif
-            __host__ __device__
+            __HOST_DEVICE__
             __half(const __half&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half(__half&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             ~__half() = default;
 
             // CREATORS - DEVICE ONLY
             #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                 template<
                     typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                 __half(T x) : data{static_cast<_Float16>(x)} {}
             #endif
 
             // MANIPULATORS
-            __host__ __device__
+            __HOST_DEVICE__
             __half& operator=(const __half&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half& operator=(__half&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half& operator=(const __half_raw& x)
             {
                 data = x.data;
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             volatile __half& operator=(const __half_raw& x) volatile
             {
                 data = x.data;
@@ -151,7 +155,7 @@ THE SOFTWARE.
                 template<
                     typename T,
                     Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                 __half& operator=(T x)
                 {
                     data = static_cast<_Float16>(x);
@@ -221,12 +225,12 @@ THE SOFTWARE.
                 template<
                     typename T,
                     Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                 operator T() const { return data; }
             #endif
-            __host__ __device__
+            __HOST_DEVICE__
             operator __half_raw() const { return __half_raw{data}; }
-            __host__ __device__
+            __HOST_DEVICE__
             operator __half_raw() const volatile
             {
                 return __half_raw{data};
@@ -235,7 +239,7 @@ THE SOFTWARE.
             #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                 template<
                     typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                 operator T() const { return data; }
             #endif
 
@@ -342,38 +346,38 @@ THE SOFTWARE.
             };
 
             // CREATORS
-            __host__ __device__
+            __HOST_DEVICE__
             __half2() = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half2(const __half2_raw& x) : data{x.data} {}
-            __host__ __device__
+            __HOST_DEVICE__
             __half2(decltype(data) x) : data{x} {}
-            __host__ __device__
+            __HOST_DEVICE__
             __half2(const __half& x, const __half& y)
                 :
                 data{
                     static_cast<__half_raw>(x).data,
                     static_cast<__half_raw>(y).data}
             {}
-            __host__ __device__
+            __HOST_DEVICE__
             __half2(const __half2&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half2(__half2&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             ~__half2() = default;
 
             // MANIPULATORS
-            __host__ __device__
+            __HOST_DEVICE__
             __half2& operator=(const __half2&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half2& operator=(__half2&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             __half2& operator=(const __half2_raw& x)
             {
                 data = x.data;
                 return *this;
             }
- 
+
             // MANIPULATORS - DEVICE ONLY
             #if !defined(__HIP_NO_HALF_OPERATORS__)
                 __device__
@@ -421,9 +425,9 @@ THE SOFTWARE.
             #endif
 
             // ACCESSORS
-            __host__ __device__
+            __HOST_DEVICE__
             operator decltype(data)() const { return data; }
-            __host__ __device__
+            __HOST_DEVICE__
             operator __half2_raw() const { return __half2_raw{data}; }
 
             // ACCESSORS - DEVICE ONLY
@@ -520,42 +524,42 @@ THE SOFTWARE.
         namespace
         {
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 make_half2(__half x, __half y)
             {
                 return __half2{x, y};
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half __low2half(__half2 x)
             {
                 return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half __high2half(__half2 x)
             {
                 return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __half2half2(__half x)
             {
                 return __half2{x, x};
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __halves2half2(__half x, __half y)
             {
                 return __half2{x, y};
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __low2half2(__half2 x)
             {
                 return __half2{
@@ -565,7 +569,7 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __high2half2(__half2 x)
             {
                 return __half2_raw{
@@ -575,7 +579,7 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __lows2half2(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -585,7 +589,7 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __highs2half2(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -595,7 +599,7 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __lowhigh2highlow(__half2 x)
             {
                 return __half2_raw{
@@ -638,37 +642,37 @@ THE SOFTWARE.
             // TODO: rounding behaviour is not correct.
             // float -> half | half2
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half __float2half(float x)
             {
                 return __half_raw{static_cast<_Float16>(x)};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half __float2half_rn(float x)
             {
                 return __half_raw{static_cast<_Float16>(x)};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half __float2half_rz(float x)
             {
                 return __half_raw{static_cast<_Float16>(x)};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half __float2half_rd(float x)
             {
                 return __half_raw{static_cast<_Float16>(x)};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half __float2half_ru(float x)
             {
                 return __half_raw{static_cast<_Float16>(x)};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half2 __float2half2_rn(float x)
             {
                 return __half2_raw{
@@ -676,14 +680,14 @@ THE SOFTWARE.
                         static_cast<_Float16>(x), static_cast<_Float16>(x)}};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half2 __floats2half2_rn(float x, float y)
             {
                 return __half2_raw{_Float16_2{
                     static_cast<_Float16>(x), static_cast<_Float16>(y)}};
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             __half2 __float22half2_rn(float2 x)
             {
                 return __floats2half2_rn(x.x, x.y);
@@ -691,25 +695,25 @@ THE SOFTWARE.
 
             // half | half2 -> float
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             float __half2float(__half x)
             {
                 return static_cast<__half_raw>(x).data;
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             float __low2float(__half2 x)
             {
                 return static_cast<__half2_raw>(x).data.x;
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             float __high2float(__half2 x)
             {
                 return static_cast<__half2_raw>(x).data.y;
             }
             inline
-            __device__ __host__
+            __HOST_DEVICE__
             float2 __half22float2(__half2 x)
             {
                 return make_float2(
@@ -1044,16 +1048,16 @@ THE SOFTWARE.
             __half __ldcs(const __half* ptr) { return *ptr; }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __ldg(const __half2* ptr) { return *ptr; }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __ldcg(const __half2* ptr) { return *ptr; }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __ldca(const __half2* ptr) { return *ptr; }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __ldcs(const __half2* ptr) { return *ptr; }
 
             // Relations
@@ -1119,7 +1123,7 @@ THE SOFTWARE.
             bool __hgtu(__half x, __half y) { return __hgt(x, y); }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __heq2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data ==
@@ -1127,7 +1131,7 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hne2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data !=
@@ -1135,7 +1139,7 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hle2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data <=
@@ -1143,7 +1147,7 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hge2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data >=
@@ -1151,7 +1155,7 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hlt2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data <
@@ -1159,7 +1163,7 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hgt2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(x).data >
@@ -1167,83 +1171,83 @@ THE SOFTWARE.
                 return __builtin_convertvector(-r, _Float16_2);
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
             inline
-            __host__  __device__
+            __HOST_DEVICE__
             __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbeq2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__heq2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbne2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hne2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hble2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hle2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbge2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hge2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hblt2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hlt2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbgt2(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hgt2(x, y));
                 return r.data.x != 0 && r.data.y != 0;
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
 
             // Arithmetic
@@ -1332,7 +1336,7 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hadd2(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -1340,14 +1344,14 @@ THE SOFTWARE.
                     static_cast<__half2_raw>(y).data};
             }
 	    inline
-	    __host__ __device__
+	    __HOST_DEVICE__
 	    __half2 __habs2(__half2 x)
 	    {
 	        return __half2_raw{
 		    __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
 	    }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hsub2(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -1355,7 +1359,7 @@ THE SOFTWARE.
                     static_cast<__half2_raw>(y).data};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hmul2(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -1363,7 +1367,7 @@ THE SOFTWARE.
                     static_cast<__half2_raw>(y).data};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hadd2_sat(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hadd2(x, y));
@@ -1372,7 +1376,7 @@ THE SOFTWARE.
                     __clamp_01(__half_raw{r.data.y})};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hsub2_sat(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hsub2(x, y));
@@ -1381,7 +1385,7 @@ THE SOFTWARE.
                     __clamp_01(__half_raw{r.data.y})};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hmul2_sat(__half2 x, __half2 y)
             {
                 auto r = static_cast<__half2_raw>(__hmul2(x, y));
@@ -1390,13 +1394,13 @@ THE SOFTWARE.
                     __clamp_01(__half_raw{r.data.y})};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hfma2(__half2 x, __half2 y, __half2 z)
             {
                 return __half2_raw{__ocml_fma_2f16(x, y, z)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
             {
                 auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
@@ -1405,7 +1409,7 @@ THE SOFTWARE.
                     __clamp_01(__half_raw{r.data.y})};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __h2div(__half2 x, __half2 y)
             {
                 return __half2_raw{
@@ -1548,82 +1552,82 @@ THE SOFTWARE.
             }
 
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2trunc(__half2 x)
             {
                 return __half2_raw{__ocml_trunc_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2ceil(__half2 x)
             {
                 return __half2_raw{__ocml_ceil_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2floor(__half2 x)
             {
                 return __half2_raw{__ocml_floor_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2rint(__half2 x)
             {
                 return __half2_raw{__ocml_rint_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2sin(__half2 x)
             {
                 return __half2_raw{__ocml_sin_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2cos(__half2 x)
             {
                 return __half2_raw{__ocml_cos_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2exp(__half2 x)
             {
                 return __half2_raw{__ocml_exp_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2exp2(__half2 x)
             {
                 return __half2_raw{__ocml_exp2_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2exp10(__half2 x)
             {
                 return __half2_raw{__ocml_exp10_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2log2(__half2 x)
             {
                 return __half2_raw{__ocml_log2_2f16(x)};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hisinf2(__half2 x)
             {
                 auto r = __ocml_isinf_2f16(x);
@@ -1631,7 +1635,7 @@ THE SOFTWARE.
                     static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
             }
             inline
-            __host__  __device__
+            __HOST_DEVICE__
             __half2 __hisnan2(__half2 x)
             {
                 auto r = __ocml_isnan_2f16(x);
@@ -1639,7 +1643,7 @@ THE SOFTWARE.
                     static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
             }
             inline
-            __host__ __device__
+            __HOST_DEVICE__
             __half2 __hneg2(__half2 x)
             {
                 return __half2_raw{-static_cast<__half2_raw>(x).data};
diff --git a/include/hip/amd_detail/hip_runtime.h b/include/hip/amd_detail/hip_runtime.h
index ca88fda2f0..a3db57ffe3 100644
--- a/include/hip/amd_detail/hip_runtime.h
+++ b/include/hip/amd_detail/hip_runtime.h
@@ -34,6 +34,7 @@ THE SOFTWARE.
 //---
 // Top part of file can be compiled with any compiler
 
+#if !defined(__HIPCC_RTC__)
 //#include <cstring>
 #if __cplusplus
 #include <cmath>
@@ -42,7 +43,8 @@ THE SOFTWARE.
 #include <math.h>
 #include <string.h>
 #include <stddef.h>
-#endif  //__cplusplus
+#endif // __cplusplus
+#endif // !defined(__HIPCC_RTC__)
 
 // __hip_malloc is not working. Disable it by default.
 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
@@ -57,9 +59,10 @@ THE SOFTWARE.
 
 #define CUDA_SUCCESS hipSuccess
 
+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
-
 extern int HIP_TRACE_API;
+#endif // !defined(__HIPCC_RTC__)
 
 #ifdef __cplusplus
 #include <hip/amd_detail/hip_ldg.h>
@@ -121,7 +124,9 @@ extern int HIP_TRACE_API;
 #define __launch_bounds__(...)                                                                     \
     select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
 
+#if !defined(__HIPCC_RTC__)
 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+#endif // !defined(__HIPCC_RTC__)
 
 #if __HIP_ARCH_GFX701__ == 0
 
@@ -162,6 +167,7 @@ static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullpt
 //
 // hip-clang functions
 //
+#if !defined(__HIPCC_RTC__)
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 #define HIP_SYMBOL(X) X
 
@@ -218,6 +224,8 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
 #endif
 
 #include <hip/hip_runtime_api.h>
+#endif // !defined(__HIPCC_RTC__)
+
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
@@ -245,6 +253,17 @@ struct __HIP_ThreadIdx {
   }
 };
 
+#if defined(__HIPCC_RTC__)
+typedef struct dim3 {
+    uint32_t x;  ///< x
+    uint32_t y;  ///< y
+    uint32_t z;  ///< z
+#ifdef __cplusplus
+    constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+#endif // !defined(__HIPCC_RTC__)
+
 template <typename F>
 struct __HIP_Coordinates {
   using R = decltype(F{}(0));
@@ -371,6 +390,7 @@ hc_get_workitem_absolute_id(int dim)
 #endif
 
 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if !defined(__HIPCC_RTC__)
 // Support std::complex.
 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
 #pragma push_macro("__CUDA__")
@@ -388,6 +408,7 @@ hc_get_workitem_absolute_id(int dim)
 #undef __CUDA__
 #pragma pop_macro("__CUDA__")
 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // !defined(__HIPCC_RTC__)
 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
 #endif // __HIP_CLANG_ONLY__
 
diff --git a/include/hip/amd_detail/hip_texture_types.h b/include/hip/amd_detail/hip_texture_types.h
index 22016e25ad..0dc40ec0ed 100644
--- a/include/hip/amd_detail/hip_texture_types.h
+++ b/include/hip/amd_detail/hip_texture_types.h
@@ -33,9 +33,10 @@ THE SOFTWARE.
  *                                                                              *
  *                                                                              *
  *******************************************************************************/
+#if !defined(__HIPCC_RTC__)
 #include <limits.h>
-//#include <hip/amd_detail/driver_types.h>
 #include <hip/amd_detail/channel_descriptor.h>
+#endif // !defined(__HIPCC_RTC__)
 #include <hip/amd_detail/texture_types.h>
 
 #if __cplusplus
diff --git a/include/hip/amd_detail/hip_vector_types.h b/include/hip/amd_detail/hip_vector_types.h
index 88635befc8..72a828aaf2 100644
--- a/include/hip/amd_detail/hip_vector_types.h
+++ b/include/hip/amd_detail/hip_vector_types.h
@@ -30,6 +30,12 @@ THE SOFTWARE.
 
 #include "hip/amd_detail/host_defines.h"
 
+#if defined(__HIPCC_RTC__)
+    #define __HOST_DEVICE__ __device__
+#else
+    #define __HOST_DEVICE__ __host__ __device__
+#endif
+
 #if defined(__has_attribute)
     #if __has_attribute(ext_vector_type)
         #define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
@@ -38,9 +44,11 @@ THE SOFTWARE.
     #endif
 
 #if defined(__cplusplus)
+#if !defined(__HIPCC_RTC__)
     #include <array>
     #include <iosfwd>
     #include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
 
     namespace hip_impl {
         template<typename, typename, unsigned int> struct Scalar_accessor;
@@ -61,20 +69,20 @@ THE SOFTWARE.
             struct Address {
                 const Scalar_accessor* p;
 
-                __host__ __device__
+                __HOST_DEVICE__
                 operator const T*() const noexcept {
                     return &reinterpret_cast<const T*>(p)[idx];
                 }
-                __host__ __device__
+                __HOST_DEVICE__
                 operator const T*() const volatile noexcept {
                     return &reinterpret_cast<const T*>(p)[idx];
                 }
-                __host__ __device__
+                __HOST_DEVICE__
                 operator T*() noexcept {
                     return &reinterpret_cast<T*>(
                         const_cast<Scalar_accessor*>(p))[idx];
                 }
-                __host__ __device__
+                __HOST_DEVICE__
                 operator T*() volatile noexcept {
                     return &reinterpret_cast<T*>(
                         const_cast<Scalar_accessor*>(p))[idx];
@@ -101,9 +109,9 @@ THE SOFTWARE.
             // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
             Vector data;
 
-            __host__ __device__
+            __HOST_DEVICE__
             operator T() const noexcept { return data[idx]; }
-            __host__ __device__
+            __HOST_DEVICE__
             operator T() const volatile noexcept { return data[idx]; }
 
 #ifdef __HIP_ENABLE_VECTOR_SCALAR_ACCESSORY_ENUM_CONVERSION__
@@ -116,7 +124,7 @@ THE SOFTWARE.
                     std::is_enum<U>{} &&
                     std::is_convertible<
                         T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             operator U() const noexcept { return static_cast<U>(data[idx]); }
             template<
                 typename U,
@@ -125,60 +133,60 @@ THE SOFTWARE.
                     std::is_enum<U>{} &&
                     std::is_convertible<
                         T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             operator U() const volatile noexcept { return static_cast<U>(data[idx]); }
 #endif
 
-            __host__ __device__
+            __HOST_DEVICE__
             operator T&() noexcept {
                 return reinterpret_cast<
                     T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
             }
-            __host__ __device__
+            __HOST_DEVICE__
             operator volatile T&() volatile noexcept {
                 return reinterpret_cast<
                     volatile T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
             }
 
-            __host__ __device__
+            __HOST_DEVICE__
             Address operator&() const noexcept { return Address{this}; }
 
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator=(const Scalar_accessor& x) noexcept {
                 data[idx] = x.data[idx];
 
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator=(T x) noexcept {
                 data[idx] = x;
 
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             volatile Scalar_accessor& operator=(T x) volatile noexcept {
                 data[idx] = x;
 
                 return *this;
             }
 
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator++() noexcept {
                 ++data[idx];
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             T operator++(int) noexcept {
                 auto r{data[idx]};
                 ++data[idx];
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator--() noexcept {
                 --data[idx];
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             T operator--(int) noexcept {
                 auto r{data[idx]};
                 --data[idx];
@@ -191,7 +199,7 @@ THE SOFTWARE.
                 typename U,
                 typename std::enable_if<
                     std::is_convertible<U, T>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator+=(U x) noexcept {
                 data[idx] += x;
                 return *this;
@@ -200,7 +208,7 @@ THE SOFTWARE.
                 typename U,
                 typename std::enable_if<
                     std::is_convertible<U, T>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator-=(U x) noexcept {
                 data[idx] -= x;
                 return *this;
@@ -210,7 +218,7 @@ THE SOFTWARE.
                 typename U,
                 typename std::enable_if<
                     std::is_convertible<U, T>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator*=(U x) noexcept {
                 data[idx] *= x;
                 return *this;
@@ -219,7 +227,7 @@ THE SOFTWARE.
                 typename U,
                 typename std::enable_if<
                     std::is_convertible<U, T>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator/=(U x) noexcept {
                 data[idx] /= x;
                 return *this;
@@ -228,7 +236,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator%=(U x) noexcept {
                 data[idx] %= x;
                 return *this;
@@ -238,7 +246,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator>>=(U x) noexcept {
                 data[idx] >>= x;
                 return *this;
@@ -247,7 +255,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator<<=(U x) noexcept {
                 data[idx] <<= x;
                 return *this;
@@ -256,7 +264,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator&=(U x) noexcept {
                 data[idx] &= x;
                 return *this;
@@ -265,7 +273,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator|=(U x) noexcept {
                 data[idx] |= x;
                 return *this;
@@ -274,7 +282,7 @@ THE SOFTWARE.
                 typename U = T,
                 typename std::enable_if<std::is_convertible<U, T>{} &&
                                         std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Scalar_accessor& operator^=(U x) noexcept {
                 data[idx] ^= x;
                 return *this;
@@ -308,22 +316,22 @@ THE SOFTWARE.
 
         using value_type = T;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base() = default;
-        __host__ __device__
+        __HOST_DEVICE__
         explicit
         constexpr
         HIP_vector_base(T x_) noexcept : data{x_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(const HIP_vector_base&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(HIP_vector_base&&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         ~HIP_vector_base() = default;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
             #if __has_attribute(ext_vector_type)
                 data = x_.data;
@@ -358,25 +366,25 @@ THE SOFTWARE.
 
         using value_type = T;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base() = default;
-        __host__ __device__
+        __HOST_DEVICE__
         explicit
         constexpr
         HIP_vector_base(T x_) noexcept : data{x_, x_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(const HIP_vector_base&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(HIP_vector_base&&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         ~HIP_vector_base() = default;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
             #if __has_attribute(ext_vector_type)
                 data = x_.data;
@@ -394,55 +402,55 @@ THE SOFTWARE.
         struct Native_vec_ {
             T d[3];
 
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_() = default;
 
-            __host__ __device__
+            __HOST_DEVICE__
             explicit
             constexpr
             Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
-            __host__ __device__
+            __HOST_DEVICE__
             constexpr
             Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
-            __host__ __device__
+            __HOST_DEVICE__
             constexpr
             Native_vec_(const Native_vec_&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             constexpr
             Native_vec_(Native_vec_&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             ~Native_vec_() = default;
 
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator=(const Native_vec_&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator=(Native_vec_&&) = default;
 
-            __host__ __device__
+            __HOST_DEVICE__
             T& operator[](unsigned int idx) noexcept { return d[idx]; }
-            __host__ __device__
+            __HOST_DEVICE__
             T operator[](unsigned int idx) const noexcept { return d[idx]; }
 
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator+=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator-=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
                 return *this;
             }
 
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator*=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
                 return *this;
             }
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator/=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
@@ -452,7 +460,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_ operator-() const noexcept
             {
                 auto r{*this};
@@ -463,7 +471,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_ operator~() const noexcept
             {
                 auto r{*this};
@@ -473,7 +481,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator%=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
@@ -482,7 +490,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator^=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
@@ -491,7 +499,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator|=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
@@ -500,7 +508,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator&=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
@@ -509,7 +517,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator>>=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
@@ -518,7 +526,7 @@ THE SOFTWARE.
             template<
                 typename U = T,
                 typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-            __host__ __device__
+            __HOST_DEVICE__
             Native_vec_& operator<<=(const Native_vec_& x_) noexcept
             {
                 for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
@@ -526,7 +534,7 @@ THE SOFTWARE.
             }
 
             using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
-            __host__ __device__
+            __HOST_DEVICE__
             Vec3_cmp operator==(const Native_vec_& x_) const noexcept
             {
                 return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
@@ -544,27 +552,27 @@ THE SOFTWARE.
 
         using value_type = T;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base() = default;
-        __host__ __device__
+        __HOST_DEVICE__
         explicit
         constexpr
         HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(const HIP_vector_base&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(HIP_vector_base&&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         ~HIP_vector_base() = default;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base& operator=(const HIP_vector_base&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base& operator=(HIP_vector_base&&) = default;
     };
 
@@ -595,25 +603,25 @@ THE SOFTWARE.
 
         using value_type = T;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base() = default;
-        __host__ __device__
+        __HOST_DEVICE__
         explicit
         constexpr
         HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(const HIP_vector_base&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_base(HIP_vector_base&&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         ~HIP_vector_base() = default;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept {
             #if __has_attribute(ext_vector_type)
                 data = x_.data;
@@ -633,13 +641,13 @@ THE SOFTWARE.
         using HIP_vector_base<T, rank>::data;
         using typename HIP_vector_base<T, rank>::Native_vec_;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type() = default;
         template<
             typename U,
             typename std::enable_if<
                 std::is_convertible<U, T>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         explicit
         constexpr
         HIP_vector_type(U x_) noexcept
@@ -649,32 +657,32 @@ THE SOFTWARE.
             typename... Us,
             typename std::enable_if<
                 (rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_type(Us... xs) noexcept
             : HIP_vector_base<T, rank>{static_cast<T>(xs)...}
         {}
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_type(const HIP_vector_type&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         constexpr
         HIP_vector_type(HIP_vector_type&&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         ~HIP_vector_type() = default;
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator=(const HIP_vector_type&) = default;
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator=(HIP_vector_type&&) = default;
 
         // Operators
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator++() noexcept
         {
             return *this += HIP_vector_type{1};
         }
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type operator++(int) noexcept
         {
             auto tmp(*this);
@@ -682,12 +690,12 @@ THE SOFTWARE.
             return tmp;
         }
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator--() noexcept
         {
             return *this -= HIP_vector_type{1};
         }
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type operator--(int) noexcept
         {
             auto tmp(*this);
@@ -695,7 +703,7 @@ THE SOFTWARE.
             return tmp;
         }
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
         {
             data += x.data;
@@ -705,13 +713,13 @@ THE SOFTWARE.
             typename U,
             typename std::enable_if<
                 std::is_convertible<U, T>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator+=(U x) noexcept
         {
             return *this += HIP_vector_type{x};
         }
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
         {
             data -= x.data;
@@ -721,13 +729,13 @@ THE SOFTWARE.
             typename U,
             typename std::enable_if<
                 std::is_convertible<U, T>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator-=(U x) noexcept
         {
             return *this -= HIP_vector_type{x};
         }
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
         {
             data *= x.data;
@@ -737,13 +745,13 @@ THE SOFTWARE.
             typename U,
             typename std::enable_if<
                 std::is_convertible<U, T>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator*=(U x) noexcept
         {
             return *this *= HIP_vector_type{x};
         }
 
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
         {
             data /= x.data;
@@ -753,7 +761,7 @@ THE SOFTWARE.
             typename U,
             typename std::enable_if<
                 std::is_convertible<U, T>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator/=(U x) noexcept
         {
             return *this /= HIP_vector_type{x};
@@ -762,7 +770,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type operator-() const noexcept
         {
             auto tmp(*this);
@@ -773,7 +781,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type operator~() const noexcept
         {
             HIP_vector_type r{*this};
@@ -784,7 +792,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
         {
             data %= x.data;
@@ -794,7 +802,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
         {
             data ^= x.data;
@@ -804,7 +812,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
         {
             data |= x.data;
@@ -814,7 +822,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
         {
             data &= x.data;
@@ -824,7 +832,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
         {
             data >>= x.data;
@@ -834,7 +842,7 @@ THE SOFTWARE.
         template<
             typename U = T,
             typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
-        __host__ __device__
+        __HOST_DEVICE__
         HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
         {
             data <<= x.data;
@@ -843,7 +851,7 @@ THE SOFTWARE.
     };
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator+(
@@ -852,7 +860,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} += y;
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator+(
@@ -861,7 +869,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator+(
@@ -871,7 +879,7 @@ THE SOFTWARE.
     }
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator-(
@@ -880,7 +888,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} -= y;
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator-(
@@ -889,7 +897,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator-(
@@ -899,7 +907,7 @@ THE SOFTWARE.
     }
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator*(
@@ -908,7 +916,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} *= y;
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator*(
@@ -917,7 +925,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator*(
@@ -927,7 +935,7 @@ THE SOFTWARE.
     }
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator/(
@@ -936,7 +944,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} /= y;
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator/(
@@ -945,7 +953,7 @@ THE SOFTWARE.
         return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator/(
@@ -955,7 +963,7 @@ THE SOFTWARE.
     }
 
     template<typename V>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool _hip_any_zero(const V& x, int n) noexcept
@@ -965,7 +973,7 @@ THE SOFTWARE.
     }
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator==(
@@ -974,7 +982,7 @@ THE SOFTWARE.
         return _hip_any_zero(x.data == y.data, n - 1);
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
@@ -982,7 +990,7 @@ THE SOFTWARE.
         return x == HIP_vector_type<T, n>{y};
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
@@ -991,7 +999,7 @@ THE SOFTWARE.
     }
 
     template<typename T, unsigned int n>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator!=(
@@ -1000,7 +1008,7 @@ THE SOFTWARE.
         return !(x == y);
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
@@ -1008,7 +1016,7 @@ THE SOFTWARE.
         return !(x == y);
     }
     template<typename T, unsigned int n, typename U>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
@@ -1020,7 +1028,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator%(
@@ -1033,7 +1041,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator%(
@@ -1046,7 +1054,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator%(
@@ -1059,7 +1067,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator^(
@@ -1072,7 +1080,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator^(
@@ -1085,7 +1093,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator^(
@@ -1098,7 +1106,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator|(
@@ -1111,7 +1119,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator|(
@@ -1124,7 +1132,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator|(
@@ -1137,7 +1145,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator&(
@@ -1150,7 +1158,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator&(
@@ -1163,7 +1171,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator&(
@@ -1176,7 +1184,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator>>(
@@ -1189,7 +1197,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator>>(
@@ -1202,7 +1210,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator>>(
@@ -1215,7 +1223,7 @@ THE SOFTWARE.
         typename T,
         unsigned int n,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator<<(
@@ -1228,7 +1236,7 @@ THE SOFTWARE.
         unsigned int n,
         typename U,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator<<(
@@ -1242,7 +1250,7 @@ THE SOFTWARE.
         typename U,
         typename std::enable_if<std::is_arithmetic<U>::value>::type,
         typename std::enable_if<std::is_integral<T>{}>* = nullptr>
-    __host__ __device__
+    __HOST_DEVICE__
     inline
     constexpr
     HIP_vector_type<T, n> operator<<(
@@ -1293,38 +1301,38 @@ __MAKE_VECTOR_TYPE__(double, double);
 
 #ifdef __cplusplus
 #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    static inline __HOST_DEVICE__ \
     type make_##type(comp x) { type r{x}; return r; }
 
 #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    static inline __HOST_DEVICE__ \
     type make_##type(comp x, comp y) { type r{x, y}; return r; }
 
 #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    static inline __HOST_DEVICE__ \
     type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }
 
 #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-    static inline __device__ __host__ \
+    static inline __HOST_DEVICE__ \
     type make_##type(comp x, comp y, comp z, comp w) { \
         type r{x, y, z, w}; \
         return r; \
     }
 #else
  #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     static inline __HOST_DEVICE__ \
      type make_##type(comp x) { type r; r.x =x; return r; }
 
  #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     static inline __HOST_DEVICE__ \
      type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }
 
  #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     static inline __HOST_DEVICE__ \
      type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }
 
  #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-     static inline __device__ __host__ \
+     static inline __HOST_DEVICE__ \
      type make_##type(comp x, comp y, comp z, comp w) { \
          type r; r.x=x; r.y=y; r.z=z; r.w=w; \
          return r; \
diff --git a/include/hip/amd_detail/math_functions.h b/include/hip/amd_detail/math_functions.h
index c2dfffb77a..2cbee4829a 100644
--- a/include/hip/amd_detail/math_functions.h
+++ b/include/hip/amd_detail/math_functions.h
@@ -28,18 +28,18 @@ THE SOFTWARE.
 
 #include <hip/amd_detail/host_defines.h>
 
+#if !defined(__HIPCC_RTC__)
 #include <algorithm>
-
 // assert.h is only for the host version of assert.
 // The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
 // Users should include hip_runtime.h for the device version of assert.
 #if !__HIP_DEVICE_COMPILE__
 #include <assert.h>
 #endif
-
 #include <limits.h>
 #include <limits>
 #include <stdint.h>
+#endif // !defined(__HIPCC_RTC__)
 
 #if _LIBCPP_VERSION && __HIP__
 namespace std {
@@ -1460,6 +1460,7 @@ double min(double x, double y) {
 __HIP_OVERLOAD2(double, max)
 __HIP_OVERLOAD2(double, min)
 
+#if !defined(__HIPCC_RTC__)
 __host__ inline static int min(int arg1, int arg2) {
   return std::min(arg1, arg2);
 }
@@ -1467,6 +1468,7 @@ __host__ inline static int min(int arg1, int arg2) {
 __host__ inline static int max(int arg1, int arg2) {
   return std::max(arg1, arg2);
 }
+#endif // !defined(__HIPCC_RTC__)
 
 __DEVICE__
 inline float pow(float base, int iexp) {
diff --git a/include/hip/amd_detail/texture_fetch_functions.h b/include/hip/amd_detail/texture_fetch_functions.h
index 004b021b80..399e4fecf7 100644
--- a/include/hip/amd_detail/texture_fetch_functions.h
+++ b/include/hip/amd_detail/texture_fetch_functions.h
@@ -25,10 +25,12 @@ THE SOFTWARE.
 #if defined(__cplusplus)
 
 #include <hip/hip_vector_types.h>
-#include <hip/texture_types.h>
+#include <hip/hip_texture_types.h>
 #include <hip/amd_detail/ockl_image.h>
 
+#if !defined(__HIPCC_RTC__)
 #include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
 
 #define TEXTURE_PARAMETERS_INIT                                                                     \
     unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
diff --git a/include/hip/amd_detail/texture_indirect_functions.h b/include/hip/amd_detail/texture_indirect_functions.h
index d55c9a50a4..87279da8c0 100644
--- a/include/hip/amd_detail/texture_indirect_functions.h
+++ b/include/hip/amd_detail/texture_indirect_functions.h
@@ -28,7 +28,9 @@ THE SOFTWARE.
 #include <hip/hip_texture_types.h>
 #include <hip/amd_detail/ockl_image.h>
 
+#if !defined(__HIPCC_RTC__)
 #include <type_traits>
+#endif // !defined(__HIPCC_RTC__)
 
 #define TEXTURE_OBJECT_PARAMETERS_INIT                                                            \
     unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
diff --git a/include/hip/hip_runtime.h b/include/hip/hip_runtime.h
index e989eaa28e..73dd87226f 100644
--- a/include/hip/hip_runtime.h
+++ b/include/hip/hip_runtime.h
@@ -41,6 +41,7 @@ THE SOFTWARE.
 #error HIP is not supported on GFX10 with wavefront size 64
 #endif
 
+#if !defined(__HIPCC_RTC__)
 // Some standard header files, these are included by hc.hpp and so want to make them avail on both
 // paths to provide a consistent include env and avoid "missing symbol" errors that only appears
 // on NVCC path:
@@ -52,6 +53,7 @@ THE SOFTWARE.
 #if __cplusplus > 199711L
 #include <thread>
 #endif
+#endif // !defined(__HIPCC_RTC__)
 
 #include <hip/hip_version.h>
 #include <hip/hip_common.h>
@@ -107,8 +109,10 @@ THE SOFTWARE.
 #endif // defined(__clang__)
 #endif
 
+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
-#include <hip/hip_vector_types.h>
 #include <hip/library_types.h>
+#endif // !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>
 
 #endif