SWDEV-283266 - __HIPCC_RTC__ macro added to HIP headers

For hipRTC on Windows, add macro __HIPCC_RTC__ to allow online compilation of with device functions excluding standard C/C++ headers, system headers, and host HIP APIs. Change-Id: I1d91f042baf1359856ec83ab7030dc58785e0334
2021-05-05 21:26:21 +00:00
parent 73964037c1
commit 811bc363c8
@@ -26,9 +26,10 @@ THE SOFTWARE.
 #include "host_defines.h"
 #include "math_fwd.h"

+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
 #include <stddef.h>
-
+#endif // !defined(__HIPCC_RTC__)

 #include <hip/hip_vector_types.h>
 #include <hip/amd_detail/device_library_decls.h>
@@ -27,9 +27,11 @@ THE SOFTWARE.
 // It's defined here for workarround of rocThrust building failure.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H

+#if !defined(__HIPCC_RTC__)
 #ifndef __cplusplus
 #include <stdbool.h>
 #endif
+#endif // !defined(__HIPCC_RTC__)

 typedef void* hipDeviceptr_t;
 typedef enum hipChannelFormatKind {
@@ -92,6 +94,7 @@ typedef struct hipArray {
    unsigned int textureType;
 }hipArray;

+#if !defined(__HIPCC_RTC__)
 typedef struct hip_Memcpy2D {
    size_t srcXInBytes;
    size_t srcY;
@@ -110,7 +113,7 @@ typedef struct hip_Memcpy2D {
    size_t WidthInBytes;
    size_t Height;
 } hip_Memcpy2D;
-
+#endif // !defined(__HIPCC_RTC__)

 typedef struct hipArray* hipArray_t;
 typedef hipArray_t hiparray;
@@ -359,6 +362,7 @@ typedef struct HIP_RESOURCE_VIEW_DESC_st
 * Memory copy types
 *
 */
+#if !defined(__HIPCC_RTC__)
 typedef enum hipMemcpyKind {
    hipMemcpyHostToHost = 0,      ///< Host-to-Host Copy
    hipMemcpyHostToDevice = 1,    ///< Host-to-Device Copy
@@ -470,5 +474,5 @@ typedef enum hipFunction_attribute {
    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
    HIP_FUNC_ATTRIBUTE_MAX
 }hipFunction_attribute;
-
+#endif // !defined(__HIPCC_RTC__)
 #endif
@@ -111,7 +111,9 @@ float atomicAdd_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

+#if !defined(__HIPCC_RTC__)
 DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
 __device__
 inline
 void atomicAddNoRet(float* address, float val)
@@ -476,7 +478,9 @@ float atomicAdd(float* address, float val)
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 }

+#if !defined(__HIPCC_RTC__)
 DEPRECATED("use atomicAdd instead")
+#endif // !defined(__HIPCC_RTC__)
 __device__
 inline
 void atomicAddNoRet(float* address, float val)
@@ -25,6 +25,10 @@ THE SOFTWARE.

 #include "hip/amd_detail/hip_vector_types.h"

+#if defined(__HIPCC_RTC__)
+#define __HOST_DEVICE__ __device__
+#else
+#define __HOST_DEVICE__ __host__ __device__
 // TODO: Clang has a bug which allows device functions to call std functions
 // when std functions are introduced into default namespace by using statement.
 // math.h may be included after this bug is fixed.
@@ -33,10 +37,11 @@ THE SOFTWARE.
 #else
 #include "math.h"
 #endif
+#endif // !defined(__HIPCC_RTC__)

 #if __cplusplus
 #define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& op) {                             \
+    __HOST_DEVICE__ static inline type operator-(const type& op) {                             \
        type ret;                                                                                  \
        ret.x = -op.x;                                                                             \
        ret.y = -op.y;                                                                             \
@@ -44,17 +49,17 @@ THE SOFTWARE.
    }

 #define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) {          \
+    __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) {          \
        return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
    }

 #define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
-    __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
+    __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
        return !(lhs == rhs);                                                                      \
    }

 #define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x + rhs.x;                                                                     \
        ret.y = lhs.y + rhs.y;                                                                     \
@@ -62,7 +67,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x - rhs.x;                                                                     \
        ret.y = lhs.y - rhs.y;                                                                     \
@@ -70,7 +75,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
@@ -78,7 +83,7 @@ THE SOFTWARE.
    }

 #define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
-    __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) {           \
+    __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
@@ -88,33 +93,33 @@ THE SOFTWARE.
    }

 #define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) {               \
        lhs.x += rhs.x;                                                                            \
        lhs.y += rhs.y;                                                                            \
        return lhs;                                                                                \
    }

 #define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) {               \
        lhs.x -= rhs.x;                                                                            \
        lhs.y -= rhs.y;                                                                            \
        return lhs;                                                                                \
    }

 #define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {               \
        lhs = lhs * rhs;                                                                           \
        return lhs;                                                                                \
    }

 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) {               \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {               \
        lhs = lhs / rhs;                                                                           \
        return lhs;                                                                                \
    }

 #define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
-    __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) {                 \
+    __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) {                 \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs;                                                                       \
        ret.y = lhs.y * rhs;                                                                       \
@@ -125,41 +130,41 @@ THE SOFTWARE.

 typedef float2 hipFloatComplex;

-__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }

-__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }

-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
    hipFloatComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
    hipFloatComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
    float sqabs = hipCsqabsf(q);
    hipFloatComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -167,46 +172,46 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi
    return ret;
 }

-__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }


 typedef double2 hipDoubleComplex;

-__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }

-__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }

-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
    hipDoubleComplex z;
    z.x = a;
    z.y = b;
    return z;
 }

-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
    hipDoubleComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
 }

-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
    return z.x * z.x + z.y * z.y;
 }

-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
    double sqabs = hipCsqabs(q);
    hipDoubleComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
@@ -214,7 +219,7 @@ __device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, h
    return ret;
 }

-__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
+__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }


 #if __cplusplus
@@ -268,19 +273,19 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)

 typedef hipFloatComplex hipComplex;

-__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
+__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
    return make_hipFloatComplex(x, y);
 }

-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
    return make_hipFloatComplex((float)z.x, (float)z.y);
 }

-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
    return make_hipDoubleComplex((double)z.x, (double)z.y);
 }

-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

@@ -290,7 +295,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q
    return make_hipComplex(real, imag);
 }

-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
                                                           hipDoubleComplex r) {
    double real = (p.x * q.x) + r.x;
    double imag = (q.x * p.y) + r.y;
@@ -25,14 +25,18 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H

 #include <hip/amd_detail/hip_common.h>
-
 #include "hip/amd_detail/host_defines.h"
-#include <assert.h>
-#if defined(__cplusplus)
+#if defined(__HIPCC_RTC__)
+  #define __HOST_DEVICE__ __device__
+#else
+  #define __HOST_DEVICE__ __host__ __device__
+  #include <assert.h>
+  #if defined(__cplusplus)
    #include <algorithm>
    #include <type_traits>
    #include <utility>
 #endif
+#endif // !defined(__HIPCC_RTC__)

 #if __HIP_CLANG_ONLY__
    typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
@@ -82,46 +86,46 @@ THE SOFTWARE.
            };
        public:
            // CREATORS
-            __host__ __device__
+            __HOST_DEVICE__
            __half() = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half(const __half_raw& x) : data{x.data} {}
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
-                __host__ __device__
+                __HOST_DEVICE__
                __half(decltype(data) x) : data{x} {}
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                __half(T x) : data{static_cast<_Float16>(x)} {}
            #endif
-            __host__ __device__
+            __HOST_DEVICE__
            __half(const __half&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half(__half&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            ~__half() = default;

            // CREATORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                __half(T x) : data{static_cast<_Float16>(x)} {}
            #endif

            // MANIPULATORS
-            __host__ __device__
+            __HOST_DEVICE__
            __half& operator=(const __half&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half& operator=(__half&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half& operator=(const __half_raw& x)
            {
                data = x.data;
                return *this;
            }
-            __host__ __device__
+            __HOST_DEVICE__
            volatile __half& operator=(const __half_raw& x) volatile
            {
                data = x.data;
@@ -151,7 +155,7 @@ THE SOFTWARE.
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                __half& operator=(T x)
                {
                    data = static_cast<_Float16>(x);
@@ -221,12 +225,12 @@ THE SOFTWARE.
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                operator T() const { return data; }
            #endif
-            __host__ __device__
+            __HOST_DEVICE__
            operator __half_raw() const { return __half_raw{data}; }
-            __host__ __device__
+            __HOST_DEVICE__
            operator __half_raw() const volatile
            {
                return __half_raw{data};
@@ -235,7 +239,7 @@ THE SOFTWARE.
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
-                __host__ __device__
+                __HOST_DEVICE__
                operator T() const { return data; }
            #endif

@@ -342,38 +346,38 @@ THE SOFTWARE.
            };

            // CREATORS
-            __host__ __device__
+            __HOST_DEVICE__
            __half2() = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half2(const __half2_raw& x) : data{x.data} {}
-            __host__ __device__
+            __HOST_DEVICE__
            __half2(decltype(data) x) : data{x} {}
-            __host__ __device__
+            __HOST_DEVICE__
            __half2(const __half& x, const __half& y)
                :
                data{
                    static_cast<__half_raw>(x).data,
                    static_cast<__half_raw>(y).data}
            {}
-            __host__ __device__
+            __HOST_DEVICE__
            __half2(const __half2&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half2(__half2&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            ~__half2() = default;

            // MANIPULATORS
-            __host__ __device__
+            __HOST_DEVICE__
            __half2& operator=(const __half2&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half2& operator=(__half2&&) = default;
-            __host__ __device__
+            __HOST_DEVICE__
            __half2& operator=(const __half2_raw& x)
            {
                data = x.data;
                return *this;
            }
- 
+
            // MANIPULATORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_OPERATORS__)
                __device__
@@ -421,9 +425,9 @@ THE SOFTWARE.
            #endif

            // ACCESSORS
-            __host__ __device__
+            __HOST_DEVICE__
            operator decltype(data)() const { return data; }
-            __host__ __device__
+            __HOST_DEVICE__
            operator __half2_raw() const { return __half2_raw{data}; }

            // ACCESSORS - DEVICE ONLY
@@ -520,42 +524,42 @@ THE SOFTWARE.
        namespace
        {
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 make_half2(__half x, __half y)
            {
                return __half2{x, y};
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half __low2half(__half2 x)
            {
                return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half __high2half(__half2 x)
            {
                return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __half2half2(__half x)
            {
                return __half2{x, x};
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __halves2half2(__half x, __half y)
            {
                return __half2{x, y};
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __low2half2(__half2 x)
            {
                return __half2{
@@ -565,7 +569,7 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __high2half2(__half2 x)
            {
                return __half2_raw{
@@ -575,7 +579,7 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __lows2half2(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -585,7 +589,7 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __highs2half2(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -595,7 +599,7 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __lowhigh2highlow(__half2 x)
            {
                return __half2_raw{
@@ -638,37 +642,37 @@ THE SOFTWARE.
            // TODO: rounding behaviour is not correct.
            // float -> half | half2
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half __float2half(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half __float2half_rn(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half __float2half_rz(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half __float2half_rd(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half __float2half_ru(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half2 __float2half2_rn(float x)
            {
                return __half2_raw{
@@ -676,14 +680,14 @@ THE SOFTWARE.
                        static_cast<_Float16>(x), static_cast<_Float16>(x)}};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half2 __floats2half2_rn(float x, float y)
            {
                return __half2_raw{_Float16_2{
                    static_cast<_Float16>(x), static_cast<_Float16>(y)}};
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            __half2 __float22half2_rn(float2 x)
            {
                return __floats2half2_rn(x.x, x.y);
@@ -691,25 +695,25 @@ THE SOFTWARE.

            // half | half2 -> float
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            float __half2float(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            float __low2float(__half2 x)
            {
                return static_cast<__half2_raw>(x).data.x;
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            float __high2float(__half2 x)
            {
                return static_cast<__half2_raw>(x).data.y;
            }
            inline
-            __device__ __host__
+            __HOST_DEVICE__
            float2 __half22float2(__half2 x)
            {
                return make_float2(
@@ -1044,16 +1048,16 @@ THE SOFTWARE.
            __half __ldcs(const __half* ptr) { return *ptr; }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __ldg(const __half2* ptr) { return *ptr; }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __ldcg(const __half2* ptr) { return *ptr; }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __ldca(const __half2* ptr) { return *ptr; }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __ldcs(const __half2* ptr) { return *ptr; }

            // Relations
@@ -1119,7 +1123,7 @@ THE SOFTWARE.
            bool __hgtu(__half x, __half y) { return __hgt(x, y); }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __heq2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data ==
@@ -1127,7 +1131,7 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hne2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data !=
@@ -1135,7 +1139,7 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hle2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data <=
@@ -1143,7 +1147,7 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hge2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data >=
@@ -1151,7 +1155,7 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hlt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data <
@@ -1159,7 +1163,7 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hgt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data >
@@ -1167,83 +1171,83 @@ THE SOFTWARE.
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
            inline
-            __host__  __device__
+            __HOST_DEVICE__
            __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbeq2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__heq2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbne2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hne2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hble2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hle2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbge2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hge2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hblt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hlt2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbgt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hgt2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }

            // Arithmetic
@@ -1332,7 +1336,7 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hadd2(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -1340,14 +1344,14 @@ THE SOFTWARE.
                    static_cast<__half2_raw>(y).data};
            }
 	    inline
-	    __host__ __device__
+	    __HOST_DEVICE__
 	    __half2 __habs2(__half2 x)
 	    {
 	        return __half2_raw{
 		    __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
 	    }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hsub2(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -1355,7 +1359,7 @@ THE SOFTWARE.
                    static_cast<__half2_raw>(y).data};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hmul2(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -1363,7 +1367,7 @@ THE SOFTWARE.
                    static_cast<__half2_raw>(y).data};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hadd2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hadd2(x, y));
@@ -1372,7 +1376,7 @@ THE SOFTWARE.
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hsub2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hsub2(x, y));
@@ -1381,7 +1385,7 @@ THE SOFTWARE.
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hmul2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hmul2(x, y));
@@ -1390,13 +1394,13 @@ THE SOFTWARE.
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hfma2(__half2 x, __half2 y, __half2 z)
            {
                return __half2_raw{__ocml_fma_2f16(x, y, z)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
            {
                auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
@@ -1405,7 +1409,7 @@ THE SOFTWARE.
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __h2div(__half2 x, __half2 y)
            {
                return __half2_raw{
@@ -1548,82 +1552,82 @@ THE SOFTWARE.
            }

            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2trunc(__half2 x)
            {
                return __half2_raw{__ocml_trunc_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2ceil(__half2 x)
            {
                return __half2_raw{__ocml_ceil_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2floor(__half2 x)
            {
                return __half2_raw{__ocml_floor_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2rint(__half2 x)
            {
                return __half2_raw{__ocml_rint_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2sin(__half2 x)
            {
                return __half2_raw{__ocml_sin_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2cos(__half2 x)
            {
                return __half2_raw{__ocml_cos_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2exp(__half2 x)
            {
                return __half2_raw{__ocml_exp_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2exp2(__half2 x)
            {
                return __half2_raw{__ocml_exp2_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2exp10(__half2 x)
            {
                return __half2_raw{__ocml_exp10_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2log2(__half2 x)
            {
                return __half2_raw{__ocml_log2_2f16(x)};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hisinf2(__half2 x)
            {
                auto r = __ocml_isinf_2f16(x);
@@ -1631,7 +1635,7 @@ THE SOFTWARE.
                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
            }
            inline
-            __host__  __device__
+            __HOST_DEVICE__
            __half2 __hisnan2(__half2 x)
            {
                auto r = __ocml_isnan_2f16(x);
@@ -1639,7 +1643,7 @@ THE SOFTWARE.
                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
            }
            inline
-            __host__ __device__
+            __HOST_DEVICE__
            __half2 __hneg2(__half2 x)
            {
                return __half2_raw{-static_cast<__half2_raw>(x).data};
@@ -34,6 +34,7 @@ THE SOFTWARE.
 //---
 // Top part of file can be compiled with any compiler

+#if !defined(__HIPCC_RTC__)
 //#include <cstring>
 #if __cplusplus
 #include <cmath>
@@ -42,7 +43,8 @@ THE SOFTWARE.
 #include <math.h>
 #include <string.h>
 #include <stddef.h>
-#endif  //__cplusplus
+#endif // __cplusplus
+#endif // !defined(__HIPCC_RTC__)

 // __hip_malloc is not working. Disable it by default.
 #ifndef __HIP_ENABLE_DEVICE_MALLOC__
@@ -57,9 +59,10 @@ THE SOFTWARE.

 #define CUDA_SUCCESS hipSuccess

+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
-
 extern int HIP_TRACE_API;
+#endif // !defined(__HIPCC_RTC__)

 #ifdef __cplusplus
 #include <hip/amd_detail/hip_ldg.h>
@@ -121,7 +124,9 @@ extern int HIP_TRACE_API;
 #define __launch_bounds__(...)                                                                     \
    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)

+#if !defined(__HIPCC_RTC__)
 __host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+#endif // !defined(__HIPCC_RTC__)

 #if __HIP_ARCH_GFX701__ == 0

@@ -162,6 +167,7 @@ static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullpt
 //
 // hip-clang functions
 //
+#if !defined(__HIPCC_RTC__)
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 #define HIP_SYMBOL(X) X

@@ -218,6 +224,8 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
 #endif

 #include <hip/hip_runtime_api.h>
+#endif // !defined(__HIPCC_RTC__)
+
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
 extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
@@ -245,6 +253,17 @@ struct __HIP_ThreadIdx {
  }
 };

+#if defined(__HIPCC_RTC__)
+typedef struct dim3 {
+    uint32_t x;  ///< x
+    uint32_t y;  ///< y
+    uint32_t z;  ///< z
+#ifdef __cplusplus
+    constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+#endif // !defined(__HIPCC_RTC__)
+
 template <typename F>
 struct __HIP_Coordinates {
  using R = decltype(F{}(0));
@@ -371,6 +390,7 @@ hc_get_workitem_absolute_id(int dim)
 #endif

 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if !defined(__HIPCC_RTC__)
 // Support std::complex.
 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
 #pragma push_macro("__CUDA__")
@@ -388,6 +408,7 @@ hc_get_workitem_absolute_id(int dim)
 #undef __CUDA__
 #pragma pop_macro("__CUDA__")
 #endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // !defined(__HIPCC_RTC__)
 #endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
 #endif // __HIP_CLANG_ONLY__

@@ -33,9 +33,10 @@ THE SOFTWARE.
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
+#if !defined(__HIPCC_RTC__)
 #include <limits.h>
-//#include <hip/amd_detail/driver_types.h>
 #include <hip/amd_detail/channel_descriptor.h>
+#endif // !defined(__HIPCC_RTC__)
 #include <hip/amd_detail/texture_types.h>

 #if __cplusplus
@@ -28,18 +28,18 @@ THE SOFTWARE.

 #include <hip/amd_detail/host_defines.h>

+#if !defined(__HIPCC_RTC__)
 #include <algorithm>
-
 // assert.h is only for the host version of assert.
 // The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
 // Users should include hip_runtime.h for the device version of assert.
 #if !__HIP_DEVICE_COMPILE__
 #include <assert.h>
 #endif
-
 #include <limits.h>
 #include <limits>
 #include <stdint.h>
+#endif // !defined(__HIPCC_RTC__)

 #if _LIBCPP_VERSION && __HIP__
 namespace std {
@@ -1460,6 +1460,7 @@ double min(double x, double y) {
 __HIP_OVERLOAD2(double, max)
 __HIP_OVERLOAD2(double, min)

+#if !defined(__HIPCC_RTC__)
 __host__ inline static int min(int arg1, int arg2) {
  return std::min(arg1, arg2);
 }
@@ -1467,6 +1468,7 @@ __host__ inline static int min(int arg1, int arg2) {
 __host__ inline static int max(int arg1, int arg2) {
  return std::max(arg1, arg2);
 }
+#endif // !defined(__HIPCC_RTC__)

 __DEVICE__
 inline float pow(float base, int iexp) {
@@ -25,10 +25,12 @@ THE SOFTWARE.
 #if defined(__cplusplus)

 #include <hip/hip_vector_types.h>
-#include <hip/texture_types.h>
+#include <hip/hip_texture_types.h>
 #include <hip/amd_detail/ockl_image.h>

+#if !defined(__HIPCC_RTC__)
 #include <type_traits>
+#endif // !defined(__HIPCC_RTC__)

 #define TEXTURE_PARAMETERS_INIT                                                                     \
    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
@@ -28,7 +28,9 @@ THE SOFTWARE.
 #include <hip/hip_texture_types.h>
 #include <hip/amd_detail/ockl_image.h>

+#if !defined(__HIPCC_RTC__)
 #include <type_traits>
+#endif // !defined(__HIPCC_RTC__)

 #define TEXTURE_OBJECT_PARAMETERS_INIT                                                            \
    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
@@ -41,6 +41,7 @@ THE SOFTWARE.
 #error HIP is not supported on GFX10 with wavefront size 64
 #endif

+#if !defined(__HIPCC_RTC__)
 // Some standard header files, these are included by hc.hpp and so want to make them avail on both
 // paths to provide a consistent include env and avoid "missing symbol" errors that only appears
 // on NVCC path:
@@ -52,6 +53,7 @@ THE SOFTWARE.
 #if __cplusplus > 199711L
 #include <thread>
 #endif
+#endif // !defined(__HIPCC_RTC__)

 #include <hip/hip_version.h>
 #include <hip/hip_common.h>
@@ -107,8 +109,10 @@ THE SOFTWARE.
 #endif // defined(__clang__)
 #endif

+#if !defined(__HIPCC_RTC__)
 #include <hip/hip_runtime_api.h>
-#include <hip/hip_vector_types.h>
 #include <hip/library_types.h>
+#endif // !defined(__HIPCC_RTC__)
+#include <hip/hip_vector_types.h>

 #endif