diff --git a/hipamd/include/hip/amd_detail/amd_device_functions.h b/hipamd/include/hip/amd_detail/amd_device_functions.h
index 263834877a..38731a9548 100644
--- a/hipamd/include/hip/amd_detail/amd_device_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_device_functions.h
@@ -60,7 +60,7 @@ __device__ static inline int __clz(int input) {
 }
 
 __device__ static inline int __clzll(long long int input) {
-    return __ockl_clz_u64((uint64_t)input);
+    return __ockl_clz_u64((__hip_uint64_t)input);
 }
 
 __device__ static inline int __ffs(unsigned int input) {
@@ -82,9 +82,9 @@ __device__ static inline int __ffsll(long long int input) {
 // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
 // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
 // If not found, return -1.
-__device__  static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
-  uint64_t temp_mask = mask;
-  int32_t temp_offset = offset;
+__device__  static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
+  __hip_uint64_t temp_mask = mask;
+  __hip_int32_t temp_offset = offset;
 
   if (offset == 0) {
     temp_mask &= (1 << base);
@@ -99,10 +99,10 @@ __device__  static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset)
   temp_mask = temp_mask & ((~0ULL) << base);
   if (__builtin_popcountll(temp_mask) < temp_offset)
     return -1;
-  int32_t total = 0;
+  __hip_int32_t total = 0;
   for (int i = 0x20; i > 0; i >>= 1) {
-    uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
-    int32_t pcnt = __builtin_popcountll(temp_mask_lo);
+    __hip_uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
+    __hip_int32_t pcnt = __builtin_popcountll(temp_mask_lo);
     if (pcnt < temp_offset) {
       temp_mask = temp_mask >> i;
       temp_offset -= pcnt;
@@ -118,9 +118,9 @@ __device__  static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset)
     return total;
 }
 
-__device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
-  uint32_t temp_mask = mask;
-  int32_t temp_offset = offset;
+__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
+  __hip_uint32_t temp_mask = mask;
+  __hip_int32_t temp_offset = offset;
   if (offset == 0) {
     temp_mask &= (1 << base);
     temp_offset = 1;
@@ -133,10 +133,10 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset)
   temp_mask = temp_mask & ((~0U) << base);
   if (__builtin_popcount(temp_mask) < temp_offset)
     return -1;
-  int32_t total = 0;
+  __hip_int32_t total = 0;
   for (int i = 0x10; i > 0; i >>= 1) {
-    uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
-    int32_t pcnt = __builtin_popcount(temp_mask_lo);
+    __hip_uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
+    __hip_int32_t pcnt = __builtin_popcount(temp_mask_lo);
     if (pcnt < temp_offset) {
       temp_mask = temp_mask >> i;
       temp_offset -= pcnt;
@@ -153,7 +153,7 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset)
 }
 
 // Wrapper around __fns32() to make porting from CUDA easier
-__device__ static int32_t __fns(unsigned int mask, unsigned int base, int offset) {
+__device__ static __hip_int32_t __fns(unsigned int mask, unsigned int base, int offset) {
     return __fns32(mask, base, offset);
 }
 
@@ -165,45 +165,45 @@ __device__ static inline unsigned long long int __brevll(unsigned long long int
     return __builtin_bitreverse64(input);
 }
 
-__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
+__device__ static inline unsigned int __lastbit_u32_u64(__hip_uint64_t input) {
     return input == 0 ? -1 : __builtin_ctzl(input);
 }
 
 __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
-    uint32_t offset = src1 & 31;
-    uint32_t width = src2 & 31;
+    __hip_uint32_t offset = src1 & 31;
+    __hip_uint32_t width = src2 & 31;
     return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
 }
 
-__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
-    uint64_t offset = src1 & 63;
-    uint64_t width = src2 & 63;
+__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1, unsigned int src2) {
+    __hip_uint64_t offset = src1 & 63;
+    __hip_uint64_t width = src2 & 63;
     return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
 }
 
 __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
-    uint32_t offset = src2 & 31;
-    uint32_t width = src3 & 31;
-    uint32_t mask = (1 << width) - 1;
+    __hip_uint32_t offset = src2 & 31;
+    __hip_uint32_t width = src3 & 31;
+    __hip_uint32_t mask = (1 << width) - 1;
     return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }
 
-__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
-    uint64_t offset = src2 & 63;
-    uint64_t width = src3 & 63;
-    uint64_t mask = (1ULL << width) - 1;
+__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1, unsigned int src2, unsigned int src3) {
+    __hip_uint64_t offset = src2 & 63;
+    __hip_uint64_t width = src3 & 63;
+    __hip_uint64_t mask = (1ULL << width) - 1;
     return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }
 
 __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
 {
-    uint32_t mask_shift = shift & 31;
+    __hip_uint32_t mask_shift = shift & 31;
     return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
 }
 
 __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
 {
-    uint32_t min_shift = shift >= 32 ? 32 : shift;
+    __hip_uint32_t min_shift = shift >= 32 ? 32 : shift;
     return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
 }
 
@@ -531,9 +531,9 @@ __device__ static inline unsigned int __float_as_uint(float x) {
 }
 
 __device__ static inline double __hiloint2double(int hi, int lo) {
-    static_assert(sizeof(double) == sizeof(uint64_t), "");
+    static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
 
-    uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo);
     double tmp1;
     __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
 
@@ -700,32 +700,32 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); }
 // hip.amdgcn.bc - lanemask
 __device__
 inline
-uint64_t  __lanemask_gt()
+__hip_uint64_t  __lanemask_gt()
 {
-    uint32_t lane = __ockl_lane_u32();
+    __hip_uint32_t lane = __ockl_lane_u32();
     if (lane == 63)
       return 0;
-    uint64_t ballot = __ballot64(1);
-    uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+    __hip_uint64_t ballot = __ballot64(1);
+    __hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1);
     return mask & ballot;
 }
 
 __device__
 inline
-uint64_t __lanemask_lt()
+__hip_uint64_t __lanemask_lt()
 {
-    uint32_t lane = __ockl_lane_u32();
-    int64_t ballot = __ballot64(1);
-    uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+    __hip_uint32_t lane = __ockl_lane_u32();
+    __hip_int64_t ballot = __ballot64(1);
+    __hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1;
     return mask & ballot;
 }
 
 __device__
 inline
-uint64_t  __lanemask_eq()
+__hip_uint64_t  __lanemask_eq()
 {
-    uint32_t lane = __ockl_lane_u32();
-    int64_t mask = ((uint64_t)1 << lane);
+    __hip_uint32_t lane = __ockl_lane_u32();
+    __hip_int64_t mask = ((__hip_uint64_t)1 << lane);
     return mask;
 }
 
diff --git a/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h b/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h
index deb3bfb7e2..937486f149 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h
@@ -81,9 +81,9 @@ struct hip_bfloat16
     {
         union
         {
-            uint32_t int32;
+            __hip_uint32_t int32;
             float    fp32;
-        } u = {uint32_t(data) << 16};
+        } u = {__hip_uint32_t(data) << 16};
         return u.fp32;
     }
 
@@ -113,7 +113,7 @@ private:
         union
         {
             float    fp32;
-            uint32_t int32;
+            __hip_uint32_t int32;
         } u = {f};
         if(~u.int32 & 0x7f800000)
         {
@@ -156,7 +156,7 @@ private:
         union
         {
             float    fp32;
-            uint32_t int32;
+            __hip_uint32_t int32;
         } u = {f};
         return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
     }
diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
index 1025e543d5..4a154f62e1 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -49,18 +49,19 @@ namespace cooperative_groups {
  */
 class thread_group {
  protected:
-  uint32_t _type;  //! Type of the thread_group.
-  uint32_t _size;  //! Total number of threads in the tread_group.
-  uint64_t _mask;  //! Lanemask for coalesced and tiled partitioned group types,
-                   //! LSB represents lane 0, and MSB represents lane 63
+  __hip_uint32_t _type;  //! Type of the thread_group.
+  __hip_uint32_t _size;  //! Total number of threads in the thread_group.
+  __hip_uint64_t _mask;  //! Lanemask for coalesced and tiled partitioned group types,
+                         //! LSB represents lane 0, and MSB represents lane 63
 
   //! Construct a thread group, and set thread group type and other essential
   //! thread group properties. This generic thread group is directly constructed
-  //! only when the group is supposed to contain only the calling the thread
+  //! only when the group is supposed to contain only the calling thread
   //! (through the API - `this_thread()`), and in all other cases, this thread
   //! group object is a sub-object of some other derived thread group object.
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
-                                uint64_t mask = static_cast<uint64_t>(0)) {
+  __CG_QUALIFIER__ thread_group(internal::group_type type,
+                                __hip_uint32_t size = static_cast<__hip_uint64_t>(0),
+                                __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
     _type = type;
     _size = size;
     _mask = mask;
@@ -88,11 +89,11 @@ class thread_group {
   //! Total number of threads in the thread_group, and this serves the purpose
   //! for all derived cooperative group types because their `size` is directly
   //! saved during the construction.
-  __CG_QUALIFIER__ uint32_t size() const { return _size; }
+  __CG_QUALIFIER__ __hip_uint32_t size() const { return _size; }
   //! Returns the type of the group.
   __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
   //! Rank of the calling thread within [0, \link size() size() \endlink).
-  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
   //! Returns true if the group has not violated any API constraints.
   __CG_QUALIFIER__ bool is_valid() const;
 
@@ -138,21 +139,21 @@ class multi_grid_group : public thread_group {
   friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
 
  protected:
-  //! Construct mutli-grid thread group (through the API this_multi_grid())
-  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+  //! Construct multi-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
       : thread_group(internal::cg_multi_grid, size) {}
 
  public:
 
   //! Number of invocations participating in this multi-grid group. In other
   //! words, the number of GPUs.
-  __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
+  __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
 
   //! Rank of this invocation. In other words, an ID number within the range
   //! [0, num_grids()) of the GPU that kernel is running on.
-  __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
+  __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
   //! @copydoc thread_group::thread_rank
-  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
+  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
   //! @copydoc thread_group::is_valid
   __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
   //! @copydoc thread_group::sync
@@ -193,11 +194,11 @@ class grid_group : public thread_group {
 
  protected:
   //! Construct grid thread group (through the API this_grid())
-  explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
+  explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {}
 
  public:
   //! @copydoc thread_group::thread_rank
-  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
+  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
   //! @copydoc thread_group::is_valid
   __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
   //! @copydoc thread_group::sync
@@ -236,7 +237,7 @@ class thread_block : public thread_group {
                                                        unsigned int tile_size);
  protected:
   // Construct a workgroup thread group (through the API this_thread_block())
-  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+  explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
       : thread_group(internal::cg_workgroup, size) {}
 
   __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
@@ -266,9 +267,9 @@ class thread_block : public thread_group {
   //! Returns 3-dimensional thread index within the block.
   __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
   //! @copydoc thread_group::thread_rank
-  __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
   //! @copydoc thread_group::size
-  __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); }
   //! @copydoc thread_group::is_valid
   __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
   //! @copydoc thread_group::sync
@@ -639,7 +640,7 @@ __CG_QUALIFIER__ coalesced_group coalesced_threads() {
  *  \note  This function is implemented on Linux and is under development
  *  on Microsoft Windows.
  */
-__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
   switch (this->_type) {
     case internal::cg_multi_grid: {
       return (static_cast<const multi_grid_group*>(this)->thread_rank());
@@ -744,7 +745,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
  *           cooperative group type APIs. This function is implemented on Linux
  *           and is under development on Microsoft Windows.
  */
-template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
+template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); }
 
 /** \brief   Returns the rank of thread of the group.
  *
@@ -757,7 +758,7 @@ template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { retu
  *           cooperative group type APIs. This function is implemented on Linux
  *           and is under development on Microsoft Windows.
  */
-template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
+template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
   return g.thread_rank();
 }
 
diff --git a/hipamd/include/hip/amd_detail/amd_hip_runtime.h b/hipamd/include/hip/amd_detail/amd_hip_runtime.h
index ed529f0b26..c3860153a9 100644
--- a/hipamd/include/hip/amd_detail/amd_hip_runtime.h
+++ b/hipamd/include/hip/amd_detail/amd_hip_runtime.h
@@ -87,19 +87,6 @@ size_t amd_dbgapi_get_build_id();
 #include <math.h>
 #include <stdint.h>
 #endif // __cplusplus
-#else
-#if !__HIP_NO_STD_DEFS__
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-typedef signed int int32_t;
-typedef signed long long int64_t;
-namespace std {
-using ::uint32_t;
-using ::uint64_t;
-using ::int32_t;
-using ::int64_t;
-}
-#endif // __HIP_NO_STD_DEFS__
 #endif // !defined(__HIPCC_RTC__)
 
 #if __HIP_CLANG_ONLY__
@@ -249,11 +236,12 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
 
 #if defined(__HIPCC_RTC__)
 typedef struct dim3 {
-    uint32_t x;  ///< x
-    uint32_t y;  ///< y
-    uint32_t z;  ///< z
+    __hip_uint32_t x;  ///< x
+    __hip_uint32_t y;  ///< y
+    __hip_uint32_t z;  ///< z
 #ifdef __cplusplus
-    constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+    constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1)
+            : x(_x), y(_y), z(_z){};
 #endif
 } dim3;
 #endif // !defined(__HIPCC_RTC__)
diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h
index d8d22ea0be..0de0939eb5 100644
--- a/hipamd/include/hip/amd_detail/amd_warp_functions.h
+++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h
@@ -163,13 +163,13 @@ __device__
 inline
 double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
-    static_assert(sizeof(double) == sizeof(uint64_t), "");
+    static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl(tmp[0], src_lane, width);
     tmp[1] = __shfl(tmp[1], src_lane, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -179,13 +179,13 @@ long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize)
 {
     #ifndef _MSC_VER
     static_assert(sizeof(long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl(tmp[0], src_lane, width);
     tmp[1] = __shfl(tmp[1], src_lane, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -198,13 +198,13 @@ inline
 unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) {
     #ifndef _MSC_VER
     static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
 
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl(tmp[0], src_lane, width);
     tmp[1] = __shfl(tmp[1], src_lane, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -217,13 +217,13 @@ inline
 long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl(tmp[0], src_lane, width);
     tmp[1] = __shfl(tmp[1], src_lane, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -231,13 +231,13 @@ __device__
 inline
 unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) {
     static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
 
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl(tmp[0], src_lane, width);
     tmp[1] = __shfl(tmp[1], src_lane, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -268,13 +268,13 @@ __device__
 inline
 double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
-    static_assert(sizeof(double) == sizeof(uint64_t), "");
+    static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_up(tmp[0], lane_delta, width);
     tmp[1] = __shfl_up(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -284,13 +284,13 @@ long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSi
 {
     #ifndef _MSC_VER
     static_assert(sizeof(long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_up(tmp[0], lane_delta, width);
     tmp[1] = __shfl_up(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -305,13 +305,13 @@ unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
 {
     #ifndef _MSC_VER
     static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
 
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_up(tmp[0], lane_delta, width);
     tmp[1] = __shfl_up(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -325,11 +325,11 @@ inline
 long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_up(tmp[0], lane_delta, width);
     tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -339,11 +339,11 @@ inline
 unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
 {
     static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_up(tmp[0], lane_delta, width);
     tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -374,13 +374,13 @@ __device__
 inline
 double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
-    static_assert(sizeof(double) == sizeof(uint64_t), "");
+    static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_down(tmp[0], lane_delta, width);
     tmp[1] = __shfl_down(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -390,13 +390,13 @@ long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warp
 {
     #ifndef _MSC_VER
     static_assert(sizeof(long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_down(tmp[0], lane_delta, width);
     tmp[1] = __shfl_down(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -410,13 +410,13 @@ unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta
 {
     #ifndef _MSC_VER
     static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
 
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_down(tmp[0], lane_delta, width);
     tmp[1] = __shfl_down(tmp[1], lane_delta, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -429,11 +429,11 @@ inline
 long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_down(tmp[0], lane_delta, width);
     tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -442,11 +442,11 @@ inline
 unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
 {
     static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_down(tmp[0], lane_delta, width);
     tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -477,13 +477,13 @@ __device__
 inline
 double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
     static_assert(sizeof(double) == 2 * sizeof(int), "");
-    static_assert(sizeof(double) == sizeof(uint64_t), "");
+    static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
     tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -493,13 +493,13 @@ long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize)
 {
     #ifndef _MSC_VER
     static_assert(sizeof(long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
 
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
     tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -513,13 +513,13 @@ unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width
 {
     #ifndef _MSC_VER
     static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
 
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
     tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
 
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
     #else
@@ -532,11 +532,11 @@ inline
 long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize)
 {
     static_assert(sizeof(long long) == 2 * sizeof(int), "");
-    static_assert(sizeof(long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
     int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
     tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
@@ -545,11 +545,11 @@ inline
 unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize)
 {
     static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
-    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+    static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
     unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
     tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
     tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
     unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
     return tmp1;
 }
diff --git a/hipamd/include/hip/amd_detail/device_library_decls.h b/hipamd/include/hip/amd_detail/device_library_decls.h
index 48abee3653..0395e6c779 100644
--- a/hipamd/include/hip/amd_detail/device_library_decls.h
+++ b/hipamd/include/hip/amd_detail/device_library_decls.h
@@ -23,7 +23,7 @@ THE SOFTWARE.
 /**
  *  @file  amd_detail/device_library_decls.h
  *  @brief Contains declarations for types and functions in device library.
- *         Uses int64_t and uint64_t instead of long, long long, unsigned
+ *         Uses __hip_int64_t and __hip_uint64_t instead of long, long long, unsigned
  *         long and unsigned long long types for device library API
  *         declarations.
  */
@@ -59,7 +59,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, ui
 extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
 extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
 extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
-extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) __hip_uint64_t __ockl_clz_u64(__hip_uint64_t);
 
 extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
 extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
@@ -75,26 +75,26 @@ extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(floa
 extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
 extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
 extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
-extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
-extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(__hip_uint32_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(__hip_int64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(__hip_uint64_t);
+extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(__hip_uint64_t);
 
 extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
 extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
 
-extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) __hip_uint32_t __ockl_lane_u32();
 extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
 extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
 extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
@@ -110,14 +110,14 @@ extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a
 extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
 extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
 
-extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
-extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
-                                                          uint64_t value0, uint64_t value1,
-                                                          uint64_t value2, uint64_t value3,
-                                                          uint64_t value4, uint64_t value5,
-                                                          uint64_t value6, uint32_t is_last);
-extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
-                                                              uint64_t length, uint32_t is_last);
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin();
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(__hip_uint64_t msg_desc, __hip_uint32_t num_args,
+                                                          __hip_uint64_t value0, __hip_uint64_t value1,
+                                                          __hip_uint64_t value2, __hip_uint64_t value3,
+                                                          __hip_uint64_t value4, __hip_uint64_t value5,
+                                                          __hip_uint64_t value6, __hip_uint32_t is_last);
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc, const char* data,
+                                                              __hip_uint64_t length, __hip_uint32_t is_last);
 
 // Introduce local address space
 #define __local __attribute__((address_space(3)))
diff --git a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
index 0afae29da9..f5d22b782d 100644
--- a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
+++ b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h
@@ -141,16 +141,16 @@ __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
  */
 namespace multi_grid {
 
-__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
-  return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
-  return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
 
 __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
 
@@ -164,23 +164,23 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
  */
 namespace grid {
 
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
+__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
+  return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
                     (blockDim.x * gridDim.x));
 }
 
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
   // Compute global id of the workgroup to which the current thread belongs to
-  uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
+  __hip_uint32_t blkIdx = static_cast<__hip_uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
                                (blockIdx.y * gridDim.x) + (blockIdx.x));
 
   // Compute total number of threads being passed to reach current workgroup
   // within grid
-  uint32_t num_threads_till_current_workgroup =
-      static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
+  __hip_uint32_t num_threads_till_current_workgroup =
+      static_cast<__hip_uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
 
   // Compute thread local rank within current workgroup
-  uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
+  __hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                                           (threadIdx.y * blockDim.x) + (threadIdx.x));
 
   return (num_threads_till_current_workgroup + local_thread_rank);
@@ -200,21 +200,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
 namespace workgroup {
 
 __CG_STATIC_QUALIFIER__ dim3 group_index() {
-  return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
-               static_cast<uint32_t>(blockIdx.z)));
+  return (dim3(static_cast<__hip_uint32_t>(blockIdx.x), static_cast<__hip_uint32_t>(blockIdx.y),
+               static_cast<__hip_uint32_t>(blockIdx.z)));
 }
 
 __CG_STATIC_QUALIFIER__ dim3 thread_index() {
-  return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
-               static_cast<uint32_t>(threadIdx.z)));
+  return (dim3(static_cast<__hip_uint32_t>(threadIdx.x), static_cast<__hip_uint32_t>(threadIdx.y),
+               static_cast<__hip_uint32_t>(threadIdx.z)));
 }
 
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
+__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
+  return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z));
 }
 
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
+__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
+  return (static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                      (threadIdx.y * blockDim.x) + (threadIdx.x)));
 }
 
@@ -225,8 +225,8 @@ __CG_STATIC_QUALIFIER__ bool is_valid() {
 __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
 
 __CG_STATIC_QUALIFIER__ dim3 block_dim() {
-  return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
-          static_cast<uint32_t>(blockDim.z)));
+  return (dim3(static_cast<__hip_uint32_t>(blockDim.x), static_cast<__hip_uint32_t>(blockDim.y),
+          static_cast<__hip_uint32_t>(blockDim.z)));
 }
 
 }  // namespace workgroup