diff --git a/hipamd/include/hip/amd_detail/amd_device_functions.h b/hipamd/include/hip/amd_detail/amd_device_functions.h index 263834877a..38731a9548 100644 --- a/hipamd/include/hip/amd_detail/amd_device_functions.h +++ b/hipamd/include/hip/amd_detail/amd_device_functions.h @@ -60,7 +60,7 @@ __device__ static inline int __clz(int input) { } __device__ static inline int __clzll(long long int input) { - return __ockl_clz_u64((uint64_t)input); + return __ockl_clz_u64((__hip_uint64_t)input); } __device__ static inline int __ffs(unsigned int input) { @@ -82,9 +82,9 @@ __device__ static inline int __ffsll(long long int input) { // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. // If not found, return -1. -__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) { - uint64_t temp_mask = mask; - int32_t temp_offset = offset; +__device__ static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) { + __hip_uint64_t temp_mask = mask; + __hip_int32_t temp_offset = offset; if (offset == 0) { temp_mask &= (1 << base); @@ -99,10 +99,10 @@ __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) temp_mask = temp_mask & ((~0ULL) << base); if (__builtin_popcountll(temp_mask) < temp_offset) return -1; - int32_t total = 0; + __hip_int32_t total = 0; for (int i = 0x20; i > 0; i >>= 1) { - uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); - int32_t pcnt = __builtin_popcountll(temp_mask_lo); + __hip_uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1); + __hip_int32_t pcnt = __builtin_popcountll(temp_mask_lo); if (pcnt < temp_offset) { temp_mask = temp_mask >> i; temp_offset -= pcnt; @@ -118,9 +118,9 @@ __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) return total; } -__device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) { - uint32_t temp_mask = mask; - int32_t temp_offset = offset; +__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) { + __hip_uint32_t temp_mask = mask; + __hip_int32_t temp_offset = offset; if (offset == 0) { temp_mask &= (1 << base); temp_offset = 1; @@ -133,10 +133,10 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) temp_mask = temp_mask & ((~0U) << base); if (__builtin_popcount(temp_mask) < temp_offset) return -1; - int32_t total = 0; + __hip_int32_t total = 0; for (int i = 0x10; i > 0; i >>= 1) { - uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1); - int32_t pcnt = __builtin_popcount(temp_mask_lo); + __hip_uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1); + __hip_int32_t pcnt = __builtin_popcount(temp_mask_lo); if (pcnt < temp_offset) { temp_mask = temp_mask >> i; temp_offset -= pcnt; @@ -153,7 +153,7 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) } // Wrapper around __fns32() to make porting from CUDA easier -__device__ static int32_t __fns(unsigned int mask, unsigned int base, int offset) { +__device__ static __hip_int32_t __fns(unsigned int mask, unsigned int base, int offset) { return __fns32(mask, base, offset); } @@ -165,45 +165,45 @@ __device__ static inline unsigned long long int __brevll(unsigned long long int return __builtin_bitreverse64(input); } -__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) { +__device__ static inline unsigned int __lastbit_u32_u64(__hip_uint64_t input) { return input == 0 ? -1 : __builtin_ctzl(input); } __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) { - uint32_t offset = src1 & 31; - uint32_t width = src2 & 31; + __hip_uint32_t offset = src1 & 31; + __hip_uint32_t width = src2 & 31; return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width); } -__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) { - uint64_t offset = src1 & 63; - uint64_t width = src2 & 63; +__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1, unsigned int src2) { + __hip_uint64_t offset = src1 & 63; + __hip_uint64_t width = src2 & 63; return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width); } __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) { - uint32_t offset = src2 & 31; - uint32_t width = src3 & 31; - uint32_t mask = (1 << width) - 1; + __hip_uint32_t offset = src2 & 31; + __hip_uint32_t width = src3 & 31; + __hip_uint32_t mask = (1 << width) - 1; return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); } -__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) { - uint64_t offset = src2 & 63; - uint64_t width = src3 & 63; - uint64_t mask = (1ULL << width) - 1; +__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1, unsigned int src2, unsigned int src3) { + __hip_uint64_t offset = src2 & 63; + __hip_uint64_t width = src3 & 63; + __hip_uint64_t mask = (1ULL << width) - 1; return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset)); } __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) { - uint32_t mask_shift = shift & 31; + __hip_uint32_t mask_shift = shift & 31; return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift); } __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) { - uint32_t min_shift = shift >= 32 ? 32 : shift; + __hip_uint32_t min_shift = shift >= 32 ? 32 : shift; return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift); } @@ -531,9 +531,9 @@ __device__ static inline unsigned int __float_as_uint(float x) { } __device__ static inline double __hiloint2double(int hi, int lo) { - static_assert(sizeof(double) == sizeof(uint64_t), ""); + static_assert(sizeof(double) == sizeof(__hip_uint64_t), ""); - uint64_t tmp0 = (static_cast(hi) << 32ull) | static_cast(lo); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo); double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); @@ -700,32 +700,32 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); } // hip.amdgcn.bc - lanemask __device__ inline -uint64_t __lanemask_gt() +__hip_uint64_t __lanemask_gt() { - uint32_t lane = __ockl_lane_u32(); + __hip_uint32_t lane = __ockl_lane_u32(); if (lane == 63) return 0; - uint64_t ballot = __ballot64(1); - uint64_t mask = (~((uint64_t)0)) << (lane + 1); + __hip_uint64_t ballot = __ballot64(1); + __hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1); return mask & ballot; } __device__ inline -uint64_t __lanemask_lt() +__hip_uint64_t __lanemask_lt() { - uint32_t lane = __ockl_lane_u32(); - int64_t ballot = __ballot64(1); - uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; + __hip_uint32_t lane = __ockl_lane_u32(); + __hip_int64_t ballot = __ballot64(1); + __hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1; return mask & ballot; } __device__ inline -uint64_t __lanemask_eq() +__hip_uint64_t __lanemask_eq() { - uint32_t lane = __ockl_lane_u32(); - int64_t mask = ((uint64_t)1 << lane); + __hip_uint32_t lane = __ockl_lane_u32(); + __hip_int64_t mask = ((__hip_uint64_t)1 << lane); return mask; } diff --git a/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h b/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h index deb3bfb7e2..937486f149 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h +++ b/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h @@ -81,9 +81,9 @@ struct hip_bfloat16 { union { - uint32_t int32; + __hip_uint32_t int32; float fp32; - } u = {uint32_t(data) << 16}; + } u = {__hip_uint32_t(data) << 16}; return u.fp32; } @@ -113,7 +113,7 @@ private: union { float fp32; - uint32_t int32; + __hip_uint32_t int32; } u = {f}; if(~u.int32 & 0x7f800000) { @@ -156,7 +156,7 @@ private: union { float fp32; - uint32_t int32; + __hip_uint32_t int32; } u = {f}; return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff)); } diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 1025e543d5..4a154f62e1 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -49,18 +49,19 @@ namespace cooperative_groups { */ class thread_group { protected: - uint32_t _type; //! Type of the thread_group. - uint32_t _size; //! Total number of threads in the tread_group. - uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types, - //! LSB represents lane 0, and MSB represents lane 63 + __hip_uint32_t _type; //! Type of the thread_group. + __hip_uint32_t _size; //! Total number of threads in the thread_group. + __hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types, + //! LSB represents lane 0, and MSB represents lane 63 //! Construct a thread group, and set thread group type and other essential //! thread group properties. This generic thread group is directly constructed - //! only when the group is supposed to contain only the calling the thread + //! only when the group is supposed to contain only the calling thread //! (through the API - `this_thread()`), and in all other cases, this thread //! group object is a sub-object of some other derived thread group object. - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast(0), - uint64_t mask = static_cast(0)) { + __CG_QUALIFIER__ thread_group(internal::group_type type, + __hip_uint32_t size = static_cast<__hip_uint64_t>(0), + __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) { _type = type; _size = size; _mask = mask; @@ -88,11 +89,11 @@ class thread_group { //! Total number of threads in the thread_group, and this serves the purpose //! for all derived cooperative group types because their `size` is directly //! saved during the construction. - __CG_QUALIFIER__ uint32_t size() const { return _size; } + __CG_QUALIFIER__ __hip_uint32_t size() const { return _size; } //! Returns the type of the group. __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } //! Rank of the calling thread within [0, \link size() size() \endlink). - __CG_QUALIFIER__ uint32_t thread_rank() const; + __CG_QUALIFIER__ __hip_uint32_t thread_rank() const; //! Returns true if the group has not violated any API constraints. __CG_QUALIFIER__ bool is_valid() const; @@ -138,21 +139,21 @@ class multi_grid_group : public thread_group { friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); protected: - //! Construct mutli-grid thread group (through the API this_multi_grid()) - explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) + //! Construct multi-grid thread group (through the API this_multi_grid()) + explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size) : thread_group(internal::cg_multi_grid, size) {} public: //! Number of invocations participating in this multi-grid group. In other //! words, the number of GPUs. - __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); } + __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); } //! Rank of this invocation. In other words, an ID number within the range //! [0, num_grids()) of the GPU that kernel is running on. - __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } + __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); } //! @copydoc thread_group::thread_rank - __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } + __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); } //! @copydoc thread_group::is_valid __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); } //! @copydoc thread_group::sync @@ -193,11 +194,11 @@ class grid_group : public thread_group { protected: //! Construct grid thread group (through the API this_grid()) - explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {} + explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {} public: //! @copydoc thread_group::thread_rank - __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); } + __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); } //! @copydoc thread_group::is_valid __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } //! @copydoc thread_group::sync @@ -236,7 +237,7 @@ class thread_block : public thread_group { unsigned int tile_size); protected: // Construct a workgroup thread group (through the API this_thread_block()) - explicit __CG_QUALIFIER__ thread_block(uint32_t size) + explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size) : thread_group(internal::cg_workgroup, size) {} __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const { @@ -266,9 +267,9 @@ class thread_block : public thread_group { //! Returns 3-dimensional thread index within the block. __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } //! @copydoc thread_group::thread_rank - __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); } + __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); } //! @copydoc thread_group::size - __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); } + __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); } //! @copydoc thread_group::is_valid __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } //! @copydoc thread_group::sync @@ -639,7 +640,7 @@ __CG_QUALIFIER__ coalesced_group coalesced_threads() { * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ -__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { +__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const { switch (this->_type) { case internal::cg_multi_grid: { return (static_cast(this)->thread_rank()); @@ -744,7 +745,7 @@ __CG_QUALIFIER__ void thread_group::sync() const { * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ -template __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); } +template __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); } /** \brief Returns the rank of thread of the group. * @@ -757,7 +758,7 @@ template __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { retu * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ -template __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) { +template __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) { return g.thread_rank(); } diff --git a/hipamd/include/hip/amd_detail/amd_hip_runtime.h b/hipamd/include/hip/amd_detail/amd_hip_runtime.h index ed529f0b26..c3860153a9 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_runtime.h +++ b/hipamd/include/hip/amd_detail/amd_hip_runtime.h @@ -87,19 +87,6 @@ size_t amd_dbgapi_get_build_id(); #include #include #endif // __cplusplus -#else -#if !__HIP_NO_STD_DEFS__ -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; -typedef signed int int32_t; -typedef signed long long int64_t; -namespace std { -using ::uint32_t; -using ::uint64_t; -using ::int32_t; -using ::int64_t; -} -#endif // __HIP_NO_STD_DEFS__ #endif // !defined(__HIPCC_RTC__) #if __HIP_CLANG_ONLY__ @@ -249,11 +236,12 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, #if defined(__HIPCC_RTC__) typedef struct dim3 { - uint32_t x; ///< x - uint32_t y; ///< y - uint32_t z; ///< z + __hip_uint32_t x; ///< x + __hip_uint32_t y; ///< y + __hip_uint32_t z; ///< z #ifdef __cplusplus - constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){}; + constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1) + : x(_x), y(_y), z(_z){}; #endif } dim3; #endif // !defined(__HIPCC_RTC__) diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index d8d22ea0be..0de0939eb5 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -163,13 +163,13 @@ __device__ inline double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) { static_assert(sizeof(double) == 2 * sizeof(int), ""); - static_assert(sizeof(double) == sizeof(uint64_t), ""); + static_assert(sizeof(double) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl(tmp[0], src_lane, width); tmp[1] = __shfl(tmp[1], src_lane, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -179,13 +179,13 @@ long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize) { #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); - static_assert(sizeof(long) == sizeof(uint64_t), ""); + static_assert(sizeof(long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl(tmp[0], src_lane, width); tmp[1] = __shfl(tmp[1], src_lane, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -198,13 +198,13 @@ inline unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) { #ifndef _MSC_VER static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl(tmp[0], src_lane, width); tmp[1] = __shfl(tmp[1], src_lane, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -217,13 +217,13 @@ inline long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); - static_assert(sizeof(long long) == sizeof(uint64_t), ""); + static_assert(sizeof(long long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl(tmp[0], src_lane, width); tmp[1] = __shfl(tmp[1], src_lane, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -231,13 +231,13 @@ __device__ inline unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) { static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl(tmp[0], src_lane, width); tmp[1] = __shfl(tmp[1], src_lane, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -268,13 +268,13 @@ __device__ inline double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(double) == 2 * sizeof(int), ""); - static_assert(sizeof(double) == sizeof(uint64_t), ""); + static_assert(sizeof(double) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_up(tmp[0], lane_delta, width); tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -284,13 +284,13 @@ long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSi { #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); - static_assert(sizeof(long) == sizeof(uint64_t), ""); + static_assert(sizeof(long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_up(tmp[0], lane_delta, width); tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -305,13 +305,13 @@ unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, { #ifndef _MSC_VER static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_up(tmp[0], lane_delta, width); tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -325,11 +325,11 @@ inline long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); - static_assert(sizeof(long long) == sizeof(uint64_t), ""); + static_assert(sizeof(long long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_up(tmp[0], lane_delta, width); tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -339,11 +339,11 @@ inline unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_up(tmp[0], lane_delta, width); tmp[1] = __shfl_up(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -374,13 +374,13 @@ __device__ inline double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(double) == 2 * sizeof(int), ""); - static_assert(sizeof(double) == sizeof(uint64_t), ""); + static_assert(sizeof(double) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_down(tmp[0], lane_delta, width); tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -390,13 +390,13 @@ long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warp { #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); - static_assert(sizeof(long) == sizeof(uint64_t), ""); + static_assert(sizeof(long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_down(tmp[0], lane_delta, width); tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -410,13 +410,13 @@ unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta { #ifndef _MSC_VER static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_down(tmp[0], lane_delta, width); tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -429,11 +429,11 @@ inline long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); - static_assert(sizeof(long long) == sizeof(uint64_t), ""); + static_assert(sizeof(long long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_down(tmp[0], lane_delta, width); tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -442,11 +442,11 @@ inline unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_down(tmp[0], lane_delta, width); tmp[1] = __shfl_down(tmp[1], lane_delta, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -477,13 +477,13 @@ __device__ inline double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) { static_assert(sizeof(double) == 2 * sizeof(int), ""); - static_assert(sizeof(double) == sizeof(uint64_t), ""); + static_assert(sizeof(double) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_xor(tmp[0], lane_mask, width); tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -493,13 +493,13 @@ long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize) { #ifndef _MSC_VER static_assert(sizeof(long) == 2 * sizeof(int), ""); - static_assert(sizeof(long) == sizeof(uint64_t), ""); + static_assert(sizeof(long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_xor(tmp[0], lane_mask, width); tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -513,13 +513,13 @@ unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width { #ifndef _MSC_VER static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_xor(tmp[0], lane_mask, width); tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; #else @@ -532,11 +532,11 @@ inline long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); - static_assert(sizeof(long long) == sizeof(uint64_t), ""); + static_assert(sizeof(long long) == sizeof(__hip_uint64_t), ""); int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_xor(tmp[0], lane_mask, width); tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } @@ -545,11 +545,11 @@ inline unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize) { static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); - static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), ""); unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); tmp[0] = __shfl_xor(tmp[0], lane_mask, width); tmp[1] = __shfl_xor(tmp[1], lane_mask, width); - uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]); unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } diff --git a/hipamd/include/hip/amd_detail/device_library_decls.h b/hipamd/include/hip/amd_detail/device_library_decls.h index 48abee3653..0395e6c779 100644 --- a/hipamd/include/hip/amd_detail/device_library_decls.h +++ b/hipamd/include/hip/amd_detail/device_library_decls.h @@ -23,7 +23,7 @@ THE SOFTWARE. /** * @file amd_detail/device_library_decls.h * @brief Contains declarations for types and functions in device library. - * Uses int64_t and uint64_t instead of long, long long, unsigned + * Uses __hip_int64_t and __hip_uint64_t instead of long, long long, unsigned * long and unsigned long long types for device library API * declarations. */ @@ -59,7 +59,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, ui extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar); extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort); extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint); -extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t); +extern "C" __device__ __attribute__((const)) __hip_uint64_t __ockl_clz_u64(__hip_uint64_t); extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float); extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float); @@ -75,26 +75,26 @@ extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(floa extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int); extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int); extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t); -extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t); -extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(__hip_uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(__hip_uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(__hip_uint32_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(__hip_uint64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(__hip_uint64_t); +extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(__hip_uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(__hip_int64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(__hip_uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(__hip_uint64_t); +extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(__hip_uint64_t); extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid); extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid); -extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32(); +extern "C" __device__ __attribute__((const)) __hip_uint32_t __ockl_lane_u32(); extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void); extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void); extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void); @@ -110,14 +110,14 @@ extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a); extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a); -extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin(); -extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args, - uint64_t value0, uint64_t value1, - uint64_t value2, uint64_t value3, - uint64_t value4, uint64_t value5, - uint64_t value6, uint32_t is_last); -extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data, - uint64_t length, uint32_t is_last); +extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin(); +extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(__hip_uint64_t msg_desc, __hip_uint32_t num_args, + __hip_uint64_t value0, __hip_uint64_t value1, + __hip_uint64_t value2, __hip_uint64_t value3, + __hip_uint64_t value4, __hip_uint64_t value5, + __hip_uint64_t value6, __hip_uint32_t is_last); +extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc, const char* data, + __hip_uint64_t length, __hip_uint32_t is_last); // Introduce local address space #define __local __attribute__((address_space(3))) diff --git a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index 0afae29da9..f5d22b782d 100644 --- a/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -141,16 +141,16 @@ __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask( */ namespace multi_grid { -__CG_STATIC_QUALIFIER__ uint32_t num_grids() { - return static_cast(__ockl_multi_grid_num_grids()); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() { + return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids()); } -__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { - return static_cast(__ockl_multi_grid_grid_rank()); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() { + return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); } -__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast(__ockl_multi_grid_size()); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); } -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return static_cast(__ockl_multi_grid_thread_rank()); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { + return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); } __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_multi_grid_is_valid()); } @@ -164,23 +164,23 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } */ namespace grid { -__CG_STATIC_QUALIFIER__ uint32_t size() { - return static_cast((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * +__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { + return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * (blockDim.x * gridDim.x)); } -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { +__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { // Compute global id of the workgroup to which the current thread belongs to - uint32_t blkIdx = static_cast((blockIdx.z * gridDim.y * gridDim.x) + + __hip_uint32_t blkIdx = static_cast<__hip_uint32_t>((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x) + (blockIdx.x)); // Compute total number of threads being passed to reach current workgroup // within grid - uint32_t num_threads_till_current_workgroup = - static_cast(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); + __hip_uint32_t num_threads_till_current_workgroup = + static_cast<__hip_uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z)); // Compute thread local rank within current workgroup - uint32_t local_thread_rank = static_cast((threadIdx.z * blockDim.y * blockDim.x) + + __hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x)); return (num_threads_till_current_workgroup + local_thread_rank); @@ -200,21 +200,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } namespace workgroup { __CG_STATIC_QUALIFIER__ dim3 group_index() { - return (dim3(static_cast(blockIdx.x), static_cast(blockIdx.y), - static_cast(blockIdx.z))); + return (dim3(static_cast<__hip_uint32_t>(blockIdx.x), static_cast<__hip_uint32_t>(blockIdx.y), + static_cast<__hip_uint32_t>(blockIdx.z))); } __CG_STATIC_QUALIFIER__ dim3 thread_index() { - return (dim3(static_cast(threadIdx.x), static_cast(threadIdx.y), - static_cast(threadIdx.z))); + return (dim3(static_cast<__hip_uint32_t>(threadIdx.x), static_cast<__hip_uint32_t>(threadIdx.y), + static_cast<__hip_uint32_t>(threadIdx.z))); } -__CG_STATIC_QUALIFIER__ uint32_t size() { - return (static_cast(blockDim.x * blockDim.y * blockDim.z)); +__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { + return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z)); } -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return (static_cast((threadIdx.z * blockDim.y * blockDim.x) + +__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { + return (static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x))); } @@ -225,8 +225,8 @@ __CG_STATIC_QUALIFIER__ bool is_valid() { __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); } __CG_STATIC_QUALIFIER__ dim3 block_dim() { - return (dim3(static_cast(blockDim.x), static_cast(blockDim.y), - static_cast(blockDim.z))); + return (dim3(static_cast<__hip_uint32_t>(blockDim.x), static_cast<__hip_uint32_t>(blockDim.y), + static_cast<__hip_uint32_t>(blockDim.z))); } } // namespace workgroup