SWDEV-513341 - Remove datatypes from HIPRTC (#377)
Defining int64_t, uint64_t, int32_t, uint32_t in HIPRTC seem to result in conflicts with some apps as they use their own definitions for these types. NVRTC also doesn't define these. Hence remove them and replace their usage with hip internal datatypes such as __hip_int64_t Change-Id: Id314730bbd61c05f126f915c546052f956550a3f
このコミットが含まれているのは:
@@ -60,7 +60,7 @@ __device__ static inline int __clz(int input) {
|
||||
}
|
||||
|
||||
__device__ static inline int __clzll(long long int input) {
|
||||
return __ockl_clz_u64((uint64_t)input);
|
||||
return __ockl_clz_u64((__hip_uint64_t)input);
|
||||
}
|
||||
|
||||
__device__ static inline int __ffs(unsigned int input) {
|
||||
@@ -82,9 +82,9 @@ __device__ static inline int __ffsll(long long int input) {
|
||||
// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
|
||||
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
|
||||
// If not found, return -1.
|
||||
__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
|
||||
uint64_t temp_mask = mask;
|
||||
int32_t temp_offset = offset;
|
||||
__device__ static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
|
||||
__hip_uint64_t temp_mask = mask;
|
||||
__hip_int32_t temp_offset = offset;
|
||||
|
||||
if (offset == 0) {
|
||||
temp_mask &= (1 << base);
|
||||
@@ -99,10 +99,10 @@ __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset)
|
||||
temp_mask = temp_mask & ((~0ULL) << base);
|
||||
if (__builtin_popcountll(temp_mask) < temp_offset)
|
||||
return -1;
|
||||
int32_t total = 0;
|
||||
__hip_int32_t total = 0;
|
||||
for (int i = 0x20; i > 0; i >>= 1) {
|
||||
uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
|
||||
int32_t pcnt = __builtin_popcountll(temp_mask_lo);
|
||||
__hip_uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
|
||||
__hip_int32_t pcnt = __builtin_popcountll(temp_mask_lo);
|
||||
if (pcnt < temp_offset) {
|
||||
temp_mask = temp_mask >> i;
|
||||
temp_offset -= pcnt;
|
||||
@@ -118,9 +118,9 @@ __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset)
|
||||
return total;
|
||||
}
|
||||
|
||||
__device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
|
||||
uint32_t temp_mask = mask;
|
||||
int32_t temp_offset = offset;
|
||||
__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
|
||||
__hip_uint32_t temp_mask = mask;
|
||||
__hip_int32_t temp_offset = offset;
|
||||
if (offset == 0) {
|
||||
temp_mask &= (1 << base);
|
||||
temp_offset = 1;
|
||||
@@ -133,10 +133,10 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset)
|
||||
temp_mask = temp_mask & ((~0U) << base);
|
||||
if (__builtin_popcount(temp_mask) < temp_offset)
|
||||
return -1;
|
||||
int32_t total = 0;
|
||||
__hip_int32_t total = 0;
|
||||
for (int i = 0x10; i > 0; i >>= 1) {
|
||||
uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
|
||||
int32_t pcnt = __builtin_popcount(temp_mask_lo);
|
||||
__hip_uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
|
||||
__hip_int32_t pcnt = __builtin_popcount(temp_mask_lo);
|
||||
if (pcnt < temp_offset) {
|
||||
temp_mask = temp_mask >> i;
|
||||
temp_offset -= pcnt;
|
||||
@@ -153,7 +153,7 @@ __device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset)
|
||||
}
|
||||
|
||||
// Wrapper around __fns32() to make porting from CUDA easier
|
||||
__device__ static int32_t __fns(unsigned int mask, unsigned int base, int offset) {
|
||||
__device__ static __hip_int32_t __fns(unsigned int mask, unsigned int base, int offset) {
|
||||
return __fns32(mask, base, offset);
|
||||
}
|
||||
|
||||
@@ -165,45 +165,45 @@ __device__ static inline unsigned long long int __brevll(unsigned long long int
|
||||
return __builtin_bitreverse64(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
|
||||
__device__ static inline unsigned int __lastbit_u32_u64(__hip_uint64_t input) {
|
||||
return input == 0 ? -1 : __builtin_ctzl(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
|
||||
uint32_t offset = src1 & 31;
|
||||
uint32_t width = src2 & 31;
|
||||
__hip_uint32_t offset = src1 & 31;
|
||||
__hip_uint32_t width = src2 & 31;
|
||||
return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
|
||||
}
|
||||
|
||||
__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
|
||||
uint64_t offset = src1 & 63;
|
||||
uint64_t width = src2 & 63;
|
||||
__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1, unsigned int src2) {
|
||||
__hip_uint64_t offset = src1 & 63;
|
||||
__hip_uint64_t width = src2 & 63;
|
||||
return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
|
||||
uint32_t offset = src2 & 31;
|
||||
uint32_t width = src3 & 31;
|
||||
uint32_t mask = (1 << width) - 1;
|
||||
__hip_uint32_t offset = src2 & 31;
|
||||
__hip_uint32_t width = src3 & 31;
|
||||
__hip_uint32_t mask = (1 << width) - 1;
|
||||
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
|
||||
}
|
||||
|
||||
__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
|
||||
uint64_t offset = src2 & 63;
|
||||
uint64_t width = src3 & 63;
|
||||
uint64_t mask = (1ULL << width) - 1;
|
||||
__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1, unsigned int src2, unsigned int src3) {
|
||||
__hip_uint64_t offset = src2 & 63;
|
||||
__hip_uint64_t width = src3 & 63;
|
||||
__hip_uint64_t mask = (1ULL << width) - 1;
|
||||
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
|
||||
}
|
||||
|
||||
__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
|
||||
{
|
||||
uint32_t mask_shift = shift & 31;
|
||||
__hip_uint32_t mask_shift = shift & 31;
|
||||
return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
|
||||
}
|
||||
|
||||
__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
|
||||
{
|
||||
uint32_t min_shift = shift >= 32 ? 32 : shift;
|
||||
__hip_uint32_t min_shift = shift >= 32 ? 32 : shift;
|
||||
return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
|
||||
}
|
||||
|
||||
@@ -531,9 +531,9 @@ __device__ static inline unsigned int __float_as_uint(float x) {
|
||||
}
|
||||
|
||||
__device__ static inline double __hiloint2double(int hi, int lo) {
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo);
|
||||
double tmp1;
|
||||
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
|
||||
@@ -700,32 +700,32 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); }
|
||||
// hip.amdgcn.bc - lanemask
|
||||
__device__
|
||||
inline
|
||||
uint64_t __lanemask_gt()
|
||||
__hip_uint64_t __lanemask_gt()
|
||||
{
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
if (lane == 63)
|
||||
return 0;
|
||||
uint64_t ballot = __ballot64(1);
|
||||
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
|
||||
__hip_uint64_t ballot = __ballot64(1);
|
||||
__hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1);
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
uint64_t __lanemask_lt()
|
||||
__hip_uint64_t __lanemask_lt()
|
||||
{
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
__hip_int64_t ballot = __ballot64(1);
|
||||
__hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1;
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
uint64_t __lanemask_eq()
|
||||
__hip_uint64_t __lanemask_eq()
|
||||
{
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
int64_t mask = ((uint64_t)1 << lane);
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
__hip_int64_t mask = ((__hip_uint64_t)1 << lane);
|
||||
return mask;
|
||||
}
|
||||
|
||||
|
||||
@@ -81,9 +81,9 @@ struct hip_bfloat16
|
||||
{
|
||||
union
|
||||
{
|
||||
uint32_t int32;
|
||||
__hip_uint32_t int32;
|
||||
float fp32;
|
||||
} u = {uint32_t(data) << 16};
|
||||
} u = {__hip_uint32_t(data) << 16};
|
||||
return u.fp32;
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ private:
|
||||
union
|
||||
{
|
||||
float fp32;
|
||||
uint32_t int32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
if(~u.int32 & 0x7f800000)
|
||||
{
|
||||
@@ -156,7 +156,7 @@ private:
|
||||
union
|
||||
{
|
||||
float fp32;
|
||||
uint32_t int32;
|
||||
__hip_uint32_t int32;
|
||||
} u = {f};
|
||||
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
|
||||
}
|
||||
|
||||
@@ -49,18 +49,19 @@ namespace cooperative_groups {
|
||||
*/
|
||||
class thread_group {
|
||||
protected:
|
||||
uint32_t _type; //! Type of the thread_group.
|
||||
uint32_t _size; //! Total number of threads in the tread_group.
|
||||
uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
|
||||
//! LSB represents lane 0, and MSB represents lane 63
|
||||
__hip_uint32_t _type; //! Type of the thread_group.
|
||||
__hip_uint32_t _size; //! Total number of threads in the thread_group.
|
||||
__hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
|
||||
//! LSB represents lane 0, and MSB represents lane 63
|
||||
|
||||
//! Construct a thread group, and set thread group type and other essential
|
||||
//! thread group properties. This generic thread group is directly constructed
|
||||
//! only when the group is supposed to contain only the calling the thread
|
||||
//! only when the group is supposed to contain only the calling thread
|
||||
//! (through the API - `this_thread()`), and in all other cases, this thread
|
||||
//! group object is a sub-object of some other derived thread group object.
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
|
||||
uint64_t mask = static_cast<uint64_t>(0)) {
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type,
|
||||
__hip_uint32_t size = static_cast<__hip_uint64_t>(0),
|
||||
__hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
|
||||
_type = type;
|
||||
_size = size;
|
||||
_mask = mask;
|
||||
@@ -88,11 +89,11 @@ class thread_group {
|
||||
//! Total number of threads in the thread_group, and this serves the purpose
|
||||
//! for all derived cooperative group types because their `size` is directly
|
||||
//! saved during the construction.
|
||||
__CG_QUALIFIER__ uint32_t size() const { return _size; }
|
||||
__CG_QUALIFIER__ __hip_uint32_t size() const { return _size; }
|
||||
//! Returns the type of the group.
|
||||
__CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
|
||||
//! Rank of the calling thread within [0, \link size() size() \endlink).
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const;
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
|
||||
//! Returns true if the group has not violated any API constraints.
|
||||
__CG_QUALIFIER__ bool is_valid() const;
|
||||
|
||||
@@ -138,21 +139,21 @@ class multi_grid_group : public thread_group {
|
||||
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
|
||||
|
||||
protected:
|
||||
//! Construct mutli-grid thread group (through the API this_multi_grid())
|
||||
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
|
||||
//! Construct multi-grid thread group (through the API this_multi_grid())
|
||||
explicit __CG_QUALIFIER__ multi_grid_group(__hip_uint32_t size)
|
||||
: thread_group(internal::cg_multi_grid, size) {}
|
||||
|
||||
public:
|
||||
|
||||
//! Number of invocations participating in this multi-grid group. In other
|
||||
//! words, the number of GPUs.
|
||||
__CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
|
||||
__CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
|
||||
|
||||
//! Rank of this invocation. In other words, an ID number within the range
|
||||
//! [0, num_grids()) of the GPU that kernel is running on.
|
||||
__CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
|
||||
__CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
|
||||
//! @copydoc thread_group::is_valid
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -193,11 +194,11 @@ class grid_group : public thread_group {
|
||||
|
||||
protected:
|
||||
//! Construct grid thread group (through the API this_grid())
|
||||
explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
|
||||
explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {}
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); }
|
||||
//! @copydoc thread_group::is_valid
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -236,7 +237,7 @@ class thread_block : public thread_group {
|
||||
unsigned int tile_size);
|
||||
protected:
|
||||
// Construct a workgroup thread group (through the API this_thread_block())
|
||||
explicit __CG_QUALIFIER__ thread_block(uint32_t size)
|
||||
explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
|
||||
: thread_group(internal::cg_workgroup, size) {}
|
||||
|
||||
__CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
|
||||
@@ -266,9 +267,9 @@ class thread_block : public thread_group {
|
||||
//! Returns 3-dimensional thread index within the block.
|
||||
__CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
|
||||
//! @copydoc thread_group::size
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); }
|
||||
//! @copydoc thread_group::is_valid
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -639,7 +640,7 @@ __CG_QUALIFIER__ coalesced_group coalesced_threads() {
|
||||
* \note This function is implemented on Linux and is under development
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_group::thread_rank() const {
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
return (static_cast<const multi_grid_group*>(this)->thread_rank());
|
||||
@@ -744,7 +745,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
*/
|
||||
template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); }
|
||||
|
||||
/** \brief Returns the rank of thread of the group.
|
||||
*
|
||||
@@ -757,7 +758,7 @@ template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { retu
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
*/
|
||||
template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t thread_rank(CGTy const& g) {
|
||||
return g.thread_rank();
|
||||
}
|
||||
|
||||
|
||||
@@ -87,19 +87,6 @@ size_t amd_dbgapi_get_build_id();
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#endif // __cplusplus
|
||||
#else
|
||||
#if !__HIP_NO_STD_DEFS__
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
typedef signed int int32_t;
|
||||
typedef signed long long int64_t;
|
||||
namespace std {
|
||||
using ::uint32_t;
|
||||
using ::uint64_t;
|
||||
using ::int32_t;
|
||||
using ::int64_t;
|
||||
}
|
||||
#endif // __HIP_NO_STD_DEFS__
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#if __HIP_CLANG_ONLY__
|
||||
@@ -249,11 +236,12 @@ void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
typedef struct dim3 {
|
||||
uint32_t x; ///< x
|
||||
uint32_t y; ///< y
|
||||
uint32_t z; ///< z
|
||||
__hip_uint32_t x; ///< x
|
||||
__hip_uint32_t y; ///< y
|
||||
__hip_uint32_t z; ///< z
|
||||
#ifdef __cplusplus
|
||||
constexpr __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
|
||||
constexpr __device__ dim3(__hip_uint32_t _x = 1, __hip_uint32_t _y = 1, __hip_uint32_t _z = 1)
|
||||
: x(_x), y(_y), z(_z){};
|
||||
#endif
|
||||
} dim3;
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
@@ -163,13 +163,13 @@ __device__
|
||||
inline
|
||||
double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -179,13 +179,13 @@ long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -198,13 +198,13 @@ inline
|
||||
unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -217,13 +217,13 @@ inline
|
||||
long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -231,13 +231,13 @@ __device__
|
||||
inline
|
||||
unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -268,13 +268,13 @@ __device__
|
||||
inline
|
||||
double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -284,13 +284,13 @@ long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSi
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -305,13 +305,13 @@ unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -325,11 +325,11 @@ inline
|
||||
long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -339,11 +339,11 @@ inline
|
||||
unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -374,13 +374,13 @@ __device__
|
||||
inline
|
||||
double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -390,13 +390,13 @@ long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warp
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -410,13 +410,13 @@ unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -429,11 +429,11 @@ inline
|
||||
long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -442,11 +442,11 @@ inline
|
||||
unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -477,13 +477,13 @@ __device__
|
||||
inline
|
||||
double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
|
||||
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -493,13 +493,13 @@ long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -513,13 +513,13 @@ unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
@@ -532,11 +532,11 @@ inline
|
||||
long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
|
||||
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
@@ -545,11 +545,11 @@ inline
|
||||
unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
__hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ THE SOFTWARE.
|
||||
/**
|
||||
* @file amd_detail/device_library_decls.h
|
||||
* @brief Contains declarations for types and functions in device library.
|
||||
* Uses int64_t and uint64_t instead of long, long long, unsigned
|
||||
* Uses __hip_int64_t and __hip_uint64_t instead of long, long long, unsigned
|
||||
* long and unsigned long long types for device library API
|
||||
* declarations.
|
||||
*/
|
||||
@@ -59,7 +59,7 @@ extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, ui
|
||||
extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
|
||||
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
|
||||
extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) __hip_uint64_t __ockl_clz_u64(__hip_uint64_t);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
|
||||
@@ -75,26 +75,26 @@ extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(floa
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(__hip_uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(__hip_uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(__hip_uint32_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(__hip_uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(__hip_uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(__hip_uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(__hip_int64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(__hip_uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(__hip_uint64_t);
|
||||
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(__hip_uint64_t);
|
||||
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
|
||||
extern "C" __device__ __attribute__((const)) __hip_uint32_t __ockl_lane_u32();
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
|
||||
@@ -110,14 +110,14 @@ extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a
|
||||
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
|
||||
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
|
||||
|
||||
extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
|
||||
extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
|
||||
uint64_t value0, uint64_t value1,
|
||||
uint64_t value2, uint64_t value3,
|
||||
uint64_t value4, uint64_t value5,
|
||||
uint64_t value6, uint32_t is_last);
|
||||
extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
|
||||
uint64_t length, uint32_t is_last);
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin();
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(__hip_uint64_t msg_desc, __hip_uint32_t num_args,
|
||||
__hip_uint64_t value0, __hip_uint64_t value1,
|
||||
__hip_uint64_t value2, __hip_uint64_t value3,
|
||||
__hip_uint64_t value4, __hip_uint64_t value5,
|
||||
__hip_uint64_t value6, __hip_uint32_t is_last);
|
||||
extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc, const char* data,
|
||||
__hip_uint64_t length, __hip_uint32_t is_last);
|
||||
|
||||
// Introduce local address space
|
||||
#define __local __attribute__((address_space(3)))
|
||||
|
||||
@@ -141,16 +141,16 @@ __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
|
||||
*/
|
||||
namespace multi_grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
|
||||
return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
|
||||
return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
||||
return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
|
||||
|
||||
@@ -164,23 +164,23 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
|
||||
*/
|
||||
namespace grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
||||
return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
|
||||
return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
|
||||
(blockDim.x * gridDim.x));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
// Compute global id of the workgroup to which the current thread belongs to
|
||||
uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
|
||||
__hip_uint32_t blkIdx = static_cast<__hip_uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
|
||||
(blockIdx.y * gridDim.x) + (blockIdx.x));
|
||||
|
||||
// Compute total number of threads being passed to reach current workgroup
|
||||
// within grid
|
||||
uint32_t num_threads_till_current_workgroup =
|
||||
static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
|
||||
__hip_uint32_t num_threads_till_current_workgroup =
|
||||
static_cast<__hip_uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
|
||||
|
||||
// Compute thread local rank within current workgroup
|
||||
uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
__hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
(threadIdx.y * blockDim.x) + (threadIdx.x));
|
||||
|
||||
return (num_threads_till_current_workgroup + local_thread_rank);
|
||||
@@ -200,21 +200,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
|
||||
namespace workgroup {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 group_index() {
|
||||
return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
|
||||
static_cast<uint32_t>(blockIdx.z)));
|
||||
return (dim3(static_cast<__hip_uint32_t>(blockIdx.x), static_cast<__hip_uint32_t>(blockIdx.y),
|
||||
static_cast<__hip_uint32_t>(blockIdx.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 thread_index() {
|
||||
return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
|
||||
static_cast<uint32_t>(threadIdx.z)));
|
||||
return (dim3(static_cast<__hip_uint32_t>(threadIdx.x), static_cast<__hip_uint32_t>(threadIdx.y),
|
||||
static_cast<__hip_uint32_t>(threadIdx.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
||||
return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
|
||||
return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
||||
return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return (static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
||||
(threadIdx.y * blockDim.x) + (threadIdx.x)));
|
||||
}
|
||||
|
||||
@@ -225,8 +225,8 @@ __CG_STATIC_QUALIFIER__ bool is_valid() {
|
||||
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 block_dim() {
|
||||
return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
|
||||
static_cast<uint32_t>(blockDim.z)));
|
||||
return (dim3(static_cast<__hip_uint32_t>(blockDim.x), static_cast<__hip_uint32_t>(blockDim.y),
|
||||
static_cast<__hip_uint32_t>(blockDim.z)));
|
||||
}
|
||||
|
||||
} // namespace workgroup
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする