SWDEV-491314 - Re-enable cross-lane sync builtins (#94)
* Enables warp sync builtins by default * Removes HIP_ENABLE_WARP_SYNC_BUILTINS; that macro will no longer have an effect. Instead, we will now be able to disable the builtins with the macro: HIP_DISABLE_WARP_SYNC_BUILTINS
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
29df3ae6e9
Коммит
5606debd8e
@@ -16,6 +16,8 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
|
||||
- HIP Extensions APIs for microscaling formats, which are supported on AMD GPUs.
|
||||
* New `wptr` and `rptr` values in `ClPrint`, for better logging in dispatch barrier methods.
|
||||
* New debug mask, to print precise code object information for logging.
|
||||
* The `_sync()` version of crosslane builtins such as `shfl_sync()` and `__reduce_add_sync` are enabled by default. These can be
|
||||
disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -663,7 +663,7 @@ __hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int wid
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
|
||||
* \brief shfl down warp intrinsic for bfloat16
|
||||
@@ -771,7 +771,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
|
||||
u.ui = __shfl_xor_sync<unsigned long long, unsigned int>(mask, u.ui, delta, width);
|
||||
return u.bf162;
|
||||
}
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
|
||||
@@ -551,7 +551,7 @@ class coalesced_group : public thread_group {
|
||||
|
||||
return __shfl(var, lane, warpSize);
|
||||
}
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
|
||||
/** \brief Ballot function on group level.
|
||||
*
|
||||
@@ -617,7 +617,7 @@ class coalesced_group : public thread_group {
|
||||
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
|
||||
&pred));
|
||||
}
|
||||
#endif
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
@@ -819,14 +819,14 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
friend __CG_QUALIFIER__ coalesced_group
|
||||
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
|
||||
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
__CG_QUALIFIER__ unsigned long long build_mask() const {
|
||||
unsigned long long mask = ~0ull >> (64 - numThreads);
|
||||
// thread_rank() gives thread id from 0..thread launch size.
|
||||
return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) *
|
||||
numThreads);
|
||||
}
|
||||
#endif
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
public:
|
||||
|
||||
@@ -850,7 +850,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
return (__shfl_xor(var, laneMask, numThreads));
|
||||
}
|
||||
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
|
||||
const auto mask = build_mask();
|
||||
return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
|
||||
@@ -869,7 +869,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
const auto mask = build_mask();
|
||||
return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
|
||||
}
|
||||
#endif
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
};
|
||||
|
||||
/** \brief User exposed API that captures the state of the parent group pre-partition
|
||||
@@ -1197,7 +1197,7 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
|
||||
return impl::tiled_partition_internal<size, ParentCGTy>(g);
|
||||
}
|
||||
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
* \brief Binary partition.
|
||||
|
||||
@@ -122,7 +122,7 @@ unsigned long long int __ballot64(int predicate) {
|
||||
}
|
||||
|
||||
// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
// Since threads in a wave do not make independent progress, __activemask()
|
||||
// always returns the exact active mask, i.e, all active threads in the wave.
|
||||
__device__
|
||||
@@ -130,7 +130,7 @@ inline
|
||||
unsigned long long __activemask() {
|
||||
return __ballot(true);
|
||||
}
|
||||
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
__device__ static inline unsigned int __lane_id() {
|
||||
if (warpSize == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
|
||||
|
||||
@@ -24,10 +24,11 @@ THE SOFTWARE.
|
||||
|
||||
// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a
|
||||
// preview to allow end-users to adapt to the new interface involving 64-bit
|
||||
// masks. These are disabled by default, and can be enabled by setting the macro
|
||||
// "HIP_ENABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
|
||||
// masks. These are enabled by default, and can be disabled by setting the macro
|
||||
// "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
|
||||
// __activemask() builtin defined in amd_warp_functions.h.
|
||||
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#include "amd_warp_functions.h"
|
||||
#include "amd_device_functions.h"
|
||||
@@ -722,5 +723,9 @@ __device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long
|
||||
return __reduce_op_sync(mask, val, op, wfReduce);
|
||||
}
|
||||
|
||||
#undef __hip_do_sync
|
||||
#undef __hip_check_mask
|
||||
#undef __hip_adjust_mask_for_wave32
|
||||
|
||||
#endif // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
|
||||
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
Ссылка в новой задаче
Block a user