diff --git a/CHANGELOG.md b/CHANGELOG.md index b78532feb0..ff41df8637 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs - HIP Extensions APIs for microscaling formats, which are supported on AMD GPUs. * New `wptr` and `rptr` values in `ClPrint`, for better logging in dispatch barrier methods. * New debug mask, to print precise code object information for logging. +* The `_sync()` version of crosslane builtins such as `shfl_sync()` and `__reduce_add_sync` are enabled by default. These can be +disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`. ### Changed diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h index 652bf6db94..a34b75d1a6 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -663,7 +663,7 @@ __hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int wid return tmp.f; } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** * \ingroup HIP_INTRINSIC_BFLOAT16_MOVE * \brief shfl down warp intrinsic for bfloat16 @@ -771,7 +771,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long u.ui = __shfl_xor_sync(mask, u.ui, delta, width); return u.bf162; } -#endif // HIP_DISABLE_WARP_SYNC_BUILTINS +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 72b605ce80..1025e543d5 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -551,7 +551,7 @@ class coalesced_group : public thread_group { return __shfl(var, lane, warpSize); } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** \brief Ballot function on group level. * @@ -617,7 +617,7 @@ class coalesced_group : public thread_group { __match_all_sync(static_cast(coalesced_info.member_mask), value, &pred)); } -#endif +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS }; /** \ingroup CooperativeGConstruct @@ -819,14 +819,14 @@ template class thread_block_tile_base : public tile_base& tgrp, bool pred); -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long build_mask() const { unsigned long long mask = ~0ull >> (64 - numThreads); // thread_rank() gives thread id from 0..thread launch size. return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads); } -#endif +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS public: @@ -850,7 +850,7 @@ template class thread_block_tile_base : public tile_base class thread_block_tile_base : public tile_base tiled_partition(const Paren return impl::tiled_partition_internal(g); } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** \ingroup CooperativeGConstruct * \brief Binary partition. diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index 463fc06c4b..d8d22ea0be 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -122,7 +122,7 @@ unsigned long long int __ballot64(int predicate) { } // See amd_warp_sync_functions.h for an explanation of this preprocessor flag. -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) // Since threads in a wave do not make independent progress, __activemask() // always returns the exact active mask, i.e, all active threads in the wave. __device__ @@ -130,7 +130,7 @@ inline unsigned long long __activemask() { return __ballot(true); } -#endif // HIP_ENABLE_WARP_SYNC_BUILTINS +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS __device__ static inline unsigned int __lane_id() { if (warpSize == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0); diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index a47ad1ea9d..c4885868e8 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -24,10 +24,11 @@ THE SOFTWARE. // Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a // preview to allow end-users to adapt to the new interface involving 64-bit -// masks. These are disabled by default, and can be enabled by setting the macro -// "HIP_ENABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the +// masks. These are enabled by default, and can be disabled by setting the macro +// "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the // __activemask() builtin defined in amd_warp_functions.h. -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) + #if !defined(__HIPCC_RTC__) #include "amd_warp_functions.h" #include "amd_device_functions.h" @@ -722,5 +723,9 @@ __device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long return __reduce_op_sync(mask, val, op, wfReduce); } +#undef __hip_do_sync +#undef __hip_check_mask +#undef __hip_adjust_mask_for_wave32 + #endif // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES -#endif // HIP_ENABLE_WARP_SYNC_BUILTINS +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS