From 4e2fd192eb2d417224ebe83be928fc672336286d Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Tue, 12 Nov 2024 11:45:28 +0530 Subject: [PATCH] SWDEV-491314 - enable _sync() functions with 64-bit mask argument Change-Id: Ieb13a9e1b2fc49ff225a05a51056d1212d95ae57 --- CHANGELOG.md | 4 ++++ hipamd/include/hip/amd_detail/amd_hip_bf16.h | 4 ++-- .../hip/amd_detail/amd_hip_cooperative_groups.h | 16 ++++++++-------- .../include/hip/amd_detail/amd_warp_functions.h | 4 ++-- .../hip/amd_detail/amd_warp_sync_functions.h | 11 +++++------ 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed339c64c..ccef6baa82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs ### Changed * Added new environment variable - `DEBUG_HIP_7_PREVIEW` This is used for enabling the backward incompatible changes before the next major ROCm release 7.0. By default this is set to 0. Users can set this variable to 0x1, to match the behavior of hipGetLastError with its corresponding CUDA API. +* New HIP APIs + - The `_sync()` version of crosslane builtins such as `shfl_sync()`, + `__all_sync()` and `__any_sync()`, are enabled by default. These can be + disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`. ## HIP 6.3 for ROCm 6.3 diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h index 0a5d1497fd..d89a4bb0ce 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -681,7 +681,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s return u.bf16; } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** * \ingroup HIP_INTRINSIC_BFLOAT16_MOVE * \brief shfl down warp intrinsic for bfloat16 @@ -789,7 +789,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long u.ui = __shfl_xor_sync(mask, u.ui, delta, width); return u.bf162; } -#endif +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 418f96949d..9b23e2d2d9 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -473,7 +473,7 @@ class coalesced_group : public thread_group { return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE); } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long ballot(int pred) const { return internal::helper::adjust_mask( coalesced_info.member_mask, @@ -500,7 +500,7 @@ class coalesced_group : public thread_group { __match_all_sync(static_cast(coalesced_info.member_mask), value, &pred)); } -#endif +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS }; /** \brief User exposed API to create coalesced groups. @@ -665,12 +665,12 @@ template class thread_block_tile_base : public tile_base& tgrp, bool pred); -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) __CG_QUALIFIER__ unsigned long long build_mask() const { unsigned long long mask = ~0ull >> (64 - numThreads); return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads); } -#endif +#endif // HIP_DISABLE_WARP_SYNC_BUILTINS public: __CG_STATIC_QUALIFIER__ void sync() { @@ -697,7 +697,7 @@ template class thread_block_tile_base : public tile_base class thread_block_tile_base : public tile_base tiled_partition(const Paren return impl::tiled_partition_internal(g); } -#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS) /** \brief Binary partition * * \details This splits the input thread group into two partitions determined by predicate @@ -927,7 +927,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile