SWDEV-491314 - enable _sync() functions with 64-bit mask argument

Change-Id: Ieb13a9e1b2fc49ff225a05a51056d1212d95ae57
2024-11-12 11:45:28 +05:30
commit 4e2fd192eb
@@ -7,6 +7,10 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
 ### Changed
 * Added new environment variable
    - `DEBUG_HIP_7_PREVIEW` This is used for enabling the backward incompatible changes before the next major ROCm release 7.0. By default this is set to 0. Users can set this variable to 0x1, to match the behavior of hipGetLastError with its corresponding CUDA API.
+* New HIP APIs
+  - The `_sync()` version of crosslane builtins such as `shfl_sync()`,
+    `__all_sync()` and `__any_sync()`, are enabled by default. These can be
+    disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`.

 ## HIP 6.3 for ROCm 6.3

@@ -681,7 +681,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
  return u.bf16;
 }

-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
 * \brief shfl down warp intrinsic for bfloat16
@@ -789,7 +789,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
  u.ui = __shfl_xor_sync(mask, u.ui, delta, width);
  return u.bf162;
 }
-#endif
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS

 /**
 * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
@@ -473,7 +473,7 @@ class coalesced_group : public thread_group {

    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  }
-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
   __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
     return internal::helper::adjust_mask(
         coalesced_info.member_mask,
@@ -500,7 +500,7 @@ class coalesced_group : public thread_group {
         __match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
                          &pred));
   }
-#endif
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
 };

 /** \brief   User exposed API to create coalesced groups.
@@ -665,12 +665,12 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
  friend __CG_QUALIFIER__ coalesced_group
  binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);

-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
  __CG_QUALIFIER__ unsigned long long build_mask() const {
    unsigned long long mask = ~0ull >> (64 - numThreads);
    return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
  }
-#endif
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS

 public:
  __CG_STATIC_QUALIFIER__ void sync() {
@@ -697,7 +697,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
    return (__shfl_xor(var, laneMask, numThreads));
  }

-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
  __CG_QUALIFIER__ unsigned long long ballot(int pred) const {
    const auto mask = build_mask();
    return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
@@ -716,7 +716,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
    const auto mask = build_mask();
    return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
  }
-#endif
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
 };

 /** \brief   User exposed API that captures the state of the parent group pre-partition
@@ -901,7 +901,7 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
  return impl::tiled_partition_internal<size, ParentCGTy>(g);
 }

-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
 /** \brief Binary partition
 *
 *  \details This splits the input thread group into two partitions determined by predicate
@@ -927,7 +927,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size,
    return coalesced_group(tgrp.build_mask() ^ mask);
  }
 }
-#endif
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
 }  // namespace cooperative_groups

 #endif  // __cplusplus
@@ -112,7 +112,7 @@ unsigned long long int __ballot64(int predicate) {
 }

 // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
 // Since threads in a wave do not make independent progress, __activemask()
 // always returns the exact active mask, i.e, all active threads in the wave.
 __device__
@@ -120,7 +120,7 @@ inline
 unsigned long long __activemask() {
  return __ballot(true);
 }
-#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS

 __device__ static inline unsigned int __lane_id() {
    return  __builtin_amdgcn_mbcnt_hi(
@@ -23,11 +23,10 @@ THE SOFTWARE.
 #pragma once

 // Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a
-// preview to allow end-users to adapt to the new interface involving 64-bit
-// masks. These are disabled by default, and can be enabled by setting the macro
-// "HIP_ENABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
-// __activemask() builtin defined in amd_warp_functions.h.
-#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
+// preview and enabled by default in ROCm 6.4. These can be disabled, by setting
+// the macro "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to
+// the __activemask() builtin defined in amd_warp_functions.h.
+#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)

 #if !defined(__HIPCC_RTC__)
 #include "amd_warp_functions.h"
@@ -283,4 +282,4 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask,
 #undef __hip_check_mask
 #undef __hip_adjust_mask_for_wave32

-#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
+#endif // HIP_DISABLE_WARP_SYNC_BUILTINS