SWDEV-493792 - add split barriers for grid_group (#508)

* SWDEV-493792 - add split barriers for grid_group * add tests * Update change log * Add Navi4 split barrier * Update docs * Use new Catch2 Approx macro * Update split_barrier.cc to check for coop groups --------- Co-authored-by: Jatin Chaudhary <jatchaud@amd.com> Co-authored-by: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com>
2026-01-19 09:17:00 -08:00
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs

 * New HIP APIs
    - `hipKernelGetParamInfo`   returns the offset and size of a kernel parameter
+* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.

 * New HIP supports
    - `grid_group::block_rank()` returns the rank of the block in the calling thread
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
  //! @copydoc thread_group::sync
  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
  __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
+  struct arrival_token {
+    unsigned int signal;
+  };
+  //! Arrive at a barrier
+  __CG_QUALIFIER__ arrival_token barrier_arrive() const {
+    arrival_token t;
+    t.signal = internal::grid::barrier_signal();
+    return t;
+  }
+  //! Arrive at a barrier
+  __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
+    internal::grid::barrier_wait(t.signal);
+  }
 };

 /** \ingroup CooperativeGConstruct
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
  __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
  //! Returns the group dimensions.
  __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
+  struct arrival_token {};
+  //! Arrive at a barrier
+  __CG_QUALIFIER__ arrival_token barrier_arrive() const {
+    internal::workgroup::barrier_arrive();
+    return arrival_token{};
+  }
+  //! Arrive at a barrier
+  __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
 };

 /** \ingroup CooperativeGConstruct
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
 extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
 extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
 extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);

 extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);

@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
               static_cast<__hip_uint32_t>(gridDim.z)));
 }

+__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
+
+__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
 }  // namespace grid

 /**
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
               static_cast<__hip_uint32_t>(blockDim.z)));
 }

+__CG_STATIC_QUALIFIER__ void barrier_arrive() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+#if __has_builtin(__builtin_amdgcn_s_barrier_signal) &&                                            \
+    __has_builtin(__builtin_amdgcn_s_barrier_wait)
+  __builtin_amdgcn_s_barrier_signal(-1);
+#endif  // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
+}
+
+__CG_STATIC_QUALIFIER__ void barrier_wait() {
+#if __has_builtin(__builtin_amdgcn_s_barrier_signal) &&                                            \
+    __has_builtin(__builtin_amdgcn_s_barrier_wait)
+  __builtin_amdgcn_s_barrier_wait(-1);
+#else
+  __builtin_amdgcn_s_barrier();
+#endif  // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+}
 }  // namespace workgroup

 namespace tiled_group {
@@ -20,6 +20,7 @@ set(TEST_SRC
  binary_partition.cc
  cg_ballot.cc
  cg_any_all.cc
+  split_barrier.cc
 )

 if(HIP_PLATFORM STREQUAL "nvidia")
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip/hip_cooperative_groups.h>
+#include <hip_test_common.hh>
+
+static __global__ void wg_split_barrier(float *out, float *in) {
+  namespace cg = cooperative_groups;
+
+  __shared__ float mid[32];
+  size_t i = threadIdx.x;
+  auto tb = cg::this_thread_block();
+
+  out[i] = in[i] * 2.0f;
+
+  auto tok = tb.barrier_arrive();
+
+  // use tid 0 to populate shared mem
+  if (i == 0) {
+    for (size_t j = 0; j < 32; j++) {
+      mid[j] = in[j];
+    }
+  }
+
+  tb.barrier_wait(std::move(tok));
+
+  out[i] += mid[i];
+}
+
+TEST_CASE("Unit_coop_thread_block_split_barrier") {
+  constexpr size_t size = 32;
+  float *d_out, *d_in;
+
+  HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
+  HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
+
+  std::vector<float> in(size, 0.0f), out = in;
+  for (size_t i = 0; i < size; i++) {
+    in[i] = i + 1;
+  }
+
+  HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
+  HIP_CHECK(
+      hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
+  wg_split_barrier<<<1, size>>>(d_out, d_in);
+  HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
+                      hipMemcpyDeviceToHost));
+
+  HIP_CHECK(hipFree(d_out));
+  HIP_CHECK(hipFree(d_in));
+
+  for (size_t i = 0; i < size; i++) {
+    INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
+    REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
+  }
+}
+
+static __global__ void grid_split_barrier(int *data, int *result, int N) {
+  namespace cg = cooperative_groups;
+  cg::grid_group grid = cg::this_grid();
+
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tok = grid.barrier_arrive();
+  if (gid < N) {
+    data[gid] = gid + 1;
+  }
+
+  grid.barrier_wait(std::move(tok));
+
+  if (grid.thread_rank() == 0) {
+    int sum = 0;
+    for (int i = 0; i < N; i++)
+      sum += data[i];
+    *result = sum;
+  }
+}
+
+TEST_CASE("Unit_coop_grids_split_barrier") {
+  hipDeviceProp_t prop;
+  HIP_CHECK(hipGetDeviceProperties(&prop, 0));
+
+  if (prop.cooperativeLaunch != 0) {
+    int N = 1024;
+    const int threads = 128;
+    const int blocks = (N + threads - 1) / threads;
+
+    int *d_in, *d_out;
+    HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
+    HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
+
+    void *args[] = {&d_in, &d_out, &N};
+
+    dim3 grid(blocks);
+    dim3 block(threads);
+
+    HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
+                                         block, args, 0, 0));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    int out = 0;
+    HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
+
+    HIP_CHECK(hipFree(d_in));
+    HIP_CHECK(hipFree(d_out));
+    REQUIRE(out == ((N * (N + 1)) / 2));
+  }
+}
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
 * ``synchronize``
 * ``memcpy_async``
 * ``wait`` and ``wait_prior``
-* ``barrier_arrive`` and ``barrier_wait``
 * ``invoke_one`` and ``invoke_one_broadcast``
 * ``reduce``
 * ``reduce_update_async`` and ``reduce_store_async``