diff --git a/projects/clr/CHANGELOG.md b/projects/clr/CHANGELOG.md index 292125904d..80756af55e 100644 --- a/projects/clr/CHANGELOG.md +++ b/projects/clr/CHANGELOG.md @@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs * New HIP APIs - `hipKernelGetParamInfo` returns the offset and size of a kernel parameter +* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`. * New HIP supports - `grid_group::block_rank()` returns the rank of the block in the calling thread diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 62cb66fb90..80b7f71c23 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -212,6 +212,19 @@ class grid_group : public thread_group { //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); } + struct arrival_token { + unsigned int signal; + }; + //! Arrive at a barrier + __CG_QUALIFIER__ arrival_token barrier_arrive() const { + arrival_token t; + t.signal = internal::grid::barrier_signal(); + return t; + } + //! Arrive at a barrier + __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const { + internal::grid::barrier_wait(t.signal); + } }; /** \ingroup CooperativeGConstruct @@ -295,6 +308,14 @@ class thread_block : public thread_group { __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } //! Returns the group dimensions. __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } + struct arrival_token {}; + //! Arrive at a barrier + __CG_QUALIFIER__ arrival_token barrier_arrive() const { + internal::workgroup::barrier_arrive(); + return arrival_token{}; + } + //! Arrive at a barrier + __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); } }; /** \ingroup CooperativeGConstruct diff --git a/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h b/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h index 39c2e59686..791d2e7137 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h +++ b/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h @@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void); extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void); extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); +extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint); extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float); diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index d9671eb342..771e8419ff 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() { static_cast<__hip_uint32_t>(gridDim.z))); } +__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); } + +__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); } } // namespace grid /** @@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() { static_cast<__hip_uint32_t>(blockDim.z))); } +__CG_STATIC_QUALIFIER__ void barrier_arrive() { + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); +#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \ + __has_builtin(__builtin_amdgcn_s_barrier_wait) + __builtin_amdgcn_s_barrier_signal(-1); +#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait +} + +__CG_STATIC_QUALIFIER__ void barrier_wait() { +#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \ + __has_builtin(__builtin_amdgcn_s_barrier_wait) + __builtin_amdgcn_s_barrier_wait(-1); +#else + __builtin_amdgcn_s_barrier(); +#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); +} } // namespace workgroup namespace tiled_group { diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt index f22d40f944..c0d2a00e3c 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt @@ -20,6 +20,7 @@ set(TEST_SRC binary_partition.cc cg_ballot.cc cg_any_all.cc + split_barrier.cc ) if(HIP_PLATFORM STREQUAL "nvidia") diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc b/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc new file mode 100644 index 0000000000..f7376b6605 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc @@ -0,0 +1,123 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +static __global__ void wg_split_barrier(float *out, float *in) { + namespace cg = cooperative_groups; + + __shared__ float mid[32]; + size_t i = threadIdx.x; + auto tb = cg::this_thread_block(); + + out[i] = in[i] * 2.0f; + + auto tok = tb.barrier_arrive(); + + // use tid 0 to populate shared mem + if (i == 0) { + for (size_t j = 0; j < 32; j++) { + mid[j] = in[j]; + } + } + + tb.barrier_wait(std::move(tok)); + + out[i] += mid[i]; +} + +TEST_CASE("Unit_coop_thread_block_split_barrier") { + constexpr size_t size = 32; + float *d_out, *d_in; + + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + + std::vector in(size, 0.0f), out = in; + for (size_t i = 0; i < size; i++) { + in[i] = i + 1; + } + + HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + wg_split_barrier<<<1, size>>>(d_out, d_in); + HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size, + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_in)); + + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]); + REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i])); + } +} + +static __global__ void grid_split_barrier(int *data, int *result, int N) { + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + + int gid = blockIdx.x * blockDim.x + threadIdx.x; + auto tok = grid.barrier_arrive(); + if (gid < N) { + data[gid] = gid + 1; + } + + grid.barrier_wait(std::move(tok)); + + if (grid.thread_rank() == 0) { + int sum = 0; + for (int i = 0; i < N; i++) + sum += data[i]; + *result = sum; + } +} + +TEST_CASE("Unit_coop_grids_split_barrier") { + hipDeviceProp_t prop; + HIP_CHECK(hipGetDeviceProperties(&prop, 0)); + + if (prop.cooperativeLaunch != 0) { + int N = 1024; + const int threads = 128; + const int blocks = (N + threads - 1) / threads; + + int *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, N * sizeof(int))); + HIP_CHECK(hipMalloc(&d_out, sizeof(int))); + + void *args[] = {&d_in, &d_out, &N}; + + dim3 grid(blocks); + dim3 block(threads); + + HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid, + block, args, 0, 0)); + HIP_CHECK(hipDeviceSynchronize()); + + int out = 0; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + REQUIRE(out == ((N * (N + 1)) / 2)); + } +} diff --git a/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst b/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst index a3e32cd294..bf34ab7583 100644 --- a/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst +++ b/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst @@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou * ``synchronize`` * ``memcpy_async`` * ``wait`` and ``wait_prior`` -* ``barrier_arrive`` and ``barrier_wait`` * ``invoke_one`` and ``invoke_one_broadcast`` * ``reduce`` * ``reduce_update_async`` and ``reduce_store_async``