SWDEV-493792 - add split barriers for grid_group (#508)

* SWDEV-493792 - add split barriers for grid_group

* add tests

* Update change log

* Add Navi4 split barrier

* Update docs

* Use new Catch2 Approx macro

* Update split_barrier.cc to check for coop groups

---------

Co-authored-by: Jatin Chaudhary <jatchaud@amd.com>
Co-authored-by: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com>
이 커밋은 다음에 포함됨:
systems-assistant[bot]
2026-01-19 09:17:00 -08:00
커밋한 사람 GitHub
부모 e49b501e9a
커밋 88f07baa92
7개의 변경된 파일168개의 추가작업 그리고 1개의 파일을 삭제
+1
파일 보기
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
* New HIP APIs
- `hipKernelGetParamInfo` returns the offset and size of a kernel parameter
* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.
* New HIP supports
- `grid_group::block_rank()` returns the rank of the block in the calling thread
+21
파일 보기
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
//! @copydoc thread_group::sync
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
struct arrival_token {
unsigned int signal;
};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
arrival_token t;
t.signal = internal::grid::barrier_signal();
return t;
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
internal::grid::barrier_wait(t.signal);
}
};
/** \ingroup CooperativeGConstruct
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
//! Returns the group dimensions.
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
struct arrival_token {};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
internal::workgroup::barrier_arrive();
return arrival_token{};
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
};
/** \ingroup CooperativeGConstruct
+2
파일 보기
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
+20
파일 보기
@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
static_cast<__hip_uint32_t>(gridDim.z)));
}
__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
} // namespace grid
/**
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
static_cast<__hip_uint32_t>(blockDim.z)));
}
__CG_STATIC_QUALIFIER__ void barrier_arrive() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_signal(-1);
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
}
__CG_STATIC_QUALIFIER__ void barrier_wait() {
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_wait(-1);
#else
__builtin_amdgcn_s_barrier();
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}
} // namespace workgroup
namespace tiled_group {
+1
파일 보기
@@ -20,6 +20,7 @@ set(TEST_SRC
binary_partition.cc
cg_ballot.cc
cg_any_all.cc
split_barrier.cc
)
if(HIP_PLATFORM STREQUAL "nvidia")
+123
파일 보기
@@ -0,0 +1,123 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/hip_cooperative_groups.h>
#include <hip_test_common.hh>
static __global__ void wg_split_barrier(float *out, float *in) {
namespace cg = cooperative_groups;
__shared__ float mid[32];
size_t i = threadIdx.x;
auto tb = cg::this_thread_block();
out[i] = in[i] * 2.0f;
auto tok = tb.barrier_arrive();
// use tid 0 to populate shared mem
if (i == 0) {
for (size_t j = 0; j < 32; j++) {
mid[j] = in[j];
}
}
tb.barrier_wait(std::move(tok));
out[i] += mid[i];
}
TEST_CASE("Unit_coop_thread_block_split_barrier") {
constexpr size_t size = 32;
float *d_out, *d_in;
HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
std::vector<float> in(size, 0.0f), out = in;
for (size_t i = 0; i < size; i++) {
in[i] = i + 1;
}
HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
HIP_CHECK(
hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
wg_split_barrier<<<1, size>>>(d_out, d_in);
HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_out));
HIP_CHECK(hipFree(d_in));
for (size_t i = 0; i < size; i++) {
INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
}
}
static __global__ void grid_split_barrier(int *data, int *result, int N) {
namespace cg = cooperative_groups;
cg::grid_group grid = cg::this_grid();
int gid = blockIdx.x * blockDim.x + threadIdx.x;
auto tok = grid.barrier_arrive();
if (gid < N) {
data[gid] = gid + 1;
}
grid.barrier_wait(std::move(tok));
if (grid.thread_rank() == 0) {
int sum = 0;
for (int i = 0; i < N; i++)
sum += data[i];
*result = sum;
}
}
TEST_CASE("Unit_coop_grids_split_barrier") {
hipDeviceProp_t prop;
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
if (prop.cooperativeLaunch != 0) {
int N = 1024;
const int threads = 128;
const int blocks = (N + threads - 1) / threads;
int *d_in, *d_out;
HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
void *args[] = {&d_in, &d_out, &N};
dim3 grid(blocks);
dim3 block(threads);
HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
block, args, 0, 0));
HIP_CHECK(hipDeviceSynchronize());
int out = 0;
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_in));
HIP_CHECK(hipFree(d_out));
REQUIRE(out == ((N * (N + 1)) / 2));
}
}
-1
파일 보기
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
* ``synchronize``
* ``memcpy_async``
* ``wait`` and ``wait_prior``
* ``barrier_arrive`` and ``barrier_wait``
* ``invoke_one`` and ``invoke_one_broadcast``
* ``reduce``
* ``reduce_update_async`` and ``reduce_store_async``