SWDEV-493792 - add split barriers for grid_group (#508)
* SWDEV-493792 - add split barriers for grid_group * add tests * Update change log * Add Navi4 split barrier * Update docs * Use new Catch2 Approx macro * Update split_barrier.cc to check for coop groups --------- Co-authored-by: Jatin Chaudhary <jatchaud@amd.com> Co-authored-by: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com>
이 커밋은 다음에 포함됨:
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
|
||||
|
||||
* New HIP APIs
|
||||
- `hipKernelGetParamInfo` returns the offset and size of a kernel parameter
|
||||
* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.
|
||||
|
||||
* New HIP supports
|
||||
- `grid_group::block_rank()` returns the rank of the block in the calling thread
|
||||
|
||||
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
|
||||
//! @copydoc thread_group::sync
|
||||
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
|
||||
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
|
||||
struct arrival_token {
|
||||
unsigned int signal;
|
||||
};
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
|
||||
arrival_token t;
|
||||
t.signal = internal::grid::barrier_signal();
|
||||
return t;
|
||||
}
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
|
||||
internal::grid::barrier_wait(t.signal);
|
||||
}
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
|
||||
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
|
||||
//! Returns the group dimensions.
|
||||
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
|
||||
struct arrival_token {};
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
|
||||
internal::workgroup::barrier_arrive();
|
||||
return arrival_token{};
|
||||
}
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
|
||||
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);
|
||||
|
||||
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
|
||||
|
||||
|
||||
@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
|
||||
static_cast<__hip_uint32_t>(gridDim.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
|
||||
} // namespace grid
|
||||
|
||||
/**
|
||||
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
|
||||
static_cast<__hip_uint32_t>(blockDim.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_arrive() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
|
||||
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
|
||||
__has_builtin(__builtin_amdgcn_s_barrier_wait)
|
||||
__builtin_amdgcn_s_barrier_signal(-1);
|
||||
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_wait() {
|
||||
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
|
||||
__has_builtin(__builtin_amdgcn_s_barrier_wait)
|
||||
__builtin_amdgcn_s_barrier_wait(-1);
|
||||
#else
|
||||
__builtin_amdgcn_s_barrier();
|
||||
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
|
||||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
|
||||
}
|
||||
} // namespace workgroup
|
||||
|
||||
namespace tiled_group {
|
||||
|
||||
@@ -20,6 +20,7 @@ set(TEST_SRC
|
||||
binary_partition.cc
|
||||
cg_ballot.cc
|
||||
cg_any_all.cc
|
||||
split_barrier.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
static __global__ void wg_split_barrier(float *out, float *in) {
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
__shared__ float mid[32];
|
||||
size_t i = threadIdx.x;
|
||||
auto tb = cg::this_thread_block();
|
||||
|
||||
out[i] = in[i] * 2.0f;
|
||||
|
||||
auto tok = tb.barrier_arrive();
|
||||
|
||||
// use tid 0 to populate shared mem
|
||||
if (i == 0) {
|
||||
for (size_t j = 0; j < 32; j++) {
|
||||
mid[j] = in[j];
|
||||
}
|
||||
}
|
||||
|
||||
tb.barrier_wait(std::move(tok));
|
||||
|
||||
out[i] += mid[i];
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_coop_thread_block_split_barrier") {
|
||||
constexpr size_t size = 32;
|
||||
float *d_out, *d_in;
|
||||
|
||||
HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
|
||||
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
|
||||
|
||||
std::vector<float> in(size, 0.0f), out = in;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
in[i] = i + 1;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
|
||||
HIP_CHECK(
|
||||
hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
|
||||
wg_split_barrier<<<1, size>>>(d_out, d_in);
|
||||
HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(d_out));
|
||||
HIP_CHECK(hipFree(d_in));
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
|
||||
REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void grid_split_barrier(int *data, int *result, int N) {
|
||||
namespace cg = cooperative_groups;
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
auto tok = grid.barrier_arrive();
|
||||
if (gid < N) {
|
||||
data[gid] = gid + 1;
|
||||
}
|
||||
|
||||
grid.barrier_wait(std::move(tok));
|
||||
|
||||
if (grid.thread_rank() == 0) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < N; i++)
|
||||
sum += data[i];
|
||||
*result = sum;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_coop_grids_split_barrier") {
|
||||
hipDeviceProp_t prop;
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
|
||||
|
||||
if (prop.cooperativeLaunch != 0) {
|
||||
int N = 1024;
|
||||
const int threads = 128;
|
||||
const int blocks = (N + threads - 1) / threads;
|
||||
|
||||
int *d_in, *d_out;
|
||||
HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
|
||||
HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
|
||||
|
||||
void *args[] = {&d_in, &d_out, &N};
|
||||
|
||||
dim3 grid(blocks);
|
||||
dim3 block(threads);
|
||||
|
||||
HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
|
||||
block, args, 0, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
int out = 0;
|
||||
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(d_in));
|
||||
HIP_CHECK(hipFree(d_out));
|
||||
REQUIRE(out == ((N * (N + 1)) / 2));
|
||||
}
|
||||
}
|
||||
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
|
||||
* ``synchronize``
|
||||
* ``memcpy_async``
|
||||
* ``wait`` and ``wait_prior``
|
||||
* ``barrier_arrive`` and ``barrier_wait``
|
||||
* ``invoke_one`` and ``invoke_one_broadcast``
|
||||
* ``reduce``
|
||||
* ``reduce_update_async`` and ``reduce_store_async``
|
||||
|
||||
새 이슈에서 참조
사용자 차단