Fix grid_group::group_dim to return grid_dim and not block_dim (#823)

* Fix grid_group::group_dim to return grid_dim and not block_dim

* Add unit test for grid_group.group_dim()

* Fix unit test errors

* Skip group_dim() assertions for base_type test
This commit is contained in:
harkgill-amd
2025-09-15 09:42:55 -04:00
committed by GitHub
parent 2f7e9591be
commit d1b2b5ed44
3 changed files with 39 additions and 7 deletions
@@ -207,7 +207,7 @@ class grid_group : public thread_group {
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
//! @copydoc thread_group::sync
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
__CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); }
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
};
/** \ingroup CooperativeGConstruct
@@ -186,6 +186,11 @@ __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_i
__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
__CG_STATIC_QUALIFIER__ dim3 grid_dim() {
return (dim3(static_cast<__hip_uint32_t>(gridDim.x), static_cast<__hip_uint32_t>(gridDim.y),
static_cast<__hip_uint32_t>(gridDim.z)));
}
} // namespace grid
/**
@@ -26,10 +26,13 @@ THE SOFTWARE.
namespace cg = cooperative_groups;
enum class GridTypeTests { gridGroupType, baseType, publicApi };
static __device__ int gm[2];
static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev,
int* is_valid_dev, int* sync_dev) {
int* is_valid_dev, int* sync_dev,
dim3* group_dim_dev) {
cg::grid_group gg = cg::this_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -49,6 +52,9 @@ static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_de
gm[1] = 20;
gg.sync();
sync_dev[gIdx] = gm[1] * gm[0];
// Test group_dim aka number of thread blocks in a grid
group_dim_dev[gIdx] = gg.group_dim();
}
static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev,
@@ -80,7 +86,8 @@ static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, in
}
static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev,
int* is_valid_dev, int* sync_dev) {
int* is_valid_dev, int* sync_dev,
dim3* group_dim_dev) {
cg::grid_group gg = cg::this_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -100,6 +107,9 @@ static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, i
gm[1] = 20;
cg::sync(gg);
sync_dev[gIdx] = gm[1] * gm[0];
// Test group_dim aka number of thread blocks in a grid
group_dim_dev[gIdx] = gg.group_dim();
}
static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array,
@@ -246,31 +256,36 @@ static void verify_barrier_buffer(unsigned int loops, unsigned int warps,
}
}
template <typename F> static void test_cg_grid_group_type(F kernel_func, int block_size) {
template <typename F> static void test_cg_grid_group_type(F kernel_func, int block_size, GridTypeTests kernel_type) {
int num_bytes = sizeof(int) * 2 * block_size;
int num_dim3_bytes = sizeof(dim3) * 2 * block_size;
int *size_dev, *size_host;
int *thd_rank_dev, *thd_rank_host;
int *is_valid_dev, *is_valid_host;
int *sync_dev, *sync_host;
dim3 *group_dim_dev, *group_dim_host;
// Allocate device memory
HIP_CHECK(hipMalloc(&size_dev, num_bytes));
HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes));
HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes));
// Allocate host memory
HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes));
HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes));
// Launch Kernel
void* params[4];
void* params[5];
params[0] = &size_dev;
params[1] = &thd_rank_dev;
params[2] = &is_valid_dev;
params[3] = &sync_dev;
params[4] = &group_dim_dev;
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0));
// Copy result from device to host
@@ -278,6 +293,7 @@ template <typename F> static void test_cg_grid_group_type(F kernel_func, int blo
HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * block_size; ++i) {
@@ -285,6 +301,11 @@ template <typename F> static void test_cg_grid_group_type(F kernel_func, int blo
ASSERT_EQUAL(thd_rank_host[i], i);
ASSERT_EQUAL(is_valid_host[i], 1);
ASSERT_EQUAL(sync_host[i], 200);
if(kernel_type != GridTypeTests::baseType){
ASSERT_EQUAL(group_dim_host[i].x, 2);
ASSERT_EQUAL(group_dim_host[i].y, 1);
ASSERT_EQUAL(group_dim_host[i].z, 1);
}
}
// Free device memory
@@ -292,12 +313,14 @@ template <typename F> static void test_cg_grid_group_type(F kernel_func, int blo
HIP_CHECK(hipFree(thd_rank_dev));
HIP_CHECK(hipFree(is_valid_dev));
HIP_CHECK(hipFree(sync_dev));
HIP_CHECK(hipFree(group_dim_dev));
// Free host memory
HIP_CHECK(hipHostFree(size_host));
HIP_CHECK(hipHostFree(thd_rank_host));
HIP_CHECK(hipHostFree(is_valid_host));
HIP_CHECK(hipHostFree(sync_host));
HIP_CHECK(hipHostFree(group_dim_host));
}
TEST_CASE("Unit_hipCGGridGroupType_Basic") {
@@ -313,31 +336,35 @@ TEST_CASE("Unit_hipCGGridGroupType_Basic") {
}
void* kernel_func;
GridTypeTests kernel_type = GridTypeTests::gridGroupType;
SECTION("Default grid group API test") {
kernel_func = reinterpret_cast<void*>(kernel_cg_grid_group_type);
kernel_type = GridTypeTests::gridGroupType;
}
#if HT_AMD
SECTION("Base type grid group API test") {
kernel_func = reinterpret_cast<void*>(kernel_cg_grid_group_type_via_base_type);
kernel_type = GridTypeTests::baseType;
}
#endif
SECTION("Public API grid group test") {
kernel_func = reinterpret_cast<void*>(kernel_cg_grid_group_type_via_public_api);
kernel_type = GridTypeTests::publicApi;
}
// Test for block_size in powers of 2
int max_threads_per_blk = device_properties.maxThreadsPerBlock;
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
test_cg_grid_group_type(kernel_func, block_size);
test_cg_grid_group_type(kernel_func, block_size, kernel_type);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk));
test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk), kernel_type);
}
}