diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index e0b40d64dd..d499a93b47 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -207,7 +207,7 @@ class grid_group : public thread_group { __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } - __CG_QUALIFIER__ dim3 group_dim() const { return internal::workgroup::block_dim(); } + __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); } }; /** \ingroup CooperativeGConstruct diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index eb695e3a70..1417ee6686 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -186,6 +186,11 @@ __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_i __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } +__CG_STATIC_QUALIFIER__ dim3 grid_dim() { + return (dim3(static_cast<__hip_uint32_t>(gridDim.x), static_cast<__hip_uint32_t>(gridDim.y), + static_cast<__hip_uint32_t>(gridDim.z))); +} + } // namespace grid /** diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc index 2b70b6719f..026b087a3a 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc @@ -26,10 +26,13 @@ THE SOFTWARE. namespace cg = cooperative_groups; +enum class GridTypeTests { gridGroupType, baseType, publicApi }; + static __device__ int gm[2]; static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev, - int* is_valid_dev, int* sync_dev) { + int* is_valid_dev, int* sync_dev, + dim3* group_dim_dev) { cg::grid_group gg = cg::this_grid(); int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -49,6 +52,9 @@ static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_de gm[1] = 20; gg.sync(); sync_dev[gIdx] = gm[1] * gm[0]; + + // Test group_dim aka number of thread blocks in a grid + group_dim_dev[gIdx] = gg.group_dim(); } static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev, @@ -80,7 +86,8 @@ static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, in } static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev, - int* is_valid_dev, int* sync_dev) { + int* is_valid_dev, int* sync_dev, + dim3* group_dim_dev) { cg::grid_group gg = cg::this_grid(); int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -100,6 +107,9 @@ static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, i gm[1] = 20; cg::sync(gg); sync_dev[gIdx] = gm[1] * gm[0]; + + // Test group_dim aka number of thread blocks in a grid + group_dim_dev[gIdx] = gg.group_dim(); } static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array, @@ -246,31 +256,36 @@ static void verify_barrier_buffer(unsigned int loops, unsigned int warps, } } -template static void test_cg_grid_group_type(F kernel_func, int block_size) { +template static void test_cg_grid_group_type(F kernel_func, int block_size, GridTypeTests kernel_type) { int num_bytes = sizeof(int) * 2 * block_size; + int num_dim3_bytes = sizeof(dim3) * 2 * block_size; int *size_dev, *size_host; int *thd_rank_dev, *thd_rank_host; int *is_valid_dev, *is_valid_host; int *sync_dev, *sync_host; + dim3 *group_dim_dev, *group_dim_host; // Allocate device memory HIP_CHECK(hipMalloc(&size_dev, num_bytes)); HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes)); HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes)); HIP_CHECK(hipMalloc(&sync_dev, num_bytes)); + HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes)); // Allocate host memory HIP_CHECK(hipHostMalloc(&size_host, num_bytes)); HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes)); HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes)); HIP_CHECK(hipHostMalloc(&sync_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes)); // Launch Kernel - void* params[4]; + void* params[5]; params[0] = &size_dev; params[1] = &thd_rank_dev; params[2] = &is_valid_dev; params[3] = &sync_dev; + params[4] = &group_dim_dev; HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0)); // Copy result from device to host @@ -278,6 +293,7 @@ template static void test_cg_grid_group_type(F kernel_func, int blo HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost)); HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost)); // Validate results for both blocks together for (int i = 0; i < 2 * block_size; ++i) { @@ -285,6 +301,11 @@ template static void test_cg_grid_group_type(F kernel_func, int blo ASSERT_EQUAL(thd_rank_host[i], i); ASSERT_EQUAL(is_valid_host[i], 1); ASSERT_EQUAL(sync_host[i], 200); + if(kernel_type != GridTypeTests::baseType){ + ASSERT_EQUAL(group_dim_host[i].x, 2); + ASSERT_EQUAL(group_dim_host[i].y, 1); + ASSERT_EQUAL(group_dim_host[i].z, 1); + } } // Free device memory @@ -292,12 +313,14 @@ template static void test_cg_grid_group_type(F kernel_func, int blo HIP_CHECK(hipFree(thd_rank_dev)); HIP_CHECK(hipFree(is_valid_dev)); HIP_CHECK(hipFree(sync_dev)); + HIP_CHECK(hipFree(group_dim_dev)); // Free host memory HIP_CHECK(hipHostFree(size_host)); HIP_CHECK(hipHostFree(thd_rank_host)); HIP_CHECK(hipHostFree(is_valid_host)); HIP_CHECK(hipHostFree(sync_host)); + HIP_CHECK(hipHostFree(group_dim_host)); } TEST_CASE("Unit_hipCGGridGroupType_Basic") { @@ -313,31 +336,35 @@ TEST_CASE("Unit_hipCGGridGroupType_Basic") { } void* kernel_func; + GridTypeTests kernel_type = GridTypeTests::gridGroupType; SECTION("Default grid group API test") { kernel_func = reinterpret_cast(kernel_cg_grid_group_type); + kernel_type = GridTypeTests::gridGroupType; } #if HT_AMD SECTION("Base type grid group API test") { kernel_func = reinterpret_cast(kernel_cg_grid_group_type_via_base_type); + kernel_type = GridTypeTests::baseType; } #endif SECTION("Public API grid group test") { kernel_func = reinterpret_cast(kernel_cg_grid_group_type_via_public_api); + kernel_type = GridTypeTests::publicApi; } // Test for block_size in powers of 2 int max_threads_per_blk = device_properties.maxThreadsPerBlock; for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) { - test_cg_grid_group_type(kernel_func, block_size); + test_cg_grid_group_type(kernel_func, block_size, kernel_type); } // Test for random blockSizes, but the sequence is the same every execution srand(0); for (int i = 0; i < 10; i++) { // Test fails for only 1 thread per block - test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk)); + test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk), kernel_type); } }