diff --git a/projects/clr/CHANGELOG.md b/projects/clr/CHANGELOG.md index 7a44af016c..292125904d 100644 --- a/projects/clr/CHANGELOG.md +++ b/projects/clr/CHANGELOG.md @@ -9,6 +9,9 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs * New HIP APIs - `hipKernelGetParamInfo` returns the offset and size of a kernel parameter +* New HIP supports + - `grid_group::block_rank()` returns the rank of the block in the calling thread + ## HIP 7.2 for ROCm 7.2 ### Added diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 322cab668b..62cb66fb90 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -96,6 +96,8 @@ class thread_group { __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } //! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink). __CG_QUALIFIER__ __hip_uint32_t thread_rank() const; + //! Rank of the block in calling thread within [0, \link num_threads() num_threads() \endlink). + __CG_QUALIFIER__ __hip_uint32_t block_rank() const; //! Returns true if the group has not violated any API constraints. __CG_QUALIFIER__ bool is_valid() const; @@ -203,6 +205,8 @@ class grid_group : public thread_group { public: //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::grid::thread_rank(); } + //! @copydoc thread_group::block_rank + __CG_QUALIFIER__ __hip_uint32_t block_rank() const { return internal::grid::block_rank(); } //! @copydoc thread_group::is_valid __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); } //! @copydoc thread_group::sync @@ -275,6 +279,10 @@ class thread_block : public thread_group { __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); } + //! @copydoc thread_group::block_rank + __CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() { + return internal::workgroup::block_rank(); + } //! @copydoc thread_group::num_threads __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); @@ -353,7 +361,6 @@ class tiled_group : public thread_group { __CG_QUALIFIER__ unsigned int thread_rank() const { return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1)); } - //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); } }; diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index 1417ee6686..d9671eb342 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -182,6 +182,11 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return (num_threads_till_current_workgroup + local_thread_rank); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() { + return static_cast<__hip_uint32_t>((blockIdx.z * gridDim.y * gridDim.x) + + (blockIdx.y * gridDim.x) + (blockIdx.x)); +} + __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast(__ockl_grid_is_valid()); } __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); } @@ -219,6 +224,11 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { (threadIdx.y * blockDim.x) + (threadIdx.x))); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t block_rank() { + return (static_cast<__hip_uint32_t>((blockIdx.z * gridDim.x * gridDim.y) + + (blockIdx.y * gridDim.x) + (blockIdx.x))); +} + __CG_STATIC_QUALIFIER__ bool is_valid() { return true; } __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); } diff --git a/projects/hip-tests/catch/include/cpu_grid.h b/projects/hip-tests/catch/include/cpu_grid.h index 1c48ef921c..8f2ba6f05b 100644 --- a/projects/hip-tests/catch/include/cpu_grid.h +++ b/projects/hip-tests/catch/include/cpu_grid.h @@ -43,6 +43,15 @@ struct CPUGrid { return thread_rank_in_grid % threads_in_block_count_; } + inline std::optional block_rank_in_grid( + const unsigned int thread_rank_in_grid) const { + if (thread_rank_in_grid > thread_count_) { + return std::nullopt; + } + + return thread_rank_in_grid / threads_in_block_count_; + } + inline std::optional block_idx(const unsigned int thread_rank_in_grid) const { if (thread_rank_in_grid > thread_count_) { return std::nullopt; diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc b/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc index 67930c0082..589a728143 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc @@ -39,6 +39,10 @@ static __global__ void grid_group_thread_rank_getter(unsigned int* thread_ranks) thread_ranks[thread_rank_in_grid()] = cg::this_grid().thread_rank(); } +static __global__ void grid_group_block_rank_getter(unsigned int* block_ranks) { + block_ranks[thread_rank_in_grid()] = cg::this_grid().block_rank(); +} + static __global__ void grid_group_is_valid_getter(unsigned int* is_valid_flags) { is_valid_flags[thread_rank_in_grid()] = cg::this_grid().is_valid(); } @@ -160,9 +164,18 @@ TEST_CASE("Unit_Grid_Group_Getters_Positive_Basic") { HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipLaunchCooperativeKernel(grid_group_block_rank_getter, blocks, threads, params, 0, 0)); // Verify grid_group.is_valid() values ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [](uint32_t) { return 1; }); + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Verify grid_group.block_rank() values + ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [threads](uint32_t i) { + return i/(threads.x * threads.y * threads.z); }); } /** diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block.cc b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block.cc index 500473f7b6..93233060ae 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block.cc @@ -49,6 +49,12 @@ static __global__ void thread_block_thread_rank_getter(unsigned int* thread_rank thread_ranks[thread_rank_in_grid()] = group.thread_rank(); } +template +static __global__ void thread_block_block_rank_getter(unsigned int* block_ranks) { + const BaseType group = cg::this_thread_block(); + block_ranks[thread_rank_in_grid()] = group.block_rank(); +} + static __global__ void thread_block_group_indices_getter(dim3* group_indices) { group_indices[thread_rank_in_grid()] = cg::this_thread_block().group_index(); } @@ -111,10 +117,20 @@ TEST_CASE("Unit_Thread_Block_Getters_Positive_Basic") { HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); + thread_block_block_rank_getter<<>>(uint_arr_dev.ptr()); + HIP_CHECK(hipGetLastError()); // Validate thread_block.thread_rank() values ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); }); + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Validate thread_block.block_rank() values + ArrayAllOf(uint_arr.ptr(), grid.thread_count_, + [&grid](uint32_t i) { return grid.block_rank_in_grid(i).value(); }); } {