diff --git a/CHANGELOG.md b/CHANGELOG.md index 03e2dcb19b..5db4cbad94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs - `hipLaunchKernelExC` launches a HIP kernel using a generic function pointer and the specified configuration. - `hipDrvLaunchKernelEx` dispatches the device kernel represented by a HIP function object. - `hipMemGetHandleForAddressRange` gets a handle for the address range requested. + - `num_threads` Total number of threads in the group. The legacy API size is alias. * New support for Open Compute Project (OCP) floating-point `FP4`/`FP6`/`FP8` as the following. For details, see [Low precision floating point document](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/low_fp_types.html). - Data types for `FP4`/`FP6`/`FP8`. - HIP APIs for `FP4`/`FP6`/`FP8`, which are compatible with corresponding CUDA APIs. diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 382c3acd0f..41cfcbb861 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -50,7 +50,7 @@ namespace cooperative_groups { class thread_group { protected: __hip_uint32_t _type; //! Type of the thread_group. - __hip_uint32_t _size; //! Total number of threads in the thread_group. + __hip_uint32_t _num_threads; //! Total number of threads in the thread_group. __hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types, //! LSB represents lane 0, and MSB represents lane 63 @@ -60,23 +60,23 @@ class thread_group { //! (through the API - `this_thread()`), and in all other cases, this thread //! group object is a sub-object of some other derived thread group object. __CG_QUALIFIER__ thread_group(internal::group_type type, - __hip_uint32_t size = static_cast<__hip_uint64_t>(0), + __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0), __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) { _type = type; - _size = size; + _num_threads = num_threads; _mask = mask; } struct _tiled_info { bool is_tiled; - unsigned int size; + unsigned int num_threads; unsigned int meta_group_rank; unsigned int meta_group_size; }; struct _coalesced_info { lane_mask member_mask; - unsigned int size; + unsigned int num_threads; struct _tiled_info tiled_info; } coalesced_info; @@ -87,12 +87,14 @@ class thread_group { public: //! Total number of threads in the thread_group, and this serves the purpose - //! for all derived cooperative group types because their `size` is directly + //! for all derived cooperative group types because their `num_threads` is directly //! saved during the construction. - __CG_QUALIFIER__ __hip_uint32_t size() const { return _size; } + __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; } + //! Total number of threads in the group (alias of num_threads()) + __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); } //! Returns the type of the group. __CG_QUALIFIER__ unsigned int cg_type() const { return _type; } - //! Rank of the calling thread within [0, \link size() size() \endlink). + //! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink). __CG_QUALIFIER__ __hip_uint32_t thread_rank() const; //! Returns true if the group has not violated any API constraints. __CG_QUALIFIER__ bool is_valid() const; @@ -174,7 +176,7 @@ class multi_grid_group : public thread_group { * development on Microsoft Windows. */ __CG_QUALIFIER__ multi_grid_group this_multi_grid() { - return multi_grid_group(internal::multi_grid::size()); + return multi_grid_group(internal::multi_grid::num_threads()); } // Doxygen end group CooperativeGConstruct /** @} */ @@ -216,7 +218,7 @@ class grid_group : public thread_group { * \note This function is implemented on Linux and is under development * on Microsoft Windows. */ -__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); } +__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); } /** \brief The workgroup (thread-block in CUDA terminology) cooperative group * type. @@ -247,14 +249,14 @@ class thread_block : public thread_group { __hip_assert(false && "invalid tile size"); } - auto block_size = size(); + auto block_size = num_threads(); auto rank = thread_rank(); auto partitions = (block_size + tile_size - 1) / tile_size; auto tail = (partitions * tile_size) - block_size; auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size); thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size); - tiledGroup.coalesced_info.tiled_info.size = tile_size; + tiledGroup.coalesced_info.tiled_info.num_threads = tile_size; tiledGroup.coalesced_info.tiled_info.is_tiled = true; tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size; tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions; @@ -268,8 +270,10 @@ class thread_block : public thread_group { __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); } //! @copydoc thread_group::thread_rank __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); } + //! @copydoc thread_group::num_threads + __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); } //! @copydoc thread_group::size - __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); } + __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); } //! @copydoc thread_group::is_valid __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); } //! @copydoc thread_group::sync @@ -289,7 +293,7 @@ class thread_block : public thread_group { * on Microsoft Windows. */ __CG_QUALIFIER__ thread_block this_thread_block() { - return thread_block(internal::workgroup::size()); + return thread_block(internal::workgroup::num_threads()); } /** \brief The tiled_group cooperative group type @@ -313,7 +317,7 @@ class tiled_group : public thread_group { __hip_assert(false && "invalid tile size"); } - if (size() <= tile_size) { + if (num_threads() <= tile_size) { return *this; } @@ -325,17 +329,20 @@ class tiled_group : public thread_group { protected: explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize) : thread_group(internal::cg_tiled_group, tileSize) { - coalesced_info.tiled_info.size = tileSize; + coalesced_info.tiled_info.num_threads = tileSize; coalesced_info.tiled_info.is_tiled = true; } public: + //! @copydoc thread_group::num_threads + __CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); } + //! @copydoc thread_group::size - __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); } + __CG_QUALIFIER__ unsigned int size() const { return num_threads(); } //! @copydoc thread_group::thread_rank __CG_QUALIFIER__ unsigned int thread_rank() const { - return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1)); + return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1)); } //! @copydoc thread_group::sync @@ -374,7 +381,7 @@ class coalesced_group : public thread_group { // prepare a mask for further partitioning it so that it stays coalesced. if (coalesced_info.tiled_info.is_tiled) { unsigned int base_offset = (thread_rank() & (~(tile_size - 1))); - unsigned int masklength = min(static_cast(size()) - base_offset, tile_size); + unsigned int masklength = min(static_cast(num_threads()) - base_offset, tile_size); lane_mask full_mask = (static_cast(warpSize) == 32) ? static_cast((1u << 32) - 1) : static_cast(-1ull); lane_mask member_mask = full_mask >> (warpSize - masklength); @@ -383,7 +390,7 @@ class coalesced_group : public thread_group { coalesced_group coalesced_tile = coalesced_group(member_mask); coalesced_tile.coalesced_info.tiled_info.is_tiled = true; coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; - coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size; + coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size; return coalesced_tile; } // Here the parent coalesced_group is not partitioned. @@ -407,7 +414,7 @@ class coalesced_group : public thread_group { coalesced_group coalesced_tile = coalesced_group(member_mask); coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; coalesced_tile.coalesced_info.tiled_info.meta_group_size = - (size() + tile_size - 1) / tile_size; + (num_threads() + tile_size - 1) / tile_size; return coalesced_tile; } return coalesced_group(0); @@ -418,16 +425,21 @@ class coalesced_group : public thread_group { explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask) : thread_group(internal::cg_coalesced_group) { coalesced_info.member_mask = member_mask; // Which threads are active - coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active + coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group coalesced_info.tiled_info.meta_group_rank = 0; coalesced_info.tiled_info.meta_group_size = 1; } public: + //! @copydoc thread_group::num_threads + __CG_QUALIFIER__ unsigned int num_threads() const { + return coalesced_info.num_threads; + } + //! @copydoc thread_group::size __CG_QUALIFIER__ unsigned int size() const { - return coalesced_info.size; + return num_threads(); } //! @copydoc thread_group::thread_rank @@ -466,9 +478,9 @@ class coalesced_group : public thread_group { template __CG_QUALIFIER__ T shfl(T var, int srcRank) const { - srcRank = srcRank % static_cast(size()); + srcRank = srcRank % static_cast(num_threads()); - int lane = (size() == warpSize) ? srcRank + int lane = (num_threads() == warpSize) ? srcRank : (static_cast(warpSize) == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1)) : __fns32(coalesced_info.member_mask, 0, (srcRank + 1)); @@ -496,7 +508,7 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == warpSize) { + if (num_threads() == warpSize) { return __shfl_down(var, lane_delta, warpSize); } @@ -536,7 +548,7 @@ class coalesced_group : public thread_group { // and WARP_SIZE as the shift value rather than lane_delta itself. // This is not described in the documentation and is not done here. - if (size() == warpSize) { + if (num_threads() == warpSize) { return __shfl_up(var, lane_delta, warpSize); } @@ -747,11 +759,11 @@ __CG_QUALIFIER__ void thread_group::sync() const { * cooperative group type APIs. This function is implemented on Linux * and is under development on Microsoft Windows. */ -template __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); } +template __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); } /** \brief Returns the rank of thread of the group. * - * \details Rank of the calling thread within [0, \link size() size() \endlink). + * \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink). * * \tparam CGTy The cooperative group class template parameter. * \param g [in] The cooperative group for rank returns. @@ -805,7 +817,11 @@ template class tile_base { } //! Number of threads within this tile - __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; } + __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; } + + //! Legacy member functions + //! Number of threads within this tile (alias of num_threads()) + __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); } }; /** @@ -888,7 +904,7 @@ public: //! Returns the number of groups created when the parent group was partitioned. __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() { - return (ParentCGTy::size() + tileSize - 1) / tileSize; + return (ParentCGTy::num_threads() + tileSize - 1) / tileSize; } }; @@ -906,10 +922,11 @@ class thread_block_tile_type : public thread_block_tile_base, typedef thread_block_tile_base tbtBase; protected: __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) { - coalesced_info.tiled_info.size = numThreads; + coalesced_info.tiled_info.num_threads = numThreads; coalesced_info.tiled_info.is_tiled = true; } public: + using tbtBase::num_threads; using tbtBase::size; using tbtBase::sync; using tbtBase::thread_rank; @@ -928,13 +945,14 @@ class thread_block_tile_type : public thread_block_tile_base(__ockl_multi_grid_grid_rank()); } -__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); } +__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); } __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); } @@ -155,7 +155,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); } */ namespace grid { -__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { +__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) * (blockDim.x * gridDim.x)); } @@ -200,7 +200,7 @@ __CG_STATIC_QUALIFIER__ dim3 thread_index() { static_cast<__hip_uint32_t>(threadIdx.z))); } -__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { +__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z)); }