SWDEV-533232 Add num_threads API in cooperative_groups (#437)

* Add num_threads API in cooperative_groups
* Reimplementsize API as its alias.
* Update changelog
Этот коммит содержится в:
Sicarov, Dragoslav
2025-06-26 11:35:17 +02:00
коммит произвёл GitHub
родитель 349b83c1f6
Коммит a4631b5700
3 изменённых файлов: 55 добавлений и 36 удалений
+1
Просмотреть файл
@@ -11,6 +11,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
- `hipLaunchKernelExC` launches a HIP kernel using a generic function pointer and the specified configuration.
- `hipDrvLaunchKernelEx` dispatches the device kernel represented by a HIP function object.
- `hipMemGetHandleForAddressRange` gets a handle for the address range requested.
- `num_threads` Total number of threads in the group. The legacy API size is alias.
* New support for Open Compute Project (OCP) floating-point `FP4`/`FP6`/`FP8` as the following. For details, see [Low precision floating point document](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/low_fp_types.html).
- Data types for `FP4`/`FP6`/`FP8`.
- HIP APIs for `FP4`/`FP6`/`FP8`, which are compatible with corresponding CUDA APIs.
+51 -33
Просмотреть файл
@@ -50,7 +50,7 @@ namespace cooperative_groups {
class thread_group {
protected:
__hip_uint32_t _type; //! Type of the thread_group.
__hip_uint32_t _size; //! Total number of threads in the thread_group.
__hip_uint32_t _num_threads; //! Total number of threads in the thread_group.
__hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
//! LSB represents lane 0, and MSB represents lane 63
@@ -60,23 +60,23 @@ class thread_group {
//! (through the API - `this_thread()`), and in all other cases, this thread
//! group object is a sub-object of some other derived thread group object.
__CG_QUALIFIER__ thread_group(internal::group_type type,
__hip_uint32_t size = static_cast<__hip_uint64_t>(0),
__hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
__hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
_type = type;
_size = size;
_num_threads = num_threads;
_mask = mask;
}
struct _tiled_info {
bool is_tiled;
unsigned int size;
unsigned int num_threads;
unsigned int meta_group_rank;
unsigned int meta_group_size;
};
struct _coalesced_info {
lane_mask member_mask;
unsigned int size;
unsigned int num_threads;
struct _tiled_info tiled_info;
} coalesced_info;
@@ -87,12 +87,14 @@ class thread_group {
public:
//! Total number of threads in the thread_group, and this serves the purpose
//! for all derived cooperative group types because their `size` is directly
//! for all derived cooperative group types because their `num_threads` is directly
//! saved during the construction.
__CG_QUALIFIER__ __hip_uint32_t size() const { return _size; }
__CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
//! Total number of threads in the group (alias of num_threads())
__CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
//! Returns the type of the group.
__CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
//! Rank of the calling thread within [0, \link size() size() \endlink).
//! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
//! Returns true if the group has not violated any API constraints.
__CG_QUALIFIER__ bool is_valid() const;
@@ -174,7 +176,7 @@ class multi_grid_group : public thread_group {
* development on Microsoft Windows.
*/
__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
return multi_grid_group(internal::multi_grid::size());
return multi_grid_group(internal::multi_grid::num_threads());
}
// Doxygen end group CooperativeGConstruct
/** @} */
@@ -216,7 +218,7 @@ class grid_group : public thread_group {
* \note This function is implemented on Linux and is under development
* on Microsoft Windows.
*/
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
* type.
@@ -247,14 +249,14 @@ class thread_block : public thread_group {
__hip_assert(false && "invalid tile size");
}
auto block_size = size();
auto block_size = num_threads();
auto rank = thread_rank();
auto partitions = (block_size + tile_size - 1) / tile_size;
auto tail = (partitions * tile_size) - block_size;
auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
tiledGroup.coalesced_info.tiled_info.size = tile_size;
tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
@@ -268,8 +270,10 @@ class thread_block : public thread_group {
__CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
//! @copydoc thread_group::thread_rank
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
//! @copydoc thread_group::num_threads
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); }
//! @copydoc thread_group::size
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); }
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
//! @copydoc thread_group::is_valid
__CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
//! @copydoc thread_group::sync
@@ -289,7 +293,7 @@ class thread_block : public thread_group {
* on Microsoft Windows.
*/
__CG_QUALIFIER__ thread_block this_thread_block() {
return thread_block(internal::workgroup::size());
return thread_block(internal::workgroup::num_threads());
}
/** \brief The tiled_group cooperative group type
@@ -313,7 +317,7 @@ class tiled_group : public thread_group {
__hip_assert(false && "invalid tile size");
}
if (size() <= tile_size) {
if (num_threads() <= tile_size) {
return *this;
}
@@ -325,17 +329,20 @@ class tiled_group : public thread_group {
protected:
explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
: thread_group(internal::cg_tiled_group, tileSize) {
coalesced_info.tiled_info.size = tileSize;
coalesced_info.tiled_info.num_threads = tileSize;
coalesced_info.tiled_info.is_tiled = true;
}
public:
//! @copydoc thread_group::num_threads
__CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); }
//! @copydoc thread_group::size
__CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
__CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
//! @copydoc thread_group::thread_rank
__CG_QUALIFIER__ unsigned int thread_rank() const {
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
}
//! @copydoc thread_group::sync
@@ -374,7 +381,7 @@ class coalesced_group : public thread_group {
// prepare a mask for further partitioning it so that it stays coalesced.
if (coalesced_info.tiled_info.is_tiled) {
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
unsigned int masklength = min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
lane_mask full_mask = (static_cast<int>(warpSize) == 32) ? static_cast<lane_mask>((1u << 32) - 1)
: static_cast<lane_mask>(-1ull);
lane_mask member_mask = full_mask >> (warpSize - masklength);
@@ -383,7 +390,7 @@ class coalesced_group : public thread_group {
coalesced_group coalesced_tile = coalesced_group(member_mask);
coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
return coalesced_tile;
}
// Here the parent coalesced_group is not partitioned.
@@ -407,7 +414,7 @@ class coalesced_group : public thread_group {
coalesced_group coalesced_tile = coalesced_group(member_mask);
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
coalesced_tile.coalesced_info.tiled_info.meta_group_size =
(size() + tile_size - 1) / tile_size;
(num_threads() + tile_size - 1) / tile_size;
return coalesced_tile;
}
return coalesced_group(0);
@@ -418,16 +425,21 @@ class coalesced_group : public thread_group {
explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
: thread_group(internal::cg_coalesced_group) {
coalesced_info.member_mask = member_mask; // Which threads are active
coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
coalesced_info.tiled_info.meta_group_rank = 0;
coalesced_info.tiled_info.meta_group_size = 1;
}
public:
//! @copydoc thread_group::num_threads
__CG_QUALIFIER__ unsigned int num_threads() const {
return coalesced_info.num_threads;
}
//! @copydoc thread_group::size
__CG_QUALIFIER__ unsigned int size() const {
return coalesced_info.size;
return num_threads();
}
//! @copydoc thread_group::thread_rank
@@ -466,9 +478,9 @@ class coalesced_group : public thread_group {
template <class T>
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
srcRank = srcRank % static_cast<int>(size());
srcRank = srcRank % static_cast<int>(num_threads());
int lane = (size() == warpSize) ? srcRank
int lane = (num_threads() == warpSize) ? srcRank
: (static_cast<int>(warpSize) == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
@@ -496,7 +508,7 @@ class coalesced_group : public thread_group {
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.
if (size() == warpSize) {
if (num_threads() == warpSize) {
return __shfl_down(var, lane_delta, warpSize);
}
@@ -536,7 +548,7 @@ class coalesced_group : public thread_group {
// and WARP_SIZE as the shift value rather than lane_delta itself.
// This is not described in the documentation and is not done here.
if (size() == warpSize) {
if (num_threads() == warpSize) {
return __shfl_up(var, lane_delta, warpSize);
}
@@ -747,11 +759,11 @@ __CG_QUALIFIER__ void thread_group::sync() const {
* cooperative group type APIs. This function is implemented on Linux
* and is under development on Microsoft Windows.
*/
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); }
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); }
/** \brief Returns the rank of thread of the group.
*
* \details Rank of the calling thread within [0, \link size() size() \endlink).
* \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
*
* \tparam CGTy The cooperative group class template parameter.
* \param g [in] The cooperative group for rank returns.
@@ -805,7 +817,11 @@ template <unsigned int tileSize> class tile_base {
}
//! Number of threads within this tile
__CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
__CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
//! Legacy member functions
//! Number of threads within this tile (alias of num_threads())
__CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
};
/**
@@ -888,7 +904,7 @@ public:
//! Returns the number of groups created when the parent group was partitioned.
__CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
return (ParentCGTy::size() + tileSize - 1) / tileSize;
return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
}
};
@@ -906,10 +922,11 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
typedef thread_block_tile_base<numThreads> tbtBase;
protected:
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
coalesced_info.tiled_info.size = numThreads;
coalesced_info.tiled_info.num_threads = numThreads;
coalesced_info.tiled_info.is_tiled = true;
}
public:
using tbtBase::num_threads;
using tbtBase::size;
using tbtBase::sync;
using tbtBase::thread_rank;
@@ -928,13 +945,14 @@ class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<til
__CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
: tiled_group(numThreads) {
coalesced_info.tiled_info.size = numThreads;
coalesced_info.tiled_info.num_threads = numThreads;
coalesced_info.tiled_info.is_tiled = true;
coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
coalesced_info.tiled_info.meta_group_size = meta_group_size;
}
public:
using tbtBase::num_threads;
using tbtBase::size;
using tbtBase::sync;
using tbtBase::thread_rank;
+3 -3
Просмотреть файл
@@ -138,7 +138,7 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
@@ -155,7 +155,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
*/
namespace grid {
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
(blockDim.x * gridDim.x));
}
@@ -200,7 +200,7 @@ __CG_STATIC_QUALIFIER__ dim3 thread_index() {
static_cast<__hip_uint32_t>(threadIdx.z)));
}
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z));
}