SWDEV-533232 Add num_threads API in cooperative_groups (#437)
* Add num_threads API in cooperative_groups
* Reimplementsize API as its alias.
* Update changelog
[ROCm/clr commit: a4631b5700]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
c530e72a7c
Коммит
dbfd869936
@@ -11,6 +11,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
|
||||
- `hipLaunchKernelExC` launches a HIP kernel using a generic function pointer and the specified configuration.
|
||||
- `hipDrvLaunchKernelEx` dispatches the device kernel represented by a HIP function object.
|
||||
- `hipMemGetHandleForAddressRange` gets a handle for the address range requested.
|
||||
- `num_threads` Total number of threads in the group. The legacy API size is alias.
|
||||
* New support for Open Compute Project (OCP) floating-point `FP4`/`FP6`/`FP8` as the following. For details, see [Low precision floating point document](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/low_fp_types.html).
|
||||
- Data types for `FP4`/`FP6`/`FP8`.
|
||||
- HIP APIs for `FP4`/`FP6`/`FP8`, which are compatible with corresponding CUDA APIs.
|
||||
|
||||
@@ -50,7 +50,7 @@ namespace cooperative_groups {
|
||||
class thread_group {
|
||||
protected:
|
||||
__hip_uint32_t _type; //! Type of the thread_group.
|
||||
__hip_uint32_t _size; //! Total number of threads in the thread_group.
|
||||
__hip_uint32_t _num_threads; //! Total number of threads in the thread_group.
|
||||
__hip_uint64_t _mask; //! Lanemask for coalesced and tiled partitioned group types,
|
||||
//! LSB represents lane 0, and MSB represents lane 63
|
||||
|
||||
@@ -60,23 +60,23 @@ class thread_group {
|
||||
//! (through the API - `this_thread()`), and in all other cases, this thread
|
||||
//! group object is a sub-object of some other derived thread group object.
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type,
|
||||
__hip_uint32_t size = static_cast<__hip_uint64_t>(0),
|
||||
__hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
|
||||
__hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
|
||||
_type = type;
|
||||
_size = size;
|
||||
_num_threads = num_threads;
|
||||
_mask = mask;
|
||||
}
|
||||
|
||||
struct _tiled_info {
|
||||
bool is_tiled;
|
||||
unsigned int size;
|
||||
unsigned int num_threads;
|
||||
unsigned int meta_group_rank;
|
||||
unsigned int meta_group_size;
|
||||
};
|
||||
|
||||
struct _coalesced_info {
|
||||
lane_mask member_mask;
|
||||
unsigned int size;
|
||||
unsigned int num_threads;
|
||||
struct _tiled_info tiled_info;
|
||||
} coalesced_info;
|
||||
|
||||
@@ -87,12 +87,14 @@ class thread_group {
|
||||
|
||||
public:
|
||||
//! Total number of threads in the thread_group, and this serves the purpose
|
||||
//! for all derived cooperative group types because their `size` is directly
|
||||
//! for all derived cooperative group types because their `num_threads` is directly
|
||||
//! saved during the construction.
|
||||
__CG_QUALIFIER__ __hip_uint32_t size() const { return _size; }
|
||||
__CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
|
||||
//! Total number of threads in the group (alias of num_threads())
|
||||
__CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
|
||||
//! Returns the type of the group.
|
||||
__CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
|
||||
//! Rank of the calling thread within [0, \link size() size() \endlink).
|
||||
//! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
|
||||
__CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
|
||||
//! Returns true if the group has not violated any API constraints.
|
||||
__CG_QUALIFIER__ bool is_valid() const;
|
||||
@@ -174,7 +176,7 @@ class multi_grid_group : public thread_group {
|
||||
* development on Microsoft Windows.
|
||||
*/
|
||||
__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
|
||||
return multi_grid_group(internal::multi_grid::size());
|
||||
return multi_grid_group(internal::multi_grid::num_threads());
|
||||
}
|
||||
// Doxygen end group CooperativeGConstruct
|
||||
/** @} */
|
||||
@@ -216,7 +218,7 @@ class grid_group : public thread_group {
|
||||
* \note This function is implemented on Linux and is under development
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
|
||||
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }
|
||||
|
||||
/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
|
||||
* type.
|
||||
@@ -247,14 +249,14 @@ class thread_block : public thread_group {
|
||||
__hip_assert(false && "invalid tile size");
|
||||
}
|
||||
|
||||
auto block_size = size();
|
||||
auto block_size = num_threads();
|
||||
auto rank = thread_rank();
|
||||
auto partitions = (block_size + tile_size - 1) / tile_size;
|
||||
auto tail = (partitions * tile_size) - block_size;
|
||||
auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
|
||||
thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);
|
||||
|
||||
tiledGroup.coalesced_info.tiled_info.size = tile_size;
|
||||
tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
|
||||
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
|
||||
tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
|
||||
tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
|
||||
@@ -268,8 +270,10 @@ class thread_block : public thread_group {
|
||||
__CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); }
|
||||
//! @copydoc thread_group::size
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
|
||||
//! @copydoc thread_group::is_valid
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -289,7 +293,7 @@ class thread_block : public thread_group {
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
__CG_QUALIFIER__ thread_block this_thread_block() {
|
||||
return thread_block(internal::workgroup::size());
|
||||
return thread_block(internal::workgroup::num_threads());
|
||||
}
|
||||
|
||||
/** \brief The tiled_group cooperative group type
|
||||
@@ -313,7 +317,7 @@ class tiled_group : public thread_group {
|
||||
__hip_assert(false && "invalid tile size");
|
||||
}
|
||||
|
||||
if (size() <= tile_size) {
|
||||
if (num_threads() <= tile_size) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -325,17 +329,20 @@ class tiled_group : public thread_group {
|
||||
protected:
|
||||
explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
|
||||
: thread_group(internal::cg_tiled_group, tileSize) {
|
||||
coalesced_info.tiled_info.size = tileSize;
|
||||
coalesced_info.tiled_info.num_threads = tileSize;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
}
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); }
|
||||
|
||||
//! @copydoc thread_group::size
|
||||
__CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
|
||||
__CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
|
||||
|
||||
//! @copydoc thread_group::thread_rank
|
||||
__CG_QUALIFIER__ unsigned int thread_rank() const {
|
||||
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
|
||||
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::sync
|
||||
@@ -374,7 +381,7 @@ class coalesced_group : public thread_group {
|
||||
// prepare a mask for further partitioning it so that it stays coalesced.
|
||||
if (coalesced_info.tiled_info.is_tiled) {
|
||||
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
|
||||
unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
|
||||
unsigned int masklength = min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
|
||||
lane_mask full_mask = (static_cast<int>(warpSize) == 32) ? static_cast<lane_mask>((1u << 32) - 1)
|
||||
: static_cast<lane_mask>(-1ull);
|
||||
lane_mask member_mask = full_mask >> (warpSize - masklength);
|
||||
@@ -383,7 +390,7 @@ class coalesced_group : public thread_group {
|
||||
coalesced_group coalesced_tile = coalesced_group(member_mask);
|
||||
coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
|
||||
return coalesced_tile;
|
||||
}
|
||||
// Here the parent coalesced_group is not partitioned.
|
||||
@@ -407,7 +414,7 @@ class coalesced_group : public thread_group {
|
||||
coalesced_group coalesced_tile = coalesced_group(member_mask);
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
||||
coalesced_tile.coalesced_info.tiled_info.meta_group_size =
|
||||
(size() + tile_size - 1) / tile_size;
|
||||
(num_threads() + tile_size - 1) / tile_size;
|
||||
return coalesced_tile;
|
||||
}
|
||||
return coalesced_group(0);
|
||||
@@ -418,16 +425,21 @@ class coalesced_group : public thread_group {
|
||||
explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
|
||||
: thread_group(internal::cg_coalesced_group) {
|
||||
coalesced_info.member_mask = member_mask; // Which threads are active
|
||||
coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
|
||||
coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active
|
||||
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
|
||||
coalesced_info.tiled_info.meta_group_rank = 0;
|
||||
coalesced_info.tiled_info.meta_group_size = 1;
|
||||
}
|
||||
|
||||
public:
|
||||
//! @copydoc thread_group::num_threads
|
||||
__CG_QUALIFIER__ unsigned int num_threads() const {
|
||||
return coalesced_info.num_threads;
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::size
|
||||
__CG_QUALIFIER__ unsigned int size() const {
|
||||
return coalesced_info.size;
|
||||
return num_threads();
|
||||
}
|
||||
|
||||
//! @copydoc thread_group::thread_rank
|
||||
@@ -466,9 +478,9 @@ class coalesced_group : public thread_group {
|
||||
template <class T>
|
||||
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
||||
|
||||
srcRank = srcRank % static_cast<int>(size());
|
||||
srcRank = srcRank % static_cast<int>(num_threads());
|
||||
|
||||
int lane = (size() == warpSize) ? srcRank
|
||||
int lane = (num_threads() == warpSize) ? srcRank
|
||||
: (static_cast<int>(warpSize) == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
||||
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
||||
|
||||
@@ -496,7 +508,7 @@ class coalesced_group : public thread_group {
|
||||
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
||||
// This is not described in the documentation and is not done here.
|
||||
|
||||
if (size() == warpSize) {
|
||||
if (num_threads() == warpSize) {
|
||||
return __shfl_down(var, lane_delta, warpSize);
|
||||
}
|
||||
|
||||
@@ -536,7 +548,7 @@ class coalesced_group : public thread_group {
|
||||
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
||||
// This is not described in the documentation and is not done here.
|
||||
|
||||
if (size() == warpSize) {
|
||||
if (num_threads() == warpSize) {
|
||||
return __shfl_up(var, lane_delta, warpSize);
|
||||
}
|
||||
|
||||
@@ -747,11 +759,11 @@ __CG_QUALIFIER__ void thread_group::sync() const {
|
||||
* cooperative group type APIs. This function is implemented on Linux
|
||||
* and is under development on Microsoft Windows.
|
||||
*/
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); }
|
||||
template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); }
|
||||
|
||||
/** \brief Returns the rank of thread of the group.
|
||||
*
|
||||
* \details Rank of the calling thread within [0, \link size() size() \endlink).
|
||||
* \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
|
||||
*
|
||||
* \tparam CGTy The cooperative group class template parameter.
|
||||
* \param g [in] The cooperative group for rank returns.
|
||||
@@ -805,7 +817,11 @@ template <unsigned int tileSize> class tile_base {
|
||||
}
|
||||
|
||||
//! Number of threads within this tile
|
||||
__CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
|
||||
__CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
|
||||
|
||||
//! Legacy member functions
|
||||
//! Number of threads within this tile (alias of num_threads())
|
||||
__CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -888,7 +904,7 @@ public:
|
||||
|
||||
//! Returns the number of groups created when the parent group was partitioned.
|
||||
__CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
|
||||
return (ParentCGTy::size() + tileSize - 1) / tileSize;
|
||||
return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -906,10 +922,11 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
|
||||
typedef thread_block_tile_base<numThreads> tbtBase;
|
||||
protected:
|
||||
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
|
||||
coalesced_info.tiled_info.size = numThreads;
|
||||
coalesced_info.tiled_info.num_threads = numThreads;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
}
|
||||
public:
|
||||
using tbtBase::num_threads;
|
||||
using tbtBase::size;
|
||||
using tbtBase::sync;
|
||||
using tbtBase::thread_rank;
|
||||
@@ -928,13 +945,14 @@ class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<til
|
||||
|
||||
__CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
|
||||
: tiled_group(numThreads) {
|
||||
coalesced_info.tiled_info.size = numThreads;
|
||||
coalesced_info.tiled_info.num_threads = numThreads;
|
||||
coalesced_info.tiled_info.is_tiled = true;
|
||||
coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
|
||||
coalesced_info.tiled_info.meta_group_size = meta_group_size;
|
||||
}
|
||||
|
||||
public:
|
||||
using tbtBase::num_threads;
|
||||
using tbtBase::size;
|
||||
using tbtBase::sync;
|
||||
using tbtBase::thread_rank;
|
||||
|
||||
@@ -138,7 +138,7 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
|
||||
return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
|
||||
@@ -155,7 +155,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
|
||||
*/
|
||||
namespace grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
|
||||
(blockDim.x * gridDim.x));
|
||||
}
|
||||
@@ -200,7 +200,7 @@ __CG_STATIC_QUALIFIER__ dim3 thread_index() {
|
||||
static_cast<__hip_uint32_t>(threadIdx.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
|
||||
__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
|
||||
return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z));
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user