SWDEV-533232 Add num_threads API in cooperative_groups (#437)

* Add num_threads API in cooperative_groups * Reimplementsize API as its alias. * Update changelog [ROCm/clr commit: a4631b5700]
2025-06-26 11:35:17 +02:00
@@ -11,6 +11,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
    - `hipLaunchKernelExC`  launches a HIP kernel using a generic function pointer and the specified configuration.
    - `hipDrvLaunchKernelEx`  dispatches the device kernel represented by a HIP function object.
    - `hipMemGetHandleForAddressRange`  gets a handle for the address range requested.
+    - `num_threads`  Total number of threads in the group. The legacy API size is alias.
 * New support for Open Compute Project (OCP) floating-point `FP4`/`FP6`/`FP8` as the following. For details, see [Low precision floating point document](https://rocm.docs.amd.com/projects/HIP/en/latest/reference/low_fp_types.html).
    - Data types for `FP4`/`FP6`/`FP8`.
    - HIP APIs for `FP4`/`FP6`/`FP8`, which are compatible with corresponding CUDA APIs.
@@ -50,7 +50,7 @@ namespace cooperative_groups {
 class thread_group {
 protected:
  __hip_uint32_t _type;  //! Type of the thread_group.
-  __hip_uint32_t _size;  //! Total number of threads in the thread_group.
+  __hip_uint32_t _num_threads;  //! Total number of threads in the thread_group.
  __hip_uint64_t _mask;  //! Lanemask for coalesced and tiled partitioned group types,
                         //! LSB represents lane 0, and MSB represents lane 63

@@ -60,23 +60,23 @@ class thread_group {
  //! (through the API - `this_thread()`), and in all other cases, this thread
  //! group object is a sub-object of some other derived thread group object.
  __CG_QUALIFIER__ thread_group(internal::group_type type,
-                                __hip_uint32_t size = static_cast<__hip_uint64_t>(0),
+                                __hip_uint32_t num_threads = static_cast<__hip_uint64_t>(0),
                                __hip_uint64_t mask = static_cast<__hip_uint64_t>(0)) {
    _type = type;
-    _size = size;
+    _num_threads = num_threads;
    _mask = mask;
  }

  struct _tiled_info {
    bool is_tiled;
-    unsigned int size;
+    unsigned int num_threads;
    unsigned int meta_group_rank;
    unsigned int meta_group_size;
  };

  struct _coalesced_info {
    lane_mask member_mask;
-    unsigned int size;
+    unsigned int num_threads;
    struct _tiled_info tiled_info;
  } coalesced_info;

@@ -87,12 +87,14 @@ class thread_group {

 public:
  //! Total number of threads in the thread_group, and this serves the purpose
-  //! for all derived cooperative group types because their `size` is directly
+  //! for all derived cooperative group types because their `num_threads` is directly
  //! saved during the construction.
-  __CG_QUALIFIER__ __hip_uint32_t size() const { return _size; }
+  __CG_QUALIFIER__ __hip_uint32_t num_threads() const { return _num_threads; }
+  //! Total number of threads in the group (alias of num_threads())
+  __CG_QUALIFIER__ __hip_uint32_t size() const { return num_threads(); }
  //! Returns the type of the group.
  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
-  //! Rank of the calling thread within [0, \link size() size() \endlink).
+  //! Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const;
  //! Returns true if the group has not violated any API constraints.
  __CG_QUALIFIER__ bool is_valid() const;
@@ -174,7 +176,7 @@ class multi_grid_group : public thread_group {
 *           development on Microsoft Windows.
 */
 __CG_QUALIFIER__ multi_grid_group this_multi_grid() {
-  return multi_grid_group(internal::multi_grid::size());
+  return multi_grid_group(internal::multi_grid::num_threads());
 }
 // Doxygen end group CooperativeGConstruct
 /** @} */
@@ -216,7 +218,7 @@ class grid_group : public thread_group {
 *  \note    This function is implemented on Linux and is under development
 *           on Microsoft Windows.
 */
-__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
+__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::num_threads()); }

 /** \brief   The workgroup (thread-block in CUDA terminology) cooperative group
 *           type.
@@ -247,14 +249,14 @@ class thread_block : public thread_group {
      __hip_assert(false && "invalid tile size");
    }

-    auto block_size = size();
+    auto block_size = num_threads();
    auto rank = thread_rank();
    auto partitions = (block_size + tile_size - 1) / tile_size;
    auto tail = (partitions * tile_size) - block_size;
    auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size);
    thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size);

-    tiledGroup.coalesced_info.tiled_info.size = tile_size;
+    tiledGroup.coalesced_info.tiled_info.num_threads = tile_size;
    tiledGroup.coalesced_info.tiled_info.is_tiled = true;
    tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size;
    tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions;
@@ -268,8 +270,10 @@ class thread_block : public thread_group {
  __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
  //! @copydoc thread_group::thread_rank
  __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
+  //! @copydoc thread_group::num_threads
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); }
  //! @copydoc thread_group::size
-  __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return internal::workgroup::size(); }
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
  //! @copydoc thread_group::is_valid
  __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
  //! @copydoc thread_group::sync
@@ -289,7 +293,7 @@ class thread_block : public thread_group {
 *           on Microsoft Windows.
 */
 __CG_QUALIFIER__ thread_block this_thread_block() {
-  return thread_block(internal::workgroup::size());
+  return thread_block(internal::workgroup::num_threads());
 }

 /** \brief   The tiled_group cooperative group type
@@ -313,7 +317,7 @@ class tiled_group : public thread_group {
      __hip_assert(false && "invalid tile size");
    }

-    if (size() <= tile_size) {
+    if (num_threads() <= tile_size) {
      return *this;
    }

@@ -325,17 +329,20 @@ class tiled_group : public thread_group {
 protected:
  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
      : thread_group(internal::cg_tiled_group, tileSize) {
-    coalesced_info.tiled_info.size = tileSize;
+    coalesced_info.tiled_info.num_threads = tileSize;
    coalesced_info.tiled_info.is_tiled = true;
  }

 public:
+  //! @copydoc thread_group::num_threads
+  __CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); }
+
  //! @copydoc thread_group::size
-  __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
+  __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }

  //! @copydoc thread_group::thread_rank
  __CG_QUALIFIER__ unsigned int thread_rank() const {
-    return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
+    return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.num_threads - 1));
  }

  //! @copydoc thread_group::sync
@@ -374,7 +381,7 @@ class coalesced_group : public thread_group {
    // prepare a mask for further partitioning it so that it stays coalesced.
    if (coalesced_info.tiled_info.is_tiled) {
      unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
-      unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
+      unsigned int masklength = min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
      lane_mask full_mask = (static_cast<int>(warpSize) == 32) ? static_cast<lane_mask>((1u << 32) - 1)
                                             : static_cast<lane_mask>(-1ull);
      lane_mask member_mask = full_mask >> (warpSize - masklength);
@@ -383,7 +390,7 @@ class coalesced_group : public thread_group {
      coalesced_group coalesced_tile = coalesced_group(member_mask);
      coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
      coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
-      coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
+      coalesced_tile.coalesced_info.tiled_info.meta_group_size = num_threads() / tile_size;
      return coalesced_tile;
    }
    // Here the parent coalesced_group is not partitioned.
@@ -407,7 +414,7 @@ class coalesced_group : public thread_group {
      coalesced_group coalesced_tile = coalesced_group(member_mask);
      coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
      coalesced_tile.coalesced_info.tiled_info.meta_group_size =
-                                                      (size() + tile_size - 1) / tile_size;
+                                                      (num_threads() + tile_size - 1) / tile_size;
      return coalesced_tile;
    }
     return coalesced_group(0);
@@ -418,16 +425,21 @@ class coalesced_group : public thread_group {
  explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
      : thread_group(internal::cg_coalesced_group) {
    coalesced_info.member_mask = member_mask; // Which threads are active
-    coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
+    coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active
    coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
    coalesced_info.tiled_info.meta_group_rank = 0;
    coalesced_info.tiled_info.meta_group_size = 1;
  }

 public:
+   //! @copydoc thread_group::num_threads
+   __CG_QUALIFIER__ unsigned int num_threads() const {
+     return coalesced_info.num_threads;
+   }
+
   //! @copydoc thread_group::size
   __CG_QUALIFIER__ unsigned int size() const {
-     return coalesced_info.size;
+     return num_threads();
   }

   //! @copydoc thread_group::thread_rank
@@ -466,9 +478,9 @@ class coalesced_group : public thread_group {
  template <class T>
  __CG_QUALIFIER__ T shfl(T var, int srcRank) const {

-    srcRank = srcRank % static_cast<int>(size());
+    srcRank = srcRank % static_cast<int>(num_threads());

-    int lane = (size() == warpSize) ? srcRank
+    int lane = (num_threads() == warpSize) ? srcRank
             : (static_cast<int>(warpSize) == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
                                    : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));

@@ -496,7 +508,7 @@ class coalesced_group : public thread_group {
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.

-    if (size() == warpSize) {
+    if (num_threads() == warpSize) {
      return __shfl_down(var, lane_delta, warpSize);
    }

@@ -536,7 +548,7 @@ class coalesced_group : public thread_group {
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.

-    if (size() == warpSize) {
+    if (num_threads() == warpSize) {
      return __shfl_up(var, lane_delta, warpSize);
    }

@@ -747,11 +759,11 @@ __CG_QUALIFIER__ void thread_group::sync() const {
 *           cooperative group type APIs. This function is implemented on Linux
 *           and is under development on Microsoft Windows.
 */
-template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.size(); }
+template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); }

 /** \brief   Returns the rank of thread of the group.
 *
- *  \details Rank of the calling thread within [0, \link size() size() \endlink).
+ *  \details Rank of the calling thread within [0, \link num_threads() num_threads() \endlink).
 * 
 *  \tparam CGTy  The cooperative group class template parameter.
 *  \param g [in] The cooperative group for rank returns.
@@ -805,7 +817,11 @@ template <unsigned int tileSize> class tile_base {
  }

  //! Number of threads within this tile
-  __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
+  __CG_STATIC_QUALIFIER__ unsigned int num_threads() { return numThreads; }
+
+  //! Legacy member functions
+  //! Number of threads within this tile (alias of num_threads())
+  __CG_STATIC_QUALIFIER__ unsigned int size() { return num_threads(); }
 };

 /**
@@ -888,7 +904,7 @@ public:

  //! Returns the number of groups created when the parent group was partitioned.
  __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
-    return (ParentCGTy::size() + tileSize - 1) / tileSize;
+    return (ParentCGTy::num_threads() + tileSize - 1) / tileSize;
  }
 };

@@ -906,10 +922,11 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
  typedef thread_block_tile_base<numThreads> tbtBase;
  protected:
    __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
-      coalesced_info.tiled_info.size = numThreads;
+      coalesced_info.tiled_info.num_threads = numThreads;
      coalesced_info.tiled_info.is_tiled = true;
    }
  public:
+    using tbtBase::num_threads;
    using tbtBase::size;
    using tbtBase::sync;
    using tbtBase::thread_rank;
@@ -928,13 +945,14 @@ class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<til

    __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
        : tiled_group(numThreads) {
-    coalesced_info.tiled_info.size = numThreads;
+    coalesced_info.tiled_info.num_threads = numThreads;
    coalesced_info.tiled_info.is_tiled = true;
    coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
    coalesced_info.tiled_info.meta_group_size = meta_group_size;
  }

 public:
+  using tbtBase::num_threads;
  using tbtBase::size;
  using tbtBase::sync;
  using tbtBase::thread_rank;
@@ -138,7 +138,7 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
 __CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
  return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }

-__CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }

 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
  return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
@@ -155,7 +155,7 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
 */
 namespace grid {

-__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
+__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
  return static_cast<__hip_uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
                    (blockDim.x * gridDim.x));
 }
@@ -200,7 +200,7 @@ __CG_STATIC_QUALIFIER__ dim3 thread_index() {
               static_cast<__hip_uint32_t>(threadIdx.z)));
 }

-__CG_STATIC_QUALIFIER__ __hip_uint32_t size() {
+__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
  return (static_cast<__hip_uint32_t>(blockDim.x * blockDim.y * blockDim.z));
 }