Device side support for Cooperative Group feature (#1202)

* first cut of the header implementation of cooperative group feature * add diclarations for device library functions * fixed various compile time issues in the CG headers * enabled copy construction and copy assignment * fixed a minor bug related to conditional compilation macro * fixed few more CG constructor issues and added a unit testcase * fixed typo * extended unit testcase * compute size of partitioned CG from mask * bit of code refactoring * removed boilerplate code * fixed few of the review comments by Brian * Changes to the sigantures of few grid and multi-grid related OCKL functions * changes to declarations of OCKL functions related to CG feature * removed all the block level support as it is not planned for 2.9 * Have taken care of review comments by Brian * Have taken care of review comments by Brian * removed unused functions which were initially intended to use in block level cg support
2019-08-29 06:31:25 +05:30
commit d75dc4eb29
@@ -748,28 +748,36 @@ unsigned long long int __ballot64(int predicate) {
 // hip.amdgcn.bc - lanemask
 __device__
 inline
-int64_t  __lanemask_gt()
+uint64_t  __lanemask_gt()
 {
-    int32_t activelane = __ockl_activelane_u32();
-    int64_t ballot = __ballot64(1);
-    if (activelane != 63) {
-        int64_t tmp = (~0ULL) << (activelane + 1);
-        return tmp & ballot;
-    }
-    return 0;
+    uint32_t lane = __ockl_lane_u32();
+    if (lane == 63)
+      return 0;
+    uint64_t ballot = __ballot64(1);
+    uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+    return mask & ballot;
 }

 __device__
 inline
-int64_t __lanemask_lt()
+uint64_t __lanemask_lt()
 {
-    int32_t activelane = __ockl_activelane_u32();
+    uint32_t lane = __ockl_lane_u32();
    int64_t ballot = __ballot64(1);
-    if (activelane == 0)
-        return 0;
-    return ballot;
+    uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+    return mask & ballot;
 }

+__device__
+inline
+uint64_t  __lanemask_eq()
+{
+    uint32_t lane = __ockl_lane_u32();
+    int64_t mask = ((uint64_t)1 << lane);
+    return mask;
+}
+
+
 __device__ inline void* __local_to_generic(void* p) { return p; }

 #ifdef __HIP_DEVICE_COMPILE__
@@ -62,6 +62,16 @@ extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float)
 extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
 extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);

+extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
+extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+

 // Introduce local address space
 #define __local __attribute__((address_space(3)))
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
+
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group type
+ *           object, like the group type, its size, etc
+ */
+class thread_group {
+ protected:
+  uint32_t _type; // thread_group type
+  uint32_t _size; // total number of threads in the tread_group
+  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
+                  // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const {
+    return _size;
+  }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+  // Construct mutli-grid thread group (through the API this_multi_grid())
+  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+      : thread_group(internal::cg_multi_grid, size) { }
+
+ public:
+  // Number of invocations participating in this multi-grid group. In other
+  // words, the number of GPUs
+	__CG_QUALIFIER__ uint32_t num_grids() {
+    return internal::multi_grid::num_grids();
+  }
+  // Rank of this invocation. In other words, an ID number within the range
+  // [0, num_grids()) of the GPU, this kernel is running on
+	__CG_QUALIFIER__ uint32_t grid_rank() {
+    return internal::multi_grid::grid_rank();
+  }
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::multi_grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::multi_grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::multi_grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group
+this_multi_grid() {
+  return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+  // Only these friend functions are allowed to construct an object of this class
+  // and access its resources
+  friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+  // Construct grid thread group (through the API this_grid())
+  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
+      : thread_group(internal::cg_grid, size) { }
+
+ public:
+  __CG_QUALIFIER__ uint32_t thread_rank() const {
+    return internal::grid::thread_rank();
+  }
+  __CG_QUALIFIER__ bool is_valid() const {
+    return internal::grid::is_valid();
+  }
+  __CG_QUALIFIER__ void sync() const {
+    internal::grid::sync();
+  }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group
+this_grid() {
+  return grid_group(internal::grid::size());
+}
+
+/**
+ *  Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->thread_rank());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->thread_rank());
+    }
+    default: {
+      return 0; //TODO(mahesha)
+    }
+  }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      return (static_cast<const multi_grid_group*>(this)->is_valid());
+    }
+    case internal::cg_grid: {
+      return (static_cast<const grid_group*>(this)->is_valid());
+    }
+    default: {
+      return false;
+    }
+  }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+  switch (this->_type) {
+    case internal::cg_multi_grid: {
+      static_cast<const multi_grid_group*>(this)->sync();
+      break;
+    }
+    case internal::cg_grid: {
+      static_cast<const grid_group*>(this)->sync();
+      break;
+    }
+  }
+}
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
@@ -0,0 +1,144 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_runtime_api.h>
+#include <hip/hcc_detail/device_functions.h>
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+  cg_invalid,
+  cg_multi_grid,
+  cg_grid
+} group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+  return (uint32_t)__ockl_multi_grid_num_grids();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+  return (uint32_t)__ockl_multi_grid_grid_rank();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)__ockl_multi_grid_size();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return (uint32_t)__ockl_multi_grid_thread_rank();
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_multi_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_multi_grid_sync();
+}
+
+} // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
+                    (hipBlockDim_y * hipGridDim_y) *
+                    (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  // Compute global id of the workgroup to which the current threads belongs to
+  uint32_t blkIdx =
+           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+                      (hipBlockIdx_y * hipGridDim_x) +
+                      (hipBlockIdx_x));
+
+  // Compute total number of threads being passed to reach current workgroup
+  // within grid
+  uint32_t num_threads_till_current_workgroup =
+           (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z));
+
+  // Compute thread local rank within current workgroup
+  uint32_t local_thread_rank =
+           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                      (hipThreadIdx_y * hipBlockDim_x) +
+                      (hipThreadIdx_x));
+
+  return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+  return (bool)__ockl_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+  __ockl_grid_sync();
+}
+
+} // namespace grid
+
+} // namespace internal
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hip_cooperative_groups.h
+ *
+ *  @brief Defines new types and device API wrappers for `Cooperative Group`
+ *  feature.
+ */
+
+#ifndef  HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
+#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#if __cplusplus
+#include <hip/hcc_detail/hip_cooperative_groups.h>
+#endif
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <cooperative_groups.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif // HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H