From d75dc4eb29220434fd8a2cc03f7410abfab18602 Mon Sep 17 00:00:00 2001 From: mshivama <47909405+mshivama@users.noreply.github.com> Date: Thu, 29 Aug 2019 06:31:25 +0530 Subject: [PATCH] Device side support for Cooperative Group feature (#1202) * first cut of the header implementation of cooperative group feature * add diclarations for device library functions * fixed various compile time issues in the CG headers * enabled copy construction and copy assignment * fixed a minor bug related to conditional compilation macro * fixed few more CG constructor issues and added a unit testcase * fixed typo * extended unit testcase * compute size of partitioned CG from mask * bit of code refactoring * removed boilerplate code * fixed few of the review comments by Brian * Changes to the sigantures of few grid and multi-grid related OCKL functions * changes to declarations of OCKL functions related to CG feature * removed all the block level support as it is not planned for 2.9 * Have taken care of review comments by Brian * Have taken care of review comments by Brian * removed unused functions which were initially intended to use in block level cg support --- include/hip/hcc_detail/device_functions.h | 34 +-- include/hip/hcc_detail/device_library_decls.h | 10 + .../hip/hcc_detail/hip_cooperative_groups.h | 216 ++++++++++++++++++ .../hip_cooperative_groups_helper.h | 144 ++++++++++++ include/hip/hip_cooperative_groups.h | 43 ++++ 5 files changed, 434 insertions(+), 13 deletions(-) create mode 100644 include/hip/hcc_detail/hip_cooperative_groups.h create mode 100644 include/hip/hcc_detail/hip_cooperative_groups_helper.h create mode 100644 include/hip/hip_cooperative_groups.h diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index 684449aa6b..5ad1ced41c 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -748,28 +748,36 @@ unsigned long long int __ballot64(int predicate) { // hip.amdgcn.bc - lanemask __device__ inline -int64_t __lanemask_gt() +uint64_t __lanemask_gt() { - int32_t activelane = __ockl_activelane_u32(); - int64_t ballot = __ballot64(1); - if (activelane != 63) { - int64_t tmp = (~0ULL) << (activelane + 1); - return tmp & ballot; - } - return 0; + uint32_t lane = __ockl_lane_u32(); + if (lane == 63) + return 0; + uint64_t ballot = __ballot64(1); + uint64_t mask = (~((uint64_t)0)) << (lane + 1); + return mask & ballot; } __device__ inline -int64_t __lanemask_lt() +uint64_t __lanemask_lt() { - int32_t activelane = __ockl_activelane_u32(); + uint32_t lane = __ockl_lane_u32(); int64_t ballot = __ballot64(1); - if (activelane == 0) - return 0; - return ballot; + uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; + return mask & ballot; } +__device__ +inline +uint64_t __lanemask_eq() +{ + uint32_t lane = __ockl_lane_u32(); + int64_t mask = ((uint64_t)1 << lane); + return mask; +} + + __device__ inline void* __local_to_generic(void* p) { return p; } #ifdef __HIP_DEVICE_COMPILE__ diff --git a/include/hip/hcc_detail/device_library_decls.h b/include/hip/hcc_detail/device_library_decls.h index 9a7636fc4a..8cfb020caa 100644 --- a/include/hip/hcc_detail/device_library_decls.h +++ b/include/hip/hcc_detail/device_library_decls.h @@ -62,6 +62,16 @@ extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float) extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid); extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid); +extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32(); +extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void); +extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void); +extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); + // Introduce local address space #define __local __attribute__((address_space(3))) diff --git a/include/hip/hcc_detail/hip_cooperative_groups.h b/include/hip/hcc_detail/hip_cooperative_groups.h new file mode 100644 index 0000000000..27ce887723 --- /dev/null +++ b/include/hip/hcc_detail/hip_cooperative_groups.h @@ -0,0 +1,216 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file hcc_detail/hip_cooperative_groups.h + * + * @brief Device side implementation of `Cooperative Group` feature. + * + * Defines new types and device API wrappers related to `Cooperative Group` + * feature, which the programmer can directly use in his kernel(s) in order to + * make use of this feature. + */ +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H + +#if __cplusplus +#include + +namespace cooperative_groups { + +/** \brief The base type of all cooperative group types + * + * \details Holds the key properties of a constructed cooperative group type + * object, like the group type, its size, etc + */ +class thread_group { + protected: + uint32_t _type; // thread_group type + uint32_t _size; // total number of threads in the tread_group + uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, + // LSB represents lane 0, and MSB represents lane 63 + + // Construct a thread group, and set thread group type and other essential + // thread group properties. This generic thread group is directly constructed + // only when the group is supposed to contain only the calling the thread + // (throurh the API - `this_thread()`), and in all other cases, this thread + // group object is a sub-object of some other derived thread group object + __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size, + uint64_t mask = (uint64_t)0) { + _type = type; + _size = size; + _mask = mask; + } + + public: + // Total number of threads in the thread group, and this serves the purpose + // for all derived cooperative group types since their `size` is directly + // saved during the construction + __CG_QUALIFIER__ uint32_t size() const { + return _size; + } + // Rank of the calling thread within [0, size()) + __CG_QUALIFIER__ uint32_t thread_rank() const; + // Is this cooperative group type valid? + __CG_QUALIFIER__ bool is_valid() const; + // synchronize the threads in the thread group + __CG_QUALIFIER__ void sync() const; +}; + +/** \brief The multi-grid cooperative group type + * + * \details Represents an inter-device cooperative group type where the + * participating threads within the group spans across multple + * devices, running the (same) kernel on these devices + */ +class multi_grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); + + protected: + // Construct mutli-grid thread group (through the API this_multi_grid()) + explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) + : thread_group(internal::cg_multi_grid, size) { } + + public: + // Number of invocations participating in this multi-grid group. In other + // words, the number of GPUs + __CG_QUALIFIER__ uint32_t num_grids() { + return internal::multi_grid::num_grids(); + } + // Rank of this invocation. In other words, an ID number within the range + // [0, num_grids()) of the GPU, this kernel is running on + __CG_QUALIFIER__ uint32_t grid_rank() { + return internal::multi_grid::grid_rank(); + } + __CG_QUALIFIER__ uint32_t thread_rank() const { + return internal::multi_grid::thread_rank(); + } + __CG_QUALIFIER__ bool is_valid() const { + return internal::multi_grid::is_valid(); + } + __CG_QUALIFIER__ void sync() const { + internal::multi_grid::sync(); + } +}; + +/** \brief User exposed API interface to construct multi-grid cooperative + * group type object - `multi_grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + */ +__CG_QUALIFIER__ multi_grid_group +this_multi_grid() { + return multi_grid_group(internal::multi_grid::size()); +} + +/** \brief The grid cooperative group type + * + * \details Represents an inter-workgroup cooperative group type where the + * participating threads within the group spans across multiple + * workgroups running the (same) kernel on the same device + */ +class grid_group : public thread_group { + // Only these friend functions are allowed to construct an object of this class + // and access its resources + friend __CG_QUALIFIER__ grid_group this_grid(); + + protected: + // Construct grid thread group (through the API this_grid()) + explicit __CG_QUALIFIER__ grid_group(uint32_t size) + : thread_group(internal::cg_grid, size) { } + + public: + __CG_QUALIFIER__ uint32_t thread_rank() const { + return internal::grid::thread_rank(); + } + __CG_QUALIFIER__ bool is_valid() const { + return internal::grid::is_valid(); + } + __CG_QUALIFIER__ void sync() const { + internal::grid::sync(); + } +}; + +/** \brief User exposed API interface to construct grid cooperative group type + * object - `grid_group` + * + * \details User is not allowed to directly construct an object of type + * `multi_grid_group`. Instead, he should construct it through this + * API function + */ +__CG_QUALIFIER__ grid_group +this_grid() { + return grid_group(internal::grid::size()); +} + +/** + * Implemenation of all publicly exposed base class APIs + */ +__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->thread_rank()); + } + case internal::cg_grid: { + return (static_cast(this)->thread_rank()); + } + default: { + return 0; //TODO(mahesha) + } + } +} + +__CG_QUALIFIER__ bool thread_group::is_valid() const { + switch (this->_type) { + case internal::cg_multi_grid: { + return (static_cast(this)->is_valid()); + } + case internal::cg_grid: { + return (static_cast(this)->is_valid()); + } + default: { + return false; + } + } +} + +__CG_QUALIFIER__ void thread_group::sync() const { + switch (this->_type) { + case internal::cg_multi_grid: { + static_cast(this)->sync(); + break; + } + case internal::cg_grid: { + static_cast(this)->sync(); + break; + } + } +} + +} // namespace cooperative_groups + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/include/hip/hcc_detail/hip_cooperative_groups_helper.h b/include/hip/hcc_detail/hip_cooperative_groups_helper.h new file mode 100644 index 0000000000..b74d16d23b --- /dev/null +++ b/include/hip/hcc_detail/hip_cooperative_groups_helper.h @@ -0,0 +1,144 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file hcc_detail/hip_cooperative_groups_helper.h + * + * @brief Device side implementation of cooperative group feature. + * + * Defines helper constructs and APIs which aid the types and device API + * wrappers defined within `hcc_detail/hip_cooperative_groups.h`. + */ +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H + +#if __cplusplus +#include +#include + +#if !defined(__align__) +#define __align__(x) __attribute__((aligned(x))) +#endif + +#if !defined(__CG_QUALIFIER__) +#define __CG_QUALIFIER__ __device__ __forceinline__ +#endif + +#if !defined(__CG_STATIC_QUALIFIER__) +#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__ +#endif + +#if !defined(WAVEFRONT_SIZE) +#define WAVEFRONT_SIZE 64 +#endif + +namespace cooperative_groups { + +namespace internal { + +/** \brief Enums representing different cooperative group types + */ +typedef enum { + cg_invalid, + cg_multi_grid, + cg_grid +} group_type; + +/** + * Functionalities related to multi-grid cooperative group type + */ +namespace multi_grid { + +__CG_STATIC_QUALIFIER__ uint32_t num_grids() { + return (uint32_t)__ockl_multi_grid_num_grids(); +} + +__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { + return (uint32_t)__ockl_multi_grid_grid_rank(); +} + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return (uint32_t)__ockl_multi_grid_size(); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + return (uint32_t)__ockl_multi_grid_thread_rank(); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { + return (bool)__ockl_multi_grid_is_valid(); +} + +__CG_STATIC_QUALIFIER__ void sync() { + __ockl_multi_grid_sync(); +} + +} // namespace multi_grid + +/** + * Functionalities related to grid cooperative group type + */ +namespace grid { + +__CG_STATIC_QUALIFIER__ uint32_t size() { + return (uint32_t)((hipBlockDim_z * hipGridDim_z) * + (hipBlockDim_y * hipGridDim_y) * + (hipBlockDim_x * hipGridDim_x)); +} + +__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { + // Compute global id of the workgroup to which the current threads belongs to + uint32_t blkIdx = + (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) + + (hipBlockIdx_y * hipGridDim_x) + + (hipBlockIdx_x)); + + // Compute total number of threads being passed to reach current workgroup + // within grid + uint32_t num_threads_till_current_workgroup = + (uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z)); + + // Compute thread local rank within current workgroup + uint32_t local_thread_rank = + (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) + + (hipThreadIdx_y * hipBlockDim_x) + + (hipThreadIdx_x)); + + return (num_threads_till_current_workgroup + local_thread_rank); +} + +__CG_STATIC_QUALIFIER__ bool is_valid() { + return (bool)__ockl_grid_is_valid(); +} + +__CG_STATIC_QUALIFIER__ void sync() { + __ockl_grid_sync(); +} + +} // namespace grid + +} // namespace internal + +} // namespace cooperative_groups + +#endif // __cplusplus +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/include/hip/hip_cooperative_groups.h b/include/hip/hip_cooperative_groups.h new file mode 100644 index 0000000000..d919e83c7f --- /dev/null +++ b/include/hip/hip_cooperative_groups.h @@ -0,0 +1,43 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** + * @file hip_cooperative_groups.h + * + * @brief Defines new types and device API wrappers for `Cooperative Group` + * feature. + */ + +#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H +#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H + +#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__) +#if __cplusplus +#include +#endif +#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__) +#include +#else +#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__"); +#endif + +#endif // HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H