Device side support for Cooperative Group feature (#1202)
* first cut of the header implementation of cooperative group feature * add diclarations for device library functions * fixed various compile time issues in the CG headers * enabled copy construction and copy assignment * fixed a minor bug related to conditional compilation macro * fixed few more CG constructor issues and added a unit testcase * fixed typo * extended unit testcase * compute size of partitioned CG from mask * bit of code refactoring * removed boilerplate code * fixed few of the review comments by Brian * Changes to the sigantures of few grid and multi-grid related OCKL functions * changes to declarations of OCKL functions related to CG feature * removed all the block level support as it is not planned for 2.9 * Have taken care of review comments by Brian * Have taken care of review comments by Brian * removed unused functions which were initially intended to use in block level cg support
This commit is contained in:
committed by
Maneesh Gupta
vanhempi
1ab81d9ce6
commit
d75dc4eb29
@@ -748,28 +748,36 @@ unsigned long long int __ballot64(int predicate) {
|
||||
// hip.amdgcn.bc - lanemask
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_gt()
|
||||
uint64_t __lanemask_gt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane != 63) {
|
||||
int64_t tmp = (~0ULL) << (activelane + 1);
|
||||
return tmp & ballot;
|
||||
}
|
||||
return 0;
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
if (lane == 63)
|
||||
return 0;
|
||||
uint64_t ballot = __ballot64(1);
|
||||
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_lt()
|
||||
uint64_t __lanemask_lt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane == 0)
|
||||
return 0;
|
||||
return ballot;
|
||||
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
uint64_t __lanemask_eq()
|
||||
{
|
||||
uint32_t lane = __ockl_lane_u32();
|
||||
int64_t mask = ((uint64_t)1 << lane);
|
||||
return mask;
|
||||
}
|
||||
|
||||
|
||||
__device__ inline void* __local_to_generic(void* p) { return p; }
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
|
||||
@@ -62,6 +62,16 @@ extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float)
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
|
||||
|
||||
extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
|
||||
|
||||
|
||||
// Introduce local address space
|
||||
#define __local __attribute__((address_space(3)))
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file hcc_detail/hip_cooperative_groups.h
|
||||
*
|
||||
* @brief Device side implementation of `Cooperative Group` feature.
|
||||
*
|
||||
* Defines new types and device API wrappers related to `Cooperative Group`
|
||||
* feature, which the programmer can directly use in his kernel(s) in order to
|
||||
* make use of this feature.
|
||||
*/
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
|
||||
#if __cplusplus
|
||||
#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
|
||||
|
||||
namespace cooperative_groups {
|
||||
|
||||
/** \brief The base type of all cooperative group types
|
||||
*
|
||||
* \details Holds the key properties of a constructed cooperative group type
|
||||
* object, like the group type, its size, etc
|
||||
*/
|
||||
class thread_group {
|
||||
protected:
|
||||
uint32_t _type; // thread_group type
|
||||
uint32_t _size; // total number of threads in the tread_group
|
||||
uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
|
||||
// LSB represents lane 0, and MSB represents lane 63
|
||||
|
||||
// Construct a thread group, and set thread group type and other essential
|
||||
// thread group properties. This generic thread group is directly constructed
|
||||
// only when the group is supposed to contain only the calling the thread
|
||||
// (throurh the API - `this_thread()`), and in all other cases, this thread
|
||||
// group object is a sub-object of some other derived thread group object
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
|
||||
uint64_t mask = (uint64_t)0) {
|
||||
_type = type;
|
||||
_size = size;
|
||||
_mask = mask;
|
||||
}
|
||||
|
||||
public:
|
||||
// Total number of threads in the thread group, and this serves the purpose
|
||||
// for all derived cooperative group types since their `size` is directly
|
||||
// saved during the construction
|
||||
__CG_QUALIFIER__ uint32_t size() const {
|
||||
return _size;
|
||||
}
|
||||
// Rank of the calling thread within [0, size())
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const;
|
||||
// Is this cooperative group type valid?
|
||||
__CG_QUALIFIER__ bool is_valid() const;
|
||||
// synchronize the threads in the thread group
|
||||
__CG_QUALIFIER__ void sync() const;
|
||||
};
|
||||
|
||||
/** \brief The multi-grid cooperative group type
|
||||
*
|
||||
* \details Represents an inter-device cooperative group type where the
|
||||
* participating threads within the group spans across multple
|
||||
* devices, running the (same) kernel on these devices
|
||||
*/
|
||||
class multi_grid_group : public thread_group {
|
||||
// Only these friend functions are allowed to construct an object of this class
|
||||
// and access its resources
|
||||
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
|
||||
|
||||
protected:
|
||||
// Construct mutli-grid thread group (through the API this_multi_grid())
|
||||
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
|
||||
: thread_group(internal::cg_multi_grid, size) { }
|
||||
|
||||
public:
|
||||
// Number of invocations participating in this multi-grid group. In other
|
||||
// words, the number of GPUs
|
||||
__CG_QUALIFIER__ uint32_t num_grids() {
|
||||
return internal::multi_grid::num_grids();
|
||||
}
|
||||
// Rank of this invocation. In other words, an ID number within the range
|
||||
// [0, num_grids()) of the GPU, this kernel is running on
|
||||
__CG_QUALIFIER__ uint32_t grid_rank() {
|
||||
return internal::multi_grid::grid_rank();
|
||||
}
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const {
|
||||
return internal::multi_grid::thread_rank();
|
||||
}
|
||||
__CG_QUALIFIER__ bool is_valid() const {
|
||||
return internal::multi_grid::is_valid();
|
||||
}
|
||||
__CG_QUALIFIER__ void sync() const {
|
||||
internal::multi_grid::sync();
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief User exposed API interface to construct multi-grid cooperative
|
||||
* group type object - `multi_grid_group`
|
||||
*
|
||||
* \details User is not allowed to directly construct an object of type
|
||||
* `multi_grid_group`. Instead, he should construct it through this
|
||||
* API function
|
||||
*/
|
||||
__CG_QUALIFIER__ multi_grid_group
|
||||
this_multi_grid() {
|
||||
return multi_grid_group(internal::multi_grid::size());
|
||||
}
|
||||
|
||||
/** \brief The grid cooperative group type
|
||||
*
|
||||
* \details Represents an inter-workgroup cooperative group type where the
|
||||
* participating threads within the group spans across multiple
|
||||
* workgroups running the (same) kernel on the same device
|
||||
*/
|
||||
class grid_group : public thread_group {
|
||||
// Only these friend functions are allowed to construct an object of this class
|
||||
// and access its resources
|
||||
friend __CG_QUALIFIER__ grid_group this_grid();
|
||||
|
||||
protected:
|
||||
// Construct grid thread group (through the API this_grid())
|
||||
explicit __CG_QUALIFIER__ grid_group(uint32_t size)
|
||||
: thread_group(internal::cg_grid, size) { }
|
||||
|
||||
public:
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const {
|
||||
return internal::grid::thread_rank();
|
||||
}
|
||||
__CG_QUALIFIER__ bool is_valid() const {
|
||||
return internal::grid::is_valid();
|
||||
}
|
||||
__CG_QUALIFIER__ void sync() const {
|
||||
internal::grid::sync();
|
||||
}
|
||||
};
|
||||
|
||||
/** \brief User exposed API interface to construct grid cooperative group type
|
||||
* object - `grid_group`
|
||||
*
|
||||
* \details User is not allowed to directly construct an object of type
|
||||
* `multi_grid_group`. Instead, he should construct it through this
|
||||
* API function
|
||||
*/
|
||||
__CG_QUALIFIER__ grid_group
|
||||
this_grid() {
|
||||
return grid_group(internal::grid::size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Implemenation of all publicly exposed base class APIs
|
||||
*/
|
||||
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
return (static_cast<const multi_grid_group*>(this)->thread_rank());
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
return (static_cast<const grid_group*>(this)->thread_rank());
|
||||
}
|
||||
default: {
|
||||
return 0; //TODO(mahesha)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__CG_QUALIFIER__ bool thread_group::is_valid() const {
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
return (static_cast<const multi_grid_group*>(this)->is_valid());
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
return (static_cast<const grid_group*>(this)->is_valid());
|
||||
}
|
||||
default: {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__CG_QUALIFIER__ void thread_group::sync() const {
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
static_cast<const multi_grid_group*>(this)->sync();
|
||||
break;
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
static_cast<const grid_group*>(this)->sync();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace cooperative_groups
|
||||
|
||||
#endif // __cplusplus
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
@@ -0,0 +1,144 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file hcc_detail/hip_cooperative_groups_helper.h
|
||||
*
|
||||
* @brief Device side implementation of cooperative group feature.
|
||||
*
|
||||
* Defines helper constructs and APIs which aid the types and device API
|
||||
* wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
|
||||
*/
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
|
||||
#if __cplusplus
|
||||
#include <hip/hcc_detail/hip_runtime_api.h>
|
||||
#include <hip/hcc_detail/device_functions.h>
|
||||
|
||||
#if !defined(__align__)
|
||||
#define __align__(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
#if !defined(__CG_QUALIFIER__)
|
||||
#define __CG_QUALIFIER__ __device__ __forceinline__
|
||||
#endif
|
||||
|
||||
#if !defined(__CG_STATIC_QUALIFIER__)
|
||||
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
|
||||
#endif
|
||||
|
||||
#if !defined(WAVEFRONT_SIZE)
|
||||
#define WAVEFRONT_SIZE 64
|
||||
#endif
|
||||
|
||||
namespace cooperative_groups {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \brief Enums representing different cooperative group types
|
||||
*/
|
||||
typedef enum {
|
||||
cg_invalid,
|
||||
cg_multi_grid,
|
||||
cg_grid
|
||||
} group_type;
|
||||
|
||||
/**
|
||||
* Functionalities related to multi-grid cooperative group type
|
||||
*/
|
||||
namespace multi_grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
|
||||
return (uint32_t)__ockl_multi_grid_num_grids();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
|
||||
return (uint32_t)__ockl_multi_grid_grid_rank();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
||||
return (uint32_t)__ockl_multi_grid_size();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
||||
return (uint32_t)__ockl_multi_grid_thread_rank();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() {
|
||||
return (bool)__ockl_multi_grid_is_valid();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() {
|
||||
__ockl_multi_grid_sync();
|
||||
}
|
||||
|
||||
} // namespace multi_grid
|
||||
|
||||
/**
|
||||
* Functionalities related to grid cooperative group type
|
||||
*/
|
||||
namespace grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
||||
return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
|
||||
(hipBlockDim_y * hipGridDim_y) *
|
||||
(hipBlockDim_x * hipGridDim_x));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
||||
// Compute global id of the workgroup to which the current threads belongs to
|
||||
uint32_t blkIdx =
|
||||
(uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
|
||||
(hipBlockIdx_y * hipGridDim_x) +
|
||||
(hipBlockIdx_x));
|
||||
|
||||
// Compute total number of threads being passed to reach current workgroup
|
||||
// within grid
|
||||
uint32_t num_threads_till_current_workgroup =
|
||||
(uint32_t)(blkIdx * (hipBlockIdx_x * hipBlockIdx_y * hipBlockIdx_z));
|
||||
|
||||
// Compute thread local rank within current workgroup
|
||||
uint32_t local_thread_rank =
|
||||
(uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
|
||||
(hipThreadIdx_y * hipBlockDim_x) +
|
||||
(hipThreadIdx_x));
|
||||
|
||||
return (num_threads_till_current_workgroup + local_thread_rank);
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() {
|
||||
return (bool)__ockl_grid_is_valid();
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() {
|
||||
__ockl_grid_sync();
|
||||
}
|
||||
|
||||
} // namespace grid
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace cooperative_groups
|
||||
|
||||
#endif // __cplusplus
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file hip_cooperative_groups.h
|
||||
*
|
||||
* @brief Defines new types and device API wrappers for `Cooperative Group`
|
||||
* feature.
|
||||
*/
|
||||
|
||||
#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
|
||||
#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
|
||||
#if __cplusplus
|
||||
#include <hip/hcc_detail/hip_cooperative_groups.h>
|
||||
#endif
|
||||
#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
|
||||
#include <cooperative_groups.h>
|
||||
#else
|
||||
#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
|
||||
#endif
|
||||
|
||||
#endif // HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
|
||||
Viittaa uudesa ongelmassa
Block a user