d37af80d7e
* add gfx1100 support
Add support for Radeon 7900 GPUs (RX and PRO), and 7800 PRO.
I was contemplating to add gfx1101 and gfx1102 GPUs as well, but those are the lower end models that are more unlikely to be used for compute intensive jobs. In addition, I do not have access to them to test the support.
* update WF_SIZe for different options
Radeon systems use a WarpSize of 32, unlike current Instinct systems,
which use a warp size of 64. For the device side, a gfx specific ifdef
is sufficient. For the host side, we need to query the device
properties.
* adjust functional tests to wf_size of 32
* update unit tests to handle wf_size of 32
* address reviewer comments
[ROCm/rocshmem commit: d0c2845031]
352 líneas
12 KiB
C++
352 líneas
12 KiB
C++
/******************************************************************************
|
|
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*****************************************************************************/
|
|
|
|
#ifndef ROCSHMEM_BITWISE_GTEST_HPP
|
|
#define ROCSHMEM_BITWISE_GTEST_HPP
|
|
|
|
#define HIP_ENABLE_PRINTF
|
|
|
|
#include "gtest/gtest.h"
|
|
#include "wf_size.hpp"
|
|
|
|
#include "../src/memory/hip_allocator.hpp"
|
|
#include "containers/matrix.hpp"
|
|
#include "containers/share_strategy.hpp"
|
|
#include "containers/strategies.hpp"
|
|
|
|
#include <hip/hip_runtime.h>
|
|
#include <cassert>
|
|
|
|
namespace rocshmem {
|
|
|
|
/*****************************************************************************
|
|
************************ WarpMatrix Type Helpers ****************************
|
|
*****************************************************************************/
|
|
typedef Matrix<uint64_t> WarpMatrix;
|
|
|
|
/*****************************************************************************
|
|
***************************** Device Methods ********************************
|
|
*****************************************************************************/
|
|
class BitwiseDeviceMethods
|
|
{
|
|
public:
|
|
/*************************************************************************
|
|
************************* Block Strategy Methods ************************
|
|
*************************************************************************/
|
|
__device__
|
|
void
|
|
lowest_active_lane(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
auto low_lane = block.lowest_active_lane();
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = low_lane;
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
is_lowest_active_lane(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
if (block.is_lowest_active_lane()) {
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = block.lane_id();
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
active_logical_lane_id_2(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
if (block.active_logical_lane_id() == 2) {
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = block.lane_id();
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
lane_id(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
auto lane_id = block.lane_id();
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = lane_id;
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
number_active_lanes(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
auto number_active_lanes = block.number_active_lanes();
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = number_active_lanes;
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
broadcast_up_value_42(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
uint64_t value = 1;
|
|
if (block.is_lowest_active_lane()) {
|
|
value = 42;
|
|
}
|
|
value = block.broadcast_up(value);
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = value;
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
fetch_incr_lowest_active_lane(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
auto orig = block.fetch_incr(_fetch_value);
|
|
if (block.is_lowest_active_lane()) {
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = orig;
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__
|
|
void
|
|
fetch_incr_active_logical_lane_1(WarpMatrix *warp_matrix,
|
|
size_t lanes_bitfield)
|
|
{
|
|
Block block {};
|
|
if (activate_lane_helper(lanes_bitfield)) {
|
|
auto orig = block.fetch_incr(_fetch_value);
|
|
if (block.active_logical_lane_id() == 1) {
|
|
size_t warp_index = hipThreadIdx_x / WF_SIZE;
|
|
size_t block_index = hipBlockIdx_x;
|
|
auto *elem = warp_matrix->access(warp_index, block_index);
|
|
*elem = orig;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
************************* Helper Methods ********************************
|
|
*************************************************************************/
|
|
__device__
|
|
bool
|
|
activate_lane_helper(uint64_t lanes_bitfield)
|
|
{
|
|
/*
|
|
* In the following example, assume the following values:
|
|
* hipThreadIdx_x := 66
|
|
* _warp_size := 64.
|
|
*
|
|
* index (tens): 0 0 0 0 0 0 0 ... 6 6 . .
|
|
* (ones): 0 1 2 3 4 5 6 ... 2 3 . .
|
|
* lanes_bitfield: [1 0 1 0 1 0 1 ... 1 0 . .]
|
|
*
|
|
* Example:
|
|
* warp_bit_id := hipThreadIdx_x % _warp_size;
|
|
* warp_bit_id := 66 % 64
|
|
* warp_bit_id := 2
|
|
*/
|
|
uint64_t warp_bit_id = hipThreadIdx_x % WF_SIZE;
|
|
|
|
/*
|
|
* Example (continued):
|
|
* warp_bitmask := 1 << 2
|
|
* index (tens): 0 0 0 0 0 0 0 ... 6 6 . .
|
|
* (ones): 0 1 2 3 4 5 6 ... 2 3 . .
|
|
* warp_bitmask: [0 0 1 0 0 0 0 ... 0 0 . .]
|
|
*/
|
|
uint64_t my_warp_bitmask_id = 1UL << warp_bit_id;
|
|
|
|
/*
|
|
* Example (continued):
|
|
* index (tens): 0 0 0 0 0 0 0 ... 6 6 . .
|
|
* (ones): 0 1 2 3 4 5 6 ... 2 3 . .
|
|
* lanes_bitfield: [1 0 1 0 1 0 1 ... 1 0 . .]
|
|
* warp_bitmask: [0 0 1 0 0 0 0 ... 0 0 . .]
|
|
*/
|
|
bool is_an_active_lane = lanes_bitfield & my_warp_bitmask_id;
|
|
|
|
return is_an_active_lane;
|
|
}
|
|
|
|
long long unsigned *_fetch_value = nullptr;
|
|
};
|
|
|
|
/*****************************************************************************
|
|
***************************** Test Fixture **********************************
|
|
*****************************************************************************/
|
|
class BitwiseTestFixture : public ::testing::Test
|
|
{
|
|
public:
|
|
BitwiseTestFixture() = default;
|
|
|
|
~BitwiseTestFixture()
|
|
{
|
|
if (_device_methods) {
|
|
if (_device_methods->_fetch_value) {
|
|
_hip_allocator.deallocate(_device_methods->_fetch_value);
|
|
}
|
|
_hip_allocator.deallocate(_device_methods);
|
|
}
|
|
if (_warp_matrix) {
|
|
_hip_allocator.deallocate(_warp_matrix);
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
**************************** Setup Methods ******************************
|
|
*************************************************************************/
|
|
void
|
|
setup_fixture(dim3 block_dim, dim3 grid_dim)
|
|
{
|
|
_hip_block_dim = block_dim;
|
|
_hip_grid_dim = grid_dim;
|
|
_wf_size = get_wf_size();
|
|
|
|
assert(_device_methods == nullptr);
|
|
_hip_allocator.allocate(reinterpret_cast<void**>(&_device_methods),
|
|
sizeof(BitwiseDeviceMethods));
|
|
|
|
assert(_device_methods);
|
|
|
|
_hip_allocator.allocate(reinterpret_cast<void**>(&_device_methods->_fetch_value),
|
|
sizeof(long long unsigned));
|
|
|
|
assert(_device_methods->_fetch_value);
|
|
|
|
*_device_methods->_fetch_value = 0;
|
|
|
|
assert(_warp_matrix == nullptr);
|
|
_hip_allocator.allocate(reinterpret_cast<void**>(&_warp_matrix),
|
|
sizeof(WarpMatrix));
|
|
|
|
size_t warps_per_block = ceil(float(_hip_block_dim.x) / WF_SIZE);
|
|
|
|
const ObjectStrategy *default_object_strategy =
|
|
DefaultObjectStrategy::instance()->get();
|
|
|
|
assert(_warp_matrix);
|
|
new (_warp_matrix) WarpMatrix(warps_per_block,
|
|
_hip_grid_dim.x,
|
|
_hip_allocator,
|
|
*default_object_strategy);
|
|
}
|
|
|
|
void
|
|
zero_warp_matrix()
|
|
{
|
|
for (size_t row = 0; row < _warp_matrix->rows(); row++) {
|
|
for (size_t col = 0; col < _warp_matrix->columns(); col++) {
|
|
auto *entry = _warp_matrix->access(row, col);
|
|
*entry = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
verify_zeroed_warp_matrix()
|
|
{
|
|
for (size_t row = 0; row < _warp_matrix->rows(); row++) {
|
|
for (size_t col = 0; col < _warp_matrix->columns(); col++) {
|
|
auto *entry = _warp_matrix->access(row, col);
|
|
ASSERT_EQ(*entry, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
*********************** Kernel Launch Methods ***************************
|
|
*************************************************************************/
|
|
void
|
|
host_run_device_kernel(void(*fn)(BitwiseDeviceMethods*,
|
|
WarpMatrix*,
|
|
size_t),
|
|
size_t activate_lanes_bitfield)
|
|
{
|
|
hipLaunchKernelGGL(fn,
|
|
_hip_grid_dim,
|
|
_hip_block_dim,
|
|
0,
|
|
nullptr,
|
|
_device_methods,
|
|
_warp_matrix,
|
|
activate_lanes_bitfield);
|
|
|
|
CHECK_HIP(hipStreamSynchronize(nullptr));
|
|
}
|
|
|
|
protected:
|
|
/*************************************************************************
|
|
********************** Implementation Variables *************************
|
|
*************************************************************************/
|
|
dim3 _hip_block_dim {};
|
|
dim3 _hip_grid_dim {};
|
|
HIPAllocator _hip_allocator {};
|
|
WarpMatrix *_warp_matrix = nullptr;
|
|
BitwiseDeviceMethods *_device_methods = nullptr;
|
|
int _wf_size;
|
|
};
|
|
|
|
} // namespace rocshmem
|
|
|
|
#endif // ROCSHMEM_BITWISE_GTEST_HPP
|