cac67a0f32
* Reworked Unit_hipLaunchCooperativeKernel_Basic and Unit_hipLaunchCooperativeKernelMultiDevice_Basic * Introduce reduction_factor for coop groups tests. Fix Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic * Fix always false requirement by adding a cast * Change data type to unsigned long long to align with cuda * Change literal type to double to ensure proper type casting * Remove formatting comments
328 sor
14 KiB
C++
328 sor
14 KiB
C++
/*
|
|
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
#include "cooperative_groups_common.hh"
|
|
|
|
#include <cpu_grid.h>
|
|
#include <resource_guards.hh>
|
|
#include <utils.hh>
|
|
|
|
/**
|
|
* @addtogroup grid_group grid_group
|
|
* @{
|
|
* @ingroup DeviceLanguageTest
|
|
* Contains unit tests for all grid_group APIs
|
|
*/
|
|
|
|
namespace cg = cooperative_groups;
|
|
|
|
static __global__ void grid_group_size_getter(unsigned int* sizes) {
|
|
sizes[thread_rank_in_grid()] = cg::this_grid().size();
|
|
}
|
|
|
|
static __global__ void grid_group_thread_rank_getter(unsigned int* thread_ranks) {
|
|
thread_ranks[thread_rank_in_grid()] = cg::this_grid().thread_rank();
|
|
}
|
|
|
|
static __global__ void grid_group_block_rank_getter(unsigned int* block_ranks) {
|
|
block_ranks[thread_rank_in_grid()] = cg::this_grid().block_rank();
|
|
}
|
|
|
|
static __global__ void grid_group_is_valid_getter(unsigned int* is_valid_flags) {
|
|
is_valid_flags[thread_rank_in_grid()] = cg::this_grid().is_valid();
|
|
}
|
|
|
|
static __global__ void grid_group_non_member_size_getter(unsigned int* sizes) {
|
|
sizes[thread_rank_in_grid()] = cg::group_size(cg::this_grid());
|
|
}
|
|
|
|
static __global__ void grid_group_non_member_thread_rank_getter(unsigned int* thread_ranks) {
|
|
thread_ranks[thread_rank_in_grid()] = cg::thread_rank(cg::this_grid());
|
|
}
|
|
|
|
static __global__ void sync_kernel(unsigned int* atomic_val, unsigned int* per_loop_atomic,
|
|
unsigned int* array, unsigned int loops) {
|
|
cg::grid_group grid = cg::this_grid();
|
|
unsigned rank = grid.thread_rank();
|
|
|
|
int blocks_seen = 0;
|
|
int grid_blocks = gridDim.x * gridDim.y * gridDim.z;
|
|
|
|
for (int iter = 0; iter < loops; iter++) {
|
|
// Make the last thread run way behind everyone else.
|
|
// If the sync below fails, then the other threads may hit the
|
|
// atomicInc instruction many times before the last thread ever gets to it.
|
|
// If the sync works, then it will likely contain "total number of blocks"*iter
|
|
if (rank == (grid.size() - 1)) {
|
|
// The last wavefront should spin on this loop's atomic value
|
|
// until all of the other wavefronts have incremented the
|
|
// per-loop atomic and hit the grid.sync()
|
|
#if HT_AMD
|
|
while (__hip_atomic_load(&per_loop_atomic[iter], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT) <
|
|
(grid_blocks - 1)) {
|
|
__builtin_amdgcn_s_sleep(127);
|
|
}
|
|
|
|
// Give the other waves time to maybe go around the loop again
|
|
// if the barrier has failed
|
|
__builtin_amdgcn_s_sleep(127);
|
|
#else // CUDA does not seem to need an ordered atomic load
|
|
while (per_loop_atomic[iter] < (grid_blocks - 1)) {
|
|
}
|
|
#endif
|
|
}
|
|
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1 &&
|
|
threadIdx.z == blockDim.z - 1) {
|
|
atomicInc(&per_loop_atomic[iter], UINT_MAX);
|
|
array[((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) + blocks_seen] =
|
|
atomicInc(&atomic_val[0], UINT_MAX);
|
|
}
|
|
grid.sync();
|
|
blocks_seen += grid_blocks;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches kernels that write the return values of size, thread_rank and is_valid member
|
|
* functions to an output array that is validated on the host side. The kernels are run
|
|
* sequentially, reusing the output array, to avoid running out of device memory for large kernel
|
|
* launches.
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/cooperativeGrps/grid_group.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
* - Device supports cooperative launch
|
|
*/
|
|
TEST_CASE("Unit_Grid_Group_Getters_Positive_Basic") {
|
|
int device;
|
|
hipDeviceProp_t device_properties;
|
|
HIP_CHECK(hipGetDevice(&device));
|
|
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
|
|
if (!device_properties.cooperativeLaunch) {
|
|
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
|
return;
|
|
}
|
|
|
|
const auto blocks = GenerateBlockDimensions();
|
|
const auto threads = GenerateThreadDimensions();
|
|
if (!CheckDimensions(device, grid_group_size_getter, blocks, threads)) return;
|
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
|
|
|
const CPUGrid grid(blocks, threads);
|
|
|
|
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
|
|
grid.thread_count_ * sizeof(unsigned int));
|
|
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
|
|
grid.thread_count_ * sizeof(unsigned int));
|
|
|
|
// Launch Kernel
|
|
unsigned int* uint_arr_dev_ptr = uint_arr_dev.ptr();
|
|
void* params[1];
|
|
params[0] = &uint_arr_dev_ptr;
|
|
|
|
HIP_CHECK(hipLaunchCooperativeKernel(grid_group_size_getter, blocks, threads, params, 0, 0));
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
HIP_CHECK(
|
|
hipLaunchCooperativeKernel(grid_group_thread_rank_getter, blocks, threads, params, 0, 0));
|
|
|
|
// Verify grid_group.size() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
|
|
[size = grid.thread_count_](uint32_t) { return size; });
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
HIP_CHECK(hipLaunchCooperativeKernel(grid_group_is_valid_getter, blocks, threads, params, 0, 0));
|
|
|
|
// Verify grid_group.thread_rank() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [](uint32_t i) { return i; });
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
HIP_CHECK(hipLaunchCooperativeKernel(grid_group_block_rank_getter, blocks, threads, params, 0, 0));
|
|
|
|
// Verify grid_group.is_valid() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [](uint32_t) { return 1; });
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
|
|
// Verify grid_group.block_rank() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [threads](uint32_t i) {
|
|
return i/(threads.x * threads.y * threads.z); });
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches kernels that write the return values of size and thread_rank non-member functions
|
|
* to an output array that is validated on the host side. The kernels are run sequentially, reusing
|
|
* the output array, to avoid running out of device memory for large kernel launches.
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/cooperativeGrps/grid_group.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
* - Device supports cooperative launch
|
|
*/
|
|
TEST_CASE("Unit_Grid_Group_Getters_Via_Non_Member_Functions_Positive_Basic") {
|
|
int device;
|
|
hipDeviceProp_t device_properties;
|
|
HIP_CHECK(hipGetDevice(&device));
|
|
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
|
|
if (!device_properties.cooperativeLaunch) {
|
|
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
|
return;
|
|
}
|
|
|
|
const auto blocks = GenerateBlockDimensions();
|
|
const auto threads = GenerateThreadDimensions();
|
|
if (!CheckDimensions(device, grid_group_non_member_size_getter, blocks, threads)) return;
|
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
|
|
|
const CPUGrid grid(blocks, threads);
|
|
|
|
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
|
|
grid.thread_count_ * sizeof(unsigned int));
|
|
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
|
|
grid.thread_count_ * sizeof(unsigned int));
|
|
|
|
// Launch Kernel
|
|
unsigned int* uint_arr_dev_ptr = uint_arr_dev.ptr();
|
|
void* params[1];
|
|
params[0] = &uint_arr_dev_ptr;
|
|
|
|
HIP_CHECK(
|
|
hipLaunchCooperativeKernel(grid_group_non_member_size_getter, blocks, threads, params, 0, 0));
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
HIP_CHECK(hipLaunchCooperativeKernel(grid_group_non_member_thread_rank_getter, blocks, threads,
|
|
params, 0, 0));
|
|
|
|
// Verify grid_group.size() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
|
|
[size = grid.thread_count_](uint32_t) { return size; });
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
|
|
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
|
|
// Verify grid_group.thread_rank() values
|
|
ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [](uint32_t i) { return i; });
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Launches a kernel where the last thread in a block atomically increments a global variable
|
|
* within a work loop. The value returned from this atomic increment entirely depends on the order
|
|
* the threads arrive at the atomic instruction. Each thread then stores the result in the global
|
|
* array based on its block id. A wait loop is inserted into the last thread so that it runs behind
|
|
* all other threads. If the sync doesn't work, the other threads will increment the atomic variable
|
|
* many times before the last thread gets to it and it will read a very large value. If the sync
|
|
* works, each thread will increment the variable once per loop iteration and the last thread will
|
|
* contain total number of blocks * loop iteration.
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/cooperativeGrps/grid_group.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 5.2
|
|
* - Device supports cooperative launch
|
|
*/
|
|
TEST_CASE("Unit_Grid_Group_Sync_Positive_Basic") {
|
|
int device;
|
|
hipDeviceProp_t device_properties;
|
|
HIP_CHECK(hipGetDevice(&device));
|
|
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
|
|
if (!device_properties.cooperativeLaunch) {
|
|
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
|
return;
|
|
}
|
|
|
|
auto loops = GENERATE(2, 4, 8, 16);
|
|
const auto blocks = GenerateBlockDimensions();
|
|
const auto threads = GenerateThreadDimensions();
|
|
if (!CheckDimensions(device, sync_kernel, blocks, threads)) return;
|
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
|
|
|
const CPUGrid grid(blocks, threads);
|
|
unsigned int array_len = grid.block_count_ * loops;
|
|
|
|
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
|
|
array_len * sizeof(unsigned int));
|
|
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
|
|
array_len * sizeof(unsigned int));
|
|
LinearAllocGuard<unsigned int> atomic_val(LinearAllocs::hipMalloc, sizeof(unsigned int));
|
|
LinearAllocGuard<unsigned int> per_loop_atomic_val(LinearAllocs::hipMalloc,
|
|
loops * sizeof(unsigned int));
|
|
HIP_CHECK(hipMemset(atomic_val.ptr(), 0, sizeof(unsigned int)));
|
|
HIP_CHECK(hipMemset(per_loop_atomic_val.ptr(), 0, loops * sizeof(unsigned int)));
|
|
|
|
// Launch Kernel
|
|
unsigned int* uint_arr_dev_ptr = uint_arr_dev.ptr();
|
|
unsigned int* atomic_val_ptr = atomic_val.ptr();
|
|
unsigned int* per_loop_atomic_val_ptr = per_loop_atomic_val.ptr();
|
|
void* params[4];
|
|
params[0] = reinterpret_cast<void*>(&atomic_val_ptr);
|
|
params[1] = reinterpret_cast<void*>(&per_loop_atomic_val_ptr);
|
|
params[2] = reinterpret_cast<void*>(&uint_arr_dev_ptr);
|
|
params[3] = reinterpret_cast<void*>(&loops);
|
|
|
|
HIP_CHECK(hipLaunchCooperativeKernel(sync_kernel, blocks, threads, params, 0, 0));
|
|
|
|
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), array_len * sizeof(*uint_arr.ptr()),
|
|
hipMemcpyDeviceToHost));
|
|
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
|
|
// Verify host buffer values
|
|
unsigned int max_in_this_loop = 0;
|
|
for (unsigned int i = 0; i < loops; i++) {
|
|
max_in_this_loop += grid.block_count_;
|
|
unsigned int j = 0;
|
|
for (j = 0; j < grid.block_count_ - 1; j++) {
|
|
REQUIRE(uint_arr.ptr()[i * grid.block_count_ + j] < max_in_this_loop);
|
|
}
|
|
REQUIRE(uint_arr.ptr()[i * grid.block_count_ + j] == max_in_this_loop - 1);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* End doxygen group DeviceLanguageTest.
|
|
* @}
|
|
*/
|