EXSWHTEC-259 - Implement new and extend existing tests for thread_block #151

Change-Id: I4d85af8e5f805a1c8ff7b7efd3b3400f38343ea9


[ROCm/hip-tests commit: 4fa177dd2b]
This commit is contained in:
Nives Vukovic
2023-12-29 16:02:50 +00:00
committed by Rakesh Roy
orang tua 20ee07a6d7
melakukan 48c932afad
6 mengubah file dengan 362 tambahan dan 26 penghapusan
@@ -30,6 +30,9 @@ int main(int argc, char** argv) {
| Opt(cmd_options.progress)
["-P"]["--progress"]
("Show progress bar when running performance tests")
| Opt(cmd_options.cg_iterations, "cg_iterations")
["-E"]["--cg-iterations"]
("Number of iterations used for cooperative groups sync tests (default: 5)")
| Opt(cmd_options.cg_extended_run, "cg_extened_run")
["-E"]["--cg-extended-run"]
("TODO: Description goes here")
@@ -144,21 +144,6 @@ THE SOFTWARE.
* @}
*/
/**
* @defgroup StreamOTest Ordered Memory Allocator
* @{
* This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
* API.
* @}
*/
/**
* @defgroup StreamOTest Ordered Memory Allocator
* @{
* This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
* API.
*/
/**
* @defgroup StreamOTest Ordered Memory Allocator
* @{
@@ -173,13 +158,6 @@ THE SOFTWARE.
* @}
*/
/**
* @defgroup PerformanceTest Performance tests
* @{
* This section describes performance tests for the target API groups and use-cases.
* @}
*/
/**
* @defgroup TextureTest Texture Management
* @{
@@ -209,7 +187,6 @@ THE SOFTWARE.
*/
/**
* @defgroup ComplexTest Complex type
* @{
* This section describes tests for the Complex type functions.
* @}
@@ -19,7 +19,6 @@
# THE SOFTWARE.
add_subdirectory(memset)
add_subdirectory(memcpy)
add_subdirectory(kernelLaunch)
add_subdirectory(stream)
add_subdirectory(event)
@@ -47,7 +47,6 @@ add_subdirectory(atomics)
add_subdirectory(complex)
add_subdirectory(p2p)
add_subdirectory(gcc)
add_subdirectory(vector_types)
if(HIP_PLATFORM STREQUAL "amd")
add_subdirectory(callback)
@@ -31,6 +31,14 @@ constexpr size_t kWarpSize = 64;
constexpr int kMaxGPUs = 8;
} // namespace
constexpr int MaxGPUs = 8;
inline bool operator==(const dim3& l, const dim3& r) {
return l.x == r.x && l.y == r.y && l.z == r.z;
}
inline bool operator!=(const dim3& l, const dim3& r) { return !(l == r); }
__device__ inline unsigned int thread_rank_in_grid() {
const auto block_size = blockDim.x * blockDim.y * blockDim.z;
const auto block_rank_in_grid = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
@@ -67,4 +75,4 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc
}
return true;
}
}
@@ -0,0 +1,350 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "cooperative_groups_common.hh"
#include <cpu_grid.h>
#include <optional>
#include <resource_guards.hh>
#include <utils.hh>
#include <cmd_options.hh>
/**
* @addtogroup thread_block thread_block
* @{
* @ingroup DeviceLanguageTest
* Contains unit tests for all thread_block APIs
*/
namespace cg = cooperative_groups;
template <typename BaseType = cg::thread_block>
static __global__ void thread_block_size_getter(unsigned int* sizes) {
const BaseType group = cg::this_thread_block();
sizes[thread_rank_in_grid()] = group.size();
}
template <typename BaseType = cg::thread_block>
static __global__ void thread_block_thread_rank_getter(unsigned int* thread_ranks) {
const BaseType group = cg::this_thread_block();
thread_ranks[thread_rank_in_grid()] = group.thread_rank();
}
static __global__ void thread_block_group_indices_getter(dim3* group_indices) {
group_indices[thread_rank_in_grid()] = cg::this_thread_block().group_index();
}
static __global__ void thread_block_thread_indices_getter(dim3* thread_indices) {
thread_indices[thread_rank_in_grid()] = cg::this_thread_block().thread_index();
}
static __global__ void thread_block_non_member_size_getter(unsigned int* sizes) {
sizes[thread_rank_in_grid()] = cg::group_size(cg::this_thread_block());
}
static __global__ void thread_block_non_member_thread_rank_getter(unsigned int* thread_ranks) {
thread_ranks[thread_rank_in_grid()] = cg::thread_rank(cg::this_thread_block());
}
/**
* Test Description
* ------------------------
* - Launches kernels that write the return values of size, thread_rank, group_index, and
* thread_index member functions to an output array that is validated on the host side. The kernels
* are run sequentially, reusing the output array, to avoid running out of device memory for large
* kernel launches.
* Test source
* ------------------------
* - unit/cooperativeGrps/thread_block.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_Thread_Block_Getters_Positive_Basic") {
const auto blocks = GenerateBlockDimensions();
const auto threads = GenerateThreadDimensions();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
const CPUGrid grid(blocks, threads);
{
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
grid.thread_count_ * sizeof(unsigned int));
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
grid.thread_count_ * sizeof(unsigned int));
thread_block_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
thread_block_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
// Validate thread_block.size() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[size = grid.threads_in_block_count_](uint32_t) { return size; });
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
// Validate thread_block.thread_rank() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
}
{
LinearAllocGuard<dim3> dim3_arr_dev(LinearAllocs::hipMalloc, grid.thread_count_ * sizeof(dim3));
LinearAllocGuard<dim3> dim3_arr(LinearAllocs::hipHostMalloc, grid.thread_count_ * sizeof(dim3));
thread_block_group_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
thread_block_thread_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
// Validate thread_block.group_index() values
ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
[&grid](uint32_t i) { return grid.block_idx(i).value(); });
HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
// Validate thread_block.thread_index() values
ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
[&grid](uint32_t i) { return grid.thread_idx(i).value(); });
}
}
/**
* Test Description
* ------------------------
* - Launches kernels that write the return values of size and thread_rank member functions to an
* output array that is validated on the host side, while treating the thread block as a thread
* group. The kernels are run sequentially, reusing the output array, to avoid running out of device
* memory for large kernel launches.
* Test source
* ------------------------
* - unit/cooperativeGrps/thread_block.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_Thread_Block_Getters_Via_Base_Type_Positive_Basic") {
const auto blocks = GenerateBlockDimensions();
const auto threads = GenerateThreadDimensions();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
const CPUGrid grid(blocks, threads);
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
grid.thread_count_ * sizeof(unsigned int));
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
grid.thread_count_ * sizeof(unsigned int));
thread_block_size_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
thread_block_thread_rank_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
// Validate thread_block.size() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[size = grid.threads_in_block_count_](uint32_t) { return size; });
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
// Validate thread_block.thread_rank() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
}
/**
* Test Description
* ------------------------
* - Launches kernels that write the return values of size and thread_rank non-member functions
* to an output array that is validated on the host side. The kernels are run sequentially, reusing
* the output array, to avoid running out of device memory for large kernel launches.
* Test source
* ------------------------
* - unit/cooperativeGrps/thread_block.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_Thread_Block_Getters_Via_Non_Member_Functions_Positive_Basic") {
const auto blocks = GenerateBlockDimensions();
const auto threads = GenerateThreadDimensions();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
const CPUGrid grid(blocks, threads);
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
grid.thread_count_ * sizeof(unsigned int));
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
grid.thread_count_ * sizeof(unsigned int));
thread_block_non_member_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
thread_block_non_member_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
HIP_CHECK(hipGetLastError());
// Validate thread_block.size() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[size = grid.threads_in_block_count_](uint32_t) { return size; });
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
// Validate thread_block.thread_rank() values
ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
[&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
}
template <bool use_global, typename T>
__global__ void thread_block_sync_check(T* global_data, unsigned int* wait_modifiers,
unsigned int* read_offsets) {
extern __shared__ uint8_t shared_data[];
T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
const auto block = cg::this_thread_block();
constexpr T divisor = 255;
const auto tid = block.thread_rank();
const auto wait_modifier = wait_modifiers[tid];
const auto read_offset = read_offsets[tid];
busy_wait(wait_modifier);
data[tid] = tid % divisor;
block.sync();
bool valid = true;
for (auto i = 0; i < block.size(); ++i) {
const auto offset = block.size() + read_offset;
const auto expected = (tid + offset + i) % block.size();
if (!(valid &= (data[expected] == expected % divisor))) {
break;
}
}
block.sync();
data[tid] = valid;
if constexpr (!use_global) {
global_data[tid] = data[tid];
}
}
static inline std::mt19937& GetRandomGenerator() {
// With a static seed the tests will remain consistent between runs, yet it relieves the problem
// of predetermining a set of modifiers by hand. The sets of modifiers could actually be
// determined at compile time if std::random objects could operate in a constexpr context.
static std::mt19937 mt(17);
return mt;
}
template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
std::uniform_int_distribution<T> dist(min, max);
return dist(GetRandomGenerator());
}
template <bool global_memory, typename T> void ThreadBlockSyncTest() {
const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
INFO("Run number: " << randomized_run_count + 1);
const auto blocks = dim3(1, 1, 1);
const auto threads = GenerateThreadDimensions();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
CPUGrid grid(blocks, threads);
const auto alloc_size = grid.thread_count_ * sizeof(T);
int max_shared_mem_per_block = 0;
HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
if (!global_memory && max_shared_mem_per_block < alloc_size) {
return;
}
LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
grid.thread_count_ * sizeof(unsigned int));
LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
grid.thread_count_ * sizeof(unsigned int));
std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
[&] { return GenerateRandomInteger(0u, 1500u); });
LinearAllocGuard<unsigned int> read_offsets_dev(LinearAllocs::hipMalloc,
grid.thread_count_ * sizeof(unsigned int));
std::vector<unsigned int> read_offsets(grid.thread_count_, 0u);
if (randomized_run_count != 0) {
std::generate(read_offsets.begin(), read_offsets.end(),
[&] { return GenerateRandomInteger(0u, grid.thread_count_); });
}
const auto shared_memory_size = global_memory ? 0u : alloc_size;
HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(read_offsets_dev.ptr(), read_offsets.data(),
grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
thread_block_sync_check<global_memory><<<blocks, threads, shared_memory_size>>>(
arr_dev.ptr(), wait_modifiers_dev.ptr(), read_offsets_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
REQUIRE(std::all_of(arr.ptr(), arr.ptr() + grid.thread_count_, [](unsigned int e) { return e; }));
}
/**
* Test Description
* ------------------------
* - Launches a kernel wherein every thread writes its grid-wide linear index into an array. The
* array is either in global or dynamic shared memory based on a compile time switch, and the test
* is run for arrays of 1, 2, and 4 byte elements. Before the write each thread executes a busy wait
* loop for a random amount of clock cycles, the amount being read from an input array. After the
* write a block-wide sync is performed and each thread validates that it can read the expected
* values that other threads have written to their respective array slots. Each thread begins the
* validation from a given offset from its own index. For the first run of the test, all the offsets
* are zero, so memory reads should be coalesced as adjacent threads read from adjacent memory
* locations. On subsequent runs the offsets are randomized for each thread, leading to
* non-coalesced reads and cache thrashing.
* Test source
* ------------------------
* - unit/cooperativeGrps/thread_block.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEMPLATE_TEST_CASE("Unit_Thread_Block_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) {
SECTION("Global memory") { ThreadBlockSyncTest<true, TestType>(); }
SECTION("Shared memory") { ThreadBlockSyncTest<false, TestType>(); }
}