/* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "cooperative_groups_common.hh" #include "cg_common_kernels.hh" #include #include #include #include #include #include #include #include /** * @addtogroup thread_block_tile thread_block_tile * @{ * @ingroup DeviceLanguageTest * Contains unit tests for all thread_block_tile APIs and dynamic block partitioning */ namespace cg = cooperative_groups; template __global__ void thread_block_partition_size_getter(unsigned int* sizes) { const auto group = cg::this_thread_block(); if constexpr (dynamic) { sizes[thread_rank_in_grid()] = cg::tiled_partition(group, tile_size).size(); } else { cg::thread_block_tile tiled_partition = cg::tiled_partition(group); sizes[thread_rank_in_grid()] = tiled_partition.size(); } } template __global__ void thread_block_partition_thread_rank_getter(unsigned int* thread_ranks) { const auto group = cg::this_thread_block(); if constexpr (dynamic) { thread_ranks[thread_rank_in_grid()] = cg::tiled_partition(group, tile_size).thread_rank(); } else { cg::thread_block_tile tiled_partition = cg::tiled_partition(group); thread_ranks[thread_rank_in_grid()] = tiled_partition.thread_rank(); } } template void BlockPartitionGettersBasicTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { auto blocks = GenerateBlockDimensions(); auto threads = GenerateThreadDimensions(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(unsigned int); LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, alloc_size); thread_block_partition_size_getter<<>>(uint_arr_dev.ptr()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); thread_block_partition_thread_rank_getter <<>>(uint_arr_dev.ptr()); HIP_CHECK(hipGetLastError()); ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [&grid](unsigned int i) { if constexpr (!dynamic) { return tile_size; } const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size; const auto rank_in_block = grid.thread_rank_in_block(i).value(); const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_; return tile_size - tail * (rank_in_block >= (partitions_in_block - 1) * tile_size); }); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [&grid](unsigned int i) { return grid.thread_rank_in_block(i).value() % tile_size; }); } } template void BlockPartitionGettersBasicTest() { static_cast((BlockPartitionGettersBasicTestImpl(), ...)); } /** * Test Description * ------------------------ * - Creates tiled partitions for each of the valid sizes{2, 4, 8, 16, 32, 64(if AMD)} and writes * the return values of size and thread_rank member functions to an output array that is validated * on the host side. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEST_CASE("Unit_Thread_Block_Tile_Getters_Positive_Basic") { BlockPartitionGettersBasicTest(); } /** * Test Description * ------------------------ * - Creates tiled partitions for each of the valid sizes{2, 4, 8, 16, 32, 64(if AMD)} via the * dynamic tiled partition api and writes the return values of size and thread_rank member functions * to an output array that is validated on host. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEST_CASE("Unit_Thread_Block_Tile_Dynamic_Getters_Positive_Basic") { BlockPartitionGettersBasicTest(); } template __global__ void block_tile_shfl_up(T* const out, const unsigned int delta) { const cg::thread_block_tile partition = cg::tiled_partition(cg::this_thread_block()); T var = static_cast(partition.thread_rank()); out[thread_rank_in_grid()] = partition.shfl_up(var, delta); } template void BlockTileShflUpTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); std::vector deltas; for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { deltas.emplace_back(static_cast(std::floor(i))); } deltas.emplace_back(tile_size - 1); const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); block_tile_shfl_up<<>>(arr_dev.ptr(), delta); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ArrayAllOf(arr.ptr(), grid.thread_count_, [delta, &grid](unsigned int i) -> std::optional { const int rank_in_partition = grid.thread_rank_in_block(i).value() % tile_size; const int target = rank_in_partition - delta; return target < 0 ? rank_in_partition : target; }); } } template void BlockTileShflUpTest() { static_cast((BlockTileShflUpTestImpl(), ...)); } /** * Test Description * ------------------------ * - Validates the shuffle up behavior of thread block tiles of all valid sizes{2, 4, 8, 16, 32, * 64(if AMD)} for delta values of [0, tile size). The test is run for all overloads of shfl_up. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic", "", int, unsigned int, long, unsigned long, long long, unsigned long long, float, double) { BlockTileShflUpTest(); } template __global__ void block_tile_shfl_down(T* const out, const unsigned int delta) { const cg::thread_block_tile partition = cg::tiled_partition(cg::this_thread_block()); T var = static_cast(partition.thread_rank()); out[thread_rank_in_grid()] = partition.shfl_down(var, delta); } template void BlockTileShflDownTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); std::vector deltas; for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { deltas.emplace_back(static_cast(std::floor(i))); } deltas.emplace_back(tile_size - 1); const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); block_tile_shfl_down<<>>(arr_dev.ptr(), delta); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); ArrayAllOf(arr.ptr(), grid.thread_count_, [delta, &grid](unsigned int i) -> std::optional { const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size; const auto rank_in_block = grid.thread_rank_in_block(i).value(); const auto rank_in_group = rank_in_block % tile_size; const auto target = rank_in_group + delta; if (rank_in_block < (partitions_in_block - 1) * tile_size) { return target < tile_size ? target : rank_in_group; } else { // If the number of threads in a block is not an integer multiple of tile_size, the // final(tail end) tile will contain inactive threads. // Shuffling from an inactive thread returns an undefined value, accordingly threads that // shuffle from one must be skipped const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_; return target < tile_size - tail ? std::optional(target) : std::nullopt; } }); } } template void BlockTileShflDownTest() { static_cast((BlockTileShflDownTestImpl(), ...)); } /** * Test Description * ------------------------ * - Validates the shuffle down behavior of thread block tiles of all valid sizes{2, 16, * 32, 64(if AMD)} for delta values of [0, tile size). The test is run for all overloads of * shfl_down. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic", "", int, unsigned int, long, unsigned long, long long, unsigned long long, float, double) { BlockTileShflDownTest(); } template __global__ void block_tile_shfl_xor(T* const out, const unsigned mask) { const cg::thread_block_tile partition = cg::tiled_partition(cg::this_thread_block()); T var = static_cast(partition.thread_rank()); out[thread_rank_in_grid()] = partition.shfl_xor(var, mask); } template void BlockTileShflXORTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); std::vector masks; for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { masks.emplace_back(static_cast(std::floor(i))); } masks.emplace_back(tile_size - 1); const auto mask = GENERATE_COPY(from_range(masks.begin(), masks.end())); INFO("Mask: 0x" << std::hex << mask); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); block_tile_shfl_xor<<>>(arr_dev.ptr(), mask); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); const auto f = [mask, &grid](unsigned int i) -> std::optional { const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size; const auto rank_in_block = grid.thread_rank_in_block(i).value(); const int rank_in_partition = rank_in_block % tile_size; const auto target = rank_in_partition ^ mask; if (rank_in_block < (partitions_in_block - 1) * tile_size) { return target; } const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_; return target < tile_size - tail ? std::optional(target) : std::nullopt; }; ArrayAllOf(arr.ptr(), grid.thread_count_, f); } } template void BlockTileShflXORTest() { static_cast((BlockTileShflXORTestImpl(), ...)); } /** * Test Description * ------------------------ * - Validates the shuffle xor behavior of thread block tiles of all valid sizes{2, 16, 32, * 64(if AMD)} for mask values of [0, tile size). The test is run for all overloads of shfl_xor. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic", "", int, unsigned int, long, unsigned long, long long, unsigned long long, float, double) { BlockTileShflXORTest(); } template __global__ void block_tile_shfl(T* const out, uint8_t* target_lanes) { const cg::thread_block_tile partition = cg::tiled_partition(cg::this_thread_block()); T var = static_cast(partition.thread_rank()); out[thread_rank_in_grid()] = partition.shfl(var, target_lanes[partition.thread_rank()]); } static inline std::mt19937& GetRandomGenerator() { static std::mt19937 mt(11); return mt; } template static inline T GenerateRandomInteger(const T min, const T max) { std::uniform_int_distribution dist(min, max); return dist(GetRandomGenerator()); } template void BlockTileShflTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); LinearAllocGuard target_lanes_dev(LinearAllocs::hipMalloc, tile_size * sizeof(uint8_t)); LinearAllocGuard target_lanes(LinearAllocs::hipHostMalloc, tile_size * sizeof(uint8_t)); std::generate(target_lanes.ptr(), target_lanes.ptr() + tile_size, [] { return GenerateRandomInteger(0, static_cast(2 * tile_size)); }); HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), tile_size * sizeof(uint8_t), hipMemcpyHostToDevice)); block_tile_shfl<<>>(arr_dev.ptr(), target_lanes_dev.ptr()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); const auto f = [&target_lanes, &grid](unsigned int i) -> std::optional { const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size; const auto rank_in_block = grid.thread_rank_in_block(i).value(); const int rank_in_partition = rank_in_block % tile_size; const auto target = target_lanes.ptr()[rank_in_partition] % tile_size; if (rank_in_block < (partitions_in_block - 1) * tile_size) { return target; } const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_; return target < tile_size - tail ? std::optional(target) : std::nullopt; }; ArrayAllOf(arr.ptr(), grid.thread_count_, f); } } template void BlockTileShflTest() { static_cast((BlockTileShflTestImpl(), ...)); } /** * Test Description * ------------------------ * - Validates the shuffle behavior of thread block tiles of all valid sizes{2, 16, 32, * 64(if AMD)} for generated shuffle target lanes. The test is run for all overloads of shfl. Test * source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long, long long, unsigned long long, float, double) { BlockTileShflTest(); } template __global__ void block_tile_sync_check(T* global_data, unsigned int* wait_modifiers) { extern __shared__ uint8_t shared_data[]; T* const data = use_global ? global_data : reinterpret_cast(shared_data); const auto tid = cg::this_grid().thread_rank(); const auto block = cg::this_thread_block(); const cg::thread_block_tile partition = cg::tiled_partition(cg::this_thread_block()); const auto data_idx = [&block](unsigned int i) { return use_global ? i : (i % block.size()); }; const auto partitions_in_block = (block.size() + partition.size() - 1) / partition.size(); const auto partition_rank = block.thread_rank() / partition.size(); const auto tail = partitions_in_block * partition.size() - block.size(); const auto window_size = partition.size() - tail * (partition_rank == partitions_in_block - 1); const auto block_base_idx = tid / block.size() * block.size(); const auto tile_base_idx = block_base_idx + partition_rank * partition.size(); const auto wait_modifier = wait_modifiers[tid]; busy_wait(wait_modifier); data[data_idx(tid)] = partition.thread_rank(); partition.sync(); bool valid = true; for (auto i = 0; i < window_size; ++i) { const auto expected = (partition.thread_rank() + i) % window_size; if (!(valid &= (data[data_idx(tile_base_idx + expected)] == expected))) { break; } } partition.sync(); data[data_idx(tid)] = valid; if constexpr (!use_global) { global_data[tid] = data[data_idx(tid)]; } } template void BlockTileSyncTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations)); INFO("Run number: " << randomized_run_count + 1); auto blocks = GenerateBlockDimensions(); auto threads = GenerateThreadDimensions(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); const auto alloc_size_per_block = alloc_size / grid.block_count_; int max_shared_mem_per_block = 0; HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock, 0)); if (!global_memory && (max_shared_mem_per_block < alloc_size_per_block)) { return; } LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); LinearAllocGuard wait_modifiers_dev(LinearAllocs::hipMalloc, grid.thread_count_ * sizeof(unsigned int)); LinearAllocGuard wait_modifiers(LinearAllocs::hipHostMalloc, grid.thread_count_ * sizeof(unsigned int)); if (randomized_run_count != 0) { std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_, [] { return GenerateRandomInteger(0u, 1500u); }); } else { std::fill_n(wait_modifiers.ptr(), grid.thread_count_, 0u); } const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block; HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(), grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice)); block_tile_sync_check <<>>(arr_dev.ptr(), wait_modifiers_dev.ptr()); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); REQUIRE( std::all_of(arr.ptr(), arr.ptr() + grid.thread_count_, [](unsigned int e) { return e; })); } } template void BlockTileSyncTest() { static_cast((BlockTileSyncTestImpl(), ...)); } /** * Test Description * ------------------------ * - Launches a kernel wherein blocks are divided into tiled partitions(size of 2, 4, 8, 16, 32, * 64 if AMD) and every thread writes its intra-tile rank into an array slot determined by its * grid-wide linear index. The array is either in global or dynamic shared memory based on a compile * time switch, and the test is run for arrays of 1, 2, and 4 byte elements. Before the write each * thread executes a busy wait loop for a random amount of clock cycles, the amount being read from * an input array. After the write a tile-wide sync is performed and each thread validates that it * can read the expected values that other threads within the same tile have written to their * respective array slots. * Test source * ------------------------ * - unit/cooperativeGrps/thread_block_tile.cc * Test requirements * ------------------------ * - HIP_VERSION >= 5.2 */ TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) { SECTION("Global memory") { BlockTileSyncTest(); } SECTION("Shared memory") { BlockTileSyncTest(); } } /** * End doxygen group DeviceLanguageTest. * @} */