diff --git a/catch/hipTestMain/main.cc b/catch/hipTestMain/main.cc index 74096c4af4..bfd52e8d96 100644 --- a/catch/hipTestMain/main.cc +++ b/catch/hipTestMain/main.cc @@ -30,8 +30,11 @@ int main(int argc, char** argv) { | Opt(cmd_options.progress) ["-P"]["--progress"] ("Show progress bar when running performance tests") - | Opt(cmd_options.cg_extended_run, "cg_extened_run") - ["-E"]["--cg-extended-run"] + | Opt(cmd_options.extended_run) + ["-E"]["--extended-run"] + ("TODO: Description goes here") + | Opt(cmd_options.cg_iterations, "cg_iterations") + ["-C"]["--cg-iterations"] ("Number of iterations used for cooperative groups sync tests (default: 5)") ; // clang-format on diff --git a/catch/include/cmd_options.hh b/catch/include/cmd_options.hh index 99e565491b..6caf7a0f48 100644 --- a/catch/include/cmd_options.hh +++ b/catch/include/cmd_options.hh @@ -26,8 +26,9 @@ struct CmdOptions { int iterations = 10; int warmups = 100; int cg_extended_run = 5; + int cg_iterations = 5; bool no_display = false; bool progress = false; }; -extern CmdOptions cmd_options; \ No newline at end of file +extern CmdOptions cmd_options; diff --git a/catch/unit/cooperativeGrps/CMakeLists.txt b/catch/unit/cooperativeGrps/CMakeLists.txt index 84d7417796..8bf1ce439b 100644 --- a/catch/unit/cooperativeGrps/CMakeLists.txt +++ b/catch/unit/cooperativeGrps/CMakeLists.txt @@ -9,9 +9,11 @@ set(TEST_SRC hipLaunchCooperativeKernel_old.cc hipLaunchCooperativeKernelMultiDevice_old.cc multi_grid_group.cc + coalesced_groups_shfl_down_old.cc + coalesced_groups_shfl_up_old.cc + hipCGCoalescedGroups_old.cc + coalesced_group.cc grid_group.cc - coalesced_groups_shfl_down.cc - coalesced_groups_shfl_up.cc coalesced_tiled_groups_metagrp.cc ) if(HIP_PLATFORM STREQUAL "nvidia") diff --git a/catch/unit/cooperativeGrps/coalesced_group.cc b/catch/unit/cooperativeGrps/coalesced_group.cc new file mode 100644 index 0000000000..3c5f3c9c62 --- /dev/null +++ b/catch/unit/cooperativeGrps/coalesced_group.cc @@ -0,0 +1,760 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "cooperative_groups_common.hh" + +#include +#include +#include + +/** + * @addtogroup coalesced_group coalesced_group + * @{ + * @ingroup DeviceLanguageTest + * Contains unit tests for all coalesced_group basic APIs + */ + +namespace cg = cooperative_groups; + +template +static __global__ void coalesced_group_size_getter(unsigned int* sizes, uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + BaseType active = cg::coalesced_threads(); + sizes[thread_rank_in_grid()] = active.size(); + } +} + +template +static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_ranks, + uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + BaseType active = cg::coalesced_threads(); + thread_ranks[thread_rank_in_grid()] = active.thread_rank(); + } +} + +template +static __global__ void coalesced_group_non_member_size_getter(unsigned int* sizes, + uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + sizes[thread_rank_in_grid()] = cg::group_size(active); + } +} + +template +static __global__ void coalesced_group_non_member_thread_rank_getter(unsigned int* thread_ranks, + uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + thread_ranks[thread_rank_in_grid()] = cg::thread_rank(active); + } +} + +static unsigned int get_active_thread_count(uint64_t active_mask, unsigned int partition_size) { + unsigned int active_thread_count = 0; + for (int i = 0; i < partition_size; i++) { + if (active_mask & (static_cast(1) << i)) active_thread_count++; + } + return active_thread_count; +} + +static uint64_t get_active_mask(unsigned int test_case) { + uint64_t active_mask = 0; + switch (test_case) { + case 0: // 1st thread + active_mask = 1; + break; + case 1: // last thread + active_mask = static_cast(1) << (kWarpSize - 1); + break; + case 2: // all threads + active_mask = 0xFFFFFFFFFFFFFFFF; + break; + case 3: // every second thread + active_mask = 0xAAAAAAAAAAAAAAAA; + break; + default: // random + static std::mt19937_64 mt(test_case); + std::uniform_int_distribution dist(0, std::numeric_limits::max()); + active_mask = dist(mt); + } + return active_mask; +} + +/** + * Test Description + * ------------------------ + * - Launches kernels that write the return values of size and thread_rank member + * functions of coalesced groups, created according to the generated mask, to an output array that + * is validated on the host side. The kernels are run sequentially, reusing the output array, to + * avoid running out of device memory for large kernel launches + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") { + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + const CPUGrid grid(blocks, threads); + + LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, + grid.thread_count_ * sizeof(unsigned int)); + LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, + grid.thread_count_ * sizeof(unsigned int)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + + // Launch Kernel + coalesced_group_size_getter<<>>(uint_arr_dev.ptr(), active_mask); + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + HIP_CHECK(hipDeviceSynchronize()); + coalesced_group_thread_rank_getter + <<>>(uint_arr_dev.ptr(), active_mask); + + // Verify coalesced_group.size() values + unsigned int coalesced_size = 0; + const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + // If the number of threads in a block is not a multiple of warp size, the + // last warp will have inactive threads and coalesced group size must be recalculated + if (rank_in_block == (partitions_in_block - 1) * kWarpSize) { + unsigned int partition_size = + grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize; + coalesced_size = get_active_thread_count(active_mask, partition_size); + } else if (rank_in_block == 0) { + coalesced_size = get_active_thread_count(active_mask, kWarpSize); + } + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_size) { + REQUIRE(uint_arr.ptr()[i] == coalesced_size); + } + } + } + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Verify coalesced_group.thread_rank() values + unsigned int coalesced_rank = 0; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_rank) { + REQUIRE(uint_arr.ptr()[i] == coalesced_rank); + } + coalesced_rank++; + } + } +} + +/** + * Test Description + * ------------------------ + * - Launches kernels that write the return values of size and thread_rank member functions to an + * output array that is validated on the host side, while treating the coalesced group, created + * according to the generated mask, as a thread group. The kernels are run sequentially, reusing the + * output array, to avoid running out of device memory for large kernel launches + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") { + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + + const CPUGrid grid(blocks, threads); + + LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, + grid.thread_count_ * sizeof(unsigned int)); + LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, + grid.thread_count_ * sizeof(unsigned int)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + + // Launch Kernel + coalesced_group_size_getter + <<>>(uint_arr_dev.ptr(), active_mask); + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + HIP_CHECK(hipDeviceSynchronize()); + coalesced_group_thread_rank_getter + <<>>(uint_arr_dev.ptr(), active_mask); + + // Verify coalesced_group.size() values + unsigned int coalesced_size = 0; + const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + // If the number of threads in a block is not a multiple of warp size, the + // last warp will have inactive threads and coalesced group size must be recalculated + if (rank_in_block == (partitions_in_block - 1) * kWarpSize) { + unsigned int partition_size = + grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize; + coalesced_size = get_active_thread_count(active_mask, partition_size); + } else if (rank_in_block == 0) { + coalesced_size = get_active_thread_count(active_mask, kWarpSize); + } + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_size) { + REQUIRE(uint_arr.ptr()[i] == coalesced_size); + } + } + } + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Verify coalesced_group.thread_rank() values + unsigned int coalesced_rank = 0; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_rank) { + REQUIRE(uint_arr.ptr()[i] == coalesced_rank); + } + coalesced_rank++; + } + } +} + +/** + * Test Description + * ------------------------ + * - Launches kernels that write the return values of size and thread_rank non-member functions + * of coalesced groups, created according to the generated mask, to an output array that is + * validated on the host side. The kernels are run sequentially, reusing the output array, to avoid + * running out of device memory for large kernel launches. + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_Coalesced_Group_Getters_Via_Non_Member_Functions_Positive_Basic") { + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + + const CPUGrid grid(blocks, threads); + + LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, + grid.thread_count_ * sizeof(unsigned int)); + LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, + grid.thread_count_ * sizeof(unsigned int)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + + // Launch Kernel + coalesced_group_non_member_size_getter + <<>>(uint_arr_dev.ptr(), active_mask); + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemset(uint_arr_dev.ptr(), 0, grid.thread_count_ * sizeof(unsigned int))); + HIP_CHECK(hipDeviceSynchronize()); + coalesced_group_non_member_thread_rank_getter + <<>>(uint_arr_dev.ptr(), active_mask); + + // Verify coalesced_group.size() values + unsigned int coalesced_size = 0; + const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + // If the number of threads in a block is not a multiple of warp size, the + // last warp will have inactive threads and coalesced group size must be recalculated + if (rank_in_block == (partitions_in_block - 1) * kWarpSize) { + unsigned int partition_size = + grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize; + coalesced_size = get_active_thread_count(active_mask, partition_size); + } else if (rank_in_block == 0) { + coalesced_size = get_active_thread_count(active_mask, kWarpSize); + } + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_size) { + REQUIRE(uint_arr.ptr()[i] == coalesced_size); + } + } + } + + HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), + grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Verify coalesced_group.thread_rank() values + unsigned int coalesced_rank = 0; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (uint_arr.ptr()[i] != coalesced_rank) { + REQUIRE(uint_arr.ptr()[i] == coalesced_rank); + } + coalesced_rank++; + } + } +} + +template +__global__ void coalesced_group_shfl_up(T* const out, const unsigned int delta, + const uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + T var = static_cast(active.thread_rank()); + out[thread_rank_in_grid()] = active.shfl_up(var, delta); + } +} + +template void CoalescedGroupShflUpTestImpl() { + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize); + + auto delta = GENERATE(range(static_cast(0), kWarpSize)); + delta = delta % active_thread_count; + INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); + + const auto alloc_size = grid.thread_count_ * sizeof(T); + LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); + + coalesced_group_shfl_up<<>>(arr_dev.ptr(), delta, active_mask); + HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + unsigned int coalesced_rank = 0; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + if (active_mask & (static_cast(1) << rank_in_partition)) { + int target = coalesced_rank - delta; + target = target < 0 ? coalesced_rank : target; + if (arr.ptr()[i] != target) { + REQUIRE(arr.ptr()[i] == target); + } + coalesced_rank++; + } + } +} + + +/** + * Test Description + * ------------------------ + * - Validates the shuffle up behavior of coalesced group, created according to the generated + * mask, for various delta values + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Up_Positive_Basic", "", int, unsigned int, long, + unsigned long, long long, unsigned long long, float, double) { + CoalescedGroupShflUpTestImpl(); +} + +template +__global__ void coalesced_group_shfl_down(T* const out, const unsigned int delta, + const uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + T var = static_cast(active.thread_rank()); + out[thread_rank_in_grid()] = active.shfl_down(var, delta); + } +} + +template void CoalescedGroupShflDownTest() { + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize); + + auto delta = GENERATE(range(static_cast(0), kWarpSize)); + delta = delta % active_thread_count; + INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); + + const auto alloc_size = grid.thread_count_ * sizeof(T); + LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); + + coalesced_group_shfl_down<<>>(arr_dev.ptr(), delta, active_mask); + HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + unsigned int coalesced_rank = 0; + unsigned int coalesced_size = 0; + const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + // If the number of threads in a block is not a multiple of warp size, the + // last warp will have inactive threads and coalesced group size must be recalculated + if (rank_in_block == (partitions_in_block - 1) * kWarpSize) { + unsigned int partition_size = + grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize; + coalesced_size = get_active_thread_count(active_mask, partition_size); + } else if (rank_in_block == 0) { + coalesced_size = get_active_thread_count(active_mask, kWarpSize); + } + if (active_mask & (static_cast(1) << rank_in_partition)) { + int target = coalesced_rank + delta; + target = target < coalesced_size ? target : coalesced_rank; + if (arr.ptr()[i] != target) { + REQUIRE(arr.ptr()[i] == target); + } + coalesced_rank++; + } + } +} + +/** + * Test Description + * ------------------------ + * - Validates the shuffle down behavior of coalesced group, created according to the generated + * mask, for various delta values + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Down_Positive_Basic", "", int, unsigned int, long, + unsigned long, long long, unsigned long long, float, double) { + CoalescedGroupShflDownTest(); +} + +template +__global__ void coalesced_group_shfl(T* const out, uint8_t* target_lanes, + const uint64_t active_mask) { + const cg::thread_block_tile tile = + cg::tiled_partition(cg::this_thread_block()); + if (active_mask & (static_cast(1) << tile.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + T var = static_cast(active.thread_rank()); + out[thread_rank_in_grid()] = active.shfl(var, target_lanes[active.thread_rank()]); + ; + } +} + +template void CoalescedGroupShflTest() { + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + unsigned int active_thread_count = get_active_thread_count(active_mask, kWarpSize); + CPUGrid grid(blocks, threads); + + const auto alloc_size = grid.thread_count_ * sizeof(T); + LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); + + LinearAllocGuard target_lanes_dev(LinearAllocs::hipMalloc, + active_thread_count * sizeof(uint8_t)); + LinearAllocGuard target_lanes(LinearAllocs::hipHostMalloc, + active_thread_count * sizeof(uint8_t)); + // Generate a couple different combinations for target lanes + for (auto i = 0u; i < active_thread_count; ++i) { + target_lanes.ptr()[i] = active_thread_count - 1 - i; + } + + HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), + active_thread_count * sizeof(uint8_t), hipMemcpyHostToDevice)); + coalesced_group_shfl + <<>>(arr_dev.ptr(), target_lanes_dev.ptr(), active_mask); + HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + + unsigned int coalesced_rank = 0; + unsigned int coalesced_size = 0; + const auto partitions_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (rank_in_partition == 0) coalesced_rank = 0; + // If the number of threads in a block is not a multiple of warp size, the + // last warp will have inactive threads and coalesced group size must be recalculated + if (rank_in_block == (partitions_in_block - 1) * kWarpSize) { + unsigned int partition_size = + grid.threads_in_block_count_ - (partitions_in_block - 1) * kWarpSize; + coalesced_size = get_active_thread_count(active_mask, partition_size); + } else if (rank_in_block == 0) { + coalesced_size = get_active_thread_count(active_mask, kWarpSize); + } + if (active_mask & (static_cast(1) << rank_in_partition)) { + auto target = target_lanes.ptr()[coalesced_rank]; + if (target >= coalesced_size) target = 0; + if (arr.ptr()[i] != target) { + REQUIRE(arr.ptr()[i] == target); + } + coalesced_rank++; + } + } +} + +/** + * Test Description + * ------------------------ + * - Validates the shuffle behavior of of coalesced group, created according to the generated + * mask, for generated shuffle target lanes + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Shfl_Positive_Basic", "", int, unsigned int, long, + unsigned long, long long, unsigned long long, float, double) { + CoalescedGroupShflTest(); +} + +static inline std::mt19937& GetRandomGenerator() { + static std::mt19937 mt(11); + return mt; +} + +template static inline T GenerateRandomInteger(const T min, const T max) { + std::uniform_int_distribution dist(min, max); + return dist(GetRandomGenerator()); +} + +template +__global__ void coalesced_group_sync_check(T* global_data, unsigned int* wait_modifiers, + const uint64_t active_mask) { + extern __shared__ uint8_t shared_data[]; + T* const data = use_global ? global_data : reinterpret_cast(shared_data); + const auto tid = cg::this_grid().thread_rank(); + const auto block = cg::this_thread_block(); + const cg::thread_block_tile partition = cg::tiled_partition(block); + + const auto data_idx = [&block](unsigned int i) { return use_global ? i : (i % block.size()); }; + + const auto partition_rank = block.thread_rank() / partition.size(); + + const auto block_base_idx = tid / block.size() * block.size(); + const auto tile_base_idx = block_base_idx + partition_rank * partition.size(); + const auto wait_modifier = wait_modifiers[tid]; + + if (active_mask & (static_cast(1) << partition.thread_rank())) { + cg::coalesced_group active = cg::coalesced_threads(); + busy_wait(wait_modifier); + data[data_idx(tid)] = active.thread_rank(); + active.sync(); + bool valid = true; + for (auto i = 0; i < active.size(); ++i) { + const auto expected = (active.thread_rank() + i) % active.size(); + unsigned int active_count = 0; + int offset = -1; + while (active_count <= expected) { + offset++; + if (active_mask & (static_cast(1) << offset)) active_count++; + } + + if (!(valid &= (data[data_idx(tile_base_idx + offset)] == expected))) { + break; + } + } + active.sync(); + data[data_idx(tid)] = valid; + + if constexpr (!use_global) { + global_data[tid] = data[data_idx(tid)]; + } + } +} + +template void CoalescedGroupSyncTest() { + const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations)); + const auto blocks = GenerateBlockDimensionsForShuffle(); + const auto threads = GenerateThreadDimensionsForShuffle(); + auto test_case = GENERATE(range(0, 4)); + uint64_t active_mask = get_active_mask(test_case); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + INFO("Coalesced group mask: " << active_mask); + CPUGrid grid(blocks, threads); + + const auto alloc_size = grid.thread_count_ * sizeof(T); + const auto alloc_size_per_block = alloc_size / grid.block_count_; + + int max_shared_mem_per_block = 0; + HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block, + hipDeviceAttributeMaxSharedMemoryPerBlock, 0)); + if (!global_memory && (max_shared_mem_per_block < alloc_size_per_block)) { + return; + } + + LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); + LinearAllocGuard wait_modifiers_dev(LinearAllocs::hipMalloc, + grid.thread_count_ * sizeof(unsigned int)); + LinearAllocGuard wait_modifiers(LinearAllocs::hipHostMalloc, + grid.thread_count_ * sizeof(unsigned int)); + if (randomized_run_count != 0) { + std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_, + [] { return GenerateRandomInteger(0u, 1500u); }); + } else { + std::fill_n(wait_modifiers.ptr(), grid.thread_count_, 0u); + } + + const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block; + HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(), + grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice)); + + coalesced_group_sync_check<<>>( + arr_dev.ptr(), wait_modifiers_dev.ptr(), active_mask); + HIP_CHECK(hipGetLastError()); + + HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + for (int i = 0; i < grid.thread_count_; i++) { + const auto rank_in_block = grid.thread_rank_in_block(i).value(); + const int rank_in_partition = rank_in_block % kWarpSize; + if (active_mask & (static_cast(1) << rank_in_partition)) { + if (arr.ptr()[i] != 1) { + REQUIRE(arr.ptr()[i] == 1); + } + } + } +} + +/** + * Test Description + * ------------------------ + * - Launches a kernel where blocks are devided into coalesced groups and every thread writes its + * intra-tile rank into an array slot determined by its grid-wide linear index. The array is either + * in global or dynamic shared memory based on a compile time switch, and the test is run for arrays + * of 1, 2, and 4 byte elements. Before the write each thread executes a busy wait loop for a random + * amount of clock cycles, the amount being read from an input array. After the write a sync for + * active threads is performed and each thread validates that it can read the expected values that + * other active threads within the same coalesced group have written to their respective array + * slots. Each thread begins the validation from a given offset from its own index. For the first + * run of the test, all the offsets are zero, so memory reads should be coalesced as adjacent + * threads read from adjacent memory locations. On subsequent runs the offsets are randomized for + * each thread, leading to non-coalesced reads and cache thrashing. + * Test source + * ------------------------ + * - unit/cooperativeGrps/coalesced_group.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) { + SECTION("Global memory") { CoalescedGroupSyncTest(); } + SECTION("Shared memory") { CoalescedGroupSyncTest(); } +} \ No newline at end of file diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc similarity index 100% rename from catch/unit/cooperativeGrps/coalesced_groups_shfl_down.cc rename to catch/unit/cooperativeGrps/coalesced_groups_shfl_down_old.cc diff --git a/catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc b/catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc similarity index 100% rename from catch/unit/cooperativeGrps/coalesced_groups_shfl_up.cc rename to catch/unit/cooperativeGrps/coalesced_groups_shfl_up_old.cc