diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux index 61369b364b..6d5a3d372a 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux @@ -252,32 +252,6 @@ "Unit_hipGraphUpload_Functional_multidevice_test", "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===", "Unit_Assert_Positive_Basic_KernelFail", - "SWDEV-442805 : Below tests failed in stress test on 19/01/24 ===", - "Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - double", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - double", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - double", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t", @@ -1250,26 +1224,6 @@ "Unit_safeAtomicMin_Positive_SameAddress - float", "=== SWDEV-454220 : Below test hanged in stress test on 22/03/24 ===", "Unit_hipExtLaunchKernel_Positive_Basic", - "=== SWDEV-454220 Below test fail in stress test 03/29/24 ===", - "Unit_Coalesced_Group_Shfl_Positive_Basic - int", - "Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Shfl_Positive_Basic - long", - "Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Shfl_Positive_Basic - long long", - "Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Shfl_Positive_Basic - float", - "Unit_Coalesced_Group_Shfl_Positive_Basic - double", - "Unit_Coalesced_Group_Sync_Positive_Basic - uint8_t", - "Unit_Coalesced_Group_Sync_Positive_Basic - uint16_t", - "Unit_Coalesced_Group_Sync_Positive_Basic - uint32_t", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - int", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - long", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - long long", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - float", - "Unit_Coalesced_Group_Shfl_Up_Positive_Basic - double", #endif #if defined gfx1030 "=== SWDEV-445961: These tests hang in PSDB stress test on 09/02/2024 ===", diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows index 190470c5ef..035385cc79 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows @@ -746,32 +746,6 @@ "Unit_Device___uhadd_Sanity_Positive", "Unit_Device___rhadd_Sanity_Positive", "Unit_Device___urhadd_Sanity_Positive", - "SWDEV-435667 : Below tests failed in stress test on 19/01/24 ===", - "Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - double", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - double", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned int", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long long", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - float", - "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - double", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t", "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t", diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc index b24d0b0aa4..c418cebd67 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc @@ -50,7 +50,7 @@ static __global__ void coalesced_group_size_getter(unsigned int* sizes, uint64_t template static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_ranks, uint64_t active_mask) { - #if (__GFX8__ || __GFX9__) + #if (__GFX8__ || __GFX9__) constexpr unsigned int ksize = 64; #else constexpr unsigned int ksize = 32; @@ -65,11 +65,11 @@ static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_r static __global__ void coalesced_group_non_member_size_getter(unsigned int* sizes, uint64_t active_mask) { - #if (__GFX8__ || __GFX9__) + #if (__GFX8__ || __GFX9__) constexpr unsigned int ksize = 64; #else constexpr unsigned int ksize = 32; - #endif + #endif const cg::thread_block_tile tile = cg::tiled_partition(cg::this_thread_block()); if (active_mask & (static_cast(1) << tile.thread_rank())) { @@ -80,7 +80,7 @@ static __global__ void coalesced_group_non_member_size_getter(unsigned int* size static __global__ void coalesced_group_non_member_thread_rank_getter(unsigned int* thread_ranks, uint64_t active_mask) { - #if (__GFX8__ || _GFX9__) + #if (__GFX8__ || _GFX9__) constexpr unsigned int ksize = 64; #else constexpr unsigned int ksize = 32; @@ -427,7 +427,7 @@ template void CoalescedGroupShflUpTestImpl() { INFO("Coalesced group mask: " << active_mask); unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size); - auto delta = GENERATE(range(static_cast(0), kWarpSize)); + auto delta = GENERATE(range(static_cast(0), static_cast(getWarpSize()))); delta = delta % active_thread_count; INFO("Delta: " << delta); CPUGrid grid(blocks, threads); @@ -481,7 +481,7 @@ __global__ void coalesced_group_shfl_down(T* const out, const unsigned int delta constexpr unsigned int ksize = 64; #else constexpr unsigned int ksize = 32; - #endif + #endif const cg::thread_block_tile tile = cg::tiled_partition(cg::this_thread_block()); if (active_mask & (static_cast(1) << tile.thread_rank())) { @@ -509,7 +509,7 @@ template void CoalescedGroupShflDownTest() { INFO("Coalesced group mask: " << active_mask); unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size); - auto delta = GENERATE(range(static_cast(0), kWarpSize)); + auto delta = GENERATE(range(static_cast(0), static_cast(getWarpSize()))); delta = delta % active_thread_count; INFO("Delta: " << delta); CPUGrid grid(blocks, threads); @@ -573,7 +573,7 @@ __global__ void coalesced_group_shfl(T* const out, uint8_t* target_lanes, constexpr unsigned int ksize = 64; #else constexpr unsigned int ksize = 32; - #endif + #endif const cg::thread_block_tile tile = cg::tiled_partition(cg::this_thread_block()); if (active_mask & (static_cast(1) << tile.thread_rank())) { diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc index 2353d20470..11f362b2b2 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc @@ -67,47 +67,43 @@ template static inline T GenerateRandomInteger(const T min, const T return dist(GetRandomGenerator()); } -template static auto coalesce_threads(const uint64_t mask) { - std::tuple, unsigned int> res; +static auto coalesce_threads(const uint64_t mask, unsigned int warp_size) { + std::tuple, unsigned int> res; auto& [threads, count] = res; - count = 0u; for (auto i = 0u; i < warp_size; ++i) { if (mask & (1u << i)) { - threads[count++] = i; + threads.push_back(i); } } return res; } -template __device__ bool deactivate_thread(uint64_t* active_masks) { - const cg::thread_block_tile warp = - cg::tiled_partition(cg::this_thread_block()); +__device__ bool deactivate_thread(uint64_t* active_masks, unsigned int warp_size) { + const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size); const auto block = cg::this_thread_block(); const auto warps_per_block = (block.size() + warp_size - 1) / warp_size; const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x; - const auto idx = block_rank * warps_per_block + block.thread_rank() / warp.size(); - + const auto idx = block_rank * warps_per_block + block.thread_rank() / warp_size; return !(active_masks[idx] & (1u << warp.thread_rank())); } - -template __global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks, unsigned int tile_size, - unsigned int* sizes) { - if (deactivate_thread(active_masks)) { + unsigned int* sizes, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } sizes[thread_rank_in_grid()] = cg::tiled_partition(cg::coalesced_threads(), tile_size).size(); } -template __global__ void coalesced_group_tiled_partition_thread_rank_getter(uint64_t* active_masks, unsigned int tile_size, - unsigned int* sizes) { - if (deactivate_thread(active_masks)) { + unsigned int* sizes, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } @@ -133,6 +129,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensions(); auto threads = GenerateThreadDimensions(); + int warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); @@ -141,38 +138,38 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, alloc_size); - const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size; const auto warps_in_grid = warps_in_block * grid.block_count_; LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, warps_in_grid * sizeof(uint64_t)); LinearAllocGuard active_masks(LinearAllocs::hipHostMalloc, warps_in_grid * sizeof(uint64_t)); - std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid, - [] { return GenerateRandomInteger(0u, std::numeric_limits().max()); }); + std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid, [] { + return GenerateRandomInteger((uint64_t)0, std::numeric_limits().max()); + }); HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice)); HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size)); - coalesced_group_tiled_partition_size_getter<32> - <<>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr()); + coalesced_group_tiled_partition_size_getter<<>>( + active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr(), warp_size); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size)); - coalesced_group_tiled_partition_thread_rank_getter<32> - <<>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr()); + coalesced_group_tiled_partition_thread_rank_getter<<>>( + active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr(), warp_size); - const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_; + const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_; // validate size for (auto i = 0u; i < warps_in_grid; ++i) { - auto current_warp_mask = active_masks.ptr()[i]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); + uint64_t current_warp_mask = active_masks.ptr()[i]; + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount; const auto [active_threads, active_thread_count] = - coalesce_threads(current_warp_mask); + coalesce_threads(current_warp_mask, warp_size); const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block); const auto num_tiles = (active_thread_count + tile_size - 1) / tile_size; @@ -183,7 +180,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { const auto window_end = t + tile_size; // Iterate through window for (auto k = window_start; k < window_end && k < active_thread_count; ++k) { - const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails; + const auto global_thread_idx = i * warp_size + active_threads[k] - tails; const auto expected_val = tile_size - tile_tail * (t + tile_size >= active_thread_count); const auto actual_val = uint_arr.ptr()[global_thread_idx]; INFO("global index: " << global_thread_idx); @@ -199,13 +196,12 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { // validate rank for (auto i = 0u; i < warps_in_grid; ++i) { - auto current_warp_mask = active_masks.ptr()[i]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); + uint64_t current_warp_mask = active_masks.ptr()[i]; + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount; const auto [active_threads, active_thread_count] = - coalesce_threads(current_warp_mask); + coalesce_threads(current_warp_mask, warp_size); const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block); // Step tile-sized window over active threads @@ -214,7 +210,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { const auto window_end = t + tile_size; // Iterate through window for (auto k = window_start; k < window_end && k < active_thread_count; ++k) { - const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails; + const auto global_thread_idx = i * warp_size + active_threads[k] - tails; const auto expected_val = k % tile_size; const auto actual_val = uint_arr.ptr()[global_thread_idx]; INFO("global index: " << global_thread_idx); @@ -227,15 +223,15 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") { } -template +template __global__ void coalesced_group_tiled_partition_shfl_up(uint64_t* active_masks, T* const out, const unsigned int tile_size, - const unsigned int delta) { - if (deactivate_thread(active_masks)) { + const unsigned int delta, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } - const cg::thread_block_tile warp = - cg::tiled_partition(cg::this_thread_block()); + const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size); T var = static_cast(warp.thread_rank()); const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size); @@ -248,6 +244,7 @@ template static void CoalescedGroupTiledPartitonShflUpTestImpl() { INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); + auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); const auto delta = GENERATE_COPY(range(0u, tile_size)); @@ -258,7 +255,7 @@ template static void CoalescedGroupTiledPartitonShflUpTestImpl() { LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, alloc_size); - const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size; const auto warps_in_grid = warps_in_block * grid.block_count_; LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, warps_in_grid * sizeof(uint64_t)); @@ -270,22 +267,21 @@ template static void CoalescedGroupTiledPartitonShflUpTestImpl() { HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice)); HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size)); - coalesced_group_tiled_partition_shfl_up - <<>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta); + coalesced_group_tiled_partition_shfl_up<<>>( + active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta, warp_size); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); - const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_; + const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_; for (auto i = 0u; i < warps_in_grid; ++i) { auto current_warp_mask = active_masks.ptr()[i]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount; const auto [active_threads, active_thread_count] = - coalesce_threads(current_warp_mask); + coalesce_threads(current_warp_mask, warp_size); const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block); // Step tile-sized window over active threads @@ -294,7 +290,7 @@ template static void CoalescedGroupTiledPartitonShflUpTestImpl() { const auto window_end = t + tile_size; // Iterate through window for (auto k = window_start; k < window_end && k < active_thread_count; ++k) { - const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails; + const auto global_thread_idx = i * warp_size + active_threads[k] - tails; const auto expected_val = active_threads[k - delta]; const auto actual_val = uint_arr.ptr()[global_thread_idx]; INFO("global index: " << global_thread_idx); @@ -327,15 +323,15 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic" } -template +template __global__ void coalesced_group_tiled_partition_shfl_down(uint64_t* active_masks, T* const out, const unsigned int tile_size, - const unsigned int delta) { - if (deactivate_thread(active_masks)) { + const unsigned int delta, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } - const cg::thread_block_tile warp = - cg::tiled_partition(cg::this_thread_block()); + const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size); T var = static_cast(warp.thread_rank()); const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size); @@ -348,6 +344,7 @@ template static void CoalescedGroupTiledPartitonShflDownTestImpl() INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); + auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); const auto delta = GENERATE_COPY(range(0u, tile_size)); @@ -358,7 +355,7 @@ template static void CoalescedGroupTiledPartitonShflDownTestImpl() LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, alloc_size); - const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size; const auto warps_in_grid = warps_in_block * grid.block_count_; LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, warps_in_grid * sizeof(uint64_t)); @@ -370,22 +367,21 @@ template static void CoalescedGroupTiledPartitonShflDownTestImpl() HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice)); HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size)); - coalesced_group_tiled_partition_shfl_down - <<>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta); + coalesced_group_tiled_partition_shfl_down<<>>( + active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta, warp_size); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); - const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_; + const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_; for (auto i = 0u; i < warps_in_grid; ++i) { auto current_warp_mask = active_masks.ptr()[i]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount; const auto [active_threads, active_thread_count] = - coalesce_threads(current_warp_mask); + coalesce_threads(current_warp_mask, warp_size); if (delta >= active_thread_count) { continue; @@ -398,7 +394,7 @@ template static void CoalescedGroupTiledPartitonShflDownTestImpl() const auto window_end = t + tile_size - delta; // Iterate through window for (auto k = window_start; k < window_end && k < active_thread_count - delta; ++k) { - const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails; + const auto global_thread_idx = i * warp_size + active_threads[k] - tails; const auto expected_val = active_threads[k + delta]; const auto actual_val = uint_arr.ptr()[global_thread_idx]; INFO("global index: " << global_thread_idx); @@ -431,14 +427,14 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basi } -template +template __global__ void coalesced_group_tiled_partition_shfl(uint64_t* active_masks, uint8_t* target_lanes, - T* const out, const unsigned int tile_size) { - if (deactivate_thread(active_masks)) { + T* const out, const unsigned int tile_size, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } - const cg::thread_block_tile warp = - cg::tiled_partition(cg::this_thread_block()); + const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size); T var = static_cast(warp.thread_rank()); const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size); @@ -450,6 +446,7 @@ template static void CoalescedGroupTiledPartitonShflTestImpl() { INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); + auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); @@ -458,7 +455,7 @@ template static void CoalescedGroupTiledPartitonShflTestImpl() { LinearAllocGuard uint_arr_dev(LinearAllocs::hipMalloc, alloc_size); LinearAllocGuard uint_arr(LinearAllocs::hipHostMalloc, alloc_size); - const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size; const auto warps_in_grid = warps_in_block * grid.block_count_; LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, warps_in_grid * sizeof(uint64_t)); @@ -476,22 +473,21 @@ template static void CoalescedGroupTiledPartitonShflTestImpl() { HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), tile_size * sizeof(uint8_t), hipMemcpyHostToDevice)); HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size)); - coalesced_group_tiled_partition_shfl<<>>( - active_masks_dev.ptr(), target_lanes_dev.ptr(), uint_arr_dev.ptr(), tile_size); + coalesced_group_tiled_partition_shfl<<>>( + active_masks_dev.ptr(), target_lanes_dev.ptr(), uint_arr_dev.ptr(), tile_size, warp_size); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); - const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_; + const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_; for (auto i = 0u; i < warps_in_grid; ++i) { auto current_warp_mask = active_masks.ptr()[i]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block); current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount; const auto [active_threads, active_thread_count] = - coalesce_threads(current_warp_mask); + coalesce_threads(current_warp_mask, warp_size); const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block); // Step tile-sized window over active threads @@ -500,7 +496,7 @@ template static void CoalescedGroupTiledPartitonShflTestImpl() { const auto window_end = t + tile_size; // Iterate through window for (auto k = window_start; k < window_end && k < active_thread_count; ++k) { - const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails; + const auto global_thread_idx = i * warp_size + active_threads[k] - tails; const auto target_lane = target_lanes.ptr()[k % tile_size]; if (target_lane >= tile_size || target_lane >= active_thread_count - t) { continue; @@ -537,11 +533,12 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic", " } -template +template __global__ void coalesced_group_tiled_partition_sync_check(uint64_t* active_masks, T* global_data, unsigned int* wait_modifiers, - size_t tile_size) { - if (deactivate_thread(active_masks)) { + size_t tile_size, + unsigned int warp_size) { + if (deactivate_thread(active_masks, warp_size)) { return; } @@ -591,6 +588,7 @@ template void CoalescedGroupTiledPartitionSyncT INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); + auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); CPUGrid grid(blocks, threads); @@ -610,7 +608,7 @@ template void CoalescedGroupTiledPartitionSyncT grid.thread_count_ * sizeof(unsigned int)); LinearAllocGuard wait_modifiers(LinearAllocs::hipHostMalloc, grid.thread_count_ * sizeof(unsigned int)); - const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize; + const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size; const auto warps_in_grid = warps_in_block * grid.block_count_; LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, warps_in_grid * sizeof(uint64_t)); @@ -631,24 +629,23 @@ template void CoalescedGroupTiledPartitionSyncT grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice)); const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block; - coalesced_group_tiled_partition_sync_check + coalesced_group_tiled_partition_sync_check <<>>(active_masks_dev.ptr(), arr_dev.ptr(), - wait_modifiers_dev.ptr(), tile_size); + wait_modifiers_dev.ptr(), tile_size, warp_size); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); HIP_CHECK(hipDeviceSynchronize()); - const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_; + const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_; for (int i = 0u; i < grid.block_count_; ++i) { for (int j = 0u; j < warps_in_block; ++j) { const auto warp_idx = i * warps_in_block + j; auto mask = active_masks.ptr()[warp_idx]; - const auto shift_amount = - (tail + 32 * TestContext::get().isNvidia()) * !((warp_idx + 1) % warps_in_block); + const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((warp_idx + 1) % warps_in_block); mask = (mask << shift_amount) >> shift_amount; const auto active_count = std::bitset(mask).count(); - const auto start_offset = i * grid.threads_in_block_count_ + j * kWarpSize; + const auto start_offset = i * grid.threads_in_block_count_ + j * warp_size; const auto end_offset = start_offset + active_count; const auto valid = std::all_of(arr.ptr() + start_offset, arr.ptr() + end_offset, [](T e) { return e; }); @@ -686,6 +683,6 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic", " } /** -* End doxygen group DeviceLanguageTest. -* @} -*/ + * End doxygen group DeviceLanguageTest. + * @} + */ diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh index 2eb1314d4f..dbaa10dec5 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh +++ b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh @@ -23,7 +23,6 @@ THE SOFTWARE. #include namespace { -constexpr size_t kWarpSize = 32; constexpr int kMaxGPUs = 8; } // namespace @@ -61,4 +60,4 @@ template bool CheckDimensions(unsigned int device, T kernel, dim3 bloc } return true; -} \ No newline at end of file +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/multi_grid_group.cc b/projects/hip-tests/catch/unit/cooperativeGrps/multi_grid_group.cc index 0a4a767a4e..9a1ac00527 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/multi_grid_group.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/multi_grid_group.cc @@ -115,6 +115,7 @@ static void get_multi_grid_dims(dim3& grid_dim, dim3& block_dim, unsigned int de HIP_CHECK(hipSetDevice(device)) HIP_CHECK(hipGetDeviceProperties(&props, 0)); int sm = props.multiProcessorCount; + auto warp_size = getWarpSize(); std::vector block_dim_values = {dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1), @@ -123,8 +124,8 @@ static void get_multi_grid_dims(dim3& grid_dim, dim3& block_dim, unsigned int de dim3(32, 32, 1), dim3(64, 8, 2), dim3(16, 16, 3), - dim3(kWarpSize - 1, 3, 3), - dim3(kWarpSize + 1, 3, 3)}; + dim3(warp_size - 1, 3, 3), + dim3(warp_size + 1, 3, 3)}; std::vector grid_dim_values = {dim3(1, 1, 1), dim3(static_cast(0.5 * sm), 1, 3), dim3(4, static_cast(0.5 * sm), 1),