From cac67a0f328c9ae414495ca4804e2c0bec374f36 Mon Sep 17 00:00:00 2001 From: amilanov-amd Date: Tue, 27 Jan 2026 11:51:08 +0100 Subject: [PATCH] SWDEV-521760 - Fix and enable disabled HIP tests from cooperative groups group (#2027) * Reworked Unit_hipLaunchCooperativeKernel_Basic and Unit_hipLaunchCooperativeKernelMultiDevice_Basic * Introduce reduction_factor for coop groups tests. Fix Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic * Fix always false requirement by adding a cast * Change data type to unsigned long long to align with cuda * Change literal type to double to ensure proper type casting * Remove formatting comments --- .../catch/hipTestMain/config/config_amd_linux | 6 - projects/hip-tests/catch/hipTestMain/main.cc | 3 + .../hip-tests/catch/include/cmd_options.hh | 4 +- projects/hip-tests/catch/include/cpu_grid.h | 82 ++++------- .../unit/cooperativeGrps/coalesced_group.cc | 73 ++++------ .../coalesced_group_tiled_partition.cc | 26 +++- .../cooperative_groups_common.hh | 4 + .../catch/unit/cooperativeGrps/grid_group.cc | 8 +- ...pLaunchCooperativeKernelMultiDevice_old.cc | 129 +++++++++--------- .../hipLaunchCooperativeKernel_old.cc | 71 +++++----- .../unit/cooperativeGrps/thread_block_tile.cc | 38 +++++- 11 files changed, 226 insertions(+), 218 deletions(-) diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux index b4da228100..474729107a 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux @@ -46,10 +46,6 @@ "Unit_hipGraphicsUnmapResources_Negative_Parameters", "Unit_hipGraphicsUnregisterResource_Negative_Parameters", "SWDEV-446588 - Disable graph multi gpu testcases until graph has support for it", - "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===", - "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t", - "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t", - "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t", "SWDEV-447384, SWDEV-447932: These tests fail in gfx1100, gfx1101 & gfx1102", "SWDEV-445928: These tests fail in PSDB stress test on 09/02/2024", "Unit_hipGraphInstantiateWithFlags_StreamCaptureDeviceContextChg", @@ -143,8 +139,6 @@ "Unit_hipStreamPerThread_MultiThread", #endif #if defined gfx90a || defined gfx942 || defined gfx950 - "=== SWDEV-443630 : Below test failed in stress test on 19/01/24 ===", - "Unit_Multi_Grid_Group_Positive_Sync", "Unit_Warp_Shfl_Up_Positive_Basic - int", "Unit_Warp_Shfl_Up_Positive_Basic - unsigned int", "Unit_Warp_Shfl_Up_Positive_Basic - long", diff --git a/projects/hip-tests/catch/hipTestMain/main.cc b/projects/hip-tests/catch/hipTestMain/main.cc index f7d5b282a2..d1042404dc 100755 --- a/projects/hip-tests/catch/hipTestMain/main.cc +++ b/projects/hip-tests/catch/hipTestMain/main.cc @@ -54,6 +54,9 @@ int main(int argc, char** argv) { | Opt(cmd_options.cg_iterations, "cg_iterations") ["-C"]["--cg-iterations"] ("Number of iterations used for cooperative groups sync tests (default: 5)") + | Opt(cmd_options.cg_reduction_factor, "cg_reduction_factor") + ["-C"]["--cg-reduction-factor"] + ("Percentage of warp sizes for shuffle tests to be actually tested (default: 10)") // TODO | Opt(cmd_options.accuracy_iterations, "accuracy_iterations") ["-A"]["--accuracy-iterations"] ("Number of iterations used for math accuracy tests with randomly generated inputs (default: 2^32)") diff --git a/projects/hip-tests/catch/include/cmd_options.hh b/projects/hip-tests/catch/include/cmd_options.hh index e736c63472..0c17e12619 100644 --- a/projects/hip-tests/catch/include/cmd_options.hh +++ b/projects/hip-tests/catch/include/cmd_options.hh @@ -28,8 +28,8 @@ THE SOFTWARE. struct CmdOptions { int iterations = 5; int warmups = 5; - int cg_extended_run = 5; - int cg_iterations = 2; + int cg_iterations = 1; + double cg_reduction_factor = 6.25; bool no_display = false; bool progress = false; uint64_t accuracy_iterations = std::numeric_limits::max() + 1ull; diff --git a/projects/hip-tests/catch/include/cpu_grid.h b/projects/hip-tests/catch/include/cpu_grid.h index 8f2ba6f05b..858a4db5b4 100644 --- a/projects/hip-tests/catch/include/cpu_grid.h +++ b/projects/hip-tests/catch/include/cpu_grid.h @@ -114,11 +114,9 @@ struct CPUMultiGrid { }; /* Generate dimensions for 1D, 2D and 3D blocks of threads */ -inline dim3 GenerateThreadDimensions() { +inline dim3 GenerateThreadDimensionsImpl(const std::initializer_list& multipliers) { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); - const auto multipliers = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, - 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5}; return GENERATE_COPY( dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1), dim3(1, 1, props.maxThreadsDim[2]), @@ -135,59 +133,39 @@ inline dim3 GenerateThreadDimensions() { dim3(props.warpSize - 1, 3, 3), dim3(props.warpSize + 1, 3, 3)); } +inline dim3 GenerateThreadDimensions() { + const auto multipliers = {0.1, 0.5, 1.0, 1.5, 2.0}; + return GenerateThreadDimensionsImpl(multipliers); +} + +inline dim3 GenerateThreadDimensionsForShuffle() { + const auto multipliers = {0.5, 1.0, 2.0}; + return GenerateThreadDimensionsImpl(multipliers); +} + /* Generate dimensions for 1D, 2D and 3D grids of blocks */ +inline dim3 GenerateBlockDimensionsImpl(const std::initializer_list& multipliers) { + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0)); + return GENERATE_COPY(dim3(1, 1, 1), + map([sm = props.multiProcessorCount]( + double i) { return dim3(static_cast(i * sm), 1, 1); }, + values(multipliers)), + map([sm = props.multiProcessorCount]( + double i) { return dim3(1, static_cast(i * sm), 1); }, + values(multipliers)), + map([sm = props.multiProcessorCount]( + double i) { return dim3(1, 1, static_cast(i * sm)); }, + values(multipliers)), + dim3(5, 5, 5)); +} + inline dim3 GenerateBlockDimensions() { - hipDeviceProp_t props; - HIP_CHECK(hipGetDeviceProperties(&props, 0)); - const auto multipliers = {0.5, 0.9, 1.0, 1.1, 1.5, 1.9, 2.0, 3.0, 4.0}; - return GENERATE_COPY(dim3(1, 1, 1), - map([sm = props.multiProcessorCount]( - double i) { return dim3(static_cast(i * sm), 1, 1); }, - values(multipliers)), - map([sm = props.multiProcessorCount]( - double i) { return dim3(1, static_cast(i * sm), 1); }, - values(multipliers)), - map([sm = props.multiProcessorCount]( - double i) { return dim3(1, 1, static_cast(i * sm)); }, - values(multipliers)), - dim3(5, 5, 5)); + const auto multipliers = {0.5, 1.0, 1.5, 2.0}; + return GenerateBlockDimensionsImpl(multipliers); } -/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */ -inline dim3 GenerateThreadDimensionsForShuffle() { - hipDeviceProp_t props; - HIP_CHECK(hipGetDeviceProperties(&props, 0)); - const auto multipliers = {0.5, 0.9, 1.0, 1.5, 2.0}; - return GENERATE_COPY( - dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1), - dim3(1, 1, props.maxThreadsDim[2]), - map([max = props.maxThreadsDim[0], warp_size = props.warpSize]( - double i) { return dim3(std::min(static_cast(i * warp_size), max), 1, 1); }, - values(multipliers)), - map([max = props.maxThreadsDim[1], warp_size = props.warpSize]( - double i) { return dim3(1, std::min(static_cast(i * warp_size), max), 1); }, - values(multipliers)), - map([max = props.maxThreadsDim[2], warp_size = props.warpSize]( - double i) { return dim3(1, 1, std::min(static_cast(i * warp_size), max)); }, - values(multipliers)), - dim3(16, 8, 8), dim3(32, 32, 1), dim3(64, 8, 2), dim3(16, 16, 3), - dim3(props.warpSize - 1, 3, 3), dim3(props.warpSize + 1, 3, 3)); -} - -/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */ inline dim3 GenerateBlockDimensionsForShuffle() { - hipDeviceProp_t props; - HIP_CHECK(hipGetDeviceProperties(&props, 0)); const auto multipliers = {0.5, 1.0}; - return GENERATE_COPY(dim3(1, 1, 1), - map([sm = props.multiProcessorCount]( - double i) { return dim3(static_cast(i * sm), 1, 1); }, - values(multipliers)), - map([sm = props.multiProcessorCount]( - double i) { return dim3(1, static_cast(i * sm), 1); }, - values(multipliers)), - map([sm = props.multiProcessorCount]( - double i) { return dim3(1, 1, static_cast(i * sm)); }, - values(multipliers)), - dim3(5, 5, 5)); + return GenerateBlockDimensionsImpl(multipliers); } \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc index 069f717f48..cd605e55fc 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group.cc @@ -137,12 +137,7 @@ static uint64_t get_active_mask(unsigned int test_case, size_t warp_size) { * - HIP_VERSION >= 5.2 */ TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); - - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -225,13 +220,7 @@ TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") { * - HIP_VERSION >= 5.2 */ TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); - - - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -315,12 +304,7 @@ TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") { * - HIP_VERSION >= 5.2 */ TEST_CASE("Unit_Coalesced_Group_Getters_Via_Non_Member_Functions_Positive_Basic") { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); - - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -407,12 +391,9 @@ template __global__ void coalesced_group_shfl_up(T* const out, } template void CoalescedGroupShflUpTestImpl() { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -423,11 +404,16 @@ template void CoalescedGroupShflUpTestImpl() { INFO("Coalesced group mask: " << active_mask); unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size); - // Tests edge cases (0, 1, max-1) and middle values - auto delta = GENERATE(values({0, 1, static_cast(getWarpSize()/2), - static_cast(getWarpSize()-1)})); + std::vector deltas; + for (double i = 0; i < warp_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(warp_size - 1); + + auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); delta = delta % active_thread_count; INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); @@ -489,12 +475,9 @@ template __global__ void coalesced_group_shfl_down(T* const out, } template void CoalescedGroupShflDownTest() { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -505,11 +488,16 @@ template void CoalescedGroupShflDownTest() { INFO("Coalesced group mask: " << active_mask); unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size); - // Tests edge cases (0, 1, max-1) and middle values - auto delta = GENERATE(values({0, 1, static_cast(getWarpSize()/2), - static_cast(getWarpSize()-1)})); + std::vector deltas; + for (double i = 0; i < warp_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(warp_size - 1); + + auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); delta = delta % active_thread_count; INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); @@ -581,12 +569,7 @@ template __global__ void coalesced_group_shfl(T* const out, uint8_t } template void CoalescedGroupShflTest() { - int device; - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); - - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto blocks = GenerateBlockDimensionsForShuffle(); const auto threads = GenerateThreadDimensionsForShuffle(); @@ -730,13 +713,7 @@ __global__ void coalesced_group_sync_check(T* global_data, unsigned int* wait_mo } template void CoalescedGroupSyncTest() { - int device; - hipDeviceProp_t device_properties; - - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); - - size_t warp_size = static_cast(device_properties.warpSize); + const int warp_size = getWarpSize(); const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations)); const auto blocks = GenerateBlockDimensionsForShuffle(); diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc index 2cb2a043ea..9f21902708 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/coalesced_group_tiled_partition.cc @@ -87,7 +87,7 @@ __device__ bool deactivate_thread(uint64_t* active_masks, unsigned int warp_size const auto warps_per_block = (block.size() + warp_size - 1) / warp_size; const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x; const auto idx = block_rank * warps_per_block + block.thread_rank() / warp_size; - return !(active_masks[idx] & (1u << warp.thread_rank())); + return !(active_masks[idx] & (1ull << warp.thread_rank())); } __global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks, @@ -243,6 +243,8 @@ __global__ void coalesced_group_tiled_partition_shfl_up(uint64_t* active_masks, template static void CoalescedGroupTiledPartitonShflUpTestImpl() { + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); + const auto tile_size = GenerateTileSizes(); INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); @@ -250,8 +252,16 @@ template static void CoalescedGroupTiledPartitonShflUpTestImpl() { auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); - const auto delta = GENERATE_COPY(range(0u, tile_size)); + + std::vector deltas; + for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(tile_size - 1); + + const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); @@ -344,6 +354,8 @@ __global__ void coalesced_group_tiled_partition_shfl_down(uint64_t* active_masks template static void CoalescedGroupTiledPartitonShflDownTestImpl() { + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); + const auto tile_size = GenerateTileSizes(); INFO("Tile size: " << tile_size); auto blocks = GenerateBlockDimensionsForShuffle(); @@ -351,8 +363,16 @@ template static void CoalescedGroupTiledPartitonShflDownTestImpl() auto warp_size = getWarpSize(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); - const auto delta = GENERATE_COPY(range(0u, tile_size)); + + std::vector deltas; + for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(tile_size - 1); + + const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh index dbaa10dec5..17ab44f840 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh +++ b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh @@ -22,6 +22,8 @@ THE SOFTWARE. #include #include +#include + namespace { constexpr int kMaxGPUs = 8; } // namespace @@ -61,3 +63,5 @@ template bool CheckDimensions(unsigned int device, T kernel, dim3 bloc return true; } + +inline double GetTestReductionFactor() { return cmd_options.cg_reduction_factor * 0.01; } diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc b/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc index 589a728143..da7457eb9a 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/grid_group.cc @@ -273,12 +273,8 @@ TEST_CASE("Unit_Grid_Group_Sync_Positive_Basic") { } auto loops = GENERATE(2, 4, 8, 16); - // Launch params for this test are hardcoded as a workaround for an issue reported - // SWDEV-429791. When fixed, please enable calls to GenerateBlock/ThreadDimensions() - const auto blocks = - GENERATE_COPY(dim3(5, 5, 5), dim3(330, 1, 1), dim3(1, 330, 1), dim3(1, 1, 330)); - const auto threads = - GENERATE_COPY(dim3(16, 8, 8), dim3(32, 32, 1), dim3(64, 8, 2), dim3(16, 16, 3)); + const auto blocks = GenerateBlockDimensions(); + const auto threads = GenerateThreadDimensions(); if (!CheckDimensions(device, sync_kernel, blocks, threads)) return; INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc index 6be0cab225..bac936ae00 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc @@ -86,45 +86,48 @@ namespace cg = cooperative_groups; static constexpr size_t kBufferLen = 1024 * 1024; -__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) { - extern __shared__ long tmp[]; - uint groups = gridDim.x; - uint group_id = blockIdx.x; - uint local_id = threadIdx.x; - uint chunk = gridDim.x * blockDim.x; +__global__ void test_gws(uint* buf, uint buf_size, unsigned long long* tmp_buf, + unsigned long long* result) { + extern __shared__ unsigned long long tmp[]; - uint i = group_id * blockDim.x + local_id; - long sum = 0; - while (i < buf_size) { + cg::thread_block tb = cg::this_thread_block(); + cg::grid_group gg = cg::this_grid(); + cg::multi_grid_group mgg = cg::this_multi_grid(); + + const auto tid = gg.thread_rank(); + const auto stride = gg.size(); + const auto local_tid = tb.thread_rank(); + const auto wid = blockIdx.x; + const auto workgroup_size = tb.size(); + const auto gid = mgg.grid_rank(); + const auto grid_size = gridDim.x; + const auto num_grids = mgg.num_grids(); + + unsigned long long sum = 0; + for (size_t i = tid; i < buf_size; i += stride) { sum += buf[i]; - i += chunk; } - tmp[local_id] = sum; - __syncthreads(); - i = 0; - if (local_id == 0) { - sum = 0; - while (i < blockDim.x) { - sum += tmp[i]; - i++; - } - tmp_buf[group_id] = sum; - } - // wait - cg::this_grid().sync(); + tmp[local_tid] = sum; + tb.sync(); - if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) { - for (uint i = 1; i < groups; ++i) { - sum += tmp_buf[i]; - } - //*result = sum; - result[1 + cg::this_multi_grid().grid_rank()] = sum; - } - cg::this_multi_grid().sync(); - if (cg::this_multi_grid().grid_rank() == 0) { + if (local_tid == 0) { sum = 0; - for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) { - sum += result[i]; + for (size_t i = 0; i < workgroup_size; i++) { + sum += tmp[i]; + } + tmp_buf[wid] = sum; + } + gg.sync(); + + if (tid < grid_size) { + atomicAdd(&result[gid + 1], tmp_buf[tid]); + } + mgg.sync(); + + if (gid == 0) { + sum = 0; + for (size_t i = 0; i < num_grids; i++) { + sum += result[i + 1]; } *result = sum; } @@ -136,20 +139,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") { int device_num = 0; HIP_CHECK(hipGetDeviceCount(&device_num)); - size_t buffer_size = kBufferLen * sizeof(int); - - int* A_h = reinterpret_cast(malloc(buffer_size * device_num)); - for (uint32_t i = 0; i < kBufferLen * device_num; ++i) { - A_h[i] = static_cast(i); - } - - std::vector A_d(device_num); - std::vector B_d(device_num); - long* C_d; - std::vector stream(device_num); - std::vector device_properties(device_num); - for (int i = 0; i < device_num; i++) { HIP_CHECK(hipSetDevice(i)); @@ -159,28 +149,39 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") { HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); return; } + } + + size_t buffer_size = kBufferLen * sizeof(int); + + int* A_h = nullptr; + std::vector A_d(device_num); + std::vector B_d(device_num); + unsigned long long* C_d; + std::vector stream(device_num); + + A_h = reinterpret_cast(malloc(buffer_size * device_num)); + for (uint32_t i = 0; i < kBufferLen * device_num; i++) { + A_h[i] = static_cast(i); + } + + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipSetDevice(i)); HIP_CHECK(hipMalloc(&A_d[i], buffer_size)); HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice)); - if (i == 0) { - HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long))); - } HIP_CHECK(hipStreamCreate(&stream[i])); + HIP_CHECK(hipDeviceSynchronize()); } - dim3 dimBlock; - dim3 dimGrid; - dimGrid.x = 1; - dimGrid.y = 1; - dimGrid.z = 1; - dimBlock.x = 64; - dimBlock.y = 1; - dimBlock.z = 1; + HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(unsigned long long))); + uint workgroup = GENERATE(32, 64, 128, 256); + + dim3 dimBlock = dim3(workgroup); + dim3 dimGrid = dim3(1); int num_blocks = 0; - uint workgroup = GENERATE(64, 128, 256); hipLaunchParams* launch_params_list = new hipLaunchParams[device_num]; std::vector args(device_num * num_kernel_args); @@ -188,16 +189,16 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") { for (int i = 0; i < device_num; i++) { HIP_CHECK(hipSetDevice(i)); - dimBlock.x = workgroup; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + &num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, + dimBlock.x * sizeof(unsigned long long))); INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks << "\n"); dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32); - HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long))); + HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(unsigned long long))); args[i * num_kernel_args] = (void*)&A_d[i]; args[i * num_kernel_args + 1] = (void*)&kBufferLen; @@ -207,7 +208,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") { launch_params_list[i].func = reinterpret_cast(test_gws); launch_params_list[i].gridDim = dimGrid; launch_params_list[i].blockDim = dimBlock; - launch_params_list[i].sharedMem = dimBlock.x * sizeof(long); + launch_params_list[i].sharedMem = dimBlock.x * sizeof(unsigned long long); launch_params_list[i].stream = stream[i]; launch_params_list[i].args = &args[i * num_kernel_args]; } @@ -218,7 +219,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") { } size_t processed_Dwords = kBufferLen * device_num; - REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2)); + REQUIRE(*C_d == (((unsigned long long)(processed_Dwords) * (processed_Dwords - 1)) / 2)); delete[] launch_params_list; diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc index a8f41de8c9..1a2de5a2b7 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc @@ -27,46 +27,47 @@ namespace cg = cooperative_groups; static constexpr size_t kBufferLen = 1024 * 1024; -__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) { - extern __shared__ long tmp[]; - uint offset = blockIdx.x * blockDim.x + threadIdx.x; - uint stride = gridDim.x * blockDim.x; +__global__ void test_gws(int* buf, size_t buf_size, unsigned long long* tmp_buf, + unsigned long long* result) { + extern __shared__ unsigned long long tmp[]; + + cg::thread_block tb = cg::this_thread_block(); cg::grid_group gg = cg::this_grid(); - long sum = 0; - for (uint i = offset; i < buf_size; i += stride) { + const auto tid = gg.thread_rank(); + const auto stride = gg.size(); + const auto local_tid = tb.thread_rank(); + const auto wid = blockIdx.x; + const auto workgroup_size = tb.size(); + const auto grid_size = gridDim.x; + + unsigned long long sum = 0; + for (size_t i = tid; i < buf_size; i += stride) { sum += buf[i]; } - tmp[threadIdx.x] = sum; + tmp[local_tid] = sum; + tb.sync(); - __syncthreads(); - - if (threadIdx.x == 0) { + if (local_tid == 0) { sum = 0; - for (uint i = 0; i < blockDim.x; i++) { + for (size_t i = 0; i < workgroup_size; i++) { sum += tmp[i]; } - tmp_buf[blockIdx.x] = sum; + tmp_buf[wid] = sum; } - gg.sync(); - if (offset == 0) { - for (uint i = 1; i < gridDim.x; ++i) { - sum += tmp_buf[i]; - } - *result = sum; + if (tid < grid_size) { + atomicAdd(result, tmp_buf[tid]); } } TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { // Use default device for validating the test int device; - int *A_h, *A_d; - long* B_d; - long* C_d; - hipDeviceProp_t device_properties; HIP_CHECK(hipGetDevice(&device)); + + hipDeviceProp_t device_properties; HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); if (!device_properties.cooperativeLaunch) { @@ -76,6 +77,12 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { size_t buffer_size = kBufferLen * sizeof(int); + int* A_h = nullptr; + int* A_d = nullptr; + unsigned long long* B_d = nullptr; + unsigned long long* C_d = nullptr; + hipStream_t stream; + A_h = reinterpret_cast(malloc(buffer_size)); for (uint32_t i = 0; i < kBufferLen; ++i) { A_h[i] = static_cast(i); @@ -83,25 +90,23 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { HIP_CHECK(hipMalloc(&A_d, buffer_size)); HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice)); - HIP_CHECK(hipHostMalloc(&C_d, sizeof(long))); + HIP_CHECK(hipHostMalloc(&C_d, sizeof(unsigned long long))); - hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); - dim3 dimBlock = dim3(1); - dim3 dimGrid = dim3(1); - int numBlocks = 0; - uint32_t workgroup = GENERATE(32, 64, 128, 256); - dimBlock.x = workgroup; + dim3 dimBlock = dim3(workgroup); + dim3 dimGrid = dim3(1); + int numBlocks = 0; // Calculate the device occupancy to know how many blocks can be run concurrently - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, + dimBlock.x * dimBlock.y * dimBlock.z, + dimBlock.x * sizeof(unsigned long long))); dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32); - HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long))); + HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(unsigned long long))); void* params[4]; params[0] = (void*)&A_d; @@ -111,7 +116,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"); HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), dimGrid, dimBlock, params, - dimBlock.x * sizeof(long), stream)); + dimBlock.x * sizeof(unsigned long long), stream)); HIP_CHECK(hipStreamSynchronize(stream)); diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc index 4ab630d5e4..1eb4d290b4 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc @@ -152,12 +152,22 @@ __global__ void block_tile_shfl_up(T* const out, const unsigned int delta) { template void BlockTileShflUpTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); + auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); - auto delta = GENERATE(range(static_cast(0), tile_size)); + + std::vector deltas; + for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(tile_size - 1); + + const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); @@ -209,12 +219,22 @@ __global__ void block_tile_shfl_down(T* const out, const unsigned int delta) { template void BlockTileShflDownTestImpl() { DYNAMIC_SECTION("Tile size: " << tile_size) { + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); + auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); - auto delta = GENERATE(range(static_cast(0), tile_size)); + + std::vector deltas; + for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { + deltas.emplace_back(static_cast(std::floor(i))); + } + deltas.emplace_back(tile_size - 1); + + const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end())); INFO("Delta: " << delta); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T); @@ -277,13 +297,23 @@ __global__ void block_tile_shfl_xor(T* const out, const unsigned mask) { } template void BlockTileShflXORTestImpl() { - DYNAMIC_SECTION("Tile size: " << tile_size) { + DYNAMIC_SECTION("Tile size: " << tile_size) { + const auto inv_reduction_factor = 1.0 / GetTestReductionFactor(); + auto blocks = GenerateBlockDimensionsForShuffle(); auto threads = GenerateThreadDimensionsForShuffle(); INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); - const auto mask = GENERATE(range(static_cast(0), tile_size)); + + std::vector masks; + for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) { + masks.emplace_back(static_cast(std::floor(i))); + } + masks.emplace_back(tile_size - 1); + + const auto mask = GENERATE_COPY(from_range(masks.begin(), masks.end())); INFO("Mask: 0x" << std::hex << mask); + CPUGrid grid(blocks, threads); const auto alloc_size = grid.thread_count_ * sizeof(T);