SWDEV-465196 - Fix coalesced_group tests

Change-Id: I4537e2351273d0c71ff62d34b5002fcb2ecd15e6
Этот коммит содержится в:
Marko Arandjelovic
2024-07-18 12:08:04 +00:00
родитель 5dc048b559
Коммит 850511cc28
6 изменённых файлов: 95 добавлений и 170 удалений
-46
Просмотреть файл
@@ -252,32 +252,6 @@
"Unit_hipGraphUpload_Functional_multidevice_test",
"=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===",
"Unit_Assert_Positive_Basic_KernelFail",
"SWDEV-442805 : Below tests failed in stress test on 19/01/24 ===",
"Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t",
@@ -1250,26 +1224,6 @@
"Unit_safeAtomicMin_Positive_SameAddress - float",
"=== SWDEV-454220 : Below test hanged in stress test on 22/03/24 ===",
"Unit_hipExtLaunchKernel_Positive_Basic",
"=== SWDEV-454220 Below test fail in stress test 03/29/24 ===",
"Unit_Coalesced_Group_Shfl_Positive_Basic - int",
"Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Shfl_Positive_Basic - long",
"Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Shfl_Positive_Basic - long long",
"Unit_Coalesced_Group_Shfl_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Shfl_Positive_Basic - float",
"Unit_Coalesced_Group_Shfl_Positive_Basic - double",
"Unit_Coalesced_Group_Sync_Positive_Basic - uint8_t",
"Unit_Coalesced_Group_Sync_Positive_Basic - uint16_t",
"Unit_Coalesced_Group_Sync_Positive_Basic - uint32_t",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - int",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - long",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - long long",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - float",
"Unit_Coalesced_Group_Shfl_Up_Positive_Basic - double",
#endif
#if defined gfx1030
"=== SWDEV-445961: These tests hang in PSDB stress test on 09/02/2024 ===",
-26
Просмотреть файл
@@ -746,32 +746,6 @@
"Unit_Device___uhadd_Sanity_Positive",
"Unit_Device___rhadd_Sanity_Positive",
"Unit_Device___urhadd_Sanity_Positive",
"SWDEV-435667 : Below tests failed in stress test on 19/01/24 ===",
"Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned int",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long long",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - float",
"Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - double",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t",
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t",
+8 -8
Просмотреть файл
@@ -50,7 +50,7 @@ static __global__ void coalesced_group_size_getter(unsigned int* sizes, uint64_t
template <typename BaseType = cg::coalesced_group>
static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_ranks,
uint64_t active_mask) {
#if (__GFX8__ || __GFX9__)
#if (__GFX8__ || __GFX9__)
constexpr unsigned int ksize = 64;
#else
constexpr unsigned int ksize = 32;
@@ -65,11 +65,11 @@ static __global__ void coalesced_group_thread_rank_getter(unsigned int* thread_r
static __global__ void coalesced_group_non_member_size_getter(unsigned int* sizes,
uint64_t active_mask) {
#if (__GFX8__ || __GFX9__)
#if (__GFX8__ || __GFX9__)
constexpr unsigned int ksize = 64;
#else
constexpr unsigned int ksize = 32;
#endif
#endif
const cg::thread_block_tile<ksize> tile =
cg::tiled_partition<ksize>(cg::this_thread_block());
if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
@@ -80,7 +80,7 @@ static __global__ void coalesced_group_non_member_size_getter(unsigned int* size
static __global__ void coalesced_group_non_member_thread_rank_getter(unsigned int* thread_ranks,
uint64_t active_mask) {
#if (__GFX8__ || _GFX9__)
#if (__GFX8__ || _GFX9__)
constexpr unsigned int ksize = 64;
#else
constexpr unsigned int ksize = 32;
@@ -427,7 +427,7 @@ template <typename T> void CoalescedGroupShflUpTestImpl() {
INFO("Coalesced group mask: " << active_mask);
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
auto delta = GENERATE(range(static_cast<size_t>(0), kWarpSize));
auto delta = GENERATE(range(static_cast<size_t>(0), static_cast<size_t>(getWarpSize())));
delta = delta % active_thread_count;
INFO("Delta: " << delta);
CPUGrid grid(blocks, threads);
@@ -481,7 +481,7 @@ __global__ void coalesced_group_shfl_down(T* const out, const unsigned int delta
constexpr unsigned int ksize = 64;
#else
constexpr unsigned int ksize = 32;
#endif
#endif
const cg::thread_block_tile<ksize> tile =
cg::tiled_partition<ksize>(cg::this_thread_block());
if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
@@ -509,7 +509,7 @@ template <typename T> void CoalescedGroupShflDownTest() {
INFO("Coalesced group mask: " << active_mask);
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
auto delta = GENERATE(range(static_cast<size_t>(0), kWarpSize));
auto delta = GENERATE(range(static_cast<size_t>(0), static_cast<size_t>(getWarpSize())));
delta = delta % active_thread_count;
INFO("Delta: " << delta);
CPUGrid grid(blocks, threads);
@@ -573,7 +573,7 @@ __global__ void coalesced_group_shfl(T* const out, uint8_t* target_lanes,
constexpr unsigned int ksize = 64;
#else
constexpr unsigned int ksize = 32;
#endif
#endif
const cg::thread_block_tile<ksize> tile =
cg::tiled_partition<ksize>(cg::this_thread_block());
if (active_mask & (static_cast<uint64_t>(1) << tile.thread_rank())) {
+83 -86
Просмотреть файл
@@ -67,47 +67,43 @@ template <typename T> static inline T GenerateRandomInteger(const T min, const T
return dist(GetRandomGenerator());
}
template <size_t warp_size> static auto coalesce_threads(const uint64_t mask) {
std::tuple<std::array<unsigned int, warp_size>, unsigned int> res;
static auto coalesce_threads(const uint64_t mask, unsigned int warp_size) {
std::tuple<std::vector<unsigned int>, unsigned int> res;
auto& [threads, count] = res;
count = 0u;
for (auto i = 0u; i < warp_size; ++i) {
if (mask & (1u << i)) {
threads[count++] = i;
threads.push_back(i);
}
}
return res;
}
template <size_t warp_size> __device__ bool deactivate_thread(uint64_t* active_masks) {
const cg::thread_block_tile<warp_size> warp =
cg::tiled_partition<warp_size>(cg::this_thread_block());
__device__ bool deactivate_thread(uint64_t* active_masks, unsigned int warp_size) {
const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size);
const auto block = cg::this_thread_block();
const auto warps_per_block = (block.size() + warp_size - 1) / warp_size;
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
const auto idx = block_rank * warps_per_block + block.thread_rank() / warp.size();
const auto idx = block_rank * warps_per_block + block.thread_rank() / warp_size;
return !(active_masks[idx] & (1u << warp.thread_rank()));
}
template <size_t warp_size>
__global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks,
unsigned int tile_size,
unsigned int* sizes) {
if (deactivate_thread<warp_size>(active_masks)) {
unsigned int* sizes,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
sizes[thread_rank_in_grid()] = cg::tiled_partition(cg::coalesced_threads(), tile_size).size();
}
template <size_t warp_size>
__global__ void coalesced_group_tiled_partition_thread_rank_getter(uint64_t* active_masks,
unsigned int tile_size,
unsigned int* sizes) {
if (deactivate_thread<warp_size>(active_masks)) {
unsigned int* sizes,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
@@ -133,6 +129,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
INFO("Tile size: " << tile_size);
auto blocks = GenerateBlockDimensions();
auto threads = GenerateThreadDimensions();
int warp_size = getWarpSize();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
CPUGrid grid(blocks, threads);
@@ -141,38 +138,38 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size;
const auto warps_in_grid = warps_in_block * grid.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
warps_in_grid * sizeof(uint64_t));
std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
[] { return GenerateRandomInteger(0u, std::numeric_limits<uint32_t>().max()); });
std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid, [] {
return GenerateRandomInteger((uint64_t)0, std::numeric_limits<uint64_t>().max());
});
HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
coalesced_group_tiled_partition_size_getter<32>
<<<blocks, threads>>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr());
coalesced_group_tiled_partition_size_getter<<<blocks, threads>>>(
active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr(), warp_size);
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
coalesced_group_tiled_partition_thread_rank_getter<32>
<<<blocks, threads>>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr());
coalesced_group_tiled_partition_thread_rank_getter<<<blocks, threads>>>(
active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr(), warp_size);
const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_;
// validate size
for (auto i = 0u; i < warps_in_grid; ++i) {
auto current_warp_mask = active_masks.ptr()[i];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
uint64_t current_warp_mask = active_masks.ptr()[i];
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
const auto [active_threads, active_thread_count] =
coalesce_threads<kWarpSize>(current_warp_mask);
coalesce_threads(current_warp_mask, warp_size);
const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
const auto num_tiles = (active_thread_count + tile_size - 1) / tile_size;
@@ -183,7 +180,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
const auto window_end = t + tile_size;
// Iterate through window
for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
const auto global_thread_idx = i * warp_size + active_threads[k] - tails;
const auto expected_val = tile_size - tile_tail * (t + tile_size >= active_thread_count);
const auto actual_val = uint_arr.ptr()[global_thread_idx];
INFO("global index: " << global_thread_idx);
@@ -199,13 +196,12 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
// validate rank
for (auto i = 0u; i < warps_in_grid; ++i) {
auto current_warp_mask = active_masks.ptr()[i];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
uint64_t current_warp_mask = active_masks.ptr()[i];
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
const auto [active_threads, active_thread_count] =
coalesce_threads<kWarpSize>(current_warp_mask);
coalesce_threads(current_warp_mask, warp_size);
const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
// Step tile-sized window over active threads
@@ -214,7 +210,7 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
const auto window_end = t + tile_size;
// Iterate through window
for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
const auto global_thread_idx = i * warp_size + active_threads[k] - tails;
const auto expected_val = k % tile_size;
const auto actual_val = uint_arr.ptr()[global_thread_idx];
INFO("global index: " << global_thread_idx);
@@ -227,15 +223,15 @@ TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
}
template <typename T, size_t warp_size>
template <typename T>
__global__ void coalesced_group_tiled_partition_shfl_up(uint64_t* active_masks, T* const out,
const unsigned int tile_size,
const unsigned int delta) {
if (deactivate_thread<warp_size>(active_masks)) {
const unsigned int delta,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
const cg::thread_block_tile<warp_size> warp =
cg::tiled_partition<warp_size>(cg::this_thread_block());
const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size);
T var = static_cast<T>(warp.thread_rank());
const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
@@ -248,6 +244,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
INFO("Tile size: " << tile_size);
auto blocks = GenerateBlockDimensionsForShuffle();
auto threads = GenerateThreadDimensionsForShuffle();
auto warp_size = getWarpSize();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
const auto delta = GENERATE_COPY(range(0u, tile_size));
@@ -258,7 +255,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size;
const auto warps_in_grid = warps_in_block * grid.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
@@ -270,22 +267,21 @@ template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
coalesced_group_tiled_partition_shfl_up<T, kWarpSize>
<<<blocks, threads>>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta);
coalesced_group_tiled_partition_shfl_up<T><<<blocks, threads>>>(
active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta, warp_size);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_;
for (auto i = 0u; i < warps_in_grid; ++i) {
auto current_warp_mask = active_masks.ptr()[i];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
const auto [active_threads, active_thread_count] =
coalesce_threads<kWarpSize>(current_warp_mask);
coalesce_threads(current_warp_mask, warp_size);
const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
// Step tile-sized window over active threads
@@ -294,7 +290,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
const auto window_end = t + tile_size;
// Iterate through window
for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
const auto global_thread_idx = i * warp_size + active_threads[k] - tails;
const auto expected_val = active_threads[k - delta];
const auto actual_val = uint_arr.ptr()[global_thread_idx];
INFO("global index: " << global_thread_idx);
@@ -327,15 +323,15 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic"
}
template <typename T, size_t warp_size>
template <typename T>
__global__ void coalesced_group_tiled_partition_shfl_down(uint64_t* active_masks, T* const out,
const unsigned int tile_size,
const unsigned int delta) {
if (deactivate_thread<warp_size>(active_masks)) {
const unsigned int delta,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
const cg::thread_block_tile<warp_size> warp =
cg::tiled_partition<warp_size>(cg::this_thread_block());
const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size);
T var = static_cast<T>(warp.thread_rank());
const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
@@ -348,6 +344,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl()
INFO("Tile size: " << tile_size);
auto blocks = GenerateBlockDimensionsForShuffle();
auto threads = GenerateThreadDimensionsForShuffle();
auto warp_size = getWarpSize();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
const auto delta = GENERATE_COPY(range(0u, tile_size));
@@ -358,7 +355,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl()
LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size;
const auto warps_in_grid = warps_in_block * grid.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
@@ -370,22 +367,21 @@ template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl()
HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
coalesced_group_tiled_partition_shfl_down<T, kWarpSize>
<<<blocks, threads>>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta);
coalesced_group_tiled_partition_shfl_down<T><<<blocks, threads>>>(
active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta, warp_size);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_;
for (auto i = 0u; i < warps_in_grid; ++i) {
auto current_warp_mask = active_masks.ptr()[i];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
const auto [active_threads, active_thread_count] =
coalesce_threads<kWarpSize>(current_warp_mask);
coalesce_threads(current_warp_mask, warp_size);
if (delta >= active_thread_count) {
continue;
@@ -398,7 +394,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl()
const auto window_end = t + tile_size - delta;
// Iterate through window
for (auto k = window_start; k < window_end && k < active_thread_count - delta; ++k) {
const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
const auto global_thread_idx = i * warp_size + active_threads[k] - tails;
const auto expected_val = active_threads[k + delta];
const auto actual_val = uint_arr.ptr()[global_thread_idx];
INFO("global index: " << global_thread_idx);
@@ -431,14 +427,14 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basi
}
template <typename T, size_t warp_size>
template <typename T>
__global__ void coalesced_group_tiled_partition_shfl(uint64_t* active_masks, uint8_t* target_lanes,
T* const out, const unsigned int tile_size) {
if (deactivate_thread<warp_size>(active_masks)) {
T* const out, const unsigned int tile_size,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
const cg::thread_block_tile<warp_size> warp =
cg::tiled_partition<warp_size>(cg::this_thread_block());
const auto warp = cg::tiled_partition(cg::this_thread_block(), warp_size);
T var = static_cast<T>(warp.thread_rank());
const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
@@ -450,6 +446,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflTestImpl() {
INFO("Tile size: " << tile_size);
auto blocks = GenerateBlockDimensionsForShuffle();
auto threads = GenerateThreadDimensionsForShuffle();
auto warp_size = getWarpSize();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
CPUGrid grid(blocks, threads);
@@ -458,7 +455,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflTestImpl() {
LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size;
const auto warps_in_grid = warps_in_block * grid.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
@@ -476,22 +473,21 @@ template <typename T> static void CoalescedGroupTiledPartitonShflTestImpl() {
HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), tile_size * sizeof(uint8_t),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
coalesced_group_tiled_partition_shfl<T, kWarpSize><<<blocks, threads>>>(
active_masks_dev.ptr(), target_lanes_dev.ptr(), uint_arr_dev.ptr(), tile_size);
coalesced_group_tiled_partition_shfl<T><<<blocks, threads>>>(
active_masks_dev.ptr(), target_lanes_dev.ptr(), uint_arr_dev.ptr(), tile_size, warp_size);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_;
for (auto i = 0u; i < warps_in_grid; ++i) {
auto current_warp_mask = active_masks.ptr()[i];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
const auto [active_threads, active_thread_count] =
coalesce_threads<kWarpSize>(current_warp_mask);
coalesce_threads(current_warp_mask, warp_size);
const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
// Step tile-sized window over active threads
@@ -500,7 +496,7 @@ template <typename T> static void CoalescedGroupTiledPartitonShflTestImpl() {
const auto window_end = t + tile_size;
// Iterate through window
for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
const auto global_thread_idx = i * warp_size + active_threads[k] - tails;
const auto target_lane = target_lanes.ptr()[k % tile_size];
if (target_lane >= tile_size || target_lane >= active_thread_count - t) {
continue;
@@ -537,11 +533,12 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic", "
}
template <bool use_global, size_t warp_size, typename T>
template <bool use_global, typename T>
__global__ void coalesced_group_tiled_partition_sync_check(uint64_t* active_masks, T* global_data,
unsigned int* wait_modifiers,
size_t tile_size) {
if (deactivate_thread<warp_size>(active_masks)) {
size_t tile_size,
unsigned int warp_size) {
if (deactivate_thread(active_masks, warp_size)) {
return;
}
@@ -591,6 +588,7 @@ template <bool global_memory, typename T> void CoalescedGroupTiledPartitionSyncT
INFO("Tile size: " << tile_size);
auto blocks = GenerateBlockDimensionsForShuffle();
auto threads = GenerateThreadDimensionsForShuffle();
auto warp_size = getWarpSize();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
CPUGrid grid(blocks, threads);
@@ -610,7 +608,7 @@ template <bool global_memory, typename T> void CoalescedGroupTiledPartitionSyncT
grid.thread_count_ * sizeof(unsigned int));
LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
grid.thread_count_ * sizeof(unsigned int));
const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
const auto warps_in_block = (grid.threads_in_block_count_ + warp_size - 1) / warp_size;
const auto warps_in_grid = warps_in_block * grid.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
@@ -631,24 +629,23 @@ template <bool global_memory, typename T> void CoalescedGroupTiledPartitionSyncT
grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block;
coalesced_group_tiled_partition_sync_check<global_memory, kWarpSize>
coalesced_group_tiled_partition_sync_check<global_memory>
<<<blocks, threads, shared_memory_size>>>(active_masks_dev.ptr(), arr_dev.ptr(),
wait_modifiers_dev.ptr(), tile_size);
wait_modifiers_dev.ptr(), tile_size, warp_size);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
const auto tail = warps_in_block * warp_size - grid.threads_in_block_count_;
for (int i = 0u; i < grid.block_count_; ++i) {
for (int j = 0u; j < warps_in_block; ++j) {
const auto warp_idx = i * warps_in_block + j;
auto mask = active_masks.ptr()[warp_idx];
const auto shift_amount =
(tail + 32 * TestContext::get().isNvidia()) * !((warp_idx + 1) % warps_in_block);
const auto shift_amount = (tail + 32 * TestContext::get().isNvidia()) * !((warp_idx + 1) % warps_in_block);
mask = (mask << shift_amount) >> shift_amount;
const auto active_count = std::bitset<sizeof(mask) * 8>(mask).count();
const auto start_offset = i * grid.threads_in_block_count_ + j * kWarpSize;
const auto start_offset = i * grid.threads_in_block_count_ + j * warp_size;
const auto end_offset = start_offset + active_count;
const auto valid =
std::all_of(arr.ptr() + start_offset, arr.ptr() + end_offset, [](T e) { return e; });
@@ -686,6 +683,6 @@ TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic", "
}
/**
* End doxygen group DeviceLanguageTest.
* @}
*/
* End doxygen group DeviceLanguageTest.
* @}
*/
+1 -2
Просмотреть файл
@@ -23,7 +23,6 @@ THE SOFTWARE.
#include <hip/hip_cooperative_groups.h>
namespace {
constexpr size_t kWarpSize = 32;
constexpr int kMaxGPUs = 8;
} // namespace
@@ -61,4 +60,4 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc
}
return true;
}
}
+3 -2
Просмотреть файл
@@ -115,6 +115,7 @@ static void get_multi_grid_dims(dim3& grid_dim, dim3& block_dim, unsigned int de
HIP_CHECK(hipSetDevice(device))
HIP_CHECK(hipGetDeviceProperties(&props, 0));
int sm = props.multiProcessorCount;
auto warp_size = getWarpSize();
std::vector<dim3> block_dim_values = {dim3(1, 1, 1),
dim3(props.maxThreadsDim[0], 1, 1),
dim3(1, props.maxThreadsDim[1], 1),
@@ -123,8 +124,8 @@ static void get_multi_grid_dims(dim3& grid_dim, dim3& block_dim, unsigned int de
dim3(32, 32, 1),
dim3(64, 8, 2),
dim3(16, 16, 3),
dim3(kWarpSize - 1, 3, 3),
dim3(kWarpSize + 1, 3, 3)};
dim3(warp_size - 1, 3, 3),
dim3(warp_size + 1, 3, 3)};
std::vector<dim3> grid_dim_values = {dim3(1, 1, 1),
dim3(static_cast<int>(0.5 * sm), 1, 3),
dim3(4, static_cast<int>(0.5 * sm), 1),