SWDEV-521760 - Fix and enable disabled HIP tests from cooperative groups group (#2027)
* Reworked Unit_hipLaunchCooperativeKernel_Basic and Unit_hipLaunchCooperativeKernelMultiDevice_Basic * Introduce reduction_factor for coop groups tests. Fix Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic * Fix always false requirement by adding a cast * Change data type to unsigned long long to align with cuda * Change literal type to double to ensure proper type casting * Remove formatting comments
Dieser Commit ist enthalten in:
@@ -46,10 +46,6 @@
|
|||||||
"Unit_hipGraphicsUnmapResources_Negative_Parameters",
|
"Unit_hipGraphicsUnmapResources_Negative_Parameters",
|
||||||
"Unit_hipGraphicsUnregisterResource_Negative_Parameters",
|
"Unit_hipGraphicsUnregisterResource_Negative_Parameters",
|
||||||
"SWDEV-446588 - Disable graph multi gpu testcases until graph has support for it",
|
"SWDEV-446588 - Disable graph multi gpu testcases until graph has support for it",
|
||||||
"=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===",
|
|
||||||
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t",
|
|
||||||
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t",
|
|
||||||
"Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t",
|
|
||||||
"SWDEV-447384, SWDEV-447932: These tests fail in gfx1100, gfx1101 & gfx1102",
|
"SWDEV-447384, SWDEV-447932: These tests fail in gfx1100, gfx1101 & gfx1102",
|
||||||
"SWDEV-445928: These tests fail in PSDB stress test on 09/02/2024",
|
"SWDEV-445928: These tests fail in PSDB stress test on 09/02/2024",
|
||||||
"Unit_hipGraphInstantiateWithFlags_StreamCaptureDeviceContextChg",
|
"Unit_hipGraphInstantiateWithFlags_StreamCaptureDeviceContextChg",
|
||||||
@@ -143,8 +139,6 @@
|
|||||||
"Unit_hipStreamPerThread_MultiThread",
|
"Unit_hipStreamPerThread_MultiThread",
|
||||||
#endif
|
#endif
|
||||||
#if defined gfx90a || defined gfx942 || defined gfx950
|
#if defined gfx90a || defined gfx942 || defined gfx950
|
||||||
"=== SWDEV-443630 : Below test failed in stress test on 19/01/24 ===",
|
|
||||||
"Unit_Multi_Grid_Group_Positive_Sync",
|
|
||||||
"Unit_Warp_Shfl_Up_Positive_Basic - int",
|
"Unit_Warp_Shfl_Up_Positive_Basic - int",
|
||||||
"Unit_Warp_Shfl_Up_Positive_Basic - unsigned int",
|
"Unit_Warp_Shfl_Up_Positive_Basic - unsigned int",
|
||||||
"Unit_Warp_Shfl_Up_Positive_Basic - long",
|
"Unit_Warp_Shfl_Up_Positive_Basic - long",
|
||||||
|
|||||||
@@ -54,6 +54,9 @@ int main(int argc, char** argv) {
|
|||||||
| Opt(cmd_options.cg_iterations, "cg_iterations")
|
| Opt(cmd_options.cg_iterations, "cg_iterations")
|
||||||
["-C"]["--cg-iterations"]
|
["-C"]["--cg-iterations"]
|
||||||
("Number of iterations used for cooperative groups sync tests (default: 5)")
|
("Number of iterations used for cooperative groups sync tests (default: 5)")
|
||||||
|
| Opt(cmd_options.cg_reduction_factor, "cg_reduction_factor")
|
||||||
|
["-C"]["--cg-reduction-factor"]
|
||||||
|
("Percentage of warp sizes for shuffle tests to be actually tested (default: 10)") // TODO
|
||||||
| Opt(cmd_options.accuracy_iterations, "accuracy_iterations")
|
| Opt(cmd_options.accuracy_iterations, "accuracy_iterations")
|
||||||
["-A"]["--accuracy-iterations"]
|
["-A"]["--accuracy-iterations"]
|
||||||
("Number of iterations used for math accuracy tests with randomly generated inputs (default: 2^32)")
|
("Number of iterations used for math accuracy tests with randomly generated inputs (default: 2^32)")
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ THE SOFTWARE.
|
|||||||
struct CmdOptions {
|
struct CmdOptions {
|
||||||
int iterations = 5;
|
int iterations = 5;
|
||||||
int warmups = 5;
|
int warmups = 5;
|
||||||
int cg_extended_run = 5;
|
int cg_iterations = 1;
|
||||||
int cg_iterations = 2;
|
double cg_reduction_factor = 6.25;
|
||||||
bool no_display = false;
|
bool no_display = false;
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
uint64_t accuracy_iterations = std::numeric_limits<uint32_t>::max() + 1ull;
|
uint64_t accuracy_iterations = std::numeric_limits<uint32_t>::max() + 1ull;
|
||||||
|
|||||||
@@ -114,11 +114,9 @@ struct CPUMultiGrid {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/* Generate dimensions for 1D, 2D and 3D blocks of threads */
|
/* Generate dimensions for 1D, 2D and 3D blocks of threads */
|
||||||
inline dim3 GenerateThreadDimensions() {
|
inline dim3 GenerateThreadDimensionsImpl(const std::initializer_list<double>& multipliers) {
|
||||||
hipDeviceProp_t props;
|
hipDeviceProp_t props;
|
||||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||||
const auto multipliers = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
|
|
||||||
1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5};
|
|
||||||
return GENERATE_COPY(
|
return GENERATE_COPY(
|
||||||
dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1),
|
dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1),
|
||||||
dim3(1, 1, props.maxThreadsDim[2]),
|
dim3(1, 1, props.maxThreadsDim[2]),
|
||||||
@@ -135,59 +133,39 @@ inline dim3 GenerateThreadDimensions() {
|
|||||||
dim3(props.warpSize - 1, 3, 3), dim3(props.warpSize + 1, 3, 3));
|
dim3(props.warpSize - 1, 3, 3), dim3(props.warpSize + 1, 3, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline dim3 GenerateThreadDimensions() {
|
||||||
|
const auto multipliers = {0.1, 0.5, 1.0, 1.5, 2.0};
|
||||||
|
return GenerateThreadDimensionsImpl(multipliers);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline dim3 GenerateThreadDimensionsForShuffle() {
|
||||||
|
const auto multipliers = {0.5, 1.0, 2.0};
|
||||||
|
return GenerateThreadDimensionsImpl(multipliers);
|
||||||
|
}
|
||||||
|
|
||||||
/* Generate dimensions for 1D, 2D and 3D grids of blocks */
|
/* Generate dimensions for 1D, 2D and 3D grids of blocks */
|
||||||
|
inline dim3 GenerateBlockDimensionsImpl(const std::initializer_list<double>& multipliers) {
|
||||||
|
hipDeviceProp_t props;
|
||||||
|
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||||
|
return GENERATE_COPY(dim3(1, 1, 1),
|
||||||
|
map([sm = props.multiProcessorCount](
|
||||||
|
double i) { return dim3(static_cast<int>(i * sm), 1, 1); },
|
||||||
|
values(multipliers)),
|
||||||
|
map([sm = props.multiProcessorCount](
|
||||||
|
double i) { return dim3(1, static_cast<int>(i * sm), 1); },
|
||||||
|
values(multipliers)),
|
||||||
|
map([sm = props.multiProcessorCount](
|
||||||
|
double i) { return dim3(1, 1, static_cast<int>(i * sm)); },
|
||||||
|
values(multipliers)),
|
||||||
|
dim3(5, 5, 5));
|
||||||
|
}
|
||||||
|
|
||||||
inline dim3 GenerateBlockDimensions() {
|
inline dim3 GenerateBlockDimensions() {
|
||||||
hipDeviceProp_t props;
|
const auto multipliers = {0.5, 1.0, 1.5, 2.0};
|
||||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
return GenerateBlockDimensionsImpl(multipliers);
|
||||||
const auto multipliers = {0.5, 0.9, 1.0, 1.1, 1.5, 1.9, 2.0, 3.0, 4.0};
|
|
||||||
return GENERATE_COPY(dim3(1, 1, 1),
|
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(static_cast<int>(i * sm), 1, 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(1, static_cast<int>(i * sm), 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(1, 1, static_cast<int>(i * sm)); },
|
|
||||||
values(multipliers)),
|
|
||||||
dim3(5, 5, 5));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
|
|
||||||
inline dim3 GenerateThreadDimensionsForShuffle() {
|
|
||||||
hipDeviceProp_t props;
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
|
||||||
const auto multipliers = {0.5, 0.9, 1.0, 1.5, 2.0};
|
|
||||||
return GENERATE_COPY(
|
|
||||||
dim3(1, 1, 1), dim3(props.maxThreadsDim[0], 1, 1), dim3(1, props.maxThreadsDim[1], 1),
|
|
||||||
dim3(1, 1, props.maxThreadsDim[2]),
|
|
||||||
map([max = props.maxThreadsDim[0], warp_size = props.warpSize](
|
|
||||||
double i) { return dim3(std::min(static_cast<int>(i * warp_size), max), 1, 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([max = props.maxThreadsDim[1], warp_size = props.warpSize](
|
|
||||||
double i) { return dim3(1, std::min(static_cast<int>(i * warp_size), max), 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([max = props.maxThreadsDim[2], warp_size = props.warpSize](
|
|
||||||
double i) { return dim3(1, 1, std::min(static_cast<int>(i * warp_size), max)); },
|
|
||||||
values(multipliers)),
|
|
||||||
dim3(16, 8, 8), dim3(32, 32, 1), dim3(64, 8, 2), dim3(16, 16, 3),
|
|
||||||
dim3(props.warpSize - 1, 3, 3), dim3(props.warpSize + 1, 3, 3));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
|
|
||||||
inline dim3 GenerateBlockDimensionsForShuffle() {
|
inline dim3 GenerateBlockDimensionsForShuffle() {
|
||||||
hipDeviceProp_t props;
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
|
||||||
const auto multipliers = {0.5, 1.0};
|
const auto multipliers = {0.5, 1.0};
|
||||||
return GENERATE_COPY(dim3(1, 1, 1),
|
return GenerateBlockDimensionsImpl(multipliers);
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(static_cast<int>(i * sm), 1, 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(1, static_cast<int>(i * sm), 1); },
|
|
||||||
values(multipliers)),
|
|
||||||
map([sm = props.multiProcessorCount](
|
|
||||||
double i) { return dim3(1, 1, static_cast<int>(i * sm)); },
|
|
||||||
values(multipliers)),
|
|
||||||
dim3(5, 5, 5));
|
|
||||||
}
|
}
|
||||||
@@ -137,12 +137,7 @@ static uint64_t get_active_mask(unsigned int test_case, size_t warp_size) {
|
|||||||
* - HIP_VERSION >= 5.2
|
* - HIP_VERSION >= 5.2
|
||||||
*/
|
*/
|
||||||
TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") {
|
TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") {
|
||||||
int device;
|
const int warp_size = getWarpSize();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -225,13 +220,7 @@ TEST_CASE("Unit_Coalesced_Group_Getters_Positive_Basic") {
|
|||||||
* - HIP_VERSION >= 5.2
|
* - HIP_VERSION >= 5.2
|
||||||
*/
|
*/
|
||||||
TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") {
|
TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") {
|
||||||
int device;
|
const int warp_size = getWarpSize();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -315,12 +304,7 @@ TEST_CASE("Unit_Coalesced_Group_Getters_Via_Base_Type_Positive_Basic") {
|
|||||||
* - HIP_VERSION >= 5.2
|
* - HIP_VERSION >= 5.2
|
||||||
*/
|
*/
|
||||||
TEST_CASE("Unit_Coalesced_Group_Getters_Via_Non_Member_Functions_Positive_Basic") {
|
TEST_CASE("Unit_Coalesced_Group_Getters_Via_Non_Member_Functions_Positive_Basic") {
|
||||||
int device;
|
const int warp_size = getWarpSize();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -407,12 +391,9 @@ template <typename T> __global__ void coalesced_group_shfl_up(T* const out,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> void CoalescedGroupShflUpTestImpl() {
|
template <typename T> void CoalescedGroupShflUpTestImpl() {
|
||||||
int device;
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
const int warp_size = getWarpSize();
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -423,11 +404,16 @@ template <typename T> void CoalescedGroupShflUpTestImpl() {
|
|||||||
INFO("Coalesced group mask: " << active_mask);
|
INFO("Coalesced group mask: " << active_mask);
|
||||||
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
|
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
|
||||||
|
|
||||||
// Tests edge cases (0, 1, max-1) and middle values
|
std::vector<int> deltas;
|
||||||
auto delta = GENERATE(values<size_t>({0, 1, static_cast<size_t>(getWarpSize()/2),
|
for (double i = 0; i < warp_size - 1; i += inv_reduction_factor) {
|
||||||
static_cast<size_t>(getWarpSize()-1)}));
|
deltas.emplace_back(static_cast<int>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(warp_size - 1);
|
||||||
|
|
||||||
|
auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
delta = delta % active_thread_count;
|
delta = delta % active_thread_count;
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
@@ -489,12 +475,9 @@ template <typename T> __global__ void coalesced_group_shfl_down(T* const out,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> void CoalescedGroupShflDownTest() {
|
template <typename T> void CoalescedGroupShflDownTest() {
|
||||||
int device;
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
const int warp_size = getWarpSize();
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -505,11 +488,16 @@ template <typename T> void CoalescedGroupShflDownTest() {
|
|||||||
INFO("Coalesced group mask: " << active_mask);
|
INFO("Coalesced group mask: " << active_mask);
|
||||||
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
|
unsigned int active_thread_count = get_active_thread_count(active_mask, warp_size);
|
||||||
|
|
||||||
// Tests edge cases (0, 1, max-1) and middle values
|
std::vector<int> deltas;
|
||||||
auto delta = GENERATE(values<size_t>({0, 1, static_cast<size_t>(getWarpSize()/2),
|
for (double i = 0; i < warp_size - 1; i += inv_reduction_factor) {
|
||||||
static_cast<size_t>(getWarpSize()-1)}));
|
deltas.emplace_back(static_cast<int>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(warp_size - 1);
|
||||||
|
|
||||||
|
auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
delta = delta % active_thread_count;
|
delta = delta % active_thread_count;
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
@@ -581,12 +569,7 @@ template <typename T> __global__ void coalesced_group_shfl(T* const out, uint8_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> void CoalescedGroupShflTest() {
|
template <typename T> void CoalescedGroupShflTest() {
|
||||||
int device;
|
const int warp_size = getWarpSize();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
|
||||||
|
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
@@ -730,13 +713,7 @@ __global__ void coalesced_group_sync_check(T* global_data, unsigned int* wait_mo
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <bool global_memory, typename T> void CoalescedGroupSyncTest() {
|
template <bool global_memory, typename T> void CoalescedGroupSyncTest() {
|
||||||
int device;
|
const int warp_size = getWarpSize();
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
|
||||||
|
|
||||||
size_t warp_size = static_cast<size_t>(device_properties.warpSize);
|
|
||||||
|
|
||||||
const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
|
const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
|
||||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
|
|||||||
+23
-3
@@ -87,7 +87,7 @@ __device__ bool deactivate_thread(uint64_t* active_masks, unsigned int warp_size
|
|||||||
const auto warps_per_block = (block.size() + warp_size - 1) / warp_size;
|
const auto warps_per_block = (block.size() + warp_size - 1) / warp_size;
|
||||||
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
|
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
|
||||||
const auto idx = block_rank * warps_per_block + block.thread_rank() / warp_size;
|
const auto idx = block_rank * warps_per_block + block.thread_rank() / warp_size;
|
||||||
return !(active_masks[idx] & (1u << warp.thread_rank()));
|
return !(active_masks[idx] & (1ull << warp.thread_rank()));
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks,
|
__global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks,
|
||||||
@@ -243,6 +243,8 @@ __global__ void coalesced_group_tiled_partition_shfl_up(uint64_t* active_masks,
|
|||||||
|
|
||||||
|
|
||||||
template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
|
template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
|
||||||
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
|
|
||||||
const auto tile_size = GenerateTileSizes();
|
const auto tile_size = GenerateTileSizes();
|
||||||
INFO("Tile size: " << tile_size);
|
INFO("Tile size: " << tile_size);
|
||||||
auto blocks = GenerateBlockDimensionsForShuffle();
|
auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
@@ -250,8 +252,16 @@ template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
|
|||||||
auto warp_size = getWarpSize();
|
auto warp_size = getWarpSize();
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
const auto delta = GENERATE_COPY(range(0u, tile_size));
|
|
||||||
|
std::vector<unsigned int> deltas;
|
||||||
|
for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) {
|
||||||
|
deltas.emplace_back(static_cast<unsigned int>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(tile_size - 1);
|
||||||
|
|
||||||
|
const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
@@ -344,6 +354,8 @@ __global__ void coalesced_group_tiled_partition_shfl_down(uint64_t* active_masks
|
|||||||
|
|
||||||
|
|
||||||
template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl() {
|
template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl() {
|
||||||
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
|
|
||||||
const auto tile_size = GenerateTileSizes();
|
const auto tile_size = GenerateTileSizes();
|
||||||
INFO("Tile size: " << tile_size);
|
INFO("Tile size: " << tile_size);
|
||||||
auto blocks = GenerateBlockDimensionsForShuffle();
|
auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
@@ -351,8 +363,16 @@ template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl()
|
|||||||
auto warp_size = getWarpSize();
|
auto warp_size = getWarpSize();
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
const auto delta = GENERATE_COPY(range(0u, tile_size));
|
|
||||||
|
std::vector<unsigned int> deltas;
|
||||||
|
for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) {
|
||||||
|
deltas.emplace_back(static_cast<unsigned int>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(tile_size - 1);
|
||||||
|
|
||||||
|
const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ THE SOFTWARE.
|
|||||||
#include <hip_test_common.hh>
|
#include <hip_test_common.hh>
|
||||||
#include <hip/hip_cooperative_groups.h>
|
#include <hip/hip_cooperative_groups.h>
|
||||||
|
|
||||||
|
#include <cmd_options.hh>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
constexpr int kMaxGPUs = 8;
|
constexpr int kMaxGPUs = 8;
|
||||||
} // namespace
|
} // namespace
|
||||||
@@ -61,3 +63,5 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline double GetTestReductionFactor() { return cmd_options.cg_reduction_factor * 0.01; }
|
||||||
|
|||||||
@@ -273,12 +273,8 @@ TEST_CASE("Unit_Grid_Group_Sync_Positive_Basic") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto loops = GENERATE(2, 4, 8, 16);
|
auto loops = GENERATE(2, 4, 8, 16);
|
||||||
// Launch params for this test are hardcoded as a workaround for an issue reported
|
const auto blocks = GenerateBlockDimensions();
|
||||||
// SWDEV-429791. When fixed, please enable calls to GenerateBlock/ThreadDimensions()
|
const auto threads = GenerateThreadDimensions();
|
||||||
const auto blocks =
|
|
||||||
GENERATE_COPY(dim3(5, 5, 5), dim3(330, 1, 1), dim3(1, 330, 1), dim3(1, 1, 330));
|
|
||||||
const auto threads =
|
|
||||||
GENERATE_COPY(dim3(16, 8, 8), dim3(32, 32, 1), dim3(64, 8, 2), dim3(16, 16, 3));
|
|
||||||
if (!CheckDimensions(device, sync_kernel, blocks, threads)) return;
|
if (!CheckDimensions(device, sync_kernel, blocks, threads)) return;
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
|
|||||||
+65
-64
@@ -86,45 +86,48 @@ namespace cg = cooperative_groups;
|
|||||||
|
|
||||||
static constexpr size_t kBufferLen = 1024 * 1024;
|
static constexpr size_t kBufferLen = 1024 * 1024;
|
||||||
|
|
||||||
__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) {
|
__global__ void test_gws(uint* buf, uint buf_size, unsigned long long* tmp_buf,
|
||||||
extern __shared__ long tmp[];
|
unsigned long long* result) {
|
||||||
uint groups = gridDim.x;
|
extern __shared__ unsigned long long tmp[];
|
||||||
uint group_id = blockIdx.x;
|
|
||||||
uint local_id = threadIdx.x;
|
|
||||||
uint chunk = gridDim.x * blockDim.x;
|
|
||||||
|
|
||||||
uint i = group_id * blockDim.x + local_id;
|
cg::thread_block tb = cg::this_thread_block();
|
||||||
long sum = 0;
|
cg::grid_group gg = cg::this_grid();
|
||||||
while (i < buf_size) {
|
cg::multi_grid_group mgg = cg::this_multi_grid();
|
||||||
|
|
||||||
|
const auto tid = gg.thread_rank();
|
||||||
|
const auto stride = gg.size();
|
||||||
|
const auto local_tid = tb.thread_rank();
|
||||||
|
const auto wid = blockIdx.x;
|
||||||
|
const auto workgroup_size = tb.size();
|
||||||
|
const auto gid = mgg.grid_rank();
|
||||||
|
const auto grid_size = gridDim.x;
|
||||||
|
const auto num_grids = mgg.num_grids();
|
||||||
|
|
||||||
|
unsigned long long sum = 0;
|
||||||
|
for (size_t i = tid; i < buf_size; i += stride) {
|
||||||
sum += buf[i];
|
sum += buf[i];
|
||||||
i += chunk;
|
|
||||||
}
|
}
|
||||||
tmp[local_id] = sum;
|
tmp[local_tid] = sum;
|
||||||
__syncthreads();
|
tb.sync();
|
||||||
i = 0;
|
|
||||||
if (local_id == 0) {
|
|
||||||
sum = 0;
|
|
||||||
while (i < blockDim.x) {
|
|
||||||
sum += tmp[i];
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
tmp_buf[group_id] = sum;
|
|
||||||
}
|
|
||||||
// wait
|
|
||||||
cg::this_grid().sync();
|
|
||||||
|
|
||||||
if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
|
if (local_tid == 0) {
|
||||||
for (uint i = 1; i < groups; ++i) {
|
|
||||||
sum += tmp_buf[i];
|
|
||||||
}
|
|
||||||
//*result = sum;
|
|
||||||
result[1 + cg::this_multi_grid().grid_rank()] = sum;
|
|
||||||
}
|
|
||||||
cg::this_multi_grid().sync();
|
|
||||||
if (cg::this_multi_grid().grid_rank() == 0) {
|
|
||||||
sum = 0;
|
sum = 0;
|
||||||
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
|
for (size_t i = 0; i < workgroup_size; i++) {
|
||||||
sum += result[i];
|
sum += tmp[i];
|
||||||
|
}
|
||||||
|
tmp_buf[wid] = sum;
|
||||||
|
}
|
||||||
|
gg.sync();
|
||||||
|
|
||||||
|
if (tid < grid_size) {
|
||||||
|
atomicAdd(&result[gid + 1], tmp_buf[tid]);
|
||||||
|
}
|
||||||
|
mgg.sync();
|
||||||
|
|
||||||
|
if (gid == 0) {
|
||||||
|
sum = 0;
|
||||||
|
for (size_t i = 0; i < num_grids; i++) {
|
||||||
|
sum += result[i + 1];
|
||||||
}
|
}
|
||||||
*result = sum;
|
*result = sum;
|
||||||
}
|
}
|
||||||
@@ -136,20 +139,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") {
|
|||||||
int device_num = 0;
|
int device_num = 0;
|
||||||
HIP_CHECK(hipGetDeviceCount(&device_num));
|
HIP_CHECK(hipGetDeviceCount(&device_num));
|
||||||
|
|
||||||
size_t buffer_size = kBufferLen * sizeof(int);
|
|
||||||
|
|
||||||
int* A_h = reinterpret_cast<int*>(malloc(buffer_size * device_num));
|
|
||||||
for (uint32_t i = 0; i < kBufferLen * device_num; ++i) {
|
|
||||||
A_h[i] = static_cast<int>(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int*> A_d(device_num);
|
|
||||||
std::vector<long*> B_d(device_num);
|
|
||||||
long* C_d;
|
|
||||||
std::vector<hipStream_t> stream(device_num);
|
|
||||||
|
|
||||||
std::vector<hipDeviceProp_t> device_properties(device_num);
|
std::vector<hipDeviceProp_t> device_properties(device_num);
|
||||||
|
|
||||||
for (int i = 0; i < device_num; i++) {
|
for (int i = 0; i < device_num; i++) {
|
||||||
HIP_CHECK(hipSetDevice(i));
|
HIP_CHECK(hipSetDevice(i));
|
||||||
|
|
||||||
@@ -159,28 +149,39 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") {
|
|||||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t buffer_size = kBufferLen * sizeof(int);
|
||||||
|
|
||||||
|
int* A_h = nullptr;
|
||||||
|
std::vector<int*> A_d(device_num);
|
||||||
|
std::vector<unsigned long long*> B_d(device_num);
|
||||||
|
unsigned long long* C_d;
|
||||||
|
std::vector<hipStream_t> stream(device_num);
|
||||||
|
|
||||||
|
A_h = reinterpret_cast<int*>(malloc(buffer_size * device_num));
|
||||||
|
for (uint32_t i = 0; i < kBufferLen * device_num; i++) {
|
||||||
|
A_h[i] = static_cast<int>(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < device_num; i++) {
|
||||||
|
HIP_CHECK(hipSetDevice(i));
|
||||||
|
|
||||||
HIP_CHECK(hipMalloc(&A_d[i], buffer_size));
|
HIP_CHECK(hipMalloc(&A_d[i], buffer_size));
|
||||||
HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice));
|
HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice));
|
||||||
if (i == 0) {
|
|
||||||
HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long)));
|
|
||||||
}
|
|
||||||
|
|
||||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||||
|
|
||||||
HIP_CHECK(hipDeviceSynchronize());
|
HIP_CHECK(hipDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
dim3 dimBlock;
|
HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(unsigned long long)));
|
||||||
dim3 dimGrid;
|
|
||||||
dimGrid.x = 1;
|
|
||||||
dimGrid.y = 1;
|
|
||||||
dimGrid.z = 1;
|
|
||||||
dimBlock.x = 64;
|
|
||||||
dimBlock.y = 1;
|
|
||||||
dimBlock.z = 1;
|
|
||||||
|
|
||||||
|
uint workgroup = GENERATE(32, 64, 128, 256);
|
||||||
|
|
||||||
|
dim3 dimBlock = dim3(workgroup);
|
||||||
|
dim3 dimGrid = dim3(1);
|
||||||
int num_blocks = 0;
|
int num_blocks = 0;
|
||||||
uint workgroup = GENERATE(64, 128, 256);
|
|
||||||
|
|
||||||
hipLaunchParams* launch_params_list = new hipLaunchParams[device_num];
|
hipLaunchParams* launch_params_list = new hipLaunchParams[device_num];
|
||||||
std::vector<void*> args(device_num * num_kernel_args);
|
std::vector<void*> args(device_num * num_kernel_args);
|
||||||
@@ -188,16 +189,16 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") {
|
|||||||
for (int i = 0; i < device_num; i++) {
|
for (int i = 0; i < device_num; i++) {
|
||||||
HIP_CHECK(hipSetDevice(i));
|
HIP_CHECK(hipSetDevice(i));
|
||||||
|
|
||||||
dimBlock.x = workgroup;
|
|
||||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||||
&num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
|
&num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z,
|
||||||
|
dimBlock.x * sizeof(unsigned long long)));
|
||||||
|
|
||||||
INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks
|
INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks
|
||||||
<< "\n");
|
<< "\n");
|
||||||
|
|
||||||
dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32);
|
dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32);
|
||||||
|
|
||||||
HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long)));
|
HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(unsigned long long)));
|
||||||
|
|
||||||
args[i * num_kernel_args] = (void*)&A_d[i];
|
args[i * num_kernel_args] = (void*)&A_d[i];
|
||||||
args[i * num_kernel_args + 1] = (void*)&kBufferLen;
|
args[i * num_kernel_args + 1] = (void*)&kBufferLen;
|
||||||
@@ -207,7 +208,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") {
|
|||||||
launch_params_list[i].func = reinterpret_cast<void*>(test_gws);
|
launch_params_list[i].func = reinterpret_cast<void*>(test_gws);
|
||||||
launch_params_list[i].gridDim = dimGrid;
|
launch_params_list[i].gridDim = dimGrid;
|
||||||
launch_params_list[i].blockDim = dimBlock;
|
launch_params_list[i].blockDim = dimBlock;
|
||||||
launch_params_list[i].sharedMem = dimBlock.x * sizeof(long);
|
launch_params_list[i].sharedMem = dimBlock.x * sizeof(unsigned long long);
|
||||||
launch_params_list[i].stream = stream[i];
|
launch_params_list[i].stream = stream[i];
|
||||||
launch_params_list[i].args = &args[i * num_kernel_args];
|
launch_params_list[i].args = &args[i * num_kernel_args];
|
||||||
}
|
}
|
||||||
@@ -218,7 +219,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic", "[multigpu]") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t processed_Dwords = kBufferLen * device_num;
|
size_t processed_Dwords = kBufferLen * device_num;
|
||||||
REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2));
|
REQUIRE(*C_d == (((unsigned long long)(processed_Dwords) * (processed_Dwords - 1)) / 2));
|
||||||
|
|
||||||
delete[] launch_params_list;
|
delete[] launch_params_list;
|
||||||
|
|
||||||
|
|||||||
+38
-33
@@ -27,46 +27,47 @@ namespace cg = cooperative_groups;
|
|||||||
|
|
||||||
static constexpr size_t kBufferLen = 1024 * 1024;
|
static constexpr size_t kBufferLen = 1024 * 1024;
|
||||||
|
|
||||||
__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) {
|
__global__ void test_gws(int* buf, size_t buf_size, unsigned long long* tmp_buf,
|
||||||
extern __shared__ long tmp[];
|
unsigned long long* result) {
|
||||||
uint offset = blockIdx.x * blockDim.x + threadIdx.x;
|
extern __shared__ unsigned long long tmp[];
|
||||||
uint stride = gridDim.x * blockDim.x;
|
|
||||||
|
cg::thread_block tb = cg::this_thread_block();
|
||||||
cg::grid_group gg = cg::this_grid();
|
cg::grid_group gg = cg::this_grid();
|
||||||
|
|
||||||
long sum = 0;
|
const auto tid = gg.thread_rank();
|
||||||
for (uint i = offset; i < buf_size; i += stride) {
|
const auto stride = gg.size();
|
||||||
|
const auto local_tid = tb.thread_rank();
|
||||||
|
const auto wid = blockIdx.x;
|
||||||
|
const auto workgroup_size = tb.size();
|
||||||
|
const auto grid_size = gridDim.x;
|
||||||
|
|
||||||
|
unsigned long long sum = 0;
|
||||||
|
for (size_t i = tid; i < buf_size; i += stride) {
|
||||||
sum += buf[i];
|
sum += buf[i];
|
||||||
}
|
}
|
||||||
tmp[threadIdx.x] = sum;
|
tmp[local_tid] = sum;
|
||||||
|
tb.sync();
|
||||||
|
|
||||||
__syncthreads();
|
if (local_tid == 0) {
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
sum = 0;
|
sum = 0;
|
||||||
for (uint i = 0; i < blockDim.x; i++) {
|
for (size_t i = 0; i < workgroup_size; i++) {
|
||||||
sum += tmp[i];
|
sum += tmp[i];
|
||||||
}
|
}
|
||||||
tmp_buf[blockIdx.x] = sum;
|
tmp_buf[wid] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
gg.sync();
|
gg.sync();
|
||||||
|
|
||||||
if (offset == 0) {
|
if (tid < grid_size) {
|
||||||
for (uint i = 1; i < gridDim.x; ++i) {
|
atomicAdd(result, tmp_buf[tid]);
|
||||||
sum += tmp_buf[i];
|
|
||||||
}
|
|
||||||
*result = sum;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
||||||
// Use default device for validating the test
|
// Use default device for validating the test
|
||||||
int device;
|
int device;
|
||||||
int *A_h, *A_d;
|
|
||||||
long* B_d;
|
|
||||||
long* C_d;
|
|
||||||
hipDeviceProp_t device_properties;
|
|
||||||
HIP_CHECK(hipGetDevice(&device));
|
HIP_CHECK(hipGetDevice(&device));
|
||||||
|
|
||||||
|
hipDeviceProp_t device_properties;
|
||||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||||
|
|
||||||
if (!device_properties.cooperativeLaunch) {
|
if (!device_properties.cooperativeLaunch) {
|
||||||
@@ -76,6 +77,12 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
|||||||
|
|
||||||
size_t buffer_size = kBufferLen * sizeof(int);
|
size_t buffer_size = kBufferLen * sizeof(int);
|
||||||
|
|
||||||
|
int* A_h = nullptr;
|
||||||
|
int* A_d = nullptr;
|
||||||
|
unsigned long long* B_d = nullptr;
|
||||||
|
unsigned long long* C_d = nullptr;
|
||||||
|
hipStream_t stream;
|
||||||
|
|
||||||
A_h = reinterpret_cast<int*>(malloc(buffer_size));
|
A_h = reinterpret_cast<int*>(malloc(buffer_size));
|
||||||
for (uint32_t i = 0; i < kBufferLen; ++i) {
|
for (uint32_t i = 0; i < kBufferLen; ++i) {
|
||||||
A_h[i] = static_cast<int>(i);
|
A_h[i] = static_cast<int>(i);
|
||||||
@@ -83,25 +90,23 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
|||||||
|
|
||||||
HIP_CHECK(hipMalloc(&A_d, buffer_size));
|
HIP_CHECK(hipMalloc(&A_d, buffer_size));
|
||||||
HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice));
|
HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice));
|
||||||
HIP_CHECK(hipHostMalloc(&C_d, sizeof(long)));
|
HIP_CHECK(hipHostMalloc(&C_d, sizeof(unsigned long long)));
|
||||||
|
|
||||||
hipStream_t stream;
|
|
||||||
HIPCHECK(hipStreamCreate(&stream));
|
HIPCHECK(hipStreamCreate(&stream));
|
||||||
|
|
||||||
dim3 dimBlock = dim3(1);
|
|
||||||
dim3 dimGrid = dim3(1);
|
|
||||||
int numBlocks = 0;
|
|
||||||
|
|
||||||
uint32_t workgroup = GENERATE(32, 64, 128, 256);
|
uint32_t workgroup = GENERATE(32, 64, 128, 256);
|
||||||
|
|
||||||
dimBlock.x = workgroup;
|
dim3 dimBlock = dim3(workgroup);
|
||||||
|
dim3 dimGrid = dim3(1);
|
||||||
|
|
||||||
|
int numBlocks = 0;
|
||||||
// Calculate the device occupancy to know how many blocks can be run concurrently
|
// Calculate the device occupancy to know how many blocks can be run concurrently
|
||||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws,
|
||||||
&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
|
dimBlock.x * dimBlock.y * dimBlock.z,
|
||||||
|
dimBlock.x * sizeof(unsigned long long)));
|
||||||
|
|
||||||
dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32);
|
dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32);
|
||||||
HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long)));
|
HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(unsigned long long)));
|
||||||
|
|
||||||
void* params[4];
|
void* params[4];
|
||||||
params[0] = (void*)&A_d;
|
params[0] = (void*)&A_d;
|
||||||
@@ -111,7 +116,7 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
|||||||
|
|
||||||
INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n");
|
INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n");
|
||||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params,
|
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params,
|
||||||
dimBlock.x * sizeof(long), stream));
|
dimBlock.x * sizeof(unsigned long long), stream));
|
||||||
|
|
||||||
HIP_CHECK(hipStreamSynchronize(stream));
|
HIP_CHECK(hipStreamSynchronize(stream));
|
||||||
|
|
||||||
|
|||||||
@@ -152,12 +152,22 @@ __global__ void block_tile_shfl_up(T* const out, const unsigned int delta) {
|
|||||||
|
|
||||||
template <typename T, size_t tile_size> void BlockTileShflUpTestImpl() {
|
template <typename T, size_t tile_size> void BlockTileShflUpTestImpl() {
|
||||||
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
||||||
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
|
|
||||||
auto blocks = GenerateBlockDimensionsForShuffle();
|
auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
auto threads = GenerateThreadDimensionsForShuffle();
|
auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
auto delta = GENERATE(range(static_cast<size_t>(0), tile_size));
|
|
||||||
|
std::vector<size_t> deltas;
|
||||||
|
for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) {
|
||||||
|
deltas.emplace_back(static_cast<size_t>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(tile_size - 1);
|
||||||
|
|
||||||
|
const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
@@ -209,12 +219,22 @@ __global__ void block_tile_shfl_down(T* const out, const unsigned int delta) {
|
|||||||
|
|
||||||
template <typename T, size_t tile_size> void BlockTileShflDownTestImpl() {
|
template <typename T, size_t tile_size> void BlockTileShflDownTestImpl() {
|
||||||
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
||||||
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
|
|
||||||
auto blocks = GenerateBlockDimensionsForShuffle();
|
auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
auto threads = GenerateThreadDimensionsForShuffle();
|
auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
auto delta = GENERATE(range(static_cast<size_t>(0), tile_size));
|
|
||||||
|
std::vector<size_t> deltas;
|
||||||
|
for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) {
|
||||||
|
deltas.emplace_back(static_cast<size_t>(std::floor(i)));
|
||||||
|
}
|
||||||
|
deltas.emplace_back(tile_size - 1);
|
||||||
|
|
||||||
|
const auto delta = GENERATE_COPY(from_range(deltas.begin(), deltas.end()));
|
||||||
INFO("Delta: " << delta);
|
INFO("Delta: " << delta);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
@@ -278,12 +298,22 @@ __global__ void block_tile_shfl_xor(T* const out, const unsigned mask) {
|
|||||||
|
|
||||||
template <typename T, size_t tile_size> void BlockTileShflXORTestImpl() {
|
template <typename T, size_t tile_size> void BlockTileShflXORTestImpl() {
|
||||||
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
DYNAMIC_SECTION("Tile size: " << tile_size) {
|
||||||
|
const auto inv_reduction_factor = 1.0 / GetTestReductionFactor();
|
||||||
|
|
||||||
auto blocks = GenerateBlockDimensionsForShuffle();
|
auto blocks = GenerateBlockDimensionsForShuffle();
|
||||||
auto threads = GenerateThreadDimensionsForShuffle();
|
auto threads = GenerateThreadDimensionsForShuffle();
|
||||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||||
const auto mask = GENERATE(range(static_cast<size_t>(0), tile_size));
|
|
||||||
|
std::vector<size_t> masks;
|
||||||
|
for (double i = 0; i < tile_size - 1; i += inv_reduction_factor) {
|
||||||
|
masks.emplace_back(static_cast<size_t>(std::floor(i)));
|
||||||
|
}
|
||||||
|
masks.emplace_back(tile_size - 1);
|
||||||
|
|
||||||
|
const auto mask = GENERATE_COPY(from_range(masks.begin(), masks.end()));
|
||||||
INFO("Mask: 0x" << std::hex << mask);
|
INFO("Mask: 0x" << std::hex << mask);
|
||||||
|
|
||||||
CPUGrid grid(blocks, threads);
|
CPUGrid grid(blocks, threads);
|
||||||
|
|
||||||
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
const auto alloc_size = grid.thread_count_ * sizeof(T);
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren