SWDEV-440095 - remove the test Unit_hipLaunchCooperativeKernel_Streams

The test checks the pattern of thing "A" should be always faster than
thing "B". Which might not be the case for several reasons and will result
in unreliable, randomly failing test.

Change-Id: I6fa9e2c110f9ef48ef63a720e7a64c54e7f2a72f


[ROCm/hip-tests commit: 5e766685c8]
This commit is contained in:
Jatin Chaudhary
2024-01-19 17:36:22 +00:00
zatwierdzone przez Maneesh Gupta
rodzic a4e764fb69
commit 3e5aa93c49
2 zmienionych plików z 2 dodań i 241 usunięć
@@ -234,7 +234,6 @@
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
"=== SWDEV-439004: Below tests failing randomly in CQE staging ===",
"Unit_hipLaunchCooperativeKernel_Streams",
"Unit_hipGLGetDevices_Positive_Basic",
"Unit_hipGLGetDevices_Positive_Parameters",
"Unit_hipGLGetDevices_Negative_Parameters",
@@ -59,238 +59,6 @@ __global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result)
}
}
__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) {
cg::thread_block tb = cg::this_thread_block();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < totalTicks);
tb.sync();
array[rank] += clock64();
}
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) {
#if HT_AMD
cg::thread_block tb = cg::this_thread_block();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < totalTicks);
tb.sync();
array[rank] += wall_clock64();
}
#endif
}
template <typename T>
static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
#if HT_AMD
// hipLaunchCooperativeKernel() follows serialization policy on AMD devices
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
#else
// hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices
// Test that the two cooperative kernels took roughly as long as the one
REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count());
#endif
// Test that the three kernels together took roughly as long as the two
// cooperative kernels.
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
}
template <typename T>
static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
// Test that the three kernels together took roughly as long as the two
// cooperative kernels.
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
}
template <typename T>
static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
// Test that the three kernels together took roughly 1.6 times as long as the two
// cooperative kernels. If the first 2 kernels run very fast, the third
// won't share much time with the second kernel.
REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count());
}
template <typename T>
static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) {
switch (tests) {
case 0:
verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
case 1:
verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
case 2:
verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
default:
break;
}
}
static void test_cooperative_streams(int dev, int p_tests) {
hipStream_t streams[3];
unsigned long long* dev_array[3];
int loops = 1000;
HIP_CHECK(hipSetDevice(dev));
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDeviceProperties(&device_properties, dev));
// Test whether target device supports cooperative groups
if (device_properties.cooperativeLaunch == 0) {
std::cout << "Cooperative group support not available in device " << dev << std::endl;
return;
}
// We will launch enough waves to fill up all of the GPU
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
long long totalTicks = device_properties.clockRate;
int max_blocks_per_sm = 0;
// Calculate the device occupancy to know how many blocks can be run.
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
warp_size, 0));
int max_active_blocks = max_blocks_per_sm * num_sms;
int coop_blocks = 0;
int reg_blocks = 0;
switch (p_tests) {
case 0:
// 1 block
coop_blocks = 1;
reg_blocks = 1;
break;
case 1:
// Half capacity
// To make sure the second kernel launched by hipLaunchCooperativeKernel
// is invoked after the first kernel finished
coop_blocks = max_active_blocks / 2 + 1;
// To make sure the third kernel launched by hipLaunchKernelGGL is invoked
// concurrently with the second kernel
reg_blocks = max_active_blocks - coop_blocks;
break;
case 2:
// Full capacity
coop_blocks = max_active_blocks;
reg_blocks = max_active_blocks;
break;
default:
break;
}
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Set up data to pass into the kernel
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), warp_size * sizeof(long long)));
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i]));
}
HIP_CHECK(hipDeviceSynchronize());
// Launch the kernels
void* coop_params[3][3];
for (int i = 0; i < 3; i++) {
coop_params[i][0] = reinterpret_cast<void*>(&loops);
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
coop_params[i][2] = reinterpret_cast<void*>(&totalTicks);
}
// We need exclude the the initial launching as it will need time to load code obj.
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipDeviceSynchronize());
// Launching a single cooperative kernel
auto single_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipDeviceSynchronize());
auto single_end = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time = (single_end - single_start);
// Launching 2 cooperative kernels to different streams
auto double_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[1], 0, streams[1]));
HIP_CHECK(hipDeviceSynchronize());
auto double_end = std::chrono::system_clock::now();
// Launching 2 cooperative kernels and 1 normal kernel
std::chrono::duration<double> double_kernel_time = (double_end - double_start);
auto triple_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[1], 0, streams[1]));
hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops,
dev_array[2], totalTicks);
HIP_CHECK(hipDeviceSynchronize());
auto triple_end = std::chrono::system_clock::now();
std::chrono::duration<double> triple_kernel_time = (triple_end - triple_start);
for (int k = 0; k < 3; ++k) {
HIP_CHECK(hipFree(dev_array[k]));
HIP_CHECK(hipStreamDestroy(streams[k]));
}
INFO("A single kernel took : " << single_kernel_time.count() << " seconds");
INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds");
INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count()
<< " seconds");
verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time);
}
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
// Use default device for validating the test
int device;
@@ -347,7 +115,8 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
HIP_CHECK(hipStreamSynchronize(stream));
REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
REQUIRE(((unsigned long long)*C_d) ==
(((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK(hipHostFree(C_d));
@@ -355,10 +124,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
HIP_CHECK(hipFree(A_d));
free(A_h);
}
TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") {
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
int p_tests = GENERATE(0, 1, 2);
test_cooperative_streams(device, p_tests);
}