SWDEV-440095 - remove the test Unit_hipLaunchCooperativeKernel_Streams
The test checks the pattern of thing "A" should be always faster than
thing "B". Which might not be the case for several reasons and will result
in unreliable, randomly failing test.
Change-Id: I6fa9e2c110f9ef48ef63a720e7a64c54e7f2a72f
[ROCm/hip-tests commit: 5e766685c8]
This commit is contained in:
zatwierdzone przez
Maneesh Gupta
rodzic
a4e764fb69
commit
3e5aa93c49
@@ -234,7 +234,6 @@
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
|
||||
"=== SWDEV-439004: Below tests failing randomly in CQE staging ===",
|
||||
"Unit_hipLaunchCooperativeKernel_Streams",
|
||||
"Unit_hipGLGetDevices_Positive_Basic",
|
||||
"Unit_hipGLGetDevices_Positive_Parameters",
|
||||
"Unit_hipGLGetDevices_Negative_Parameters",
|
||||
|
||||
+2
-240
@@ -59,238 +59,6 @@ __global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result)
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) {
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < totalTicks);
|
||||
tb.sync();
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) {
|
||||
#if HT_AMD
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < totalTicks);
|
||||
tb.sync();
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
#if HT_AMD
|
||||
// hipLaunchCooperativeKernel() follows serialization policy on AMD devices
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
#else
|
||||
// hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices
|
||||
// Test that the two cooperative kernels took roughly as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count());
|
||||
#endif
|
||||
|
||||
// Test that the three kernels together took roughly as long as the two
|
||||
// cooperative kernels.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
|
||||
// Test that the three kernels together took roughly as long as the two
|
||||
// cooperative kernels.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
|
||||
// Test that the three kernels together took roughly 1.6 times as long as the two
|
||||
// cooperative kernels. If the first 2 kernels run very fast, the third
|
||||
// won't share much time with the second kernel.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) {
|
||||
switch (tests) {
|
||||
case 0:
|
||||
verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
case 1:
|
||||
verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
case 2:
|
||||
verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cooperative_streams(int dev, int p_tests) {
|
||||
hipStream_t streams[3];
|
||||
unsigned long long* dev_array[3];
|
||||
int loops = 1000;
|
||||
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
|
||||
// Test whether target device supports cooperative groups
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cout << "Cooperative group support not available in device " << dev << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// We will launch enough waves to fill up all of the GPU
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
long long totalTicks = device_properties.clockRate;
|
||||
int max_blocks_per_sm = 0;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
|
||||
warp_size, 0));
|
||||
int max_active_blocks = max_blocks_per_sm * num_sms;
|
||||
int coop_blocks = 0;
|
||||
int reg_blocks = 0;
|
||||
|
||||
switch (p_tests) {
|
||||
case 0:
|
||||
// 1 block
|
||||
coop_blocks = 1;
|
||||
reg_blocks = 1;
|
||||
break;
|
||||
case 1:
|
||||
// Half capacity
|
||||
// To make sure the second kernel launched by hipLaunchCooperativeKernel
|
||||
// is invoked after the first kernel finished
|
||||
coop_blocks = max_active_blocks / 2 + 1;
|
||||
// To make sure the third kernel launched by hipLaunchKernelGGL is invoked
|
||||
// concurrently with the second kernel
|
||||
reg_blocks = max_active_blocks - coop_blocks;
|
||||
break;
|
||||
case 2:
|
||||
// Full capacity
|
||||
coop_blocks = max_active_blocks;
|
||||
reg_blocks = max_active_blocks;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Set up data to pass into the kernel
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), warp_size * sizeof(long long)));
|
||||
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Launch the kernels
|
||||
void* coop_params[3][3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
coop_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
coop_params[i][2] = reinterpret_cast<void*>(&totalTicks);
|
||||
}
|
||||
|
||||
// We need exclude the the initial launching as it will need time to load code obj.
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Launching a single cooperative kernel
|
||||
auto single_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto single_end = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> single_kernel_time = (single_end - single_start);
|
||||
|
||||
// Launching 2 cooperative kernels to different streams
|
||||
auto double_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[1], 0, streams[1]));
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto double_end = std::chrono::system_clock::now();
|
||||
|
||||
// Launching 2 cooperative kernels and 1 normal kernel
|
||||
std::chrono::duration<double> double_kernel_time = (double_end - double_start);
|
||||
|
||||
auto triple_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[1], 0, streams[1]));
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops,
|
||||
dev_array[2], totalTicks);
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto triple_end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> triple_kernel_time = (triple_end - triple_start);
|
||||
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
HIP_CHECK(hipFree(dev_array[k]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
|
||||
|
||||
INFO("A single kernel took : " << single_kernel_time.count() << " seconds");
|
||||
INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds");
|
||||
INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
@@ -347,7 +115,8 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
|
||||
REQUIRE(((unsigned long long)*C_d) ==
|
||||
(((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK(hipHostFree(C_d));
|
||||
@@ -355,10 +124,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
free(A_h);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") {
|
||||
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
|
||||
int p_tests = GENERATE(0, 1, 2);
|
||||
|
||||
test_cooperative_streams(device, p_tests);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user