diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux index 4d065782ce..731cf0549c 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux @@ -234,7 +234,6 @@ "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float", "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double", "=== SWDEV-439004: Below tests failing randomly in CQE staging ===", - "Unit_hipLaunchCooperativeKernel_Streams", "Unit_hipGLGetDevices_Positive_Basic", "Unit_hipGLGetDevices_Positive_Parameters", "Unit_hipGLGetDevices_Negative_Parameters", diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc index 3c4be35662..a8f41de8c9 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc @@ -59,238 +59,6 @@ __global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) } } -__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) { - cg::thread_block tb = cg::this_thread_block(); - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = clock64(); - do { - long long cur_clock = clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < totalTicks); - tb.sync(); - array[rank] += clock64(); - } -} - -__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) { -#if HT_AMD - cg::thread_block tb = cg::this_thread_block(); - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = wall_clock64(); - do { - long long cur_clock = wall_clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < totalTicks); - tb.sync(); - array[rank] += wall_clock64(); - } -#endif -} - -template -static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, - T& triple_kernel_time) { -#if HT_AMD - // hipLaunchCooperativeKernel() follows serialization policy on AMD devices - // Test that the two cooperative kernels took roughly twice as long as the one - REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); - REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); -#else - // hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices - // Test that the two cooperative kernels took roughly as long as the one - REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count()); - REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count()); -#endif - - // Test that the three kernels together took roughly as long as the two - // cooperative kernels. - REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count()); -} - -template -static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time, - T& triple_kernel_time) { - // Test that the two cooperative kernels took roughly twice as long as the one - REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); - REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); - - // Test that the three kernels together took roughly as long as the two - // cooperative kernels. - REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count()); -} - -template -static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time, - T& triple_kernel_time) { - // Test that the two cooperative kernels took roughly twice as long as the one - REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); - REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); - - // Test that the three kernels together took roughly 1.6 times as long as the two - // cooperative kernels. If the first 2 kernels run very fast, the third - // won't share much time with the second kernel. - REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count()); -} - -template -static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) { - switch (tests) { - case 0: - verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); - break; - case 1: - verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); - break; - case 2: - verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); - break; - default: - break; - } -} - -static void test_cooperative_streams(int dev, int p_tests) { - hipStream_t streams[3]; - unsigned long long* dev_array[3]; - int loops = 1000; - - HIP_CHECK(hipSetDevice(dev)); - hipDeviceProp_t device_properties; - HIP_CHECK(hipGetDeviceProperties(&device_properties, dev)); - - // Test whether target device supports cooperative groups - if (device_properties.cooperativeLaunch == 0) { - std::cout << "Cooperative group support not available in device " << dev << std::endl; - return; - } - - // We will launch enough waves to fill up all of the GPU - int warp_size = device_properties.warpSize; - int num_sms = device_properties.multiProcessorCount; - long long totalTicks = device_properties.clockRate; - int max_blocks_per_sm = 0; - // Calculate the device occupancy to know how many blocks can be run. - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used, - warp_size, 0)); - int max_active_blocks = max_blocks_per_sm * num_sms; - int coop_blocks = 0; - int reg_blocks = 0; - - switch (p_tests) { - case 0: - // 1 block - coop_blocks = 1; - reg_blocks = 1; - break; - case 1: - // Half capacity - // To make sure the second kernel launched by hipLaunchCooperativeKernel - // is invoked after the first kernel finished - coop_blocks = max_active_blocks / 2 + 1; - // To make sure the third kernel launched by hipLaunchKernelGGL is invoked - // concurrently with the second kernel - reg_blocks = max_active_blocks - coop_blocks; - break; - case 2: - // Full capacity - coop_blocks = max_active_blocks; - reg_blocks = max_active_blocks; - break; - default: - break; - } - - for (int i = 0; i < 3; i++) { - HIP_CHECK(hipStreamCreate(&streams[i])); - } - - // Set up data to pass into the kernel - - for (int i = 0; i < 3; i++) { - HIP_CHECK(hipMalloc(reinterpret_cast(&dev_array[i]), warp_size * sizeof(long long))); - HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i])); - } - - HIP_CHECK(hipDeviceSynchronize()); - - // Launch the kernels - void* coop_params[3][3]; - for (int i = 0; i < 3; i++) { - coop_params[i][0] = reinterpret_cast(&loops); - coop_params[i][1] = reinterpret_cast(&dev_array[i]); - coop_params[i][2] = reinterpret_cast(&totalTicks); - } - - // We need exclude the the initial launching as it will need time to load code obj. - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), max_active_blocks, - warp_size, coop_params[0], 0, streams[0])); - HIP_CHECK(hipDeviceSynchronize()); - - // Launching a single cooperative kernel - auto single_start = std::chrono::system_clock::now(); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), max_active_blocks, - warp_size, coop_params[0], 0, streams[0])); - HIP_CHECK(hipDeviceSynchronize()); - auto single_end = std::chrono::system_clock::now(); - - std::chrono::duration single_kernel_time = (single_end - single_start); - - // Launching 2 cooperative kernels to different streams - auto double_start = std::chrono::system_clock::now(); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, - warp_size, coop_params[0], 0, streams[0])); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, - warp_size, coop_params[1], 0, streams[1])); - - HIP_CHECK(hipDeviceSynchronize()); - auto double_end = std::chrono::system_clock::now(); - - // Launching 2 cooperative kernels and 1 normal kernel - std::chrono::duration double_kernel_time = (double_end - double_start); - - auto triple_start = std::chrono::system_clock::now(); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, - warp_size, coop_params[0], 0, streams[0])); - HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, - warp_size, coop_params[1], 0, streams[1])); - hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops, - dev_array[2], totalTicks); - - HIP_CHECK(hipDeviceSynchronize()); - auto triple_end = std::chrono::system_clock::now(); - std::chrono::duration triple_kernel_time = (triple_end - triple_start); - - for (int k = 0; k < 3; ++k) { - HIP_CHECK(hipFree(dev_array[k])); - HIP_CHECK(hipStreamDestroy(streams[k])); - } - - - INFO("A single kernel took : " << single_kernel_time.count() << " seconds"); - INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds"); - INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count() - << " seconds"); - - verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time); -} - TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { // Use default device for validating the test int device; @@ -347,7 +115,8 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { HIP_CHECK(hipStreamSynchronize(stream)); - REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2)); + REQUIRE(((unsigned long long)*C_d) == + (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipHostFree(C_d)); @@ -355,10 +124,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { HIP_CHECK(hipFree(A_d)); free(A_h); } - -TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") { - const auto device = GENERATE(range(0, HipTest::getDeviceCount())); - int p_tests = GENERATE(0, 1, 2); - - test_cooperative_streams(device, p_tests); -}