diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc index afc63c9de6..6a6e4d41b4 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc @@ -130,356 +130,6 @@ __global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) } } -__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) { - cg::multi_grid_group mgrid = cg::this_multi_grid(); - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - if (mgrid.grid_rank() == fast_gpu) { - return; - } - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = clock64(); - do { - long long cur_clock = clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - array[rank] += clock64(); - } -} - -__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) { -#if HT_AMD - cg::multi_grid_group mgrid = cg::this_multi_grid(); - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - if (mgrid.grid_rank() == fast_gpu) { - return; - } - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = clock_function(); - do { - long long cur_clock = clock_function(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - array[rank] += clock_function(); - } -#endif -} - -__global__ void test_kernel(uint32_t loops, unsigned long long* array) { - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = clock64(); - do { - long long cur_clock = clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - array[rank] += clock64(); - } -} - -__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) { -#if HT_AMD - unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; - - for (int i = 0; i < loops; i++) { - long long time_diff = 0; - long long last_clock = clock_function(); - do { - long long cur_clock = clock_function(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - array[rank] += clock_function(); - } -#endif -} - -static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound, - float high_bound) { - // Test that multiple kernel times are inside expected boundaries - REQUIRE(multi_kernel_time >= low_bound * single_kernel_time); - REQUIRE(multi_kernel_time <= high_bound * single_kernel_time); -} - -void test_multigrid_streams(int device_num) { - uint32_t loops = 2000; - int32_t fast_gpu = -1; - - // We will launch enough waves to fill up all of the GPU - int warp_sizes[2]; - int num_sms[2]; - hipDeviceProp_t device_properties[2]; - int warp_size = INT_MAX; - int num_sm = INT_MAX; - for (int dev = 0; dev < (device_num - 1); ++dev) { - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i))); - warp_sizes[i] = device_properties[i].warpSize; - if (warp_sizes[i] < warp_size) { - warp_size = warp_sizes[i]; - } - num_sms[i] = device_properties[i].multiProcessorCount; - if (num_sms[i] < num_sm) { - num_sm = num_sms[i]; - } - } - - // Calculate the device occupancy to know how many blocks can be run. - int max_blocks_per_sm_arr[2]; - int max_blocks_per_sm = INT_MAX; - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i], - test_kernel_used, warp_size, 0)); - if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { - max_blocks_per_sm = max_blocks_per_sm_arr[i]; - } - } - int desired_blocks = 1; - - if (desired_blocks > max_blocks_per_sm * num_sm) { - INFO("The requested number of blocks will not fit on the GPU"); - REQUIRE(desired_blocks < max_blocks_per_sm * num_sm); - return; - } - - // Create the streams we will use in this test - hipStream_t streams[2]; - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipStreamCreate(&streams[i])); - } - - // Set up data to pass into the kernel - // Alocate the host input buffer, and two device-focused buffers that we - // will use for our test. - unsigned long long* dev_array[2]; - for (int i = 0; i < 2; i++) { - int good_size = desired_blocks * warp_size * sizeof(long long); - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipMalloc(reinterpret_cast(&dev_array[i]), good_size)); - HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i])); - } - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - - /* Launch the kernels ****************************************************/ - void* dev_params[2][3]; - hipLaunchParams md_params[2]; - std::chrono::time_point start_time[2]; - std::chrono::time_point end_time[2]; - - // Test 0: Launching a multi-GPU cooperative kernel - // Both GPUs launch a long cooperative kernel - INFO("GPU " << dev << ": Long Coop Kernel"); - INFO("GPU " << (dev + 1) << ": Long Coop Kernel"); - - auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel; - for (int i = 0; i < 2; i++) { - dev_params[i][0] = reinterpret_cast(&loops); - dev_params[i][1] = reinterpret_cast(&dev_array[i]); - dev_params[i][2] = reinterpret_cast(&fast_gpu); - md_params[i].func = reinterpret_cast(test_coop_kernel_used); - md_params[i].gridDim = desired_blocks; - md_params[i].blockDim = warp_size; - md_params[i].sharedMem = 0; - md_params[i].stream = streams[i]; - md_params[i].args = dev_params[i]; - } - - start_time[0] = std::chrono::system_clock::now(); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[0] = std::chrono::system_clock::now(); - - std::chrono::duration single_kernel_time = (end_time[0] - start_time[0]); - INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds"); - - SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") { - INFO("GPU " << dev << ": Standard/Long Coop"); - INFO("GPU " << (dev + 1) << ": Coop/Standard"); - fast_gpu = 1; - start_time[1] = std::chrono::system_clock::now(); - HIP_CHECK(hipSetDevice(dev)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], - loops, dev_array[0]); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - HIP_CHECK(hipSetDevice(dev + 1)); - test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], - loops, dev_array[1]); - HIP_CHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - std::chrono::duration serialized_gpu0_time = (end_time[1] - start_time[1]); - INFO("Serialized set of three kernels with GPU0 being long took: " - << serialized_gpu0_time.count() << " seconds"); - - verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f); - } - - SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") { - INFO("GPU " << dev << ": Standard/Coop"); - INFO("GPU " << (dev + 1) << ": Long Coop/Standard"); - fast_gpu = 0; - start_time[1] = std::chrono::system_clock::now(); - HIP_CHECK(hipSetDevice(dev)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], - loops, dev_array[0]); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); - HIP_CHECK(hipSetDevice(dev + 1)); - test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], - loops, dev_array[1]); - HIP_CHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - std::chrono::duration serialized_gpu1_time = (end_time[1] - start_time[1]); - INFO("Serialized set of three kernels with GPU1 being long took: " - << serialized_gpu1_time.count() << " seconds"); - - verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f); - } - - SECTION( - "GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at " - "beginning") { - INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync"); - INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync"); - fast_gpu = 0; - start_time[1] = std::chrono::system_clock::now(); - HIP_CHECK(hipSetDevice(dev)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], - loops, dev_array[0]); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, - hipCooperativeLaunchMultiDeviceNoPreSync)); - HIP_CHECK(hipSetDevice(dev + 1)); - test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], - loops, dev_array[1]); - HIP_CHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - std::chrono::duration pre_overlapped_time = (end_time[1] - start_time[1]); - INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count() - << " seconds"); - - verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f); - } - - SECTION( - "GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at " - "end") { - INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync"); - INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync"); - fast_gpu = 1; - start_time[1] = std::chrono::system_clock::now(); - HIP_CHECK(hipSetDevice(dev)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], - loops, dev_array[0]); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, - hipCooperativeLaunchMultiDeviceNoPostSync)); - HIP_CHECK(hipSetDevice(dev + 1)); - test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], - loops, dev_array[1]); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - std::chrono::duration post_overlapped_time = (end_time[1] - start_time[1]); - INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count() - << " seconds"); - - verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f); - } - - SECTION( - "GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") { - INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync"); - INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync"); - start_time[1] = std::chrono::system_clock::now(); - HIP_CHECK(hipSetDevice(dev)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], - loops, dev_array[0]); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipLaunchCooperativeKernelMultiDevice( - md_params, 2, - hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync)); - HIP_CHECK(hipSetDevice(dev + 1)); - test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; - hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], - loops, dev_array[1]); - HIP_CHECK(hipGetLastError()); - for (int i = 0; i < 2; i++) { - HIP_CHECK(hipSetDevice(dev + i)); - HIP_CHECK(hipDeviceSynchronize()); - } - end_time[1] = std::chrono::system_clock::now(); - std::chrono::duration overlapped_time = (end_time[1] - start_time[1]); - INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds"); - - verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f); - } - - for (int k = 0; k < 2; ++k) { - HIP_CHECK(hipFree(dev_array[k])); - HIP_CHECK(hipStreamDestroy(streams[k])); - } - } -} - TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") { constexpr uint num_kernel_args = 4; @@ -583,24 +233,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") { free(A_h); } - -TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") { - int device_num = 0; - HIP_CHECK(hipGetDeviceCount(&device_num)); - - if (device_num < 2) { - HipTest::HIP_SKIP_TEST("Skipping because devices < 2"); - return; - } - - hipDeviceProp_t device_properties; - for (int i = 0; i < device_num; i++) { - HIP_CHECK(hipGetDeviceProperties(&device_properties, i)); - if (!device_properties.cooperativeMultiDeviceLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - } - - test_multigrid_streams(device_num); -}