SWDEV-534689 - Remove Unit_hipLaunchCooperativeKernelMultiDevice_Streams (#209)

[ROCm/hip-tests commit: 90d7eea054]
Этот коммит содержится в:
Brzak, Branislav
2025-06-04 16:22:16 +02:00
коммит произвёл GitHub
родитель f62fdb7380
Коммит d0aa27a704
-371
Просмотреть файл
@@ -130,356 +130,6 @@ __global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result)
}
}
__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) {
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock64();
}
}
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) {
#if HT_AMD
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock_function();
do {
long long cur_clock = clock_function();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock_function();
}
#endif
}
__global__ void test_kernel(uint32_t loops, unsigned long long* array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock64();
}
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) {
#if HT_AMD
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock_function();
do {
long long cur_clock = clock_function();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock_function();
}
#endif
}
static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound,
float high_bound) {
// Test that multiple kernel times are inside expected boundaries
REQUIRE(multi_kernel_time >= low_bound * single_kernel_time);
REQUIRE(multi_kernel_time <= high_bound * single_kernel_time);
}
void test_multigrid_streams(int device_num) {
uint32_t loops = 2000;
int32_t fast_gpu = -1;
// We will launch enough waves to fill up all of the GPU
int warp_sizes[2];
int num_sms[2];
hipDeviceProp_t device_properties[2];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int dev = 0; dev < (device_num - 1); ++dev) {
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
}
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[2];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i],
test_kernel_used, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int desired_blocks = 1;
if (desired_blocks > max_blocks_per_sm * num_sm) {
INFO("The requested number of blocks will not fit on the GPU");
REQUIRE(desired_blocks < max_blocks_per_sm * num_sm);
return;
}
// Create the streams we will use in this test
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Set up data to pass into the kernel
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned long long* dev_array[2];
for (int i = 0; i < 2; i++) {
int good_size = desired_blocks * warp_size * sizeof(long long);
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
}
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
/* Launch the kernels ****************************************************/
void* dev_params[2][3];
hipLaunchParams md_params[2];
std::chrono::time_point<std::chrono::system_clock> start_time[2];
std::chrono::time_point<std::chrono::system_clock> end_time[2];
// Test 0: Launching a multi-GPU cooperative kernel
// Both GPUs launch a long cooperative kernel
INFO("GPU " << dev << ": Long Coop Kernel");
INFO("GPU " << (dev + 1) << ": Long Coop Kernel");
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&loops);
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
md_params[i].gridDim = desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
start_time[0] = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[0] = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time = (end_time[0] - start_time[0]);
INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds");
SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") {
INFO("GPU " << dev << ": Standard/Long Coop");
INFO("GPU " << (dev + 1) << ": Coop/Standard");
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> serialized_gpu0_time = (end_time[1] - start_time[1]);
INFO("Serialized set of three kernels with GPU0 being long took: "
<< serialized_gpu0_time.count() << " seconds");
verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f);
}
SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") {
INFO("GPU " << dev << ": Standard/Coop");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard");
fast_gpu = 0;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> serialized_gpu1_time = (end_time[1] - start_time[1]);
INFO("Serialized set of three kernels with GPU1 being long took: "
<< serialized_gpu1_time.count() << " seconds");
verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f);
}
SECTION(
"GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at "
"beginning") {
INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync");
fast_gpu = 0;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> pre_overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count()
<< " seconds");
verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f);
}
SECTION(
"GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at "
"end") {
INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync");
INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync");
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPostSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> post_overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count()
<< " seconds");
verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f);
}
SECTION(
"GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") {
INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync");
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(
md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds");
verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f);
}
for (int k = 0; k < 2; ++k) {
HIP_CHECK(hipFree(dev_array[k]));
HIP_CHECK(hipStreamDestroy(streams[k]));
}
}
}
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
constexpr uint num_kernel_args = 4;
@@ -583,24 +233,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
free(A_h);
}
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") {
int device_num = 0;
HIP_CHECK(hipGetDeviceCount(&device_num));
if (device_num < 2) {
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
return;
}
hipDeviceProp_t device_properties;
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
if (!device_properties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
}
test_multigrid_streams(device_num);
}