SWDEV-534689 - Remove Unit_hipLaunchCooperativeKernelMultiDevice_Streams (#209)
[ROCm/hip-tests commit: 90d7eea054]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
f62fdb7380
Коммит
d0aa27a704
-371
@@ -130,356 +130,6 @@ __global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result)
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) {
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) {
|
||||
#if HT_AMD
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock_function();
|
||||
do {
|
||||
long long cur_clock = clock_function();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock_function();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long* array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) {
|
||||
#if HT_AMD
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock_function();
|
||||
do {
|
||||
long long cur_clock = clock_function();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock_function();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound,
|
||||
float high_bound) {
|
||||
// Test that multiple kernel times are inside expected boundaries
|
||||
REQUIRE(multi_kernel_time >= low_bound * single_kernel_time);
|
||||
REQUIRE(multi_kernel_time <= high_bound * single_kernel_time);
|
||||
}
|
||||
|
||||
void test_multigrid_streams(int device_num) {
|
||||
uint32_t loops = 2000;
|
||||
int32_t fast_gpu = -1;
|
||||
|
||||
// We will launch enough waves to fill up all of the GPU
|
||||
int warp_sizes[2];
|
||||
int num_sms[2];
|
||||
hipDeviceProp_t device_properties[2];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int dev = 0; dev < (device_num - 1); ++dev) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[2];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i],
|
||||
test_kernel_used, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
int desired_blocks = 1;
|
||||
|
||||
if (desired_blocks > max_blocks_per_sm * num_sm) {
|
||||
INFO("The requested number of blocks will not fit on the GPU");
|
||||
REQUIRE(desired_blocks < max_blocks_per_sm * num_sm);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create the streams we will use in this test
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Set up data to pass into the kernel
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned long long* dev_array[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int good_size = desired_blocks * warp_size * sizeof(long long);
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
|
||||
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
/* Launch the kernels ****************************************************/
|
||||
void* dev_params[2][3];
|
||||
hipLaunchParams md_params[2];
|
||||
std::chrono::time_point<std::chrono::system_clock> start_time[2];
|
||||
std::chrono::time_point<std::chrono::system_clock> end_time[2];
|
||||
|
||||
// Test 0: Launching a multi-GPU cooperative kernel
|
||||
// Both GPUs launch a long cooperative kernel
|
||||
INFO("GPU " << dev << ": Long Coop Kernel");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop Kernel");
|
||||
|
||||
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
start_time[0] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[0] = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> single_kernel_time = (end_time[0] - start_time[0]);
|
||||
INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds");
|
||||
|
||||
SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop");
|
||||
INFO("GPU " << (dev + 1) << ": Coop/Standard");
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> serialized_gpu0_time = (end_time[1] - start_time[1]);
|
||||
INFO("Serialized set of three kernels with GPU0 being long took: "
|
||||
<< serialized_gpu0_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f);
|
||||
}
|
||||
|
||||
SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") {
|
||||
INFO("GPU " << dev << ": Standard/Coop");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard");
|
||||
fast_gpu = 0;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> serialized_gpu1_time = (end_time[1] - start_time[1]);
|
||||
INFO("Serialized set of three kernels with GPU1 being long took: "
|
||||
<< serialized_gpu1_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at "
|
||||
"beginning") {
|
||||
INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync");
|
||||
fast_gpu = 0;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> pre_overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at "
|
||||
"end") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync");
|
||||
INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync");
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> post_overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync");
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(
|
||||
md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f);
|
||||
}
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
HIP_CHECK(hipFree(dev_array[k]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
|
||||
constexpr uint num_kernel_args = 4;
|
||||
|
||||
@@ -583,24 +233,3 @@ TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
|
||||
|
||||
free(A_h);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") {
|
||||
int device_num = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&device_num));
|
||||
|
||||
if (device_num < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
|
||||
return;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
|
||||
if (!device_properties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
test_multigrid_streams(device_num);
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user