diff --git a/tests/src/deviceLib/hipTestClock.cpp b/tests/src/deviceLib/hipTestClock.cpp index bf6a572cfc..7e70d32b4d 100644 --- a/tests/src/deviceLib/hipTestClock.cpp +++ b/tests/src/deviceLib/hipTestClock.cpp @@ -43,6 +43,20 @@ THE SOFTWARE. Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid]; } + static __global__ void kernel1_gfx11(long long* Ad) { +#ifdef __HIP_PLATFORM_AMD__ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + Ad[tid] = clock() + wall_clock64() + __clock() + __clock64(); +#endif + } + + static __global__ void kernel2_gfx11(long long* Ad) { +#ifdef __HIP_PLATFORM_AMD__ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + Ad[tid] = clock() + wall_clock64() + __clock() + __clock64() - Ad[tid]; +#endif + } + void run() { long long *A, *Ad; A = new long long[LEN]; @@ -50,9 +64,15 @@ THE SOFTWARE. A[i] = 0; } + auto kernel1_used = IsGfx11() ? kernel1_gfx11 : kernel1; + auto kernel2_used = IsGfx11() ? kernel2_gfx11 : kernel2; + HIP_ASSERT(hipMalloc((void**)&Ad, SIZE)); - hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad); - hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad); + + hipLaunchKernelGGL(kernel1_used, dim3(1, 1, 1), + dim3(LEN, 1, 1), 0, 0, Ad); + hipLaunchKernelGGL(kernel2_used, dim3(1, 1, 1), + dim3(LEN, 1, 1), 0, 0, Ad); HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost)); for (unsigned i = 0; i < LEN; i++) { diff --git a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp index 544b9ee775..91f47359b5 100644 --- a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp @@ -124,6 +124,13 @@ __global__ void test_kernel(long long *array) { array[rank] += clock64(); } +__global__ void test_kernel_gfx11(long long *array) { +#ifdef __HIP_PLATFORM_AMD__ + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += wall_clock64(); +#endif +} + int main(int argc, char** argv) { hipError_t err; int device_num, FailFlag = 0; @@ -155,6 +162,7 @@ int main(int argc, char** argv) { int num_sms = device_properties.multiProcessorCount; int max_num_threads = device_properties.maxThreadsPerBlock; + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; // Check single-thread block, all numbers of warps, then too-large block for (int block_size = 0; block_size <= (max_num_threads + warp_size); block_size += warp_size) { @@ -163,9 +171,8 @@ int main(int argc, char** argv) { } int max_blocks_per_sm; // Calculate the device occupancy to know how many blocks can be run. - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - &max_blocks_per_sm, test_kernel, block_size, 0, - hipOccupancyDefault)); + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&max_blocks_per_sm, + test_kernel_used, block_size, 0, hipOccupancyDefault)); if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) { std::cerr << "ERROR! Occupancy API indicated that we can have >0 "; @@ -212,7 +219,7 @@ int main(int argc, char** argv) { coop_params[i][0] = reinterpret_cast(&dev_array[i]); } - err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), 2 * desired_blocks, block_size, coop_params[0], 0, streams[0]); @@ -235,9 +242,8 @@ int main(int argc, char** argv) { } HIPCHECK(hipDeviceSynchronize()); - err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), - desired_blocks, block_size, - coop_params[1], 0, streams[1]); + err = hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), desired_blocks, + block_size, coop_params[1], 0, streams[1]); if (expect_fail) { expect_to_see = hipErrorInvalidConfiguration; diff --git a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp index 07a4f7711c..70ce8b6a80 100644 --- a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp @@ -106,6 +106,29 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array, long long } } +__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array, long long totalTicks) { +#ifdef __HIP_PLATFORM_AMD__ + cooperative_groups::thread_block tb = cooperative_groups::this_thread_block(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < totalTicks); + tb.sync(); + array[rank] += wall_clock64(); + } +#endif +} + template bool verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) { @@ -256,8 +279,8 @@ int main(int argc, char** argv) { long long totalTicks = device_properties.clockRate ; int max_blocks_per_sm = 0; // Calculate the device occupancy to know how many blocks can be run. - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, - test_kernel, + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used, warp_size, 0)); int max_active_blocks = max_blocks_per_sm * num_sms; int coop_blocks = 0; @@ -325,9 +348,10 @@ int main(int argc, char** argv) { } // Verify over capacity - HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), max_active_blocks + 1, warp_size, - coop_params[0], 0, streams[0]), hipErrorCooperativeLaunchTooLarge); + coop_params[0], 0, streams[0]), + hipErrorCooperativeLaunchTooLarge); std::cout << "Launching an initial single cooperative kernel..." << std::endl; // We need exclude the the initial launching as it will need time to load code obj. diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp index 2957b0420c..40d20a48de 100644 --- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp @@ -192,6 +192,20 @@ __global__ void second_test_kernel(long long *array) { array[rank] += clock64(); } +__global__ void test_kernel_gfx11(long long *array) { +#ifdef __HIP_PLATFORM_AMD__ + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += wall_clock64(); +#endif +} + +__global__ void second_test_kernel_gfx11(long long *array) { +#ifdef __HIP_PLATFORM_AMD__ + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + array[rank] += wall_clock64(); +#endif +} + int main(int argc, char** argv) { hipError_t err; /*************************************************************************/ @@ -245,8 +259,9 @@ int main(int argc, char** argv) { int max_blocks_per_sm = INT_MAX; for (int i = 0; i < 2; i++) { HIPCHECK(hipSetDevice((dev + i))); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0)); + &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0)); if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { max_blocks_per_sm = max_blocks_per_sm_arr[i]; } @@ -293,7 +308,8 @@ int main(int argc, char** argv) { for (int i = 0; i < 2; i++) { dev_params[i][0] = reinterpret_cast(&bad_dev_array[i]); - md_params[i].func = reinterpret_cast(test_kernel); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + md_params[i].func = reinterpret_cast(test_kernel_used); md_params[i].gridDim = 2 * desired_blocks; md_params[i].blockDim = warp_size; md_params[i].sharedMem = 0; @@ -370,7 +386,8 @@ int main(int argc, char** argv) { supports_sep_kernels = false; } } - md_params[1].func = reinterpret_cast(second_test_kernel); + auto second_test_kernel_used = IsGfx11() ? second_test_kernel_gfx11 : second_test_kernel; + md_params[1].func = reinterpret_cast(second_test_kernel_used); err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0); if ((supports_sep_kernels && err != hipSuccess) || (!supports_sep_kernels && err != hipErrorInvalidValue)) { @@ -405,7 +422,8 @@ int main(int argc, char** argv) { std::cout << "different grid sizes." << std::endl; bool supports_sep_sizes = true; for (int i = 0; i < 2; i++) { - md_params[i].func = reinterpret_cast(test_kernel); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + md_params[i].func = reinterpret_cast(test_kernel_used); md_params[i].gridDim = i+1; if (!support_for_separate_grid_sizes((dev + i))) { supports_sep_sizes = false; diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp index 120e32353f..28bba39785 100644 --- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp @@ -195,6 +195,55 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array) { } } +__global__ void test_coop_kernel_gfx11(unsigned int loops, long long *array, + int fast_gpu) { +#ifdef __HIP_PLATFORM_AMD__ + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + if (mgrid.grid_rank() == fast_gpu) { + return; + } + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + array[rank] += wall_clock64(); + } +#endif +} + +__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array) { +#ifdef __HIP_PLATFORM_AMD__ + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + array[rank] += wall_clock64(); + } +#endif +} + int main(int argc, char** argv) { hipError_t err; int device_num, FailFlag = 0; @@ -249,8 +298,9 @@ int main(int argc, char** argv) { int max_blocks_per_sm = INT_MAX; for (int i = 0; i < 2; i++) { HIPCHECK(hipSetDevice(dev + i)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0)); + &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0)); if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { max_blocks_per_sm = max_blocks_per_sm_arr[i]; } @@ -302,11 +352,12 @@ int main(int argc, char** argv) { std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl; std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl; + auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel; for (int i = 0; i < 2; i++) { dev_params[i][0] = reinterpret_cast(&loops); dev_params[i][1] = reinterpret_cast(&dev_array[i]); dev_params[i][2] = reinterpret_cast(&fast_gpu); - md_params[i].func = reinterpret_cast(test_coop_kernel); + md_params[i].func = reinterpret_cast(test_coop_kernel_used); md_params[i].gridDim = desired_blocks; md_params[i].blockDim = warp_size; md_params[i].sharedMem = 0; @@ -331,12 +382,14 @@ int main(int argc, char** argv) { fast_gpu = 1; start_time[1] = std::chrono::system_clock::now(); HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], loops, dev_array[0]); HIPCHECK(hipGetLastError()); HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], loops, dev_array[1]); HIPCHECK(hipGetLastError()); for (int i = 0; i < 2; i++) { @@ -356,12 +409,14 @@ int main(int argc, char** argv) { fast_gpu = 0; start_time[2] = std::chrono::system_clock::now(); HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], loops, dev_array[0]); HIPCHECK(hipGetLastError()); HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], loops, dev_array[1]); HIPCHECK(hipGetLastError()); for (int i = 0; i < 2; i++) { @@ -382,13 +437,15 @@ int main(int argc, char** argv) { fast_gpu = 0; start_time[3] = std::chrono::system_clock::now(); HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], loops, dev_array[0]); HIPCHECK(hipGetLastError()); HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, hipCooperativeLaunchMultiDeviceNoPreSync)); HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], loops, dev_array[1]); HIPCHECK(hipGetLastError()); for (int i = 0; i < 2; i++) { @@ -409,13 +466,15 @@ int main(int argc, char** argv) { fast_gpu = 1; start_time[4] = std::chrono::system_clock::now(); HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], loops, dev_array[0]); HIPCHECK(hipGetLastError()); HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, hipCooperativeLaunchMultiDeviceNoPostSync)); HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], loops, dev_array[1]); for (int i = 0; i < 2; i++) { HIPCHECK(hipSetDevice(dev + i)); @@ -434,14 +493,16 @@ int main(int argc, char** argv) { std::cout << " Kernel\n"; start_time[5] = std::chrono::system_clock::now(); HIPCHECK(hipSetDevice(dev)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], loops, dev_array[0]); HIPCHECK(hipGetLastError()); HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync)); HIPCHECK(hipSetDevice(dev + 1)); - hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0, + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], loops, dev_array[1]); HIPCHECK(hipGetLastError()); for (int i = 0; i < 2; i++) { diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp index 2623e5bc16..6958130e5c 100644 --- a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp @@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array, } } +__global__ void +test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array, + unsigned int *array, uint32_t loops) { +#ifdef __HIP_PLATFORM_AMD__ + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +#endif +} + int main(int argc, char** argv) { hipError_t err; int num_devices = 0; @@ -265,8 +340,9 @@ int main(int argc, char** argv) { int max_blocks_per_sm = INT_MAX; for (int i = 0; i < num_devices; i++) { HIPCHECK(hipSetDevice(device_num[i])); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0)); + &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0)); if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { max_blocks_per_sm = max_blocks_per_sm_arr[i]; } @@ -320,12 +396,13 @@ int main(int argc, char** argv) { void *dev_params[num_devices][4]; hipLaunchParams md_params[num_devices]; + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; for (int i = 0; i < num_devices; i++) { dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); dev_params[i][1] = reinterpret_cast(&global_array); dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); dev_params[i][3] = reinterpret_cast(&loops); - md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].func = reinterpret_cast(test_kernel_used); md_params[i].gridDim = requested_blocks; md_params[i].blockDim = num_threads_in_block; md_params[i].sharedMem = 0; diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp index 2256369b04..1e0d15529e 100644 --- a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp @@ -131,6 +131,48 @@ test_kernel(unsigned int *atomic_val, unsigned int *array, } } +__global__ void +test_kernel_gfx11(unsigned int *atomic_val, unsigned int *array, + unsigned int loops) { +#ifdef __HIP_PLATFORM_AMD__ + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + unsigned rank = grid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + } + + if (threadIdx.x == 0) { + array[offset] = atomicInc(&atomic_val[0], UINT_MAX); + } + grid.sync(); + offset += gridDim.x; + } +#endif +} + int main(int argc, char** argv) { hipError_t err; int device_num; @@ -167,9 +209,10 @@ int main(int argc, char** argv) { int num_threads_in_block = block_size * warp_size; + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; // Calculate the device occupancy to know how many blocks can be run. HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, - test_kernel, num_threads_in_block, 0)); + test_kernel_used, num_threads_in_block, 0)); int requested_blocks = warps / block_size; if (requested_blocks > max_blocks_per_sm * num_sms) { @@ -211,7 +254,8 @@ int main(int argc, char** argv) { params[0] = reinterpret_cast(&kernel_atomic); params[1] = reinterpret_cast(&kernel_buffer); params[2] = reinterpret_cast(&loops); - HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel), + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), requested_blocks, num_threads_in_block, params, 0, NULL)); diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp index cd7aa1ce82..73e28026a0 100644 --- a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp +++ b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp @@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array, } } +__global__ void +test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array, + unsigned int *array, uint32_t loops) { +#ifdef __HIP_PLATFORM_AMD__ + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + cooperative_groups::multi_grid_group mgrid = + cooperative_groups::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while(time_diff < 1000000); + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +#endif +} + int main(int argc, char** argv) { hipError_t err; int device_num = 0, flag = 0; @@ -263,8 +338,9 @@ int main(int argc, char** argv) { int max_blocks_per_sm = INT_MAX; for (int i = 0; i < 2; i++) { HIPCHECK(hipSetDevice((d + i))); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, + &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0)); if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { max_blocks_per_sm = max_blocks_per_sm_arr[i]; @@ -319,11 +395,12 @@ int main(int argc, char** argv) { void *dev_params[2][4]; hipLaunchParams md_params[2]; for (int i = 0; i < 2; i++) { + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); dev_params[i][1] = reinterpret_cast(&global_array); dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); dev_params[i][3] = reinterpret_cast(&loops); - md_params[i].func = reinterpret_cast(test_kernel); + md_params[i].func = reinterpret_cast(test_kernel_used); md_params[i].gridDim = requested_blocks; md_params[i].blockDim = num_threads_in_block; md_params[i].sharedMem = 0; diff --git a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp index 2bf5a78969..b6e0e1c3fd 100644 --- a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp +++ b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp @@ -44,10 +44,20 @@ extern "C" __global__ void WaitKernel(int *Ad, int clockrate) { *Ad = 1; } +extern "C" __global__ void WaitKernel_gfx11(int *Ad, int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + uint64_t wait_t = 500, + start = wall_clock64()/clockrate, cycles; + do { cycles = wall_clock64()/clockrate-start;} while (cycles < wait_t); + *Ad = 1; +#endif +} + void t1(hipEvent_t start, hipStream_t stream1, int clkRate, int *A, int *Ad) { *A = 0; - hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel), dim3(1), dim3(1), 0, stream1, Ad, clkRate); + auto WaitKernel_used = IsGfx11() ? WaitKernel_gfx11 : WaitKernel; + hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel_used), dim3(1), dim3(1), 0, stream1, Ad, clkRate); HIPCHECK(hipEventRecord(start, stream1)); @@ -69,7 +79,11 @@ int main(int argc, char* argv[]) { HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A[i], 0)); } - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } hipStream_t stream1; hipStreamCreate(&stream1); hipEvent_t start; diff --git a/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp b/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp index 4df6bd8600..d0ddb04cb6 100644 --- a/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp +++ b/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp @@ -62,6 +62,34 @@ __global__ void FourSecKernel(int clockrate) { } } +__global__ void TwoSecKernel_gfx11(int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + if (globalvar == 0x2222) { + globalvar = 0x3333; + } + uint64_t wait_t = 2000, + start = wall_clock64()/clockrate, cur; + do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t); + if (globalvar != 0x3333) { + globalvar = 0x5555; + } +#endif +} + +__global__ void FourSecKernel_gfx11(int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + if (globalvar == 1) { + globalvar = 0x2222; + } + uint64_t wait_t = 4000, + start = wall_clock64()/clockrate, cur; + do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t); + if (globalvar == 0x2222) { + globalvar = 0x4444; + } +#endif +} + /* * In this Scenario, we create events by disabling the timing flag * We then Launch the kernel using hipExtModuleLaunchKernel by passing @@ -76,13 +104,20 @@ bool DisableTimeFlag() { float time_4sec, time_2sec; hipEvent_t start_event1, end_event1; int clkRate = 0; - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } + HIPCHECK(hipEventCreateWithFlags(&start_event1, hipEventDisableTiming)); HIPCHECK(hipEventCreateWithFlags(&end_event1, hipEventDisableTiming)); HIPCHECK(hipStreamCreate(&stream1)); - hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0, + auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel; + hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0, stream1, start_event1, end_event1, 0, clkRate); HIPCHECK(hipStreamSynchronize(stream1)); e = hipEventElapsedTime(&time_2sec, start_event1, end_event1); @@ -113,11 +148,20 @@ bool ConcurencyCheck_GlobalVar(int conc_flag) { int deviceGlobal_h = 0; HIPCHECK(hipSetDevice(0)); int clkRate = 0; - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } + HIPCHECK(hipStreamCreate(&stream1)); - hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0, + auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel; + auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel; + + hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0, stream1, nullptr, nullptr, conc_flag, clkRate); - hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0, + hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0, stream1, nullptr, nullptr, conc_flag, clkRate); HIPCHECK(hipStreamSynchronize(stream1)); HIPCHECK(hipMemcpyFromSymbol(&deviceGlobal_h, globalvar, @@ -148,16 +192,24 @@ bool KernelTimeExecution() { hipEvent_t start_event1, end_event1, start_event2, end_event2; float time_4sec, time_2sec; int clkRate = 0; - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } + + auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel; + auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel; HIPCHECK(hipEventCreate(&start_event1)); HIPCHECK(hipEventCreate(&end_event1)); HIPCHECK(hipEventCreate(&start_event2)); HIPCHECK(hipEventCreate(&end_event2)); HIPCHECK(hipStreamCreate(&stream1)); - hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0, + hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0, stream1, start_event1, end_event1, 0, clkRate); - hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0, + hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0, stream1, start_event2, end_event2, 0, clkRate); HIPCHECK(hipStreamSynchronize(stream1)); e = hipEventElapsedTime(&time_4sec, start_event1, end_event1); diff --git a/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp b/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp index 866332893b..65be909591 100644 --- a/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp +++ b/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp @@ -49,10 +49,7 @@ #define fileName "matmul.code" #define matmulK "matmulK" -#define SixteenSec "SixteenSecKernel" #define KernelandExtra "KernelandExtraParams" -#define FourSec "FourSecKernel" -#define TwoSec "TwoSecKernel" #define globalDevVar "deviceGlobal" #define dummyKernel "dummyKernel" #define FOURSEC_KERNEL 4999 @@ -124,7 +121,12 @@ void ModuleLaunchKernel::AllocateMemory() { HIPCHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice)); int clkRate = 0; - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } + args1._Ad = Ad; args1._Bd = Bd; args1._Cd = C; @@ -149,13 +151,21 @@ void ModuleLaunchKernel::AllocateMemory() { } void ModuleLaunchKernel::ModuleLoad() { + + std::string TwoSecStr = IsGfx11() ? std::string("TwoSecKernel_gfx11") + : std::string("TwoSecKernel"); + std::string FourSecStr = IsGfx11() ? std::string("FourSecKernel_gfx11") + : std::string("FourSecKernel"); + std::string SixteenSecStr = IsGfx11() ? std::string("SixteenSecKernel_gfx11") + : std::string("SixteenSecKernel"); + HIPCHECK(hipModuleLoad(&Module, fileName)); HIPCHECK(hipModuleGetFunction(&MultKernel, Module, matmulK)); - HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec)); + HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSecStr.c_str())); HIPCHECK(hipModuleGetFunction(&KernelandExtraParamKernel, Module, KernelandExtra)); - HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec)); - HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec)); + HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSecStr.c_str())); + HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSecStr.c_str())); HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel)); HIPCHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize, Module, globalDevVar)); diff --git a/tests/src/runtimeApi/module/matmul.cpp b/tests/src/runtimeApi/module/matmul.cpp index 08a5a11018..de7c935f53 100644 --- a/tests/src/runtimeApi/module/matmul.cpp +++ b/tests/src/runtimeApi/module/matmul.cpp @@ -79,5 +79,41 @@ extern "C" __global__ void FourSecKernel(int clockrate) { } } +extern "C" __global__ void SixteenSecKernel_gfx11(int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + uint64_t wait_t = 16000, + start = wall_clock64()/clockrate, cur; + do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t); +#endif +} + +extern "C" __global__ void TwoSecKernel_gfx11(int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + if (deviceGlobal == 0x2222) { + deviceGlobal = 0x3333; + } + uint64_t wait_t = 2000, + start = wall_clock64()/clockrate, cur; + do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t); + if (deviceGlobal != 0x3333) { + deviceGlobal = 0x5555; + } +#endif +} + +extern "C" __global__ void FourSecKernel_gfx11(int clockrate) { +#ifdef __HIP_PLATFORM_AMD__ + if (deviceGlobal == 1) { + deviceGlobal = 0x2222; + } + uint64_t wait_t = 4000, + start = wall_clock64()/clockrate, cur; + do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t); + if (deviceGlobal == 0x2222) { + deviceGlobal = 0x4444; + } +#endif +} + extern "C" __global__ void dummyKernel() { } diff --git a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp index 3188d305bc..694375925b 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp @@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) { } } +__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) { +#ifdef __HIP_PLATFORM_AMD__ + size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = gputhread; i < Num; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay thread 1 only in the GPU + if (gputhread == 1) { + unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur; + do { + cur = wall_clock64() - start; + } while (cur < wait_t); + } +#endif +} + float *A_h, *C_h, *A_h1, *C_h1; static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status, @@ -129,8 +148,9 @@ int main(int argc, char* argv[]) { int *ptr = NULL; int *ptr1 = NULL; // Queing jobs in both mystream1/2 followed by hipStreamAddCallback + auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square; for (int i = 1; i < 5; ++i) { - hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), + hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream1, C_d, A_d, Num); HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream1)); @@ -139,7 +159,7 @@ int main(int argc, char* argv[]) { HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1, reinterpret_cast(ptr), 0)); - hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), + hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream2, C_d, A_d, Num); HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes, hipMemcpyDeviceToHost, mystream2)); diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp index 15e149a3e6..86dd214361 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp @@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) { } } +__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) { +#ifdef __HIP_PLATFORM_AMD__ + size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = gputhread; i < Num; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay thread 1 only in the GPU + if (gputhread == 1) { + unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur; + do { + cur = wall_clock64() - start; + } while (cur < wait_t); + } +#endif +} + static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status, void* userData) { for (size_t i = 0; i < Num; i++) { @@ -100,7 +119,8 @@ int main(int argc, char* argv[]) { const unsigned threadsPerBlock = 256; const unsigned blocks = (Num+255)/threadsPerBlock; - hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, + auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square; + hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, Num); HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp index 704621761e..e6b702fa3b 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp @@ -64,6 +64,24 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) { } } +__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) { +#ifdef __HIP_PLATFORM_AMD__ + size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = gputhread; i < Num; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay thread 1 only in the GPU + if (gputhread == 1) { + unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur; + do { + cur = wall_clock64() - start; + } while (cur < wait_t); + } +#endif +} static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status, void* userData) { @@ -124,7 +142,8 @@ int main(int argc, char* argv[]) { const unsigned threadsPerBlock = 256; const unsigned blocks = (Num+255)/threadsPerBlock; - hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, + auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square; + hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, Num); HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); diff --git a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp index 14d9eab597..d079e6eb38 100644 --- a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp +++ b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp @@ -61,6 +61,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) { } } +__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t N_elmts) { +#ifdef __HIP_PLATFORM_AMD__ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < N_elmts; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay the thread 1 + if (offset == 1) { + unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur; + do { + cur = wall_clock64() - start; + } while (cur < wait_t); + } +#endif +} + float *A_h, *C_h; static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status, @@ -102,7 +121,8 @@ int main(int argc, char* argv[]) { const unsigned threadsPerBlock = 256; const unsigned blocks = (N_elmts + 255)/threadsPerBlock; - hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, + auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square; + hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N_elmts); HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream)); HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0)); diff --git a/tests/src/test_common.cpp b/tests/src/test_common.cpp index 03464fcba5..ec76df0427 100644 --- a/tests/src/test_common.cpp +++ b/tests/src/test_common.cpp @@ -92,6 +92,32 @@ size_t getHostThreadCount(const size_t memPerThread, const size_t maxThreads) { return thread_count; } +// Function to determine if the device is of gfx11 architecture +bool IsGfx11() { +#if defined(__HIP_PLATFORM_NVIDIA__) + return false; +#elif defined(__HIP_PLATFORM_AMD__) + int device = -1; + hipDeviceProp_t props{}; + HIPCHECK(hipGetDevice(&device)); + HIPCHECK(hipGetDeviceProperties(&props, device)); + + // Get GCN Arch Name and compare to check if it is gfx11 + std::string arch = std::string(props.gcnArchName); + auto pos = arch.find(":"); + if (pos != std::string::npos) + arch = arch.substr(0, pos); + + if(arch.size() >= 5) + arch = arch.substr(0,5); + + return (arch == std::string("gfx11")) ? true : false; +#else + std::cout<<"Have to be either Nvidia or AMD platform, asserting"<