From 04080c2e2eb88d0eb4e0c484d7b54961f658fca1 Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Thu, 20 Jul 2023 10:16:56 +0530 Subject: [PATCH] SWDEV-408958 - Use LaunchDelayKernel and modify it to use same kernel based on real time clock for gfx10 and gfx11. (#370) Change-Id: Iea8a48e8cbfa1745c7d5535dc5820133a4104087 --- catch/include/hip_test_common.hh | 81 ------------------- catch/include/utils.hh | 26 +++--- catch/unit/event/Unit_hipEventElapsedTime.cc | 8 +- catch/unit/event/Unit_hipEventQuery.cc | 31 +++++-- catch/unit/event/hipEventDestroy.cc | 4 +- catch/unit/memory/hipFree.cc | 12 +-- catch/unit/memory/hipMemcpySync.cc | 4 +- catch/unit/memory/hipMemsetSync.cc | 3 +- catch/unit/stream/hipStreamACb_MultiThread.cc | 32 +------- catch/unit/stream/hipStreamCreateWithFlags.cc | 5 +- catch/unit/stream/hipStreamDestroy.cc | 3 +- catch/unit/stream/hipStreamQuery.cc | 9 +-- catch/unit/stream/hipStreamSynchronize.cc | 18 ++--- catch/unit/stream/hipStreamWaitEvent.cc | 52 +----------- 14 files changed, 79 insertions(+), 209 deletions(-) diff --git a/catch/include/hip_test_common.hh b/catch/include/hip_test_common.hh index d4c44a813f..7b4b08ee38 100644 --- a/catch/include/hip_test_common.hh +++ b/catch/include/hip_test_common.hh @@ -350,87 +350,6 @@ template <> struct MemTraits { } }; - -namespace { -static __global__ void waitKernel(size_t offset) { - auto start = clock(); - while ((clock() - start) < offset) { - } -} - -static __global__ void waitKernel_gfx11(size_t offset) { -#if HT_AMD - auto start = wall_clock64(); - while ((wall_clock64() - start) < offset) { - } -#endif -} - -// helper function used to set the device frequency variable -// estimates the number of clock ticks in 1 second -static size_t findTicksPerSecond() { - // first read the reported clockRate as a starting point - hipDeviceProp_t prop; - int device; - HIP_CHECK(hipGetDevice(&device)); - HIP_CHECK(hipGetDeviceProperties(&prop, device)); - size_t devFreq = static_cast(prop.clockRate); // in kHz - size_t clockTicksPerSecond = devFreq * 1000; - - // init - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; - // Warmup - hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipDeviceSynchronize()); - - // try 10 times to find device frequency - // after 10 attempts the result is likely good enough so just accept it - for (int attempts = 10; attempts > 0; --attempts) { - HIP_CHECK(hipEventRecord(start)); - hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond); - HIP_CHECK(hipEventRecord(stop)); - HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipEventSynchronize(stop)); - - float executionTimeMs = 0; - HIP_CHECK(hipEventElapsedTime(&executionTimeMs, start, stop)); - - constexpr float tolerance = 20; - if (fabs(executionTimeMs - 1000) <= tolerance) { - // Timing is within accepted tolerance, break here - break; - } else { - clockTicksPerSecond = (clockTicksPerSecond * 1000) / executionTimeMs; - --attempts; - } - } - - // deinit - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - return clockTicksPerSecond; -} -} // namespace - -// Launches a kernel which runs for specified amount of time -// Note: The current implementation uses HIP_CHECK which is not thread safe! -// Note: the function assumes execution on a single device and caches the number of clock ticks per -// second -static inline void runKernelForDuration(std::chrono::milliseconds duration, - hipStream_t stream = nullptr) { - // number of clocks the device is running at (device frequency) - // each translation unit will have a copy of ticksPerSecond but this function isn't designed for - // precision so that's acceptable. - static size_t ticksPerSecond = findTicksPerSecond(); - const auto millis = duration.count(); - auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; - hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000); -} - class BlockingContext { std::atomic_bool blocked{true}; hipStream_t stream; diff --git a/catch/include/utils.hh b/catch/include/utils.hh index 6dd8070b9c..5efd6a5125 100644 --- a/catch/include/utils.hh +++ b/catch/include/utils.hh @@ -122,9 +122,17 @@ template __global__ void VectorSet(T* const vec, const T value, siz // Will execute for atleast interval milliseconds static __global__ void Delay(uint32_t interval, const uint32_t ticks_per_ms) { while (interval--) { - uint64_t start = clock(); - while (clock() - start < ticks_per_ms) { + #if HT_AMD + uint64_t start = wall_clock64(); + while (wall_clock64() - start < ticks_per_ms) { + __builtin_amdgcn_s_sleep(10); } + #endif + #if HT_NVIDIA + uint64_t start = clock64(); + while (clock64() - start < ticks_per_ms) { + } + #endif } } @@ -140,14 +148,14 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) { } } -inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) { +inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream = nullptr) { int ticks_per_ms = 0; - // Clock rate is in kHz => number of clock ticks in a millisecond - if (IsGfx11()) { - HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0)); - } else { - HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0)); - } + #if HT_AMD + HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0)); + #endif + #if HT_NVIDIA + HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0)); + #endif Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms); } diff --git a/catch/unit/event/Unit_hipEventElapsedTime.cc b/catch/unit/event/Unit_hipEventElapsedTime.cc index 93deb7f842..cf8d6fb919 100644 --- a/catch/unit/event/Unit_hipEventElapsedTime.cc +++ b/catch/unit/event/Unit_hipEventElapsedTime.cc @@ -23,7 +23,7 @@ THE SOFTWARE. #include #include #include - +#include /** * @addtogroup hipEventElapsedTime hipEventElapsedTime * @{ @@ -158,10 +158,7 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") { // Record start event HIP_CHECK(hipEventRecord(start, nullptr)); - HipTest::BlockingContext b_context{nullptr}; - b_context.block_stream(); // blocked stream - REQUIRE(b_context.is_blocked()); - + LaunchDelayKernel(std::chrono::milliseconds(1000)); // Record stop event HIP_CHECK(hipEventRecord(stop, nullptr)); @@ -169,7 +166,6 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") { float tElapsed = 1.0f; HIP_CHECK_ERROR(hipEventQuery(stop), hipErrorNotReady); HIP_ASSERT(hipEventElapsedTime(&tElapsed, start, stop) == hipErrorNotReady); - b_context.unblock_stream(); HIP_CHECK(hipStreamSynchronize(nullptr)); HIP_CHECK(hipEventDestroy(start)); diff --git a/catch/unit/event/Unit_hipEventQuery.cc b/catch/unit/event/Unit_hipEventQuery.cc index b9606fccf4..29f7b4d50c 100644 --- a/catch/unit/event/Unit_hipEventQuery.cc +++ b/catch/unit/event/Unit_hipEventQuery.cc @@ -18,7 +18,29 @@ THE SOFTWARE. */ #include +#include +/** + * @addtogroup hipEventQuery hipEventQuery + * @{ + * @ingroup EventTest + * `hipEventQuery(hipEvent_t event)` - + * Query the status of the specified event. + * ________________________ + * Test cases from other modules: + * - @ref Unit_hipEventIpc + */ +/** + * Test Description + * ------------------------ + * - Query events with a single and with multiple devices. + * Test source + * ------------------------ + * - unit/event/Unit_hipEventQuery.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ TEST_CASE("Unit_hipEventQuery_DifferentDevice") { hipEvent_t event1{}, event2{}; HIP_CHECK(hipEventCreate(&event1)); @@ -30,15 +52,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") { HIP_CHECK(hipStreamCreate(&stream)); REQUIRE(stream != nullptr); - HipTest::BlockingContext b_context1{stream}; // og context - // Block stream { HIP_CHECK(hipSetDevice(0)); HIP_CHECK(hipEventRecord(event1, stream)); - - b_context1.block_stream(); // blocked stream - REQUIRE(b_context1.is_blocked()); - + LaunchDelayKernel(std::chrono::milliseconds(3000), stream); HIP_CHECK(hipEventRecord(event2, stream)); HIP_CHECK(hipEventSynchronize(event1)); @@ -58,8 +75,6 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") { HIP_CHECK(hipEventQuery(event1)); HIP_CHECK_ERROR(hipEventQuery(event2), hipErrorNotReady); - b_context1.unblock_stream(); - HIP_CHECK(hipEventSynchronize(event2)); // Query, should be done now diff --git a/catch/unit/event/hipEventDestroy.cc b/catch/unit/event/hipEventDestroy.cc index d62921f01b..8836e3db7a 100644 --- a/catch/unit/event/hipEventDestroy.cc +++ b/catch/unit/event/hipEventDestroy.cc @@ -25,7 +25,7 @@ THE SOFTWARE. #include #include #include "hip/hip_runtime_api.h" - +#include /** * @addtogroup hipEventDestroy hipEventDestroy * @{ @@ -53,7 +53,7 @@ static inline void launchVectorAdd(float*& A_h, float*& B_h, float*& C_h, HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&A_d), A_h, 0)); HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&B_d), B_h, 0)); HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&C_d), C_h, 0)); - HipTest::runKernelForDuration(delay, stream); + LaunchDelayKernel(delay, stream); HipTest::vectorADD<<<1, 1, 0, stream>>>(A_d, B_d, C_d, vectorSize); } diff --git a/catch/unit/memory/hipFree.cc b/catch/unit/memory/hipFree.cc index 1db0ec9502..bc06a4c7a6 100644 --- a/catch/unit/memory/hipFree.cc +++ b/catch/unit/memory/hipFree.cc @@ -25,7 +25,7 @@ THE SOFTWARE. #include #include "hipArrayCommon.hh" #include "DriverContext.hh" - +#include /* * This testcase verifies [ hipFree || hipFreeArray || hipFreeType::ArrayDestroy || * hipFreeType::HostFree with hipHostMalloc ] @@ -54,7 +54,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncDev") { size_t size_mult = GENERATE(1, 32, 64, 128, 256); HIP_CHECK(hipMalloc(&devPtr, sizeof(*devPtr) * size_mult)); - HipTest::runKernelForDuration(delay); + LaunchDelayKernel(delay); // make sure device is busy HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); HIP_CHECK(hipFree(devPtr)); @@ -67,7 +67,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncHost") { HIP_CHECK(hipHostMalloc(&hostPtr, sizeof(*hostPtr) * size_mult)); - HipTest::runKernelForDuration(delay); + LaunchDelayKernel(delay); // make sure device is busy HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); HIP_CHECK(hipHostFree(hostPtr)); @@ -88,7 +88,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo hipChannelFormatDesc desc = hipCreateChannelDesc(); HIP_CHECK(hipMallocArray(&arrayPtr, &desc, width, height, hipArrayDefault)); - HipTest::runKernelForDuration(delay); + LaunchDelayKernel(delay); // make sure device is busy HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); HIP_CHECK(hipFreeArray(arrayPtr)); @@ -103,7 +103,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo cuDesc.Format = vec_info::format; cuDesc.NumChannels = vec_info::size; HIP_CHECK(hipArrayCreate(&cuArrayPtr, &cuDesc)); - HipTest::runKernelForDuration(delay); + LaunchDelayKernel(delay); // make sure device is busy HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); HIP_CHECK(hipArrayDestroy(cuArrayPtr)); @@ -120,7 +120,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo hipChannelFormatDesc desc = hipCreateChannelDesc(); HIP_CHECK(hipMallocArray(&arrayPtr, &desc, extent.width, extent.height, hipArrayDefault)); - HipTest::runKernelForDuration(delay); + LaunchDelayKernel(delay); // make sure device is busy HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); // Second free segfaults diff --git a/catch/unit/memory/hipMemcpySync.cc b/catch/unit/memory/hipMemcpySync.cc index fbb165d892..bef3c953f5 100644 --- a/catch/unit/memory/hipMemcpySync.cc +++ b/catch/unit/memory/hipMemcpySync.cc @@ -21,7 +21,7 @@ THE SOFTWARE. */ #include "MemUtils.hh" - +#include /* * These testcases verify that synchronization behaviour for memcpy functions with respect to * the host. @@ -156,7 +156,7 @@ static void runMemcpyTests(hipStream_t stream, bool async, allocType type, memTy using namespace std::chrono_literals; const std::chrono::duration delay = 100ms; - HipTest::runKernelForDuration(delay, stream); + LaunchDelayKernel(delay, stream); memcpyCheck(type, memType, aPtr.first, data, fillerData, async, stream, fromHost); checkForSync(stream, async, type, fromHost); diff --git a/catch/unit/memory/hipMemsetSync.cc b/catch/unit/memory/hipMemsetSync.cc index f3de97e02f..973c7dc6e7 100644 --- a/catch/unit/memory/hipMemsetSync.cc +++ b/catch/unit/memory/hipMemsetSync.cc @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include +#include /* * These testcases verify that synchronous memset functions are asynchronous with respect to the * host except when the target is pinned host memory or a Unified Memory region @@ -406,7 +407,7 @@ void runTests(allocType type, memSetType memsetType, MultiDData data, hipStream_ std::pair aPtr = initMemory(type, memsetType, data); using namespace std::chrono_literals; const std::chrono::duration delay = 100ms; - HipTest::runKernelForDuration(delay, stream); + LaunchDelayKernel(delay, stream); memsetCheck(aPtr.first, testValue, memsetType, data, async, stream); if (async || type == allocType::deviceMalloc) { diff --git a/catch/unit/stream/hipStreamACb_MultiThread.cc b/catch/unit/stream/hipStreamACb_MultiThread.cc index 2825914cba..9d1a780deb 100644 --- a/catch/unit/stream/hipStreamACb_MultiThread.cc +++ b/catch/unit/stream/hipStreamACb_MultiThread.cc @@ -25,6 +25,7 @@ multiple Threads. #include #include +#include static constexpr size_t N = 4096; static constexpr int numThreads = 1000; @@ -43,33 +44,6 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) { for (size_t i = gputhread; i < Num; i += stride) { C_d[i] = A_d[i] * A_d[i]; } - - // Delay thread 1 only in the GPU - if (gputhread == 1) { - uint64_t wait_t = 3200000000, start = clock64(), cur; - do { - cur = clock64() - start; - } while (cur < wait_t); - } -} - -static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) { -#if HT_AMD - size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); - size_t stride = blockDim.x * gridDim.x; - - for (size_t i = gputhread; i < Num; i += stride) { - C_d[i] = A_d[i] * A_d[i]; - } - - // Delay thread 1 only in the GPU - if (gputhread == 1) { - uint64_t wait_t = 3200000000, start = wall_clock64(), cur; - do { - cur = wall_clock64() - start; - } while (cur < wait_t); - } -#endif } static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status, @@ -146,10 +120,10 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") { constexpr unsigned threadsPerBlock = 256; constexpr unsigned blocks = (N + 255)/threadsPerBlock; - auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function; - hipLaunchKernelGGL((device_function_used), dim3(blocks), + hipLaunchKernelGGL((device_function), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N); + LaunchDelayKernel(std::chrono::milliseconds(2000), mystream); HIP_CHECK(hipGetLastError()); HIP_CHECK( hipMemcpyAsync(C1_h, C_d, Nbytes, diff --git a/catch/unit/stream/hipStreamCreateWithFlags.cc b/catch/unit/stream/hipStreamCreateWithFlags.cc index ef38755028..5038037452 100644 --- a/catch/unit/stream/hipStreamCreateWithFlags.cc +++ b/catch/unit/stream/hipStreamCreateWithFlags.cc @@ -19,6 +19,7 @@ THE SOFTWARE. #include #include +#include namespace hipStreamCreateWithFlagsTests { @@ -69,11 +70,11 @@ TEST_CASE("Unit_hipStreamCreateWithFlags_DefaultStreamInteraction") { constexpr auto delay = std::chrono::milliseconds(500); SECTION("default stream waiting for created stream") { - HipTest::runKernelForDuration(delay, stream); + LaunchDelayKernel(delay, stream); REQUIRE(hipStreamQuery(defaultStream) == expectedError); } SECTION("created stream waiting for default stream") { - HipTest::runKernelForDuration(delay, defaultStream); + LaunchDelayKernel(delay, defaultStream); REQUIRE(hipStreamQuery(stream) == expectedError); } diff --git a/catch/unit/stream/hipStreamDestroy.cc b/catch/unit/stream/hipStreamDestroy.cc index 7c7658e64b..a2c0f287ec 100644 --- a/catch/unit/stream/hipStreamDestroy.cc +++ b/catch/unit/stream/hipStreamDestroy.cc @@ -18,6 +18,7 @@ THE SOFTWARE. */ #include #include +#include namespace hipStreamDestroyTests { @@ -80,7 +81,7 @@ TEST_CASE("Unit_hipStreamDestroy_WithPendingWork") { HIP_CHECK(hipMalloc(&deviceData, sizeof(int) * numDataPoints)); HIP_CHECK(hipMemset(deviceData, 0, sizeof(int) * numDataPoints)); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream); + LaunchDelayKernel(std::chrono::milliseconds(500), stream); setToOne<<<1, numDataPoints, 0, stream>>>(deviceData, numDataPoints); HIP_CHECK_ERROR(hipStreamQuery(stream), hipErrorNotReady); HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady); diff --git a/catch/unit/stream/hipStreamQuery.cc b/catch/unit/stream/hipStreamQuery.cc index 8b99dc712c..6a1f306c6c 100644 --- a/catch/unit/stream/hipStreamQuery.cc +++ b/catch/unit/stream/hipStreamQuery.cc @@ -19,7 +19,7 @@ THE SOFTWARE. #include #include "streamCommon.hh" - +#include /** * @brief Check that querying a stream with no work returns hipSuccess * @@ -101,7 +101,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") { HIP_CHECK(hipStreamCreate(&stream)); HIP_CHECK(hipStreamQuery(hip::nullStream)); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream); + LaunchDelayKernel(std::chrono::milliseconds(500), stream); HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady); HIP_CHECK(hipDeviceSynchronize()); @@ -116,7 +116,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") { */ TEST_CASE("Unit_hipStreamQuery_NullStreamQuery") { HIP_CHECK(hipStreamQuery(hip::nullStream)); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream); + LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream); HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady); HIP_CHECK(hipStreamSynchronize(hip::nullStream)); @@ -130,8 +130,7 @@ TEST_CASE("Unit_hipStreamQuery_WithPendingWork") { hipStream_t waitingStream{nullptr}; HIP_CHECK(hipStreamCreate(&waitingStream)); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), waitingStream); - + LaunchDelayKernel(std::chrono::milliseconds(500), waitingStream); HIP_CHECK_ERROR(hipStreamQuery(waitingStream), hipErrorNotReady); HIP_CHECK(hipStreamSynchronize(waitingStream)); HIP_CHECK(hipStreamQuery(waitingStream)); diff --git a/catch/unit/stream/hipStreamSynchronize.cc b/catch/unit/stream/hipStreamSynchronize.cc index a31546a52a..d1faaa4f04 100644 --- a/catch/unit/stream/hipStreamSynchronize.cc +++ b/catch/unit/stream/hipStreamSynchronize.cc @@ -19,7 +19,7 @@ THE SOFTWARE. #include #include "streamCommon.hh" - +#include namespace hipStreamSynchronizeTest { /** @@ -62,7 +62,7 @@ TEST_CASE("Unit_hipStreamSynchronize_FinishWork") { HIP_CHECK(hipStreamCreate(&stream)); } - HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream); + LaunchDelayKernel(std::chrono::milliseconds(500), stream); HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamQuery(stream)); @@ -86,15 +86,15 @@ TEST_CASE("Unit_hipStreamSynchronize_NullStreamSynchronization") { } for (int i = 0; i < totalStreams; ++i) { - HipTest::runKernelForDuration(std::chrono::milliseconds(1000), streams[i]); + LaunchDelayKernel(std::chrono::milliseconds(1000), streams[i]); } + HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady); + for (int i = 0; i < totalStreams; ++i) { HIP_CHECK_ERROR(hipStreamQuery(streams[i]), hipErrorNotReady); } - HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady); - HIP_CHECK(hipStreamSynchronize(hip::nullStream)); HIP_CHECK(hipStreamQuery(hip::nullStream)); @@ -123,8 +123,8 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") { HIP_CHECK(hipStreamCreate(&stream1)); HIP_CHECK(hipStreamCreate(&stream2)); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream1); - HipTest::runKernelForDuration(std::chrono::milliseconds(2000), stream2); + LaunchDelayKernel(std::chrono::milliseconds(500), stream1); + LaunchDelayKernel(std::chrono::milliseconds(2000), stream2); SECTION("Do not use NullStream") {} SECTION("Submit Kernel to NullStream") { @@ -157,10 +157,10 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") { * */ TEST_CASE("Unit_hipStreamSynchronize_NullStreamAndStreamPerThread") { - HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::streamPerThread); + LaunchDelayKernel(std::chrono::milliseconds(500), hip::streamPerThread); HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady); HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipErrorNotReady); - HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream); + LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream); HIP_CHECK(hipStreamSynchronize(hip::nullStream)) HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipSuccess); HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipSuccess); diff --git a/catch/unit/stream/hipStreamWaitEvent.cc b/catch/unit/stream/hipStreamWaitEvent.cc index 7965bc19ef..9877d548c1 100644 --- a/catch/unit/stream/hipStreamWaitEvent.cc +++ b/catch/unit/stream/hipStreamWaitEvent.cc @@ -25,7 +25,7 @@ Unit_hipStreamWaitEvent_DifferentStreams - Test waiting for an event on a differ */ #include - +#include TEST_CASE("Unit_hipStreamWaitEvent_Negative") { enum class StreamTestType { NullStream = 0, StreamPerThread, CreatedStream }; @@ -79,35 +79,6 @@ TEST_CASE("Unit_hipStreamWaitEvent_UninitializedStream_Negative") { } #endif -// Since we can not use atomic*_system on every gpu, we will use wait based on clock rate. -// This wont be accurate since clock rate of a GPU varies depending on many variables including -// thermals, load, utilization -__global__ void waitKernel(int clockRate, int seconds) { - auto start = clock(); - auto ms = seconds * 1000; - long long waitTill = clockRate * (long long)ms; - while (1) { - auto end = clock(); - if ((end - start) > waitTill) { - return; - } - } -} - -__global__ void waitKernel_gfx11(int clockRate, int seconds) { -#if HT_AMD - auto start = wall_clock64(); - auto ms = seconds * 1000; - long long waitTill = clockRate * (long long)ms; - while (1) { - auto end = wall_clock64(); - if ((end - start) > waitTill) { - return; - } - } -#endif -} - TEST_CASE("Unit_hipStreamWaitEvent_Default") { hipStream_t stream{nullptr}; hipEvent_t waitEvent{nullptr}; @@ -118,15 +89,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") { REQUIRE(stream != nullptr); REQUIRE(waitEvent != nullptr); - int deviceId {}; - HIP_CHECK(hipGetDevice(&deviceId)); - - hipDeviceProp_t prop{}; - HIP_CHECK(hipGetDeviceProperties(&prop, deviceId)); - auto clockRate = prop.clockRate; - - auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; - waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds + LaunchDelayKernel(std::chrono::milliseconds(2000), stream); HIP_CHECK(hipEventRecord(waitEvent, stream)); @@ -154,15 +117,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") { REQUIRE(streamBlockedOnStreamA != nullptr); REQUIRE(waitEvent != nullptr); - int deviceId {}; - HIP_CHECK(hipGetDevice(&deviceId)); + LaunchDelayKernel(std::chrono::milliseconds(3000), blockedStreamA); - hipDeviceProp_t prop{}; - HIP_CHECK(hipGetDeviceProperties(&prop, deviceId)); - auto clockRate = prop.clockRate; - auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; - waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate, - 3); // wait for 3 seconds HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA)); // Make sure stream is waiting for data to be set @@ -170,7 +126,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") { HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0)); - waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds + LaunchDelayKernel(std::chrono::milliseconds(2000), streamBlockedOnStreamA); HIP_CHECK(hipStreamSynchronize(unblockingStream));