SWDEV-408958 - Use LaunchDelayKernel and modify it to use same kernel based on real time clock for gfx10 and gfx11. (#370)

Change-Id: Iea8a48e8cbfa1745c7d5535dc5820133a4104087
This commit is contained in:
ROCm CI Service Account
2023-07-20 10:16:56 +05:30
zatwierdzone przez GitHub
rodzic faa2dd7cfb
commit 04080c2e2e
14 zmienionych plików z 79 dodań i 209 usunięć
-81
Wyświetl plik
@@ -350,87 +350,6 @@ template <> struct MemTraits<MemcpyAsync> {
}
};
namespace {
static __global__ void waitKernel(size_t offset) {
auto start = clock();
while ((clock() - start) < offset) {
}
}
static __global__ void waitKernel_gfx11(size_t offset) {
#if HT_AMD
auto start = wall_clock64();
while ((wall_clock64() - start) < offset) {
}
#endif
}
// helper function used to set the device frequency variable
// estimates the number of clock ticks in 1 second
static size_t findTicksPerSecond() {
// first read the reported clockRate as a starting point
hipDeviceProp_t prop;
int device;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&prop, device));
size_t devFreq = static_cast<size_t>(prop.clockRate); // in kHz
size_t clockTicksPerSecond = devFreq * 1000;
// init
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
// Warmup
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipDeviceSynchronize());
// try 10 times to find device frequency
// after 10 attempts the result is likely good enough so just accept it
for (int attempts = 10; attempts > 0; --attempts) {
HIP_CHECK(hipEventRecord(start));
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipEventSynchronize(stop));
float executionTimeMs = 0;
HIP_CHECK(hipEventElapsedTime(&executionTimeMs, start, stop));
constexpr float tolerance = 20;
if (fabs(executionTimeMs - 1000) <= tolerance) {
// Timing is within accepted tolerance, break here
break;
} else {
clockTicksPerSecond = (clockTicksPerSecond * 1000) / executionTimeMs;
--attempts;
}
}
// deinit
HIP_CHECK(hipEventDestroy(start));
HIP_CHECK(hipEventDestroy(stop));
return clockTicksPerSecond;
}
} // namespace
// Launches a kernel which runs for specified amount of time
// Note: The current implementation uses HIP_CHECK which is not thread safe!
// Note: the function assumes execution on a single device and caches the number of clock ticks per
// second
static inline void runKernelForDuration(std::chrono::milliseconds duration,
hipStream_t stream = nullptr) {
// number of clocks the device is running at (device frequency)
// each translation unit will have a copy of ticksPerSecond but this function isn't designed for
// precision so that's acceptable.
static size_t ticksPerSecond = findTicksPerSecond();
const auto millis = duration.count();
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
}
class BlockingContext {
std::atomic_bool blocked{true};
hipStream_t stream;
+17 -9
Wyświetl plik
@@ -122,9 +122,17 @@ template <typename T> __global__ void VectorSet(T* const vec, const T value, siz
// Will execute for atleast interval milliseconds
static __global__ void Delay(uint32_t interval, const uint32_t ticks_per_ms) {
while (interval--) {
uint64_t start = clock();
while (clock() - start < ticks_per_ms) {
#if HT_AMD
uint64_t start = wall_clock64();
while (wall_clock64() - start < ticks_per_ms) {
__builtin_amdgcn_s_sleep(10);
}
#endif
#if HT_NVIDIA
uint64_t start = clock64();
while (clock64() - start < ticks_per_ms) {
}
#endif
}
}
@@ -140,14 +148,14 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
}
}
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream = nullptr) {
int ticks_per_ms = 0;
// Clock rate is in kHz => number of clock ticks in a millisecond
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
}
#if HT_AMD
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
#endif
#if HT_NVIDIA
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
#endif
Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
}
@@ -23,7 +23,7 @@ THE SOFTWARE.
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <iostream>
#include <utils.hh>
/**
* @addtogroup hipEventElapsedTime hipEventElapsedTime
* @{
@@ -158,10 +158,7 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") {
// Record start event
HIP_CHECK(hipEventRecord(start, nullptr));
HipTest::BlockingContext b_context{nullptr};
b_context.block_stream(); // blocked stream
REQUIRE(b_context.is_blocked());
LaunchDelayKernel(std::chrono::milliseconds(1000));
// Record stop event
HIP_CHECK(hipEventRecord(stop, nullptr));
@@ -169,7 +166,6 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") {
float tElapsed = 1.0f;
HIP_CHECK_ERROR(hipEventQuery(stop), hipErrorNotReady);
HIP_ASSERT(hipEventElapsedTime(&tElapsed, start, stop) == hipErrorNotReady);
b_context.unblock_stream();
HIP_CHECK(hipStreamSynchronize(nullptr));
HIP_CHECK(hipEventDestroy(start));
+23 -8
Wyświetl plik
@@ -18,7 +18,29 @@ THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <utils.hh>
/**
* @addtogroup hipEventQuery hipEventQuery
* @{
* @ingroup EventTest
* `hipEventQuery(hipEvent_t event)` -
* Query the status of the specified event.
* ________________________
* Test cases from other modules:
* - @ref Unit_hipEventIpc
*/
/**
* Test Description
* ------------------------
* - Query events with a single and with multiple devices.
* Test source
* ------------------------
* - unit/event/Unit_hipEventQuery.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
hipEvent_t event1{}, event2{};
HIP_CHECK(hipEventCreate(&event1));
@@ -30,15 +52,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
HIP_CHECK(hipStreamCreate(&stream));
REQUIRE(stream != nullptr);
HipTest::BlockingContext b_context1{stream}; // og context
// Block stream
{
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipEventRecord(event1, stream));
b_context1.block_stream(); // blocked stream
REQUIRE(b_context1.is_blocked());
LaunchDelayKernel(std::chrono::milliseconds(3000), stream);
HIP_CHECK(hipEventRecord(event2, stream));
HIP_CHECK(hipEventSynchronize(event1));
@@ -58,8 +75,6 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
HIP_CHECK(hipEventQuery(event1));
HIP_CHECK_ERROR(hipEventQuery(event2), hipErrorNotReady);
b_context1.unblock_stream();
HIP_CHECK(hipEventSynchronize(event2));
// Query, should be done now
+2 -2
Wyświetl plik
@@ -25,7 +25,7 @@ THE SOFTWARE.
#include <hip_test_kernels.hh>
#include <hip_test_common.hh>
#include "hip/hip_runtime_api.h"
#include <utils.hh>
/**
* @addtogroup hipEventDestroy hipEventDestroy
* @{
@@ -53,7 +53,7 @@ static inline void launchVectorAdd(float*& A_h, float*& B_h, float*& C_h,
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), A_h, 0));
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&C_d), C_h, 0));
HipTest::runKernelForDuration(delay, stream);
LaunchDelayKernel(delay, stream);
HipTest::vectorADD<<<1, 1, 0, stream>>>(A_d, B_d, C_d, vectorSize);
}
+6 -6
Wyświetl plik
@@ -25,7 +25,7 @@ THE SOFTWARE.
#include <hip_array_common.hh>
#include "hipArrayCommon.hh"
#include "DriverContext.hh"
#include <utils.hh>
/*
* This testcase verifies [ hipFree || hipFreeArray || hipFreeType::ArrayDestroy ||
* hipFreeType::HostFree with hipHostMalloc ]
@@ -54,7 +54,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncDev") {
size_t size_mult = GENERATE(1, 32, 64, 128, 256);
HIP_CHECK(hipMalloc(&devPtr, sizeof(*devPtr) * size_mult));
HipTest::runKernelForDuration(delay);
LaunchDelayKernel(delay);
// make sure device is busy
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
HIP_CHECK(hipFree(devPtr));
@@ -67,7 +67,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncHost") {
HIP_CHECK(hipHostMalloc(&hostPtr, sizeof(*hostPtr) * size_mult));
HipTest::runKernelForDuration(delay);
LaunchDelayKernel(delay);
// make sure device is busy
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
HIP_CHECK(hipHostFree(hostPtr));
@@ -88,7 +88,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
hipChannelFormatDesc desc = hipCreateChannelDesc<TestType>();
HIP_CHECK(hipMallocArray(&arrayPtr, &desc, width, height, hipArrayDefault));
HipTest::runKernelForDuration(delay);
LaunchDelayKernel(delay);
// make sure device is busy
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
HIP_CHECK(hipFreeArray(arrayPtr));
@@ -103,7 +103,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
cuDesc.Format = vec_info::format;
cuDesc.NumChannels = vec_info::size;
HIP_CHECK(hipArrayCreate(&cuArrayPtr, &cuDesc));
HipTest::runKernelForDuration(delay);
LaunchDelayKernel(delay);
// make sure device is busy
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
HIP_CHECK(hipArrayDestroy(cuArrayPtr));
@@ -120,7 +120,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
hipChannelFormatDesc desc = hipCreateChannelDesc<TestType>();
HIP_CHECK(hipMallocArray(&arrayPtr, &desc, extent.width, extent.height, hipArrayDefault));
HipTest::runKernelForDuration(delay);
LaunchDelayKernel(delay);
// make sure device is busy
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
// Second free segfaults
+2 -2
Wyświetl plik
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
#include "MemUtils.hh"
#include <utils.hh>
/*
* These testcases verify that synchronization behaviour for memcpy functions with respect to
* the host.
@@ -156,7 +156,7 @@ static void runMemcpyTests(hipStream_t stream, bool async, allocType type, memTy
using namespace std::chrono_literals;
const std::chrono::duration<uint64_t, std::milli> delay = 100ms;
HipTest::runKernelForDuration(delay, stream);
LaunchDelayKernel(delay, stream);
memcpyCheck(type, memType, aPtr.first, data, fillerData, async, stream, fromHost);
checkForSync(stream, async, type, fromHost);
+2 -1
Wyświetl plik
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <utils.hh>
/*
* These testcases verify that synchronous memset functions are asynchronous with respect to the
* host except when the target is pinned host memory or a Unified Memory region
@@ -406,7 +407,7 @@ void runTests(allocType type, memSetType memsetType, MultiDData data, hipStream_
std::pair<T*, T*> aPtr = initMemory<T>(type, memsetType, data);
using namespace std::chrono_literals;
const std::chrono::duration<uint64_t, std::milli> delay = 100ms;
HipTest::runKernelForDuration(delay, stream);
LaunchDelayKernel(delay, stream);
memsetCheck(aPtr.first, testValue, memsetType, data, async, stream);
if (async || type == allocType::deviceMalloc) {
@@ -25,6 +25,7 @@ multiple Threads.
#include <hip_test_common.hh>
#include <atomic>
#include <utils.hh>
static constexpr size_t N = 4096;
static constexpr int numThreads = 1000;
@@ -43,33 +44,6 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
uint64_t wait_t = 3200000000, start = clock64(), cur;
do {
cur = clock64() - start;
} while (cur < wait_t);
}
}
static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
#if HT_AMD
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
@@ -146,10 +120,10 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
constexpr unsigned threadsPerBlock = 256;
constexpr unsigned blocks = (N + 255)/threadsPerBlock;
auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
hipLaunchKernelGGL((device_function_used), dim3(blocks),
hipLaunchKernelGGL((device_function), dim3(blocks),
dim3(threadsPerBlock), 0,
mystream, C_d, A_d, N);
LaunchDelayKernel(std::chrono::milliseconds(2000), mystream);
HIP_CHECK(hipGetLastError());
HIP_CHECK(
hipMemcpyAsync(C1_h, C_d, Nbytes,
@@ -19,6 +19,7 @@ THE SOFTWARE.
#include <chrono>
#include <hip_test_common.hh>
#include <utils.hh>
namespace hipStreamCreateWithFlagsTests {
@@ -69,11 +70,11 @@ TEST_CASE("Unit_hipStreamCreateWithFlags_DefaultStreamInteraction") {
constexpr auto delay = std::chrono::milliseconds(500);
SECTION("default stream waiting for created stream") {
HipTest::runKernelForDuration(delay, stream);
LaunchDelayKernel(delay, stream);
REQUIRE(hipStreamQuery(defaultStream) == expectedError);
}
SECTION("created stream waiting for default stream") {
HipTest::runKernelForDuration(delay, defaultStream);
LaunchDelayKernel(delay, defaultStream);
REQUIRE(hipStreamQuery(stream) == expectedError);
}
+2 -1
Wyświetl plik
@@ -18,6 +18,7 @@ THE SOFTWARE.
*/
#include <chrono>
#include <hip_test_common.hh>
#include <utils.hh>
namespace hipStreamDestroyTests {
@@ -80,7 +81,7 @@ TEST_CASE("Unit_hipStreamDestroy_WithPendingWork") {
HIP_CHECK(hipMalloc(&deviceData, sizeof(int) * numDataPoints));
HIP_CHECK(hipMemset(deviceData, 0, sizeof(int) * numDataPoints));
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
setToOne<<<1, numDataPoints, 0, stream>>>(deviceData, numDataPoints);
HIP_CHECK_ERROR(hipStreamQuery(stream), hipErrorNotReady);
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
+4 -5
Wyświetl plik
@@ -19,7 +19,7 @@ THE SOFTWARE.
#include <hip_test_common.hh>
#include "streamCommon.hh"
#include <utils.hh>
/**
* @brief Check that querying a stream with no work returns hipSuccess
*
@@ -101,7 +101,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") {
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipStreamQuery(hip::nullStream));
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
HIP_CHECK(hipDeviceSynchronize());
@@ -116,7 +116,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") {
*/
TEST_CASE("Unit_hipStreamQuery_NullStreamQuery") {
HIP_CHECK(hipStreamQuery(hip::nullStream));
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream);
LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream);
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
HIP_CHECK(hipStreamSynchronize(hip::nullStream));
@@ -130,8 +130,7 @@ TEST_CASE("Unit_hipStreamQuery_WithPendingWork") {
hipStream_t waitingStream{nullptr};
HIP_CHECK(hipStreamCreate(&waitingStream));
HipTest::runKernelForDuration(std::chrono::milliseconds(500), waitingStream);
LaunchDelayKernel(std::chrono::milliseconds(500), waitingStream);
HIP_CHECK_ERROR(hipStreamQuery(waitingStream), hipErrorNotReady);
HIP_CHECK(hipStreamSynchronize(waitingStream));
HIP_CHECK(hipStreamQuery(waitingStream));
@@ -19,7 +19,7 @@ THE SOFTWARE.
#include <hip_test_common.hh>
#include "streamCommon.hh"
#include <utils.hh>
namespace hipStreamSynchronizeTest {
/**
@@ -62,7 +62,7 @@ TEST_CASE("Unit_hipStreamSynchronize_FinishWork") {
HIP_CHECK(hipStreamCreate(&stream));
}
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamQuery(stream));
@@ -86,15 +86,15 @@ TEST_CASE("Unit_hipStreamSynchronize_NullStreamSynchronization") {
}
for (int i = 0; i < totalStreams; ++i) {
HipTest::runKernelForDuration(std::chrono::milliseconds(1000), streams[i]);
LaunchDelayKernel(std::chrono::milliseconds(1000), streams[i]);
}
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
for (int i = 0; i < totalStreams; ++i) {
HIP_CHECK_ERROR(hipStreamQuery(streams[i]), hipErrorNotReady);
}
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
HIP_CHECK(hipStreamSynchronize(hip::nullStream));
HIP_CHECK(hipStreamQuery(hip::nullStream));
@@ -123,8 +123,8 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") {
HIP_CHECK(hipStreamCreate(&stream1));
HIP_CHECK(hipStreamCreate(&stream2));
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream1);
HipTest::runKernelForDuration(std::chrono::milliseconds(2000), stream2);
LaunchDelayKernel(std::chrono::milliseconds(500), stream1);
LaunchDelayKernel(std::chrono::milliseconds(2000), stream2);
SECTION("Do not use NullStream") {}
SECTION("Submit Kernel to NullStream") {
@@ -157,10 +157,10 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") {
*
*/
TEST_CASE("Unit_hipStreamSynchronize_NullStreamAndStreamPerThread") {
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::streamPerThread);
LaunchDelayKernel(std::chrono::milliseconds(500), hip::streamPerThread);
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipErrorNotReady);
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream);
LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream);
HIP_CHECK(hipStreamSynchronize(hip::nullStream))
HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipSuccess);
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipSuccess);
+4 -48
Wyświetl plik
@@ -25,7 +25,7 @@ Unit_hipStreamWaitEvent_DifferentStreams - Test waiting for an event on a differ
*/
#include <hip_test_common.hh>
#include <utils.hh>
TEST_CASE("Unit_hipStreamWaitEvent_Negative") {
enum class StreamTestType { NullStream = 0, StreamPerThread, CreatedStream };
@@ -79,35 +79,6 @@ TEST_CASE("Unit_hipStreamWaitEvent_UninitializedStream_Negative") {
}
#endif
// Since we can not use atomic*_system on every gpu, we will use wait based on clock rate.
// This wont be accurate since clock rate of a GPU varies depending on many variables including
// thermals, load, utilization
__global__ void waitKernel(int clockRate, int seconds) {
auto start = clock();
auto ms = seconds * 1000;
long long waitTill = clockRate * (long long)ms;
while (1) {
auto end = clock();
if ((end - start) > waitTill) {
return;
}
}
}
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
#if HT_AMD
auto start = wall_clock64();
auto ms = seconds * 1000;
long long waitTill = clockRate * (long long)ms;
while (1) {
auto end = wall_clock64();
if ((end - start) > waitTill) {
return;
}
}
#endif
}
TEST_CASE("Unit_hipStreamWaitEvent_Default") {
hipStream_t stream{nullptr};
hipEvent_t waitEvent{nullptr};
@@ -118,15 +89,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
REQUIRE(stream != nullptr);
REQUIRE(waitEvent != nullptr);
int deviceId {};
HIP_CHECK(hipGetDevice(&deviceId));
hipDeviceProp_t prop{};
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
auto clockRate = prop.clockRate;
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
LaunchDelayKernel(std::chrono::milliseconds(2000), stream);
HIP_CHECK(hipEventRecord(waitEvent, stream));
@@ -154,15 +117,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
REQUIRE(streamBlockedOnStreamA != nullptr);
REQUIRE(waitEvent != nullptr);
int deviceId {};
HIP_CHECK(hipGetDevice(&deviceId));
LaunchDelayKernel(std::chrono::milliseconds(3000), blockedStreamA);
hipDeviceProp_t prop{};
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
auto clockRate = prop.clockRate;
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
3); // wait for 3 seconds
HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));
// Make sure stream is waiting for data to be set
@@ -170,7 +126,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));
waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
LaunchDelayKernel(std::chrono::milliseconds(2000), streamBlockedOnStreamA);
HIP_CHECK(hipStreamSynchronize(unblockingStream));