SWDEV-408958 - Use LaunchDelayKernel and modify it to use same kernel based on real time clock for gfx10 and gfx11. (#370)
Change-Id: Iea8a48e8cbfa1745c7d5535dc5820133a4104087
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
faa2dd7cfb
commit
04080c2e2e
@@ -350,87 +350,6 @@ template <> struct MemTraits<MemcpyAsync> {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
namespace {
|
||||
static __global__ void waitKernel(size_t offset) {
|
||||
auto start = clock();
|
||||
while ((clock() - start) < offset) {
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void waitKernel_gfx11(size_t offset) {
|
||||
#if HT_AMD
|
||||
auto start = wall_clock64();
|
||||
while ((wall_clock64() - start) < offset) {
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// helper function used to set the device frequency variable
|
||||
// estimates the number of clock ticks in 1 second
|
||||
static size_t findTicksPerSecond() {
|
||||
// first read the reported clockRate as a starting point
|
||||
hipDeviceProp_t prop;
|
||||
int device;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, device));
|
||||
size_t devFreq = static_cast<size_t>(prop.clockRate); // in kHz
|
||||
size_t clockTicksPerSecond = devFreq * 1000;
|
||||
|
||||
// init
|
||||
hipEvent_t start, stop;
|
||||
HIP_CHECK(hipEventCreate(&start));
|
||||
HIP_CHECK(hipEventCreate(&stop));
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
// Warmup
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// try 10 times to find device frequency
|
||||
// after 10 attempts the result is likely good enough so just accept it
|
||||
for (int attempts = 10; attempts > 0; --attempts) {
|
||||
HIP_CHECK(hipEventRecord(start));
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
HIP_CHECK(hipEventRecord(stop));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipEventSynchronize(stop));
|
||||
|
||||
float executionTimeMs = 0;
|
||||
HIP_CHECK(hipEventElapsedTime(&executionTimeMs, start, stop));
|
||||
|
||||
constexpr float tolerance = 20;
|
||||
if (fabs(executionTimeMs - 1000) <= tolerance) {
|
||||
// Timing is within accepted tolerance, break here
|
||||
break;
|
||||
} else {
|
||||
clockTicksPerSecond = (clockTicksPerSecond * 1000) / executionTimeMs;
|
||||
--attempts;
|
||||
}
|
||||
}
|
||||
|
||||
// deinit
|
||||
HIP_CHECK(hipEventDestroy(start));
|
||||
HIP_CHECK(hipEventDestroy(stop));
|
||||
return clockTicksPerSecond;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Launches a kernel which runs for specified amount of time
|
||||
// Note: The current implementation uses HIP_CHECK which is not thread safe!
|
||||
// Note: the function assumes execution on a single device and caches the number of clock ticks per
|
||||
// second
|
||||
static inline void runKernelForDuration(std::chrono::milliseconds duration,
|
||||
hipStream_t stream = nullptr) {
|
||||
// number of clocks the device is running at (device frequency)
|
||||
// each translation unit will have a copy of ticksPerSecond but this function isn't designed for
|
||||
// precision so that's acceptable.
|
||||
static size_t ticksPerSecond = findTicksPerSecond();
|
||||
const auto millis = duration.count();
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
|
||||
}
|
||||
|
||||
class BlockingContext {
|
||||
std::atomic_bool blocked{true};
|
||||
hipStream_t stream;
|
||||
|
||||
@@ -122,9 +122,17 @@ template <typename T> __global__ void VectorSet(T* const vec, const T value, siz
|
||||
// Will execute for atleast interval milliseconds
|
||||
static __global__ void Delay(uint32_t interval, const uint32_t ticks_per_ms) {
|
||||
while (interval--) {
|
||||
uint64_t start = clock();
|
||||
while (clock() - start < ticks_per_ms) {
|
||||
#if HT_AMD
|
||||
uint64_t start = wall_clock64();
|
||||
while (wall_clock64() - start < ticks_per_ms) {
|
||||
__builtin_amdgcn_s_sleep(10);
|
||||
}
|
||||
#endif
|
||||
#if HT_NVIDIA
|
||||
uint64_t start = clock64();
|
||||
while (clock64() - start < ticks_per_ms) {
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,14 +148,14 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
|
||||
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream = nullptr) {
|
||||
int ticks_per_ms = 0;
|
||||
// Clock rate is in kHz => number of clock ticks in a millisecond
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
#if HT_AMD
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
|
||||
#endif
|
||||
#if HT_NVIDIA
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
|
||||
#endif
|
||||
Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ THE SOFTWARE.
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <iostream>
|
||||
|
||||
#include <utils.hh>
|
||||
/**
|
||||
* @addtogroup hipEventElapsedTime hipEventElapsedTime
|
||||
* @{
|
||||
@@ -158,10 +158,7 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") {
|
||||
// Record start event
|
||||
HIP_CHECK(hipEventRecord(start, nullptr));
|
||||
|
||||
HipTest::BlockingContext b_context{nullptr};
|
||||
b_context.block_stream(); // blocked stream
|
||||
REQUIRE(b_context.is_blocked());
|
||||
|
||||
LaunchDelayKernel(std::chrono::milliseconds(1000));
|
||||
// Record stop event
|
||||
HIP_CHECK(hipEventRecord(stop, nullptr));
|
||||
|
||||
@@ -169,7 +166,6 @@ TEST_CASE("Unit_hipEventElapsedTime_NotReady_Negative") {
|
||||
float tElapsed = 1.0f;
|
||||
HIP_CHECK_ERROR(hipEventQuery(stop), hipErrorNotReady);
|
||||
HIP_ASSERT(hipEventElapsedTime(&tElapsed, start, stop) == hipErrorNotReady);
|
||||
b_context.unblock_stream();
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(nullptr));
|
||||
HIP_CHECK(hipEventDestroy(start));
|
||||
|
||||
@@ -18,7 +18,29 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <utils.hh>
|
||||
/**
|
||||
* @addtogroup hipEventQuery hipEventQuery
|
||||
* @{
|
||||
* @ingroup EventTest
|
||||
* `hipEventQuery(hipEvent_t event)` -
|
||||
* Query the status of the specified event.
|
||||
* ________________________
|
||||
* Test cases from other modules:
|
||||
* - @ref Unit_hipEventIpc
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Query events with a single and with multiple devices.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/event/Unit_hipEventQuery.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
|
||||
hipEvent_t event1{}, event2{};
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
@@ -30,15 +52,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
REQUIRE(stream != nullptr);
|
||||
|
||||
HipTest::BlockingContext b_context1{stream}; // og context
|
||||
// Block stream
|
||||
{
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipEventRecord(event1, stream));
|
||||
|
||||
b_context1.block_stream(); // blocked stream
|
||||
REQUIRE(b_context1.is_blocked());
|
||||
|
||||
LaunchDelayKernel(std::chrono::milliseconds(3000), stream);
|
||||
HIP_CHECK(hipEventRecord(event2, stream));
|
||||
|
||||
HIP_CHECK(hipEventSynchronize(event1));
|
||||
@@ -58,8 +75,6 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
|
||||
HIP_CHECK(hipEventQuery(event1));
|
||||
HIP_CHECK_ERROR(hipEventQuery(event2), hipErrorNotReady);
|
||||
|
||||
b_context1.unblock_stream();
|
||||
|
||||
HIP_CHECK(hipEventSynchronize(event2));
|
||||
|
||||
// Query, should be done now
|
||||
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include "hip/hip_runtime_api.h"
|
||||
|
||||
#include <utils.hh>
|
||||
/**
|
||||
* @addtogroup hipEventDestroy hipEventDestroy
|
||||
* @{
|
||||
@@ -53,7 +53,7 @@ static inline void launchVectorAdd(float*& A_h, float*& B_h, float*& C_h,
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), A_h, 0));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&C_d), C_h, 0));
|
||||
HipTest::runKernelForDuration(delay, stream);
|
||||
LaunchDelayKernel(delay, stream);
|
||||
HipTest::vectorADD<<<1, 1, 0, stream>>>(A_d, B_d, C_d, vectorSize);
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
#include <hip_array_common.hh>
|
||||
#include "hipArrayCommon.hh"
|
||||
#include "DriverContext.hh"
|
||||
|
||||
#include <utils.hh>
|
||||
/*
|
||||
* This testcase verifies [ hipFree || hipFreeArray || hipFreeType::ArrayDestroy ||
|
||||
* hipFreeType::HostFree with hipHostMalloc ]
|
||||
@@ -54,7 +54,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncDev") {
|
||||
size_t size_mult = GENERATE(1, 32, 64, 128, 256);
|
||||
HIP_CHECK(hipMalloc(&devPtr, sizeof(*devPtr) * size_mult));
|
||||
|
||||
HipTest::runKernelForDuration(delay);
|
||||
LaunchDelayKernel(delay);
|
||||
// make sure device is busy
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
HIP_CHECK(hipFree(devPtr));
|
||||
@@ -67,7 +67,7 @@ TEST_CASE("Unit_hipFreeImplicitSyncHost") {
|
||||
|
||||
HIP_CHECK(hipHostMalloc(&hostPtr, sizeof(*hostPtr) * size_mult));
|
||||
|
||||
HipTest::runKernelForDuration(delay);
|
||||
LaunchDelayKernel(delay);
|
||||
// make sure device is busy
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
HIP_CHECK(hipHostFree(hostPtr));
|
||||
@@ -88,7 +88,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
|
||||
hipChannelFormatDesc desc = hipCreateChannelDesc<TestType>();
|
||||
|
||||
HIP_CHECK(hipMallocArray(&arrayPtr, &desc, width, height, hipArrayDefault));
|
||||
HipTest::runKernelForDuration(delay);
|
||||
LaunchDelayKernel(delay);
|
||||
// make sure device is busy
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
HIP_CHECK(hipFreeArray(arrayPtr));
|
||||
@@ -103,7 +103,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
|
||||
cuDesc.Format = vec_info::format;
|
||||
cuDesc.NumChannels = vec_info::size;
|
||||
HIP_CHECK(hipArrayCreate(&cuArrayPtr, &cuDesc));
|
||||
HipTest::runKernelForDuration(delay);
|
||||
LaunchDelayKernel(delay);
|
||||
// make sure device is busy
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
HIP_CHECK(hipArrayDestroy(cuArrayPtr));
|
||||
@@ -120,7 +120,7 @@ TEMPLATE_TEST_CASE("Unit_hipFreeImplicitSyncArray", "", char, float, float2, flo
|
||||
hipChannelFormatDesc desc = hipCreateChannelDesc<TestType>();
|
||||
|
||||
HIP_CHECK(hipMallocArray(&arrayPtr, &desc, extent.width, extent.height, hipArrayDefault));
|
||||
HipTest::runKernelForDuration(delay);
|
||||
LaunchDelayKernel(delay);
|
||||
// make sure device is busy
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
// Second free segfaults
|
||||
|
||||
@@ -21,7 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "MemUtils.hh"
|
||||
|
||||
#include <utils.hh>
|
||||
/*
|
||||
* These testcases verify that synchronization behaviour for memcpy functions with respect to
|
||||
* the host.
|
||||
@@ -156,7 +156,7 @@ static void runMemcpyTests(hipStream_t stream, bool async, allocType type, memTy
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
const std::chrono::duration<uint64_t, std::milli> delay = 100ms;
|
||||
HipTest::runKernelForDuration(delay, stream);
|
||||
LaunchDelayKernel(delay, stream);
|
||||
|
||||
memcpyCheck(type, memType, aPtr.first, data, fillerData, async, stream, fromHost);
|
||||
checkForSync(stream, async, type, fromHost);
|
||||
|
||||
@@ -21,6 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <utils.hh>
|
||||
/*
|
||||
* These testcases verify that synchronous memset functions are asynchronous with respect to the
|
||||
* host except when the target is pinned host memory or a Unified Memory region
|
||||
@@ -406,7 +407,7 @@ void runTests(allocType type, memSetType memsetType, MultiDData data, hipStream_
|
||||
std::pair<T*, T*> aPtr = initMemory<T>(type, memsetType, data);
|
||||
using namespace std::chrono_literals;
|
||||
const std::chrono::duration<uint64_t, std::milli> delay = 100ms;
|
||||
HipTest::runKernelForDuration(delay, stream);
|
||||
LaunchDelayKernel(delay, stream);
|
||||
memsetCheck(aPtr.first, testValue, memsetType, data, async, stream);
|
||||
|
||||
if (async || type == allocType::deviceMalloc) {
|
||||
|
||||
@@ -25,6 +25,7 @@ multiple Threads.
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <atomic>
|
||||
#include <utils.hh>
|
||||
|
||||
static constexpr size_t N = 4096;
|
||||
static constexpr int numThreads = 1000;
|
||||
@@ -43,33 +44,6 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
uint64_t wait_t = 3200000000, start = clock64(), cur;
|
||||
do {
|
||||
cur = clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
|
||||
#if HT_AMD
|
||||
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
|
||||
@@ -146,10 +120,10 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
|
||||
constexpr unsigned threadsPerBlock = 256;
|
||||
constexpr unsigned blocks = (N + 255)/threadsPerBlock;
|
||||
|
||||
auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
|
||||
hipLaunchKernelGGL((device_function_used), dim3(blocks),
|
||||
hipLaunchKernelGGL((device_function), dim3(blocks),
|
||||
dim3(threadsPerBlock), 0,
|
||||
mystream, C_d, A_d, N);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(2000), mystream);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(
|
||||
hipMemcpyAsync(C1_h, C_d, Nbytes,
|
||||
|
||||
@@ -19,6 +19,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <chrono>
|
||||
#include <hip_test_common.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
namespace hipStreamCreateWithFlagsTests {
|
||||
|
||||
@@ -69,11 +70,11 @@ TEST_CASE("Unit_hipStreamCreateWithFlags_DefaultStreamInteraction") {
|
||||
constexpr auto delay = std::chrono::milliseconds(500);
|
||||
|
||||
SECTION("default stream waiting for created stream") {
|
||||
HipTest::runKernelForDuration(delay, stream);
|
||||
LaunchDelayKernel(delay, stream);
|
||||
REQUIRE(hipStreamQuery(defaultStream) == expectedError);
|
||||
}
|
||||
SECTION("created stream waiting for default stream") {
|
||||
HipTest::runKernelForDuration(delay, defaultStream);
|
||||
LaunchDelayKernel(delay, defaultStream);
|
||||
REQUIRE(hipStreamQuery(stream) == expectedError);
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include <chrono>
|
||||
#include <hip_test_common.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
namespace hipStreamDestroyTests {
|
||||
|
||||
@@ -80,7 +81,7 @@ TEST_CASE("Unit_hipStreamDestroy_WithPendingWork") {
|
||||
HIP_CHECK(hipMalloc(&deviceData, sizeof(int) * numDataPoints));
|
||||
HIP_CHECK(hipMemset(deviceData, 0, sizeof(int) * numDataPoints));
|
||||
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
|
||||
setToOne<<<1, numDataPoints, 0, stream>>>(deviceData, numDataPoints);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(stream), hipErrorNotReady);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(nullptr), hipErrorNotReady);
|
||||
|
||||
@@ -19,7 +19,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "streamCommon.hh"
|
||||
|
||||
#include <utils.hh>
|
||||
/**
|
||||
* @brief Check that querying a stream with no work returns hipSuccess
|
||||
*
|
||||
@@ -101,7 +101,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") {
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipStreamQuery(hip::nullStream));
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
@@ -116,7 +116,7 @@ TEST_CASE("Unit_hipStreamQuery_SubmitWorkOnStreamAndQueryNullStream") {
|
||||
*/
|
||||
TEST_CASE("Unit_hipStreamQuery_NullStreamQuery") {
|
||||
HIP_CHECK(hipStreamQuery(hip::nullStream));
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(hip::nullStream));
|
||||
@@ -130,8 +130,7 @@ TEST_CASE("Unit_hipStreamQuery_WithPendingWork") {
|
||||
hipStream_t waitingStream{nullptr};
|
||||
HIP_CHECK(hipStreamCreate(&waitingStream));
|
||||
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), waitingStream);
|
||||
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), waitingStream);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(waitingStream), hipErrorNotReady);
|
||||
HIP_CHECK(hipStreamSynchronize(waitingStream));
|
||||
HIP_CHECK(hipStreamQuery(waitingStream));
|
||||
|
||||
@@ -19,7 +19,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "streamCommon.hh"
|
||||
|
||||
#include <utils.hh>
|
||||
namespace hipStreamSynchronizeTest {
|
||||
|
||||
/**
|
||||
@@ -62,7 +62,7 @@ TEST_CASE("Unit_hipStreamSynchronize_FinishWork") {
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
}
|
||||
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), stream);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipStreamQuery(stream));
|
||||
|
||||
@@ -86,15 +86,15 @@ TEST_CASE("Unit_hipStreamSynchronize_NullStreamSynchronization") {
|
||||
}
|
||||
|
||||
for (int i = 0; i < totalStreams; ++i) {
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(1000), streams[i]);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(1000), streams[i]);
|
||||
}
|
||||
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
|
||||
|
||||
for (int i = 0; i < totalStreams; ++i) {
|
||||
HIP_CHECK_ERROR(hipStreamQuery(streams[i]), hipErrorNotReady);
|
||||
}
|
||||
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(hip::nullStream));
|
||||
HIP_CHECK(hipStreamQuery(hip::nullStream));
|
||||
|
||||
@@ -123,8 +123,8 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") {
|
||||
HIP_CHECK(hipStreamCreate(&stream1));
|
||||
HIP_CHECK(hipStreamCreate(&stream2));
|
||||
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), stream1);
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(2000), stream2);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), stream1);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(2000), stream2);
|
||||
|
||||
SECTION("Do not use NullStream") {}
|
||||
SECTION("Submit Kernel to NullStream") {
|
||||
@@ -157,10 +157,10 @@ TEST_CASE("Unit_hipStreamSynchronize_SynchronizeStreamAndQueryNullStream") {
|
||||
*
|
||||
*/
|
||||
TEST_CASE("Unit_hipStreamSynchronize_NullStreamAndStreamPerThread") {
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::streamPerThread);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), hip::streamPerThread);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipErrorNotReady);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipErrorNotReady);
|
||||
HipTest::runKernelForDuration(std::chrono::milliseconds(500), hip::nullStream);
|
||||
LaunchDelayKernel(std::chrono::milliseconds(500), hip::nullStream);
|
||||
HIP_CHECK(hipStreamSynchronize(hip::nullStream))
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::streamPerThread), hipSuccess);
|
||||
HIP_CHECK_ERROR(hipStreamQuery(hip::nullStream), hipSuccess);
|
||||
|
||||
@@ -25,7 +25,7 @@ Unit_hipStreamWaitEvent_DifferentStreams - Test waiting for an event on a differ
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
#include <utils.hh>
|
||||
TEST_CASE("Unit_hipStreamWaitEvent_Negative") {
|
||||
enum class StreamTestType { NullStream = 0, StreamPerThread, CreatedStream };
|
||||
|
||||
@@ -79,35 +79,6 @@ TEST_CASE("Unit_hipStreamWaitEvent_UninitializedStream_Negative") {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Since we can not use atomic*_system on every gpu, we will use wait based on clock rate.
|
||||
// This wont be accurate since clock rate of a GPU varies depending on many variables including
|
||||
// thermals, load, utilization
|
||||
__global__ void waitKernel(int clockRate, int seconds) {
|
||||
auto start = clock();
|
||||
auto ms = seconds * 1000;
|
||||
long long waitTill = clockRate * (long long)ms;
|
||||
while (1) {
|
||||
auto end = clock();
|
||||
if ((end - start) > waitTill) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
|
||||
#if HT_AMD
|
||||
auto start = wall_clock64();
|
||||
auto ms = seconds * 1000;
|
||||
long long waitTill = clockRate * (long long)ms;
|
||||
while (1) {
|
||||
auto end = wall_clock64();
|
||||
if ((end - start) > waitTill) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipStreamWaitEvent_Default") {
|
||||
hipStream_t stream{nullptr};
|
||||
hipEvent_t waitEvent{nullptr};
|
||||
@@ -118,15 +89,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
|
||||
REQUIRE(stream != nullptr);
|
||||
REQUIRE(waitEvent != nullptr);
|
||||
|
||||
int deviceId {};
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
|
||||
auto clockRate = prop.clockRate;
|
||||
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
|
||||
LaunchDelayKernel(std::chrono::milliseconds(2000), stream);
|
||||
|
||||
HIP_CHECK(hipEventRecord(waitEvent, stream));
|
||||
|
||||
@@ -154,15 +117,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
|
||||
REQUIRE(streamBlockedOnStreamA != nullptr);
|
||||
REQUIRE(waitEvent != nullptr);
|
||||
|
||||
int deviceId {};
|
||||
HIP_CHECK(hipGetDevice(&deviceId));
|
||||
LaunchDelayKernel(std::chrono::milliseconds(3000), blockedStreamA);
|
||||
|
||||
hipDeviceProp_t prop{};
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
|
||||
auto clockRate = prop.clockRate;
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
|
||||
3); // wait for 3 seconds
|
||||
HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));
|
||||
|
||||
// Make sure stream is waiting for data to be set
|
||||
@@ -170,7 +126,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
|
||||
|
||||
HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));
|
||||
|
||||
waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
|
||||
LaunchDelayKernel(std::chrono::milliseconds(2000), streamBlockedOnStreamA);
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(unblockingStream));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user