Files
rocm-systems/catch/unit/graph/hipStreamBeginCapture.cc
T
Mirza Halilcevic dc78fafa49 EXSWHTEC-384 - Coverage Tool and Test Plan update #455
Change-Id: Idf52308186a73ae1c4d815eb59f20ea7da99c964
2024-02-27 22:26:25 +05:30

1481 строка
60 KiB
C++

/*
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#include "stream_capture_common.hh" // NOLINT
#pragma clang diagnostic ignored "-Wunused-variable"
/**
* @addtogroup hipStreamBeginCapture hipStreamBeginCapture
* @{
* @ingroup GraphTest
* `hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode)` -
* begins graph capture on a stream
*/
static int gCbackIter = 0;
static __global__ void dummyKernel() { return; }
static __global__ void incrementKernel(int* data) {
atomicAdd(data, 1);
return;
}
static __global__ void myadd(int* A_d, int* B_d) {
int myId = threadIdx.x + blockDim.x * blockIdx.x;
A_d[myId] = A_d[myId] + B_d[myId];
}
static __global__ void mymul(int* devMem, int value) {
int myId = threadIdx.x + blockDim.x * blockIdx.x;
devMem[myId] = devMem[myId] * value;
}
static void hostNodeCallback(void* data) {
REQUIRE(data == nullptr);
gCbackIter++;
}
template <typename T, typename F>
void captureStreamAndLaunchGraph(F graphFunc, hipStreamCaptureMode mode, hipStream_t stream) {
constexpr size_t N = 1000000;
size_t Nbytes = N * sizeof(T);
hipGraph_t graph{nullptr};
hipGraphExec_t graphExec{nullptr};
// Host and Device allocation
LinearAllocGuard<T> A_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<T> B_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<T> A_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<T> B_d(LinearAllocs::hipMalloc, Nbytes);
// Capture stream sequence
HIP_CHECK(hipStreamBeginCapture(stream, mode));
graphFunc(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream);
captureSequenceCompute(A_d.ptr(), B_h.ptr(), B_d.ptr(), N, stream);
HIP_CHECK(hipStreamEndCapture(stream, &graph));
// Validate end capture is successful
REQUIRE(graph != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
REQUIRE(graphExec != nullptr);
// Replay the recorded sequence multiple times
for (size_t i = 0; i < kLaunchIters; i++) {
std::fill_n(A_h.host_ptr(), N, static_cast<float>(i));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
ArrayFindIfNot(B_h.host_ptr(), static_cast<float>(i) * static_cast<float>(i), N);
}
HIP_CHECK(hipGraphExecDestroy(graphExec))
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Basic Functional Test for capturing created/hipStreamPerThread stream
* and replaying sequence. Test exercises the API on all available modes:
* -# Linear sequence capture - each graph node has only one dependency
* -# Branched sequence capture - some graph nodes have more than one
* dependency
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_Functional") {
const auto stream_type = GENERATE(Streams::perThread, Streams::created);
StreamGuard stream_guard(stream_type);
hipStream_t stream = stream_guard.stream();
const hipStreamCaptureMode captureMode = GENERATE(
hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed);
EventsGuard events_guard(3);
StreamsGuard streams_guard(2);
SECTION("Linear graph capture") {
captureStreamAndLaunchGraph<float>(
[](float* A_h, float* A_d, float* B_h, float* B_d, size_t N, hipStream_t stream) {
return captureSequenceLinear(A_h, A_d, B_h, B_d, N, stream);
},
captureMode, stream);
}
SECTION("Branched graph capture") {
captureStreamAndLaunchGraph<float>(
[&streams_guard, &events_guard](float* A_h, float* A_d, float* B_h, float* B_d, size_t N,
hipStream_t stream) {
captureSequenceBranched(A_h, A_d, B_h, B_d, N, stream, streams_guard.stream_list(),
events_guard.event_list());
},
captureMode, stream);
}
}
/**
* Test Description
* ------------------------
* - Test to verify API behavior with invalid arguments:
* -# Begin capture on legacy/null stream
* -# Begin capture on the already captured stream
* -# Begin capture with invalid mode
* -# Begin capture on uninitialized stream
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Negative_Parameters") {
const auto stream_type = GENERATE(Streams::created);
StreamGuard stream_guard(stream_type);
hipStream_t stream = stream_guard.stream();
SECTION("Stream capture on legacy/null stream returns error code.") {
HIP_CHECK_ERROR(hipStreamBeginCapture(nullptr, hipStreamCaptureModeGlobal),
hipErrorStreamCaptureUnsupported);
}
SECTION("Capturing hipStream status with same stream again") {
HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
HIP_CHECK_ERROR(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal),
hipErrorIllegalState);
}
SECTION("Creating hipStream with invalid mode") {
HIP_CHECK_ERROR(hipStreamBeginCapture(stream, hipStreamCaptureMode(-1)), hipErrorInvalidValue);
}
#if HT_NVIDIA // EXSWHTEC-216
SECTION("Stream capture on uninitialized stream returns error code.") {
constexpr auto InvalidStream = [] {
StreamGuard sg(Streams::created);
return sg.stream();
};
HIP_CHECK_ERROR(hipStreamBeginCapture(InvalidStream(), hipStreamCaptureModeGlobal),
hipErrorContextIsDestroyed);
}
#endif
}
/**
* Test Description
* ------------------------
* - Basic Test to verify basic API functionality with
* created/hipStreamPerThread stream for available modes
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_Basic") {
hipGraph_t graph{nullptr};
const auto stream_type = GENERATE(Streams::perThread, Streams::created);
StreamGuard stream_guard(stream_type);
hipStream_t s = stream_guard.stream();
const hipStreamCaptureMode captureMode = GENERATE(
hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed);
HIP_CHECK(hipStreamBeginCapture(s, captureMode));
HIP_CHECK(hipStreamEndCapture(s, &graph));
HIP_CHECK(hipGraphDestroy(graph));
}
/* Local function for inter stream event synchronization
*/
static void interStrmEventSyncCapture(const hipStream_t& stream1, const hipStream_t& stream2) {
hipGraph_t graph1{nullptr}, graph2{nullptr};
hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr};
EventsGuard events_guard(1);
hipEvent_t event = events_guard[0];
HIP_CHECK(hipEventCreate(&event));
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(event, stream1));
HIP_CHECK(hipStreamWaitEvent(stream2, event, 0));
dummyKernel<<<1, 1, 0, stream1>>>();
HIP_CHECK(hipStreamEndCapture(stream1, &graph1));
HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, stream2>>>();
dummyKernel<<<1, 1, 0, stream2>>>();
HIP_CHECK(hipStreamEndCapture(stream2, &graph2));
size_t numNodes1 = 0, numNodes2 = 0;
HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1));
HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2));
REQUIRE(numNodes1 == 1);
REQUIRE(numNodes2 == 2);
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(graphExec1 != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
REQUIRE(graphExec2 != nullptr);
// Replay the recorded sequence multiple times
for (size_t i = 0; i < kLaunchIters; i++) {
// Execute the Graphs
HIP_CHECK(hipGraphLaunch(graphExec1, stream1));
HIP_CHECK(hipGraphLaunch(graphExec2, stream2));
HIP_CHECK(hipStreamSynchronize(stream1));
HIP_CHECK(hipStreamSynchronize(stream2));
}
// Free
HIP_CHECK(hipGraphExecDestroy(graphExec2));
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph2));
HIP_CHECK(hipGraphDestroy(graph1));
}
/* Local function for colligated stream capture
*/
static void colligatedStrmCapture(const hipStream_t& stream1, const hipStream_t& stream2) {
hipGraph_t graph1{nullptr}, graph2{nullptr};
hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr};
EventsGuard events_guard(1);
hipEvent_t event = events_guard[0];
HIP_CHECK(hipEventCreate(&event));
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(event, stream1));
HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal));
HIP_CHECK(hipStreamWaitEvent(stream1, event, 0));
dummyKernel<<<1, 1, 0, stream1>>>();
HIP_CHECK(hipStreamEndCapture(stream1, &graph1));
dummyKernel<<<1, 1, 0, stream2>>>();
HIP_CHECK(hipStreamEndCapture(stream2, &graph2));
// Validate end capture is successful
REQUIRE(graph2 != nullptr);
REQUIRE(graph1 != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(graphExec1 != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
REQUIRE(graphExec2 != nullptr);
// Replay the recorded sequence multiple times
for (size_t i = 0; i < kLaunchIters; i++) {
// Execute the Graphs
HIP_CHECK(hipGraphLaunch(graphExec1, stream1));
HIP_CHECK(hipGraphLaunch(graphExec2, stream2));
HIP_CHECK(hipStreamSynchronize(stream1));
HIP_CHECK(hipStreamSynchronize(stream2));
}
// Free
HIP_CHECK(hipGraphExecDestroy(graphExec2));
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph2));
HIP_CHECK(hipGraphDestroy(graph1));
}
/* Local function for colligated stream capture functionality
*/
static void colligatedStrmCaptureFunc(const hipStream_t& stream1, const hipStream_t& stream2) {
constexpr size_t N = 1000000;
size_t Nbytes = N * sizeof(int);
hipGraph_t graph1{nullptr}, graph2{nullptr};
hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr};
// Host and device allocation
LinearAllocGuard<int> A_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> B_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> A_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> B_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> C_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> C_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> D_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> D_d(LinearAllocs::hipMalloc, Nbytes);
// Capture 2 streams
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal));
captureSequenceLinear(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream1);
captureSequenceLinear(C_h.host_ptr(), C_d.ptr(), D_h.host_ptr(), D_d.ptr(), N, stream2);
captureSequenceCompute(A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream1);
captureSequenceCompute(C_d.ptr(), D_h.host_ptr(), D_d.ptr(), N, stream2);
HIP_CHECK(hipStreamEndCapture(stream1, &graph1));
HIP_CHECK(hipStreamEndCapture(stream2, &graph2));
// Validate end capture is successful
REQUIRE(graph2 != nullptr);
REQUIRE(graph1 != nullptr);
// Create Executable Graphs
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(graphExec1 != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
REQUIRE(graphExec2 != nullptr);
// Execute the Graphs
for (size_t iter = 0; iter < kLaunchIters; iter++) {
std::fill_n(A_h.host_ptr(), N, iter);
std::fill_n(C_h.host_ptr(), N, iter);
HIP_CHECK(hipGraphLaunch(graphExec1, stream1));
HIP_CHECK(hipGraphLaunch(graphExec2, stream2));
HIP_CHECK(hipStreamSynchronize(stream1));
HIP_CHECK(hipStreamSynchronize(stream2));
ArrayFindIfNot(B_h.host_ptr(), static_cast<int>(iter * iter), N);
ArrayFindIfNot(D_h.host_ptr(), static_cast<int>(iter * iter), N);
}
// Free
HIP_CHECK(hipGraphExecDestroy(graphExec2));
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph2));
HIP_CHECK(hipGraphDestroy(graph1));
}
/* Stream Capture thread function
*/
static void threadStrmCaptureFunc(hipStream_t stream, int* A_h, int* A_d, int* B_h, int* B_d,
hipGraph_t* graph, size_t N, hipStreamCaptureMode mode) {
// Capture stream
HIP_CHECK(hipStreamBeginCapture(stream, mode));
captureSequenceLinear(A_h, A_d, B_h, B_d, N, stream);
captureSequenceCompute(A_d, B_h, B_d, N, stream);
HIP_CHECK(hipStreamEndCapture(stream, graph));
}
/* Local Function for multithreaded tests
*/
static void multithreadedTest(hipStreamCaptureMode mode) {
constexpr size_t N = 1000000;
size_t Nbytes = N * sizeof(int);
hipGraph_t graph1{nullptr}, graph2{nullptr};
hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr};
StreamGuard stream_guard1(Streams::created);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::created);
hipStream_t stream2 = stream_guard2.stream();
// Host and device allocation
LinearAllocGuard<int> A_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> B_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> A_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> B_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> C_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> D_h(LinearAllocs::malloc, Nbytes);
LinearAllocGuard<int> C_d(LinearAllocs::hipMalloc, Nbytes);
LinearAllocGuard<int> D_d(LinearAllocs::hipMalloc, Nbytes);
// Launch 2 threads to capture the 2 streams into graphs
std::thread t1(threadStrmCaptureFunc, stream1, A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(),
B_d.ptr(), &graph1, N, mode);
std::thread t2(threadStrmCaptureFunc, stream2, C_h.host_ptr(), C_d.ptr(), D_h.host_ptr(),
D_d.ptr(), &graph2, N, mode);
t1.join();
t2.join();
// Create Executable Graphs
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(graphExec1 != nullptr);
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
REQUIRE(graphExec2 != nullptr);
// Execute the Graphs
for (size_t iter = 0; iter < kLaunchIters; iter++) {
std::fill_n(A_h.host_ptr(), N, iter);
std::fill_n(C_h.host_ptr(), N, iter);
HIP_CHECK(hipGraphLaunch(graphExec1, stream1));
HIP_CHECK(hipGraphLaunch(graphExec2, stream2));
HIP_CHECK(hipStreamSynchronize(stream1));
HIP_CHECK(hipStreamSynchronize(stream2));
ArrayFindIfNot(B_h.host_ptr(), static_cast<int>(iter * iter), N);
ArrayFindIfNot(D_h.host_ptr(), static_cast<int>(iter * iter), N);
}
// Free
HIP_CHECK(hipGraphExecDestroy(graphExec2));
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph2));
HIP_CHECK(hipGraphDestroy(graph1));
}
/**
* Test Description
* ------------------------
* - Test to verify inter stream event synchronization- Waiting on an event
recorded on a captured stream. Initiate capture on stream1, record an event on
stream1, wait for the event on stream2, end the stream1 capture and initiate
stream capture on stream2
* -# Streams are created with hipStreamDefault/hipStreamNonBlocking flag
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_InterStrmEventSync_Flags") {
const auto stream_flags1 = GENERATE(hipStreamDefault, hipStreamNonBlocking);
const auto stream_flags2 = GENERATE(hipStreamDefault, hipStreamNonBlocking);
StreamGuard stream_guard1(Streams::withFlags, stream_flags1);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::withFlags, stream_flags2);
hipStream_t stream2 = stream_guard2.stream();
interStrmEventSyncCapture(stream1, stream2);
}
/**
* Test Description
* ------------------------
* - Test to verify inter stream event synchronization- Waiting on an event
* recorded on a captured stream. Initiate capture on stream1, record an event
* on stream1, wait for the event on stream2, end the stream1 capture and
* initiate stream capture on stream2
* -# Stream1 is created with minimal priority, stream 2 is created with
* maximal priority
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_InterStrmEventSync_Priority") {
int minPriority = 0, maxPriority = 0;
HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority));
StreamGuard stream_guard1(Streams::withPriority, hipStreamDefault, minPriority);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::withPriority, hipStreamDefault, maxPriority);
hipStream_t stream2 = stream_guard2.stream();
interStrmEventSyncCapture(stream1, stream2);
}
/**
* Test Description
* ------------------------
* - Test to verify colligated streams capture. Capture operation sequences
* queued in 2 streams by overlapping the 2 captures. Initiate capture on
* stream1, record an event on stream1, initiate capture on stream 2, end both
* stream captures
* -# Streams are created with hipStreamDefault/hipStreamNonBlocking flag
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCapture_Flags") {
const auto stream_flags1 = GENERATE(hipStreamDefault, hipStreamNonBlocking);
const auto stream_flags2 = GENERATE(hipStreamDefault, hipStreamNonBlocking);
StreamGuard stream_guard1(Streams::withFlags, stream_flags1);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::withFlags, stream_flags2);
hipStream_t stream2 = stream_guard2.stream();
colligatedStrmCapture(stream1, stream2);
}
/**
* Test Description
* ------------------------
* - Test to verify colligated streams capture. Capture operation sequences
* queued in 2 streams by overlapping the 2 captures. Initiate capture on
* stream1, record an event on stream1, initiate capture on stream 2, end both
* stream captures
* -# Stream1 is created with minimal priority, stream 2 is created with
* maximal priority
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCapture_Prio") {
int minPriority = 0, maxPriority = 0;
HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority));
StreamGuard stream_guard1(Streams::withPriority, hipStreamDefault, minPriority);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::withPriority, hipStreamDefault, maxPriority);
hipStream_t stream2 = stream_guard2.stream();
colligatedStrmCapture(stream1, stream2);
}
/**
* Test Description
* ------------------------
* - Create 2 streams. Start capturing both stream1 and stream2 at the same
* time. On stream1 queue memcpy, kernel and memcpy operations and on stream2
* queue memcpy, kernel and memcpy operations. Execute both the captured graphs
* and validate the results
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCaptureFunc") {
StreamGuard stream_guard1(Streams::created);
hipStream_t stream1 = stream_guard1.stream();
StreamGuard stream_guard2(Streams::created);
hipStream_t stream2 = stream_guard2.stream();
colligatedStrmCaptureFunc(stream1, stream2);
}
/**
* Test Description
* ------------------------
* - Capture 2 streams in parallel using threads. Execute the graphs in
* sequence in main thread and validate the results for all available capture
* modes
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_Multithreaded") {
const hipStreamCaptureMode captureMode = GENERATE(
hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed);
multithreadedTest(captureMode);
}
/**
* Test Description
* ------------------------
* - Test to verify inter stream event synchronization- Waiting on an event
* recorded on a captured stream.
* -# Initiate capture on stream1, record an event on stream1, wait for
* the event on stream2, end the stream1 capture and initiate stream capture on
* stream2. Repeat the same sequence between stream2 and stream3
* -# Initiate capture on stream1, record an event on stream1, wait for
* the event on stream2 and stream3, end the stream1 capture and initiate stream
* capture on stream2 and stream3
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_Multiplestrms") {
StreamsGuard streams(3);
hipGraph_t graphs[3];
size_t numNodes1 = 0, numNodes2 = 0, numNodes3 = 0;
SECTION("Capture Multiple stream with interdependent events") {
EventsGuard events(2);
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0));
dummyKernel<<<1, 1, 0, streams[0]>>>();
HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0]));
HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[1], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0));
dummyKernel<<<1, 1, 0, streams[1]>>>();
HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1]));
HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, streams[2]>>>();
HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2]));
HIP_CHECK(hipGraphGetNodes(graphs[0], nullptr, &numNodes1));
HIP_CHECK(hipGraphGetNodes(graphs[1], nullptr, &numNodes2));
HIP_CHECK(hipGraphGetNodes(graphs[2], nullptr, &numNodes3));
REQUIRE(numNodes1 == 1);
REQUIRE(numNodes2 == 1);
REQUIRE(numNodes3 == 1);
}
SECTION("Capture Multiple stream with single event") {
EventsGuard events(1);
hipEvent_t event = events[0];
HIP_CHECK(hipEventCreate(&event));
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(event, streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], event, 0));
HIP_CHECK(hipStreamWaitEvent(streams[2], event, 0));
dummyKernel<<<1, 1, 0, streams[0]>>>();
HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0]));
HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, streams[1]>>>();
HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1]));
HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, streams[2]>>>();
HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2]));
HIP_CHECK(hipGraphGetNodes(graphs[0], nullptr, &numNodes1));
HIP_CHECK(hipGraphGetNodes(graphs[1], nullptr, &numNodes2));
HIP_CHECK(hipGraphGetNodes(graphs[2], nullptr, &numNodes3));
REQUIRE(numNodes1 == 1);
REQUIRE(numNodes2 == 1);
REQUIRE(numNodes3 == 1);
}
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipGraphDestroy(graphs[i]));
}
}
/**
* Test Description
* ------------------------
* - Test to verify queue operations (increment kernels) in 3 streams. Start
* capturing the streams after some operations have been queued. This scenario
* validates that only operations queued after hipStreamBeginCapture are
* captured in the graph
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_CapturingFromWithinStrms") {
constexpr int INCREMENT_KERNEL_FINALEXP_VAL = 7;
hipGraph_t graph{nullptr};
hipGraphExec_t graphExec{nullptr};
StreamsGuard streams(3);
EventsGuard events(3);
// Create a device memory of size int and initialize it to 0
LinearAllocGuard<int> hostMem_g(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> devMem_g(LinearAllocs::hipMalloc, sizeof(int));
int* hostMem = hostMem_g.host_ptr();
int* devMem = devMem_g.ptr();
HIP_CHECK(hipMemset(devMem, 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
// Start Capturing
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem);
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem);
HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[0], 0));
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem);
HIP_CHECK(hipEventRecord(events[1], streams[1]));
HIP_CHECK(hipEventRecord(events[2], streams[2]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[1], 0));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0));
HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), hipMemcpyDefault, streams[0]));
HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture
// Reset device memory
HIP_CHECK(hipMemset(devMem, 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
// Create Executable Graphs
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
REQUIRE(graphExec != nullptr);
HIP_CHECK(hipGraphLaunch(graphExec, streams[0]));
HIP_CHECK(hipStreamSynchronize(streams[0]));
REQUIRE((*hostMem) == INCREMENT_KERNEL_FINALEXP_VAL);
HIP_CHECK(hipGraphExecDestroy(graphExec))
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Detecting invalid capture. Create 2 streams s1 and s2. Start capturing
* s1. Create event dependency between s1 and s2 using event record and event
* wait. Try capturing s2. hipStreamBeginCapture must return error
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Negative_DetectingInvalidCapture") {
StreamsGuard streams(2);
EventsGuard events(1);
hipEvent_t event = events[0];
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(event, streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], event, 0));
dummyKernel<<<1, 1, 0, streams[0]>>>();
// Since stream[1] is already in capture mode due to event wait
// hipStreamBeginCapture on stream[1] is expected to return error.
HIP_CHECK_ERROR(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal),
hipErrorIllegalState);
}
/**
* Test Description
* ------------------------
* - Test to verify wtream reuse. Capture multiple graphs from the same
* stream. Validate graphs are captured correctly
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_CapturingMultGraphsFrom1Strm") {
hipGraph_t graphs[3];
StreamGuard stream_guard(Streams::created);
hipStream_t stream1 = stream_guard.stream();
// Create a device memory of size int and initialize it to 0
LinearAllocGuard<int> hostMem_g(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> devMem_g(LinearAllocs::hipMalloc, sizeof(int));
int* hostMem = hostMem_g.host_ptr();
int* devMem = devMem_g.ptr();
HIP_CHECK(hipMemset(devMem, 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
for (int j = 0; j <= i; j++) incrementKernel<<<1, 1, 0, stream1>>>(devMem);
HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), hipMemcpyDefault, stream1));
HIP_CHECK(hipStreamEndCapture(stream1, &graphs[i]));
}
// Instantiate and execute all graphs
for (int i = 0; i < 3; i++) {
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipMemset(devMem, 0, sizeof(int)));
HIP_CHECK(hipGraphInstantiate(&graphExec, graphs[i], nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream1));
HIP_CHECK(hipStreamSynchronize(stream1));
REQUIRE((*hostMem) == (i + 1));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graphs[i]));
}
}
/**
* Test Description
* ------------------------
* - Test to verify synchronization during stream capture returns an error:
* -# Synchronize stream during capture
* -# Synchronize device during capture
* -# Synchronize event during capture
* -# Query stream during capture
* -# Query for an event during capture
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Negative_CheckingSyncDuringCapture") {
StreamGuard stream_guard(Streams::created);
hipStream_t stream = stream_guard.stream();
EventsGuard events_guard(1);
hipEvent_t e = events_guard[0];
const hipStreamCaptureMode captureMode = GENERATE(
hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed);
HIP_CHECK(hipStreamBeginCapture(stream, captureMode));
SECTION("Synchronize stream during capture") {
HIP_CHECK_ERROR(hipStreamSynchronize(stream), hipErrorStreamCaptureUnsupported);
}
SECTION("Query stream during capture") {
HIP_CHECK_ERROR(hipStreamQuery(stream), hipErrorStreamCaptureUnsupported);
}
#if HT_NVIDIA
SECTION("Synchronize device during capture") {
HIP_CHECK_ERROR(hipDeviceSynchronize(), hipErrorStreamCaptureUnsupported);
}
SECTION("Synchronize event during capture") {
HIP_CHECK(hipEventRecord(e, stream));
HIP_CHECK_ERROR(hipEventSynchronize(e), hipErrorCapturedEvent);
}
SECTION("Query for an event during capture") {
HIP_CHECK(hipEventRecord(e, stream));
HIP_CHECK_ERROR(hipEventQuery(e), hipErrorCapturedEvent);
}
#endif
}
#if HT_NVIDIA
/**
* Test Description
* ------------------------
* - Test to verify unsafe API calls during stream capture with initiated
* with hipStreamCaptureModeGlobal and hipStreamCaptureModeThreadLocal return an
* error:
* -# hipMalloc during capture
* -# hipMemcpy during capture
* -# hipMemset during capture
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Negative_UnsafeCallsDuringCapture") {
StreamGuard stream_guard(Streams::created);
hipStream_t stream = stream_guard.stream();
LinearAllocGuard<int> hostMem(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> devMem(LinearAllocs::hipMalloc, sizeof(int));
int* devMem2;
const hipStreamCaptureMode captureMode =
GENERATE(hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal);
HIP_CHECK(hipStreamBeginCapture(stream, captureMode));
SECTION("hipMalloc during capture") {
HIP_CHECK_ERROR(hipMalloc(&devMem2, sizeof(int)), hipErrorStreamCaptureUnsupported);
}
SECTION("hipMemcpy during capture") {
HIP_CHECK_ERROR(hipMemcpy(devMem.ptr(), hostMem.host_ptr(), sizeof(int), hipMemcpyHostToDevice),
hipErrorStreamCaptureImplicit);
}
SECTION("hipMemset during capture") {
HIP_CHECK_ERROR(hipMemset(devMem.ptr(), 0, sizeof(int)), hipErrorStreamCaptureImplicit);
}
}
#endif
/**
* Test Description
* ------------------------
* - Test to verify end stream capture when the stream capture is still in
* progress:
* -# Abruptly end stream capture when stream capture is in progress in
* forked stream. hipStreamEndCapture must return an error
* -# Abruptly end stream capture when operations in forked stream are
* still waiting to be captured. hipStreamEndCapture must return an error
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Negative_EndingCapwhenCapInProg") {
hipGraph_t graph{nullptr};
StreamsGuard streams_guard(2);
hipStream_t stream1 = streams_guard[0];
hipStream_t stream2 = streams_guard[1];
SECTION("Abruptly end strm capture when in progress in forked strm") {
EventsGuard events_guard(1);
hipEvent_t e = events_guard[0];
HIP_CHECK(hipEventCreate(&e));
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, stream1>>>();
HIP_CHECK(hipEventRecord(e, stream1));
HIP_CHECK(hipStreamWaitEvent(stream2, e, 0));
dummyKernel<<<1, 1, 0, stream2>>>();
HIP_CHECK_ERROR(hipStreamEndCapture(stream1, &graph), hipErrorStreamCaptureUnjoined);
}
SECTION("End strm capture when forked strm still has operations") {
EventsGuard events_guard(2);
hipEvent_t e1 = events_guard[0];
hipEvent_t e2 = events_guard[1];
HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal));
dummyKernel<<<1, 1, 0, stream1>>>();
HIP_CHECK(hipEventRecord(e1, stream1));
HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0));
dummyKernel<<<1, 1, 0, stream2>>>();
HIP_CHECK(hipEventRecord(e2, stream2));
HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0));
dummyKernel<<<1, 1, 0, stream2>>>();
HIP_CHECK_ERROR(hipStreamEndCapture(stream1, &graph), hipErrorStreamCaptureUnjoined);
}
}
/**
* Test Description
* ------------------------
* - Testing independent stream capture using multiple GPUs. Capture a stream
* in each device context and execute the captured graph in the context GPU
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_MultiGPU") {
int devcount = 0;
HIP_CHECK(hipGetDeviceCount(&devcount));
// If only single GPU is detected then return
if (devcount < 2) {
SUCCEED("skipping the testcases as numDevices < 2");
return;
}
hipStream_t* stream = reinterpret_cast<hipStream_t*>(malloc(devcount * sizeof(hipStream_t)));
REQUIRE(stream != nullptr);
hipGraph_t* graph = reinterpret_cast<hipGraph_t*>(malloc(devcount * sizeof(hipGraph_t)));
REQUIRE(graph != nullptr);
int **devMem{nullptr}, **hostMem{nullptr};
hostMem = reinterpret_cast<int**>(malloc(sizeof(int*) * devcount));
REQUIRE(hostMem != nullptr);
devMem = reinterpret_cast<int**>(malloc(sizeof(int*) * devcount));
REQUIRE(devMem != nullptr);
hipGraphExec_t* graphExec =
reinterpret_cast<hipGraphExec_t*>(malloc(devcount * sizeof(hipGraphExec_t)));
// Capture stream in each device
for (int dev = 0; dev < devcount; dev++) {
HIP_CHECK(hipSetDevice(dev));
HIP_CHECK(hipStreamCreate(&stream[dev]));
hostMem[dev] = reinterpret_cast<int*>(malloc(sizeof(int)));
HIP_CHECK(hipMalloc(&devMem[dev], sizeof(int)));
HIP_CHECK(hipStreamBeginCapture(stream[dev], hipStreamCaptureModeGlobal));
HIP_CHECK(hipMemsetAsync(devMem[dev], 0, sizeof(int), stream[dev]));
for (int i = 0; i < (dev + 1); i++) {
incrementKernel<<<1, 1, 0, stream[dev]>>>(devMem[dev]);
}
HIP_CHECK(
hipMemcpyAsync(hostMem[dev], devMem[dev], sizeof(int), hipMemcpyDefault, stream[dev]));
HIP_CHECK(hipStreamEndCapture(stream[dev], &graph[dev]));
}
// Launch the captured graphs in the respective device
for (int dev = 0; dev < devcount; dev++) {
HIP_CHECK(hipSetDevice(dev));
HIP_CHECK(hipGraphInstantiate(&graphExec[dev], graph[dev], nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec[dev], stream[dev]));
}
// Validate output
for (int dev = 0; dev < devcount; dev++) {
HIP_CHECK(hipSetDevice(dev));
HIP_CHECK(hipStreamSynchronize(stream[dev]));
REQUIRE((*hostMem[dev]) == (dev + 1));
}
// Destroy all device resources
for (int dev = 0; dev < devcount; dev++) {
HIP_CHECK(hipSetDevice(dev));
HIP_CHECK(hipGraphExecDestroy(graphExec[dev]));
HIP_CHECK(hipStreamDestroy(stream[dev]));
}
free(graphExec);
free(hostMem);
free(devMem);
free(stream);
free(graph);
}
/**
* Test Description
* ------------------------
* - Test Nested Stream Capture Functionality: Create 3 streams. Capture s1,
* record event e1 on s1, wait for event e1 on s2 and queue operations in s1.
* Record event e2 on s2 and wait for it on s3. Queue operations on both s2 and
* s3. Record event e4 on s3 and wait for it in s1. Record event e3 on s2 and
* wait for it in s1. End stream capture on s1. Execute the graph and verify the
* result.
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_nestedStreamCapture") {
constexpr int INCREMENT_KERNEL_FINALEXP_VAL = 7;
hipGraph_t graph{nullptr};
StreamsGuard streams(3);
EventsGuard events(4);
// Create a device memory of size int and initialize it to 0
LinearAllocGuard<int> hostMem_g(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> devMem_g(LinearAllocs::hipMalloc, sizeof(int));
HIP_CHECK(hipMemset(devMem_g.ptr(), 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
// Start Capturing stream1
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0));
HIP_CHECK(hipEventRecord(events[1], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0));
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr());
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr());
HIP_CHECK(hipEventRecord(events[2], streams[1]));
HIP_CHECK(hipEventRecord(events[3], streams[2]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0));
HIP_CHECK(hipMemcpyAsync(hostMem_g.host_ptr(), devMem_g.ptr(), sizeof(int), hipMemcpyDefault,
streams[0]));
HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture
// Reset device memory
HIP_CHECK(hipMemset(devMem_g.ptr(), 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
// Create Executable Graphs
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streams[0]));
HIP_CHECK(hipStreamSynchronize(streams[0]));
REQUIRE((*hostMem_g.host_ptr()) == INCREMENT_KERNEL_FINALEXP_VAL);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Test Nested Stream Capture Functionality: Create 3 streams. Capture s1,
* record event e1 on s1, wait for event e1 on s2 and queue operations in s1.
* Record event e2 on s2 and wait for it on s3. Queue operations on both s2 and
* s3. Record event e4 on s3 and wait for it in s1. Record event e3 on s2 and
* wait for it in s1. End stream capture on s1. Queue operations on both s2 and
* s3, and capture their graphs. Execute the graphs and verify the result.
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_streamReuse") {
constexpr int increment_kernel_vals[3] = {7, 3, 5};
hipGraph_t graphs[3];
StreamsGuard streams(3);
EventsGuard events(4);
LinearAllocGuard<int> hostMem_g1 = LinearAllocGuard<int>(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> hostMem_g2 = LinearAllocGuard<int>(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> hostMem_g3 = LinearAllocGuard<int>(LinearAllocs::malloc, sizeof(int));
LinearAllocGuard<int> devMem_g1 = LinearAllocGuard<int>(LinearAllocs::hipMalloc, sizeof(int));
LinearAllocGuard<int> devMem_g2 = LinearAllocGuard<int>(LinearAllocs::hipMalloc, sizeof(int));
LinearAllocGuard<int> devMem_g3 = LinearAllocGuard<int>(LinearAllocs::hipMalloc, sizeof(int));
std::vector<int*> hostMem = {hostMem_g1.host_ptr(), hostMem_g2.host_ptr(), hostMem_g3.host_ptr()};
std::vector<int*> devMem = {devMem_g1.ptr(), devMem_g2.ptr(), devMem_g3.ptr()};
// Create a device memory of size int and initialize it to 0
for (int i = 0; i < 3; i++) {
memset(hostMem[i], 0, sizeof(int));
HIP_CHECK(hipMemset(devMem[i], 0, sizeof(int)));
}
HIP_CHECK(hipDeviceSynchronize());
// Start Capturing stream1
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0));
HIP_CHECK(hipEventRecord(events[1], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0));
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[0]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]);
HIP_CHECK(hipEventRecord(events[2], streams[1]));
HIP_CHECK(hipEventRecord(events[3], streams[2]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0));
HIP_CHECK(hipMemcpyAsync(hostMem[0], devMem[0], sizeof(int), hipMemcpyDefault, streams[0]));
HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0])); // End Capture
// Start capturing graph2 from stream 2
HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal));
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]);
incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]);
HIP_CHECK(hipMemcpyAsync(hostMem[1], devMem[1], sizeof(int), hipMemcpyDefault, streams[1]));
HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1])); // End Capture
// Start capturing graph3 from stream 3
HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal));
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]);
incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]);
HIP_CHECK(hipMemcpyAsync(hostMem[2], devMem[2], sizeof(int), hipMemcpyDefault, streams[2]));
HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2])); // End Capture
// Reset device memory
HIP_CHECK(hipMemset(devMem[0], 0, sizeof(int)));
HIP_CHECK(hipMemset(devMem[1], 0, sizeof(int)));
HIP_CHECK(hipMemset(devMem[2], 0, sizeof(int)));
HIP_CHECK(hipDeviceSynchronize());
// Create Executable Graphs and verify graphs
for (int i = 0; i < 3; i++) {
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipMemset(devMem[i], 0, sizeof(int)));
HIP_CHECK(hipGraphInstantiate(&graphExec, graphs[i], nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streams[i]));
HIP_CHECK(hipStreamSynchronize(streams[i]));
REQUIRE((*hostMem[i]) == increment_kernel_vals[i]);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graphs[i]));
}
}
/**
* Test Description
* ------------------------
* - Capture a complex graph containing multiple independent memcpy, kernel
* and host nodes. Launch the graph on random input data and validate the output
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_captureComplexGraph") {
constexpr int GRIDSIZE = 256;
constexpr int BLOCKSIZE = 256;
constexpr int CONST_KER1_VAL = 3;
constexpr int CONST_KER2_VAL = 2;
constexpr int CONST_KER3_VAL = 5;
hipGraph_t graph{nullptr};
StreamsGuard streams(5);
EventsGuard events(7);
// Allocate Device memory and Host memory
size_t N = GRIDSIZE * BLOCKSIZE;
LinearAllocGuard<int> Ah = LinearAllocGuard<int>(LinearAllocs::malloc, N * sizeof(int));
LinearAllocGuard<int> Bh = LinearAllocGuard<int>(LinearAllocs::malloc, N * sizeof(int));
LinearAllocGuard<int> Ch = LinearAllocGuard<int>(LinearAllocs::malloc, N * sizeof(int));
LinearAllocGuard<int> Ad = LinearAllocGuard<int>(LinearAllocs::hipMalloc, N * sizeof(int));
LinearAllocGuard<int> Bd = LinearAllocGuard<int>(LinearAllocs::hipMalloc, N * sizeof(int));
// Capture streams into graph
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[3], events[0], 0));
HIP_CHECK(hipStreamWaitEvent(streams[4], events[0], 0));
HIP_CHECK(
hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[0]));
HIP_CHECK(
hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[4]));
hipHostFn_t fn = hostNodeCallback;
HIPCHECK(hipLaunchHostFunc(streams[3], fn, nullptr));
HIP_CHECK(hipEventRecord(events[1], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], events[1], 0));
int* Ad_2nd_half = Ad.ptr() + N / 2;
int* Ad_1st_half = Ad.ptr();
mymul<<<GRIDSIZE / 2, BLOCKSIZE, 0, streams[0]>>>(Ad_2nd_half, CONST_KER2_VAL);
mymul<<<GRIDSIZE / 2, BLOCKSIZE, 0, streams[1]>>>(Ad_1st_half, CONST_KER1_VAL);
HIP_CHECK(hipEventRecord(events[2], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[2], 0));
mymul<<<GRIDSIZE / 2, BLOCKSIZE, 0, streams[1]>>>(Ad_1st_half, CONST_KER3_VAL);
HIPCHECK(hipLaunchHostFunc(streams[2], fn, nullptr));
HIP_CHECK(hipEventRecord(events[6], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[6], 0));
HIP_CHECK(hipEventRecord(events[5], streams[4]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[5], 0));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, streams[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipEventRecord(events[3], streams[2]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0));
HIP_CHECK(hipEventRecord(events[4], streams[3]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[4], 0));
HIP_CHECK(
hipMemcpyAsync(Ch.host_ptr(), Ad.ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[0]));
HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture
// Execute and test the graph
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
// Verify graph
for (size_t iter = 0; iter < kLaunchIters; iter++) {
std::fill_n(Ah.host_ptr(), N, iter);
std::fill_n(Bh.host_ptr(), N, iter);
HIP_CHECK(hipGraphLaunch(graphExec, streams[0]));
HIP_CHECK(hipStreamSynchronize(streams[0]));
for (size_t i = 0; i < N; i++) {
if (i > (N / 2 - 1)) {
REQUIRE(Ch.host_ptr()[i] == (Bh.host_ptr()[i] + Ah.host_ptr()[i] * CONST_KER2_VAL));
} else {
REQUIRE(Ch.host_ptr()[i] ==
(Bh.host_ptr()[i] + Ah.host_ptr()[i] * CONST_KER1_VAL * CONST_KER3_VAL));
}
}
}
REQUIRE(gCbackIter == (2 * kLaunchIters));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Test to verify capturing empty streams (parent + forked streams) and
* validate the captured graph has no nodes
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipStreamBeginCapture_Positive_captureEmptyStreams") {
hipGraph_t graph{nullptr};
// Stream and event create
StreamsGuard streams(3);
EventsGuard events(3);
// Capture streams into graph
HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipEventRecord(events[0], streams[0]));
HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0));
HIP_CHECK(hipStreamWaitEvent(streams[2], events[0], 0));
HIP_CHECK(hipEventRecord(events[1], streams[1]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[1], 0));
HIP_CHECK(hipEventRecord(events[2], streams[2]));
HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0));
HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture
size_t numNodes = 0;
HIP_CHECK(hipGraphGetNodes(graph, nullptr, &numNodes));
REQUIRE(numNodes == 0);
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Test to verify hipStreamSynchronize on a stream works when stream capture
* on another stream is ongoing.
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Unit_hipStreamBeginCapture_StreamSync_OngoingCapture") {
hipStreamCaptureMode flag = hipStreamCaptureModeRelaxed;
constexpr int GRIDSIZE = 1;
constexpr int BLOCKSIZE = 512;
constexpr int VALUE1 = 7, VALUE2 = 11;
hipGraph_t graph{nullptr};
hipGraphExec_t graphExec{nullptr};
// Allocate device memory
LinearAllocGuard<int> Ah = LinearAllocGuard<int>(LinearAllocs::malloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Ad =
LinearAllocGuard<int>(LinearAllocs::hipMalloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Bh = LinearAllocGuard<int>(LinearAllocs::malloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Bd =
LinearAllocGuard<int>(LinearAllocs::hipMalloc, BLOCKSIZE * sizeof(int));
// Fill input data
std::fill_n(Ah.host_ptr(), BLOCKSIZE, VALUE1);
std::fill_n(Bh.host_ptr(), BLOCKSIZE, VALUE2);
// Stream create
StreamsGuard stream0(1);
// Capture streams into graph
SECTION("Stream Creation Before Capture") {
StreamsGuard stream1(1);
HIP_CHECK(hipStreamBeginCapture(stream0[0], flag));
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipStreamSynchronize(stream1[0]));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream0[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipStreamEndCapture(stream0[0], &graph)); // End Capture
}
SECTION("Synchronizing multiple streams during Capture") {
StreamsGuard stream1(1), stream2(1);
HIP_CHECK(hipStreamBeginCapture(stream0[0], flag));
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream2[0]));
HIP_CHECK(hipStreamSynchronize(stream1[0]));
HIP_CHECK(hipStreamSynchronize(stream2[0]));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream0[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipStreamEndCapture(stream0[0], &graph)); // End Capture
}
SECTION("Stream Creation After Capture") {
HIP_CHECK(hipStreamBeginCapture(stream0[0], flag));
StreamsGuard stream1(1);
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipStreamSynchronize(stream1[0]));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream0[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipStreamEndCapture(stream0[0], &graph)); // End Capture
}
SECTION("Stream Synchronize Before Capture") {
StreamsGuard stream1(1);
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipStreamSynchronize(stream1[0]));
HIP_CHECK(hipStreamBeginCapture(stream0[0], flag));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream0[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipStreamEndCapture(stream0[0], &graph)); // End Capture
}
SECTION("Stream Synchronize After Capture") {
HIP_CHECK(hipStreamBeginCapture(stream0[0], flag));
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream0[0]>>>(Ad.ptr(), Bd.ptr());
HIP_CHECK(hipStreamEndCapture(stream0[0], &graph)); // End Capture
StreamsGuard stream1(1);
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream1[0]));
HIP_CHECK(hipStreamSynchronize(stream1[0]));
}
// Execute and test the graph
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream0[0]));
HIP_CHECK(hipStreamSynchronize(stream0[0]));
// Check output
HIP_CHECK(hipMemcpy(Ah.host_ptr(), Ad.ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDeviceToHost));
for (int idx = 0; idx < BLOCKSIZE; idx++) {
REQUIRE(Ah.host_ptr()[idx] == (VALUE1 + VALUE2));
}
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
/**
* Test Description
* ------------------------
* - Test to verify hipStreamSynchronize on a stream behavior when stream capture
* on another stream is ongoing in another thread.
* Test source
* ------------------------
* - catch\unit\graph\hipStreamBeginCapture.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
// Local function executed as thread
static void strmSyncThread(int* Ah, int* Ad, int* Bh, int* Bd, int BLOCKSIZE, hipError_t* error) {
StreamsGuard stream(1);
HIP_CHECK(hipMemcpyAsync(Ad, Ah, BLOCKSIZE * sizeof(int), hipMemcpyDefault, stream[0]));
HIP_CHECK(hipMemcpyAsync(Bd, Bh, BLOCKSIZE * sizeof(int), hipMemcpyDefault, stream[0]));
*error = hipStreamSynchronize(stream[0]);
}
// Local function executed as thread
static void captureStrmThread(hipGraph_t* graph, int* Ah, int* Ad, int* Bh, int* Bd, int BLOCKSIZE,
int GRIDSIZE, hipStreamCaptureMode flag, hipError_t* error) {
StreamsGuard stream(1);
// Capture streams into graph
HIP_CHECK(hipStreamBeginCapture(stream[0], flag));
std::thread t1(strmSyncThread, Ah, Ad, Bh, Bd, BLOCKSIZE, error);
t1.join();
myadd<<<GRIDSIZE, BLOCKSIZE, 0, stream[0]>>>(Ad, Bd);
HIP_CHECK(hipStreamEndCapture(stream[0], graph)); // End Capture
}
TEST_CASE("Unit_hipStreamBeginCapture_StreamSync_OngoingCapture_MThread") {
constexpr int GRIDSIZE = 1;
constexpr int BLOCKSIZE = 512;
constexpr int VALUE1 = 7, VALUE2 = 11;
hipGraph_t graph{nullptr};
// Allocate device memory
LinearAllocGuard<int> Ah = LinearAllocGuard<int>(LinearAllocs::malloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Ad =
LinearAllocGuard<int>(LinearAllocs::hipMalloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Bh = LinearAllocGuard<int>(LinearAllocs::malloc, BLOCKSIZE * sizeof(int));
LinearAllocGuard<int> Bd =
LinearAllocGuard<int>(LinearAllocs::hipMalloc, BLOCKSIZE * sizeof(int));
// Fill input data
std::fill_n(Ah.host_ptr(), BLOCKSIZE, VALUE1);
std::fill_n(Bh.host_ptr(), BLOCKSIZE, VALUE2);
// Stream create
hipError_t error = hipSuccess;
SECTION("Capture Flag = hipStreamCaptureModeGlobal Single Threaded") {
StreamsGuard stream(2);
// Capture streams into graph
HIP_CHECK(hipStreamBeginCapture(stream[0], hipStreamCaptureModeGlobal));
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream[1]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream[1]));
error = hipStreamSynchronize(stream[1]);
REQUIRE(error == hipErrorStreamCaptureUnsupported);
}
#if HT_NVIDIA
SECTION("Capture Flag = hipStreamCaptureModeThreadLocal Single Threaded") {
StreamsGuard stream(2);
// Capture streams into graph
HIP_CHECK(hipStreamBeginCapture(stream[0], hipStreamCaptureModeThreadLocal));
HIP_CHECK(hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream[1]));
HIP_CHECK(hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDefault,
stream[1]));
error = hipStreamSynchronize(stream[1]);
REQUIRE(error == hipErrorStreamCaptureUnsupported);
}
#endif
#if HT_AMD
SECTION("Capture Flag = hipStreamCaptureModeGlobal Multithreaded") {
captureStrmThread(&graph, Ah.host_ptr(), Ad.ptr(), Bh.host_ptr(), Bd.ptr(), BLOCKSIZE, GRIDSIZE,
hipStreamCaptureModeGlobal, &error);
REQUIRE(error == hipErrorStreamCaptureUnsupported);
}
#endif
SECTION("Capture Flag = hipStreamCaptureModeThreadLocal Multithreaded") {
captureStrmThread(&graph, Ah.host_ptr(), Ad.ptr(), Bh.host_ptr(), Bd.ptr(), BLOCKSIZE, GRIDSIZE,
hipStreamCaptureModeThreadLocal, &error);
REQUIRE(error == hipSuccess);
}
SECTION("Capture Flag = hipStreamCaptureModeRelaxed Multithreaded") {
captureStrmThread(&graph, Ah.host_ptr(), Ad.ptr(), Bh.host_ptr(), Bd.ptr(), BLOCKSIZE, GRIDSIZE,
hipStreamCaptureModeRelaxed, &error);
REQUIRE(error == hipSuccess);
}
if (graph != nullptr) {
hipGraphExec_t graphExec{nullptr};
StreamsGuard stream(1);
// Execute and test the graph
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream[0]));
HIP_CHECK(hipStreamSynchronize(stream[0]));
// Check output
HIP_CHECK(hipMemcpy(Ah.host_ptr(), Ad.ptr(), BLOCKSIZE * sizeof(int), hipMemcpyDeviceToHost));
for (int idx = 0; idx < BLOCKSIZE; idx++) {
REQUIRE(Ah.host_ptr()[idx] == (VALUE1 + VALUE2));
}
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
}