diff --git a/catch/hipTestMain/config/config_amd_windows_common.json b/catch/hipTestMain/config/config_amd_windows_common.json index af9e8e7ca9..09eadeb673 100644 --- a/catch/hipTestMain/config/config_amd_windows_common.json +++ b/catch/hipTestMain/config/config_amd_windows_common.json @@ -114,6 +114,7 @@ "Unit_hipStreamValue_Wait64_Blocking_NoMask_Nor", "Unit_hipStreamQuery_WithFinishedWork", "Unit_hipLaunchHostFunc_Graph", - "Unit_hipLaunchHostFunc_KernelHost" + "Unit_hipLaunchHostFunc_KernelHost", + "Unit_hipStreamSetCaptureDependencies_Positive_Functional" ] } diff --git a/catch/unit/graph/CMakeLists.txt b/catch/unit/graph/CMakeLists.txt index 1e89b09630..9c06e091b0 100644 --- a/catch/unit/graph/CMakeLists.txt +++ b/catch/unit/graph/CMakeLists.txt @@ -63,11 +63,13 @@ set(TEST_SRC hipGraphEventWaitNodeGetEvent.cc hipGraphExecMemcpyNodeSetParams.cc hipStreamBeginCapture.cc + hipStreamBeginCapture_old.cc hipStreamIsCapturing.cc hipStreamIsCapturing_old.cc hipStreamGetCaptureInfo.cc hipStreamGetCaptureInfo_old.cc hipStreamEndCapture.cc + hipStreamEndCapture_old.cc hipGraphMemcpyNodeSetParamsFromSymbol_old.cc hipGraphMemcpyNodeSetParamsFromSymbol.cc hipGraphExecEventWaitNodeSetEvent.cc @@ -87,6 +89,9 @@ set(TEST_SRC hipGraphHostNodeGetParams.cc hipGraphExecChildGraphNodeSetParams.cc hipStreamGetCaptureInfo_v2.cc + hipStreamUpdateCaptureDependencies.cc + hipThreadExchangeStreamCaptureMode.cc + hipLaunchHostFunc.cc hipStreamGetCaptureInfo_v2_old.cc hipUserObjectCreate.cc hipGraphDebugDotPrint.cc diff --git a/catch/unit/graph/hipLaunchHostFunc.cc b/catch/unit/graph/hipLaunchHostFunc.cc new file mode 100644 index 0000000000..005bb6b736 --- /dev/null +++ b/catch/unit/graph/hipLaunchHostFunc.cc @@ -0,0 +1,183 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include + +#include "stream_capture_common.hh" + +/** + * @addtogroup hipLaunchHostFunc hipLaunchHostFunc + * @{ + * @ingroup GraphTest + * `hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void *userData)` - + * enqueues a host function call in a stream + */ + +static void hostNodeCallbackDummy(void* data) { REQUIRE(data == nullptr); } + +static void hostNodeCallback(void* data) { + float** userData = static_cast(data); + + float input_data = *(userData[0]); + float output_data = *(userData[1]); + REQUIRE(input_data == output_data); +} + +/** + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# Stream is legacy/nullptr stream + * -# Function is nullptr + * -# Stream is uninitialized + * Test source + * ------------------------ + * - catch\unit\graph\hipLaunchHostFunc.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipLaunchHostFunc_Negative_Parameters") { + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); +#if HT_NVIDIA // EXSWHTEC-228 + SECTION("Pass stream as nullptr") { + hipHostFn_t fn = hostNodeCallbackDummy; + HIP_CHECK_ERROR(hipLaunchHostFunc(nullptr, fn, nullptr), hipErrorStreamCaptureImplicit); + } +#endif + SECTION("Pass functions as nullptr") { + HIP_CHECK_ERROR(hipLaunchHostFunc(stream, nullptr, nullptr), hipErrorInvalidValue); + } + + SECTION("Pass uninitialized stream") { + hipHostFn_t fn = hostNodeCallbackDummy; + constexpr auto InvalidStream = [] { + StreamGuard sg(Streams::created); + return sg.stream(); + }; + HIP_CHECK_ERROR(hipLaunchHostFunc(InvalidStream(), fn, nullptr), hipErrorContextIsDestroyed); + } +} + +/** + * Test Description + * ------------------------ + * - Test to verify enquing a host function into a stream, which checks if + * the captured computation result is correct + * Test source + * ------------------------ + * - catch\unit\graph\hipLaunchHostFunc.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipLaunchHostFunc_Positive_Functional") { + LinearAllocGuard A_h(LinearAllocs::malloc, sizeof(float)); + LinearAllocGuard B_h(LinearAllocs::malloc, sizeof(float)); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, sizeof(float)); + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = hipStreamCaptureModeGlobal; + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceSimple(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), 1, stream); + + hipHostFn_t fn = hostNodeCallback; + float* data[2] = {A_h.host_ptr(), B_h.host_ptr()}; + HIP_CHECK(hipLaunchHostFunc(stream, fn, static_cast(data))); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + // Validate end capture is successful + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), 1, static_cast(i)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i), 1); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +static void thread_func_pos(hipStream_t* stream, hipHostFn_t fn, float** data){ + + HIP_CHECK(hipLaunchHostFunc(*stream, fn, static_cast(data)))} + +/** + * Test Description + * ------------------------ + * - Test to verify enquing a host function into a stream on a different + * thread, which checks if the captured computation result is correct + * Test source + * ------------------------ + * - catch\unit\graph\hipLaunchHostFunc.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipLaunchHostFunc_Positive_Thread") { + LinearAllocGuard A_h(LinearAllocs::malloc, sizeof(float)); + LinearAllocGuard B_h(LinearAllocs::malloc, sizeof(float)); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, sizeof(float)); + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = hipStreamCaptureModeGlobal; + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceSimple(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), 1, stream); + + hipHostFn_t fn = hostNodeCallback; + float* data[2] = {A_h.host_ptr(), B_h.host_ptr()}; + std::thread t(thread_func_pos, &stream, fn, data); + t.join(); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + // Validate end capture is successful + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), 1, static_cast(i)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i), 1); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} diff --git a/catch/unit/graph/hipStreamBeginCapture.cc b/catch/unit/graph/hipStreamBeginCapture.cc index 80be624e8a..e7e8b72761 100644 --- a/catch/unit/graph/hipStreamBeginCapture.cc +++ b/catch/unit/graph/hipStreamBeginCapture.cc @@ -17,103 +17,25 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/** -Testcase Scenarios : Functional - 1) Initiate stream capture with different modes on custom stream. - Capture stream sequence and replay the sequence in multiple iterations. - 2) End capture and validate that API returns captured graph for - all possible modes on custom stream. - 3) Initiate stream capture with different modes on hipStreamPerThread. - Capture stream sequence and replay the sequence in multiple iterations. - 4) End capture and validate that API returns captured graph for - all possible modes on hipStreamPerThread. - 5) Waiting on an event recorded on a captured stream. Initiate capture - on stream1, record an event on stream1, wait for the event on stream2, - end the stream1 capture and Initiate stream capture on stream2 - 5.1) Both streams are created with default flags. - 5.2) Both streams are created with flag = hipStreamCaptureModeGlobal. - 5.3) Both streams are created with different flags. - 5.4) Both streams are created with different priorities. - 5.5) Validate the number of nodes in both the captured graphs. - 6) Colligated Streams capture. Capture operation sequences queued in - 2 streams by overlapping the 2 captures. - 6.1) Both streams are created with default flags. - 6.2) Both streams are created with flag = hipStreamCaptureModeGlobal. - 6.3) Both streams are created with different flags. - 6.4) Both streams are created with different priorities. - 7) Extend the scenario 5.1 for 3 streamsss. - 8) Create 2 streams. Start capturing both stream1 and stream2 at the same - time. On stream1 queue memcpy, kernel and memcpy operations and on stream2 - queue memcpy, kernel and memcpy operations. Execute both the captured - graphs and validate the results. - 9) Capture 2 streams in parallel using threads. Execute the graphs in - sequence in main thread and validate the results. - 9.1) mode = hipStreamCaptureModeGlobal - 9.2) mode = hipStreamCaptureModeThreadLocal - 9.3) mode = hipStreamCaptureModeRelaxed - 10) Queue operations (increment kernels) in 3 streams. Start capturing - the streams after some operations have been queued. This scenario validates - that only operations queued after hipStreamBeginCapture are captured in - the graph. - 11) Detecting invalid capture. Create 2 streams s1 and s2. Start capturing - s1. Create event dependency between s1 and s2 using event record and event - wait. Try capturing s2. hipStreamBeginCapture must return error. - 12) Stream reuse. Capture multiple graphs from the same stream. Validate - graphs are captured correctly. - 13) Test different synchronization during stream capture. - 13.1) Test hipStreamSynchronize. Must return - hipErrorStreamCaptureUnsupported. - 13.2) Test hipDeviceSynchronize. Must return - hipErrorStreamCaptureUnsupported. - 13.3) Test hipDeviceSynchronize. Must return - hipEventSynchronize. - 13.4) Test hipStreamWaitEvent. Must return - hipErrorStreamCaptureIsolation. - 14) End Stream Capture when the stream capture is still in progress. - 14.1) Abruptly end stream capture when stream capture is in progress in - forked stream. hipStreamEndCapture must return - hipErrorStreamCaptureUnjoined. - 14.2) Abruptly end stream capture when operations in forked stream - are still waiting to be captured. hipStreamEndCapture must return - hipErrorStreamCaptureUnjoined. - 15) Testing independent stream capture using multiple GPUs. Capture - a stream in each device context and execute the captured graph in the - context GPU. - 16) Test Nested Stream Capture Functionality: Create 3 streams s1, s2 & s3. - Capture s1, record event e1 on s1, wait for event e1 on s2 and queue - operations in s1. Record event e2 on s2 and wait for it on s3. Queue - operations on both s2 and s3. Record event e4 on s3 and wait for it in s1. - Record event e3 on s2 and wait for it in s1. End stream capture on s1. - Execute the graph and verify the result. - 17) Forked Stream Reuse: In scenario 16, after end capture on s1, queue - operations on both s2 and s3, and capture their graphs. Execute both the - graphs and validate the functionality. - 18) Capture a complex graph containing multiple independent memcpy, kernel - and host nodes. Launch the graph on random input data and validate the - output. - 19) Capture empty streams (parent + forked streams) and validate the - functionality. -*/ - -#include #include #include +#include + +#include "stream_capture_common.hh" + +/** + * @addtogroup hipStreamBeginCapture hipStreamBeginCapture + * @{ + * @ingroup GraphTest + * `hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode)` - + * begins graph capture on a stream + */ -#define INCREMENT_KERNEL_FINALEXP_VAL 7 -constexpr size_t N = 1000000; -constexpr int LAUNCH_ITERS = 50; static int gCbackIter = 0; -#define GRIDSIZE 256 -#define BLOCKSIZE 256 -#define CONST_KER1_VAL 3 -#define CONST_KER2_VAL 2 -#define CONST_KER3_VAL 5 -static __global__ void dummyKernel() { - return; -} +static __global__ void dummyKernel() { return; } -static __global__ void incrementKernel(int *data) { +static __global__ void incrementKernel(int* data) { atomicAdd(data, 1); return; } @@ -133,21 +55,25 @@ static void hostNodeCallback(void* data) { gCbackIter++; } -bool CaptureStreamAndLaunchGraph(float *A_d, float *C_d, float *A_h, - float *C_h, hipStreamCaptureMode mode, hipStream_t stream) { +template +void captureStreamAndLaunchGraph(F graphFunc, hipStreamCaptureMode mode, hipStream_t stream) { + constexpr size_t N = 1000000; + size_t Nbytes = N * sizeof(T); + hipGraph_t graph{nullptr}; hipGraphExec_t graphExec{nullptr}; - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - size_t Nbytes = N * sizeof(float); + // Host and Device allocation + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + + // Capture stream sequence HIP_CHECK(hipStreamBeginCapture(stream, mode)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + graphFunc(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream); - HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); - hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), - dim3(threadsPerBlock), 0, stream, A_d, C_d, N); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + captureSequenceCompute(A_d.ptr(), B_h.ptr(), B_d.ptr(), N, stream); HIP_CHECK(hipStreamEndCapture(stream, &graph)); @@ -158,176 +84,142 @@ bool CaptureStreamAndLaunchGraph(float *A_d, float *C_d, float *A_h, REQUIRE(graphExec != nullptr); // Replay the recorded sequence multiple times - for (int i = 0; i < LAUNCH_ITERS; i++) { + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), N, static_cast(i)); HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i) * static_cast(i), N); } - HIP_CHECK(hipStreamSynchronize(stream)); - - HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphExecDestroy(graphExec)) HIP_CHECK(hipGraphDestroy(graph)); - - // Validate the computation - for (size_t i = 0; i < N; i++) { - if (C_h[i] != A_h[i] * A_h[i]) { - UNSCOPED_INFO("A and C not matching at " << i); - return false; - } - } - return true; } /** - * Basic Functional Test for API capturing custom stream and replaying sequence. - * Test exercises the API on available/possible modes. - * Stream capture with different modes behave the same when supported/ - * safe apis are used in sequence. + * Test Description + * ------------------------ + * - Basic Functional Test for capturing created/hipStreamPerThread stream + * and replaying sequence. Test exercises the API on all available modes: + * -# Linear sequence capture - each graph node has only one dependency + * -# Branched sequence capture - some graph nodes have more than one + * dependency + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_BasicFunctional") { - float *A_d, *C_d; - float *A_h, *C_h; - size_t Nbytes = N * sizeof(float); - hipStream_t stream; - bool ret; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_Functional") { + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); - A_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(C_h != nullptr); + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; + EventsGuard events_guard(3); + StreamsGuard streams_guard(2); + + SECTION("Linear graph capture") { + captureStreamAndLaunchGraph( + [](float* A_h, float* A_d, float* B_h, float* B_d, size_t N, hipStream_t stream) { + return captureSequenceLinear(A_h, A_d, B_h, B_d, N, stream); + }, + captureMode, stream); } - HIP_CHECK(hipStreamCreate(&stream)); - HIP_CHECK(hipMalloc(&A_d, Nbytes)); - HIP_CHECK(hipMalloc(&C_d, Nbytes)); - REQUIRE(A_d != nullptr); - REQUIRE(C_d != nullptr); - - SECTION("Capture stream and launch graph when mode is global") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeGlobal, stream); - REQUIRE(ret == true); + SECTION("Branched graph capture") { + captureStreamAndLaunchGraph( + [&streams_guard, &events_guard](float* A_h, float* A_d, float* B_h, float* B_d, size_t N, + hipStream_t stream) { + captureSequenceBranched(A_h, A_d, B_h, B_d, N, stream, streams_guard.stream_list(), + events_guard.event_list()); + }, + captureMode, stream); } - - SECTION("Capture stream and launch graph when mode is local") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeThreadLocal, stream); - REQUIRE(ret == true); - } - - SECTION("Capture stream and launch graph when mode is relaxed") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeRelaxed, stream); - REQUIRE(ret == true); - } - - HIP_CHECK(hipStreamDestroy(stream)); - free(A_h); - free(C_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipFree(C_d)); } /** - * Perform capture on hipStreamPerThread, launch the graph and verify results. + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# Begin capture on legacy/null stream + * -# Begin capture on the already captured stream + * -# Begin capture with invalid mode + * -# Begin capture on uninitialized stream + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_hipStreamPerThread") { - float *A_d, *C_d; - float *A_h, *C_h; - size_t Nbytes = N * sizeof(float); - hipStream_t stream{hipStreamPerThread}; - bool ret; - - A_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(C_h != nullptr); - - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - } - - HIP_CHECK(hipMalloc(&A_d, Nbytes)); - HIP_CHECK(hipMalloc(&C_d, Nbytes)); - REQUIRE(A_d != nullptr); - REQUIRE(C_d != nullptr); - - SECTION("Capture hipStreamPerThread and launch graph when mode is global") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeGlobal, stream); - REQUIRE(ret == true); - } - - SECTION("Capture hipStreamPerThread and launch graph when mode is local") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeThreadLocal, stream); - REQUIRE(ret == true); - } - - SECTION("Capture hipStreamPerThread and launch graph when mode is relaxed") { - ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, - hipStreamCaptureModeRelaxed, stream); - REQUIRE(ret == true); - } - - free(A_h); - free(C_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipFree(C_d)); -} - - -/* Test verifies hipStreamBeginCapture API Negative scenarios. - */ - -TEST_CASE("Unit_hipStreamBeginCapture_Negative") { - hipError_t ret; - hipStream_t stream{}; - HIP_CHECK(hipStreamCreate(&stream)); +TEST_CASE("Unit_hipStreamBeginCapture_Negative_Parameters") { + const auto stream_type = GENERATE(Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); SECTION("Stream capture on legacy/null stream returns error code.") { - ret = hipStreamBeginCapture(nullptr, hipStreamCaptureModeGlobal); - REQUIRE(hipErrorStreamCaptureUnsupported == ret); + HIP_CHECK_ERROR(hipStreamBeginCapture(nullptr, hipStreamCaptureModeGlobal), + hipErrorStreamCaptureUnsupported); } SECTION("Capturing hipStream status with same stream again") { HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - ret = hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal); - REQUIRE(hipErrorIllegalState == ret); + HIP_CHECK_ERROR(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal), + hipErrorIllegalState); } SECTION("Creating hipStream with invalid mode") { - ret = hipStreamBeginCapture(stream, hipStreamCaptureMode(-1)); - REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK_ERROR(hipStreamBeginCapture(stream, hipStreamCaptureMode(-1)), hipErrorInvalidValue); } - HIP_CHECK(hipStreamDestroy(stream)); +#if HT_NVIDIA // EXSWHTEC-216 + SECTION("Stream capture on uninitialized stream returns error code.") { + constexpr auto InvalidStream = [] { + StreamGuard sg(Streams::created); + return sg.stream(); + }; + HIP_CHECK_ERROR(hipStreamBeginCapture(InvalidStream(), hipStreamCaptureModeGlobal), + hipErrorContextIsDestroyed); + } +#endif } -TEST_CASE("Unit_hipStreamBeginCapture_Basic") { - hipStream_t s1, s2, s3; - - HIP_CHECK(hipStreamCreate(&s1)); - HIP_CHECK(hipStreamBeginCapture(s1, hipStreamCaptureModeGlobal)); - - HIP_CHECK(hipStreamCreate(&s2)); - HIP_CHECK(hipStreamBeginCapture(s2, hipStreamCaptureModeThreadLocal)); - - HIP_CHECK(hipStreamCreate(&s3)); - HIP_CHECK(hipStreamBeginCapture(s3, hipStreamCaptureModeRelaxed)); - - HIP_CHECK(hipStreamDestroy(s1)); - HIP_CHECK(hipStreamDestroy(s2)); - HIP_CHECK(hipStreamDestroy(s3)); -} -/* Local Function +/** + * Test Description + * ------------------------ + * - Basic Test to verify basic API functionality with + * created/hipStreamPerThread stream for available modes + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -static void interStrmEventSyncCapture(const hipStream_t &stream1, - const hipStream_t &stream2) { - hipGraph_t graph1, graph2; - hipEvent_t event; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_Basic") { + hipGraph_t graph{nullptr}; + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t s = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + + HIP_CHECK(hipStreamBeginCapture(s, captureMode)); + + HIP_CHECK(hipStreamEndCapture(s, &graph)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/* Local function for inter stream event synchronization + */ +static void interStrmEventSyncCapture(const hipStream_t& stream1, const hipStream_t& stream2) { + hipGraph_t graph1{nullptr}, graph2{nullptr}; hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + + EventsGuard events_guard(1); + hipEvent_t event = events_guard[0]; + HIP_CHECK(hipEventCreate(&event)); HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); HIP_CHECK(hipEventRecord(event, stream1)); @@ -338,35 +230,43 @@ static void interStrmEventSyncCapture(const hipStream_t &stream1, dummyKernel<<<1, 1, 0, stream2>>>(); dummyKernel<<<1, 1, 0, stream2>>>(); HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); - // Create Executable Graphs - HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); - REQUIRE(graphExec1 != nullptr); - HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); - REQUIRE(graphExec2 != nullptr); + size_t numNodes1 = 0, numNodes2 = 0; HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); REQUIRE(numNodes1 == 1); REQUIRE(numNodes2 == 2); - // Execute the Graphs - HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); - HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); - HIP_CHECK(hipStreamSynchronize(stream1)); - HIP_CHECK(hipStreamSynchronize(stream2)); + + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + REQUIRE(graphExec1 != nullptr); + HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + REQUIRE(graphExec2 != nullptr); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + // Execute the Graphs + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + } + // Free HIP_CHECK(hipGraphExecDestroy(graphExec2)); HIP_CHECK(hipGraphExecDestroy(graphExec1)); HIP_CHECK(hipGraphDestroy(graph2)); HIP_CHECK(hipGraphDestroy(graph1)); - HIP_CHECK(hipEventDestroy(event)); } -/* Local Function + +/* Local function for colligated stream capture */ -static void colligatedStrmCapture(const hipStream_t &stream1, - const hipStream_t &stream2) { - hipGraph_t graph1, graph2; - hipEvent_t event; +static void colligatedStrmCapture(const hipStream_t& stream1, const hipStream_t& stream2) { + hipGraph_t graph1{nullptr}, graph2{nullptr}; hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + + EventsGuard events_guard(1); + hipEvent_t event = events_guard[0]; + HIP_CHECK(hipEventCreate(&event)); HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); HIP_CHECK(hipEventRecord(event, stream1)); @@ -379,550 +279,638 @@ static void colligatedStrmCapture(const hipStream_t &stream1, // Validate end capture is successful REQUIRE(graph2 != nullptr); REQUIRE(graph1 != nullptr); - // Create Executable Graphs + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); REQUIRE(graphExec1 != nullptr); HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); REQUIRE(graphExec2 != nullptr); - // Execute the Graphs - HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); - HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); - HIP_CHECK(hipStreamSynchronize(stream1)); - HIP_CHECK(hipStreamSynchronize(stream2)); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + // Execute the Graphs + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + } + // Free HIP_CHECK(hipGraphExecDestroy(graphExec2)); HIP_CHECK(hipGraphExecDestroy(graphExec1)); HIP_CHECK(hipGraphDestroy(graph2)); HIP_CHECK(hipGraphDestroy(graph1)); - HIP_CHECK(hipEventDestroy(event)); } -/* Fill input Data + +/* Local function for colligated stream capture functionality */ -static void init_input(int* a, size_t size) { - unsigned int seed = time(nullptr); - for (size_t i = 0; i < size; i++) { - a[i] = (HipTest::RAND_R(&seed) & 0xFF); - } -} -/* Validate Output - */ -static void validate_output(int* a, int *b, size_t size) { - for (size_t i = 0; i < size; i++) { - REQUIRE(a[i] == (b[i]*b[i])); - } -} -/* Local Function - */ -static void colligatedStrmCaptureFunc(const hipStream_t &stream1, - const hipStream_t &stream2) { - constexpr size_t size = 1024; - constexpr auto blocksPerCU = 6; - constexpr auto threadsPerBlock = 256; - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, size); - hipGraph_t graph1, graph2; - int *inputVec_d1{nullptr}, *inputVec_h1{nullptr}, *outputVec_h1{nullptr}, - *outputVec_d1{nullptr}; - int *inputVec_d2{nullptr}, *inputVec_h2{nullptr}, *outputVec_h2{nullptr}, - *outputVec_d2{nullptr}; +static void colligatedStrmCaptureFunc(const hipStream_t& stream1, const hipStream_t& stream2) { + constexpr size_t N = 1000000; + size_t Nbytes = N * sizeof(int); + + hipGraph_t graph1{nullptr}, graph2{nullptr}; hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; - // host and device allocation - HipTest::initArrays(&inputVec_d1, &outputVec_d1, nullptr, - &inputVec_h1, &outputVec_h1, nullptr, size, false); - HipTest::initArrays(&inputVec_d2, &outputVec_d2, nullptr, - &inputVec_h2, &outputVec_h2, nullptr, size, false); + + // Host and device allocation + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard C_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard C_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard D_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard D_d(LinearAllocs::hipMalloc, Nbytes); + // Capture 2 streams HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipMemcpyAsync(inputVec_d1, inputVec_h1, sizeof(int) * size, - hipMemcpyDefault, stream1)); - HIP_CHECK(hipMemcpyAsync(inputVec_d2, inputVec_h2, sizeof(int) * size, - hipMemcpyDefault, stream2)); - HipTest::vector_square<<>>( - inputVec_d1, outputVec_d1, size); - HipTest::vector_square<<>>( - inputVec_d2, outputVec_d2, size); - HIP_CHECK(hipMemcpyAsync(outputVec_h1, outputVec_d1, sizeof(int) * size, - hipMemcpyDefault, stream1)); - HIP_CHECK(hipMemcpyAsync(outputVec_h2, outputVec_d2, sizeof(int) * size, - hipMemcpyDefault, stream2)); + captureSequenceLinear(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream1); + captureSequenceLinear(C_h.host_ptr(), C_d.ptr(), D_h.host_ptr(), D_d.ptr(), N, stream2); + captureSequenceCompute(A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream1); + captureSequenceCompute(C_d.ptr(), D_h.host_ptr(), D_d.ptr(), N, stream2); HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); // Validate end capture is successful REQUIRE(graph2 != nullptr); REQUIRE(graph1 != nullptr); + // Create Executable Graphs HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); REQUIRE(graphExec1 != nullptr); HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); REQUIRE(graphExec2 != nullptr); + // Execute the Graphs - for (int iter = 0; iter < LAUNCH_ITERS; iter++) { - init_input(inputVec_h1, size); - init_input(inputVec_h2, size); + for (int iter = 0; iter < kLaunchIters; iter++) { + std::fill_n(A_h.host_ptr(), N, iter); + std::fill_n(C_h.host_ptr(), N, iter); HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); - validate_output(outputVec_h1, inputVec_h1, size); - validate_output(outputVec_h2, inputVec_h2, size); + ArrayFindIfNot(B_h.host_ptr(), iter * iter, N); + ArrayFindIfNot(D_h.host_ptr(), iter * iter, N); } + // Free - HipTest::freeArrays(inputVec_d1, outputVec_d1, nullptr, - inputVec_h1, outputVec_h1, nullptr, false); - HipTest::freeArrays(inputVec_d2, outputVec_d2, nullptr, - inputVec_h2, outputVec_h2, nullptr, false); HIP_CHECK(hipGraphExecDestroy(graphExec2)); HIP_CHECK(hipGraphExecDestroy(graphExec1)); HIP_CHECK(hipGraphDestroy(graph2)); HIP_CHECK(hipGraphDestroy(graph1)); } + /* Stream Capture thread function */ -static void threadStrmCaptureFunc(hipStream_t stream, int *inputVec_d, -int *outputVec_d, int *inputVec_h, int *outputVec_h, hipGraph_t *graph, -size_t size, hipStreamCaptureMode mode) { - constexpr auto blocksPerCU = 6; - constexpr auto threadsPerBlock = 256; - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, size); +static void threadStrmCaptureFunc(hipStream_t stream, int* A_h, int* A_d, int* B_h, int* B_d, + hipGraph_t* graph, size_t N, hipStreamCaptureMode mode) { // Capture stream HIP_CHECK(hipStreamBeginCapture(stream, mode)); - HIP_CHECK(hipMemcpyAsync(inputVec_d, inputVec_h, sizeof(int) * size, - hipMemcpyDefault, stream)); - HipTest::vector_square<<>>( - inputVec_d, outputVec_d, size); - HIP_CHECK(hipMemcpyAsync(outputVec_h, outputVec_d, sizeof(int) * size, - hipMemcpyDefault, stream)); + captureSequenceLinear(A_h, A_d, B_h, B_d, N, stream); + captureSequenceCompute(A_d, B_h, B_d, N, stream); HIP_CHECK(hipStreamEndCapture(stream, graph)); } + /* Local Function for multithreaded tests */ static void multithreadedTest(hipStreamCaptureMode mode) { - hipStream_t stream1, stream2; - constexpr size_t size = 1024; - hipGraph_t graph1, graph2; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - int *inputVec_d1{nullptr}, *inputVec_h1{nullptr}, *outputVec_h1{nullptr}, - *outputVec_d1{nullptr}; - int *inputVec_d2{nullptr}, *inputVec_h2{nullptr}, *outputVec_h2{nullptr}, - *outputVec_d2{nullptr}; + constexpr size_t N = 1000000; + size_t Nbytes = N * sizeof(int); + + hipGraph_t graph1{nullptr}, graph2{nullptr}; hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; - // host and device allocation - HipTest::initArrays(&inputVec_d1, &outputVec_d1, nullptr, - &inputVec_h1, &outputVec_h1, nullptr, size, false); - HipTest::initArrays(&inputVec_d2, &outputVec_d2, nullptr, - &inputVec_h2, &outputVec_h2, nullptr, size, false); + StreamGuard stream_guard1(Streams::created); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::created); + hipStream_t stream2 = stream_guard2.stream(); + + // Host and device allocation + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard C_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard D_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard C_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard D_d(LinearAllocs::hipMalloc, Nbytes); + // Launch 2 threads to capture the 2 streams into graphs - std::thread t1(threadStrmCaptureFunc, stream1, inputVec_d1, - outputVec_d1, inputVec_h1, outputVec_h1, &graph1, size, mode); - std::thread t2(threadStrmCaptureFunc, stream2, inputVec_d2, - outputVec_d2, inputVec_h2, outputVec_h2, &graph2, size, mode); + std::thread t1(threadStrmCaptureFunc, stream1, A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), + B_d.ptr(), &graph1, N, mode); + std::thread t2(threadStrmCaptureFunc, stream2, C_h.host_ptr(), C_d.ptr(), D_h.host_ptr(), + D_d.ptr(), &graph2, N, mode); t1.join(); t2.join(); + // Create Executable Graphs HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + REQUIRE(graphExec1 != nullptr); HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + REQUIRE(graphExec2 != nullptr); + // Execute the Graphs - for (int iter = 0; iter < LAUNCH_ITERS; iter++) { - init_input(inputVec_h1, size); - init_input(inputVec_h2, size); + for (int iter = 0; iter < kLaunchIters; iter++) { + std::fill_n(A_h.host_ptr(), N, iter); + std::fill_n(C_h.host_ptr(), N, iter); HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); HIP_CHECK(hipStreamSynchronize(stream1)); HIP_CHECK(hipStreamSynchronize(stream2)); - validate_output(outputVec_h1, inputVec_h1, size); - validate_output(outputVec_h2, inputVec_h2, size); + ArrayFindIfNot(B_h.host_ptr(), iter * iter, N); + ArrayFindIfNot(D_h.host_ptr(), iter * iter, N); } + // Free - HipTest::freeArrays(inputVec_d1, outputVec_d1, nullptr, - inputVec_h1, outputVec_h1, nullptr, false); - HipTest::freeArrays(inputVec_d2, outputVec_d2, nullptr, - inputVec_h2, outputVec_h2, nullptr, false); HIP_CHECK(hipGraphExecDestroy(graphExec2)); HIP_CHECK(hipGraphExecDestroy(graphExec1)); - HIP_CHECK(hipGraphDestroy(graph1)); HIP_CHECK(hipGraphDestroy(graph2)); - HIP_CHECK(hipStreamDestroy(stream1)); - HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipGraphDestroy(graph1)); } -/* Test scenario 5.1 + +/** + * Test Description + * ------------------------ + * - Test to verify inter stream event synchronization- Waiting on an event + recorded on a captured stream. Initiate capture on stream1, record an event on + stream1, wait for the event on stream2, end the stream1 capture and initiate + stream capture on stream2 + * -# Streams are created with hipStreamDefault/hipStreamNonBlocking flag + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_defaultflag") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); +TEST_CASE("Unit_hipStreamBeginCapture_Positive_InterStrmEventSync_Flags") { + const auto stream_flags1 = GENERATE(hipStreamDefault, hipStreamNonBlocking); + const auto stream_flags2 = GENERATE(hipStreamDefault, hipStreamNonBlocking); + StreamGuard stream_guard1(Streams::withFlags, stream_flags1); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::withFlags, stream_flags2); + hipStream_t stream2 = stream_guard2.stream(); interStrmEventSyncCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 5.2 + +/** + * Test Description + * ------------------------ + * - Test to verify inter stream event synchronization- Waiting on an event + * recorded on a captured stream. Initiate capture on stream1, record an event + * on stream1, wait for the event on stream2, end the stream1 capture and + * initiate stream capture on stream2 + * -# Stream1 is created with minimal priority, stream 2 is created with + * maximal priority + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_blockingflag") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); - HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamNonBlocking)); - interStrmEventSyncCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 5.3 - */ -TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_diffflags") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); - HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamDefault)); - interStrmEventSyncCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 5.4 - */ -TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_diffprio") { - hipStream_t stream1, stream2; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_InterStrmEventSync_Priority") { int minPriority = 0, maxPriority = 0; HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority)); - HIP_CHECK(hipStreamCreateWithPriority(&stream1, hipStreamDefault, - minPriority)); - HIP_CHECK(hipStreamCreateWithPriority(&stream2, hipStreamDefault, - maxPriority)); + StreamGuard stream_guard1(Streams::withPriority, hipStreamDefault, minPriority); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::withPriority, hipStreamDefault, maxPriority); + hipStream_t stream2 = stream_guard2.stream(); interStrmEventSyncCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 6.1 + +/** + * Test Description + * ------------------------ + * - Test to verify colligated streams capture. Capture operation sequences + * queued in 2 streams by overlapping the 2 captures. Initiate capture on + * stream1, record an event on stream1, initiate capture on stream 2, end both + * stream captures + * -# Streams are created with hipStreamDefault/hipStreamNonBlocking flag + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_defaultflag") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); +TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCapture_Flags") { + const auto stream_flags1 = GENERATE(hipStreamDefault, hipStreamNonBlocking); + const auto stream_flags2 = GENERATE(hipStreamDefault, hipStreamNonBlocking); + StreamGuard stream_guard1(Streams::withFlags, stream_flags1); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::withFlags, stream_flags2); + hipStream_t stream2 = stream_guard2.stream(); colligatedStrmCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 6.2 + +/** + * Test Description + * ------------------------ + * - Test to verify colligated streams capture. Capture operation sequences + * queued in 2 streams by overlapping the 2 captures. Initiate capture on + * stream1, record an event on stream1, initiate capture on stream 2, end both + * stream captures + * -# Stream1 is created with minimal priority, stream 2 is created with + * maximal priority + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_blockingflag") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); - HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamNonBlocking)); - colligatedStrmCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 6.3 - */ -TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_diffflags") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); - HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamDefault)); - colligatedStrmCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 6.4 - */ -TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_diffprio") { - hipStream_t stream1, stream2; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCapture_Priority") { int minPriority = 0, maxPriority = 0; HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority)); - HIP_CHECK(hipStreamCreateWithPriority(&stream1, hipStreamDefault, - minPriority)); - HIP_CHECK(hipStreamCreateWithPriority(&stream2, hipStreamDefault, - maxPriority)); + StreamGuard stream_guard1(Streams::withPriority, hipStreamDefault, minPriority); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::withPriority, hipStreamDefault, maxPriority); + hipStream_t stream2 = stream_guard2.stream(); colligatedStrmCapture(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 7 + +/** + * Test Description + * ------------------------ + * - Create 2 streams. Start capturing both stream1 and stream2 at the same + * time. On stream1 queue memcpy, kernel and memcpy operations and on stream2 + * queue memcpy, kernel and memcpy operations. Execute both the captured graphs + * and validate the results + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_multiplestrms") { - hipStream_t stream1, stream2, stream3; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - hipGraph_t graph1, graph2, graph3; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_ColligatedStrmCaptureFunc") { + StreamGuard stream_guard1(Streams::created); + hipStream_t stream1 = stream_guard1.stream(); + StreamGuard stream_guard2(Streams::created); + hipStream_t stream2 = stream_guard2.stream(); + colligatedStrmCaptureFunc(stream1, stream2); +} + +/** + * Test Description + * ------------------------ + * - Capture 2 streams in parallel using threads. Execute the graphs in + * sequence in main thread and validate the results for all available capture + * modes + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Positive_Multithreaded") { + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + multithreadedTest(captureMode); +} + +/** + * Test Description + * ------------------------ + * - Test to verify inter stream event synchronization- Waiting on an event + * recorded on a captured stream. + * -# Initiate capture on stream1, record an event on stream1, wait for + * the event on stream2, end the stream1 capture and initiate stream capture on + * stream2. Repeat the same sequence between stream2 and stream3 + * -# Initiate capture on stream1, record an event on stream1, wait for + * the event on stream2 and stream3, end the stream1 capture and initiate stream + * capture on stream2 and stream3 + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Positive_Multiplestrms") { + StreamsGuard streams(3); + hipGraph_t graphs[3]; + size_t numNodes1 = 0, numNodes2 = 0, numNodes3 = 0; SECTION("Capture Multiple stream with interdependent events") { - hipEvent_t event1, event2; - HIP_CHECK(hipEventCreate(&event1)); - HIP_CHECK(hipEventCreate(&event2)); - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(event1, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0)); - dummyKernel<<<1, 1, 0, stream1>>>(); - HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); - HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(event2, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream3, event2, 0)); - dummyKernel<<<1, 1, 0, stream2>>>(); - HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); - HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); - dummyKernel<<<1, 1, 0, stream3>>>(); - HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); - HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); - HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); - HIP_CHECK(hipGraphGetNodes(graph3, nullptr, &numNodes3)); + EventsGuard events(2); + + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0)); + dummyKernel<<<1, 1, 0, streams[0]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0])); + HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[1], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0)); + dummyKernel<<<1, 1, 0, streams[1]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1])); + HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, streams[2]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2])); + HIP_CHECK(hipGraphGetNodes(graphs[0], nullptr, &numNodes1)); + HIP_CHECK(hipGraphGetNodes(graphs[1], nullptr, &numNodes2)); + HIP_CHECK(hipGraphGetNodes(graphs[2], nullptr, &numNodes3)); REQUIRE(numNodes1 == 1); REQUIRE(numNodes2 == 1); REQUIRE(numNodes3 == 1); - HIP_CHECK(hipEventDestroy(event2)); - HIP_CHECK(hipEventDestroy(event1)); } SECTION("Capture Multiple stream with single event") { - hipEvent_t event1; - HIP_CHECK(hipEventCreate(&event1)); - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(event1, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0)); - HIP_CHECK(hipStreamWaitEvent(stream3, event1, 0)); - dummyKernel<<<1, 1, 0, stream1>>>(); - HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); - HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); - dummyKernel<<<1, 1, 0, stream2>>>(); - HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); - HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); - dummyKernel<<<1, 1, 0, stream3>>>(); - HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); - HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); - HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); - HIP_CHECK(hipGraphGetNodes(graph3, nullptr, &numNodes3)); + EventsGuard events(1); + hipEvent_t event = events[0]; + + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event, streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], event, 0)); + HIP_CHECK(hipStreamWaitEvent(streams[2], event, 0)); + dummyKernel<<<1, 1, 0, streams[0]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0])); + HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, streams[1]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1])); + HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, streams[2]>>>(); + HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2])); + HIP_CHECK(hipGraphGetNodes(graphs[0], nullptr, &numNodes1)); + HIP_CHECK(hipGraphGetNodes(graphs[1], nullptr, &numNodes2)); + HIP_CHECK(hipGraphGetNodes(graphs[2], nullptr, &numNodes3)); REQUIRE(numNodes1 == 1); REQUIRE(numNodes2 == 1); REQUIRE(numNodes3 == 1); - HIP_CHECK(hipEventDestroy(event1)); } - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); + + for (int i = 0; i < 3; i++) { + HIP_CHECK(hipGraphDestroy(graphs[i])); + } } -/* Test scenario 8 + +/** + * Test Description + * ------------------------ + * - Test to verify queue operations (increment kernels) in 3 streams. Start + * capturing the streams after some operations have been queued. This scenario + * validates that only operations queued after hipStreamBeginCapture are + * captured in the graph + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_func") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - colligatedStrmCaptureFunc(stream1, stream2); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 9.1 - */ -TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_Global") { - multithreadedTest(hipStreamCaptureModeGlobal); -} -/* Test scenario 9.2 - */ -TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_ThreadLocal") { - multithreadedTest(hipStreamCaptureModeThreadLocal); -} -/* Test scenario 9.3 - */ -TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_Relaxed") { - multithreadedTest(hipStreamCaptureModeRelaxed); -} -/* Test scenario 10 - */ -TEST_CASE("Unit_hipStreamBeginCapture_CapturingFromWithinStrms") { - hipGraph_t graph; - hipStream_t stream1, stream2, stream3; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - hipEvent_t e1, e2, e3; - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); - HIP_CHECK(hipEventCreate(&e3)); +TEST_CASE("Unit_hipStreamBeginCapture_Positive_CapturingFromWithinStrms") { + constexpr int INCREMENT_KERNEL_FINALEXP_VAL = 7; + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + StreamsGuard streams(3); + EventsGuard events(3); + // Create a device memory of size int and initialize it to 0 - int *devMem{nullptr}, *hostMem{nullptr}; - hostMem = reinterpret_cast(malloc(sizeof(int))); - HIP_CHECK(hipMalloc(&devMem, sizeof(int))); + LinearAllocGuard hostMem_g(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard devMem_g(LinearAllocs::hipMalloc, sizeof(int)); + int* hostMem = hostMem_g.host_ptr(); + int* devMem = devMem_g.ptr(); HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); - // Start Capturing stream1 - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(e1, stream1)); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); - HIP_CHECK(hipStreamWaitEvent(stream3, e1, 0)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - HIP_CHECK(hipEventRecord(e2, stream2)); - HIP_CHECK(hipEventRecord(e3, stream3)); - HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); - HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); - HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + // Start Capturing + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem); + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[0], 0)); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem); + HIP_CHECK(hipEventRecord(events[1], streams[1])); + HIP_CHECK(hipEventRecord(events[2], streams[2])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[1], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0)); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), hipMemcpyDefault, streams[0])); + HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture // Reset device memory HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); + // Create Executable Graphs - hipGraphExec_t graphExec{nullptr}; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream1)); - HIP_CHECK(hipStreamSynchronize(stream1)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE(graphExec != nullptr); + + HIP_CHECK(hipGraphLaunch(graphExec, streams[0])); + HIP_CHECK(hipStreamSynchronize(streams[0])); REQUIRE((*hostMem) == INCREMENT_KERNEL_FINALEXP_VAL); - HIP_CHECK(hipFree(devMem)); - free(hostMem); + + HIP_CHECK(hipGraphExecDestroy(graphExec)) HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(e3)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 11 + +/** + * Test Description + * ------------------------ + * - Detecting invalid capture. Create 2 streams s1 and s2. Start capturing + * s1. Create event dependency between s1 and s2 using event record and event + * wait. Try capturing s2. hipStreamBeginCapture must return error + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_DetectingInvalidCapture") { - hipStream_t stream1, stream2; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - hipEvent_t event; - HIP_CHECK(hipEventCreate(&event)); - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(event, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, event, 0)); - dummyKernel<<<1, 1, 0, stream1>>>(); - // Since stream2 is already in capture mode due to event wait - // hipStreamBeginCapture on stream2 is expected to return error. - REQUIRE(hipSuccess != hipStreamBeginCapture(stream2, - hipStreamCaptureModeGlobal)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); +TEST_CASE("Unit_hipStreamBeginCapture_Negative_DetectingInvalidCapture") { + StreamsGuard streams(2); + EventsGuard events(1); + hipEvent_t event = events[0]; + + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event, streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], event, 0)); + dummyKernel<<<1, 1, 0, streams[0]>>>(); + // Since stream[1] is already in capture mode due to event wait + // hipStreamBeginCapture on stream[1] is expected to return error. + HIP_CHECK_ERROR(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal), + hipErrorIllegalState); } -/* Test scenario 12 + +/** + * Test Description + * ------------------------ + * - Test to verify wtream reuse. Capture multiple graphs from the same + * stream. Validate graphs are captured correctly + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_CapturingMultGraphsFrom1Strm") { - hipStream_t stream1; - HIP_CHECK(hipStreamCreate(&stream1)); - hipGraph_t graph[3]; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_CapturingMultGraphsFrom1Strm") { + hipGraph_t graphs[3]; + + StreamGuard stream_guard(Streams::created); + hipStream_t stream1 = stream_guard.stream(); + // Create a device memory of size int and initialize it to 0 - int *devMem{nullptr}, *hostMem{nullptr}; - hostMem = reinterpret_cast(malloc(sizeof(int))); - HIP_CHECK(hipMalloc(&devMem, sizeof(int))); + LinearAllocGuard hostMem_g(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard devMem_g(LinearAllocs::hipMalloc, sizeof(int)); + int* hostMem = hostMem_g.host_ptr(); + int* devMem = devMem_g.ptr(); HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); - // Capture Graph1 - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph[0])); - // Capture Graph2 - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph[1])); - // Capture Graph3 - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph[2])); + + for (int i = 0; i < 3; i++) { + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + for (int j = 0; j <= i; j++) incrementKernel<<<1, 1, 0, stream1>>>(devMem); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graphs[i])); + } // Instantiate and execute all graphs for (int i = 0; i < 3; i++) { hipGraphExec_t graphExec{nullptr}; HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph[i], nullptr, - nullptr, 0)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graphs[i], nullptr, nullptr, 0)); HIP_CHECK(hipGraphLaunch(graphExec, stream1)); HIP_CHECK(hipStreamSynchronize(stream1)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); REQUIRE((*hostMem) == (i + 1)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graphs[i])); } - HIP_CHECK(hipFree(devMem)); - free(hostMem); - for (int i = 0; i < 3; i++) { - HIP_CHECK(hipGraphDestroy(graph[i])); - } - HIP_CHECK(hipStreamDestroy(stream1)); } + #if HT_NVIDIA -/* Test scenario 13 +/** + * Test Description + * ------------------------ + * - Test to verify synchronization during stream capture returns an error: + * -# Synchronize stream during capture + * -# Synchronize device during capture + * -# Synchronize event during capture + * -# Query stream during capture + * -# Query for an event during capture + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_CheckingSyncDuringCapture") { - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); +TEST_CASE("Unit_hipStreamBeginCapture_Negative_CheckingSyncDuringCapture") { + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + EventsGuard events_guard(1); + hipEvent_t e = events_guard[0]; + + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); SECTION("Synchronize stream during capture") { - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipErrorStreamCaptureUnsupported == - hipStreamSynchronize(stream)); + HIP_CHECK_ERROR(hipStreamSynchronize(stream), hipErrorStreamCaptureUnsupported); } SECTION("Synchronize device during capture") { - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipErrorStreamCaptureUnsupported == hipDeviceSynchronize()); + HIP_CHECK_ERROR(hipDeviceSynchronize(), hipErrorStreamCaptureUnsupported); } SECTION("Synchronize event during capture") { - hipEvent_t e; - HIP_CHECK(hipEventCreate(&e)); HIP_CHECK(hipEventRecord(e, stream)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipErrorStreamCaptureUnsupported == hipEventSynchronize(e)); - HIP_CHECK(hipEventDestroy(e)); - } - SECTION("Wait for an event during capture") { - hipEvent_t e; - HIP_CHECK(hipEventCreate(&e)); - HIP_CHECK(hipEventRecord(e, stream)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipErrorStreamCaptureIsolation == - hipStreamWaitEvent(stream, e, 0)); - HIP_CHECK(hipEventDestroy(e)); + HIP_CHECK_ERROR(hipEventSynchronize(e), hipErrorCapturedEvent); } SECTION("Query stream during capture") { - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipErrorStreamCaptureUnsupported == hipStreamQuery(stream)); + HIP_CHECK_ERROR(hipStreamQuery(stream), hipErrorStreamCaptureUnsupported); } SECTION("Query for an event during capture") { - hipEvent_t e; - HIP_CHECK(hipEventCreate(&e)); HIP_CHECK(hipEventRecord(e, stream)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - REQUIRE(hipSuccess != hipEventQuery(e)); - HIP_CHECK(hipEventDestroy(e)); + HIP_CHECK_ERROR(hipEventQuery(e), hipErrorCapturedEvent); + } +} + +/** + * Test Description + * ------------------------ + * - Test to verify unsafe API calls during stream capture with initiated + * with hipStreamCaptureModeGlobal and hipStreamCaptureModeThreadLocal return an + * error: + * -# hipMalloc during capture + * -# hipMemcpy during capture + * -# hipMemset during capture + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Negative_UnsafeCallsDuringCapture") { + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + LinearAllocGuard hostMem(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard devMem(LinearAllocs::hipMalloc, sizeof(int)); + + int* devMem2; + + const hipStreamCaptureMode captureMode = + GENERATE(hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal); + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + SECTION("hipMalloc during capture") { + HIP_CHECK_ERROR(hipMalloc(&devMem2, sizeof(int)), hipErrorStreamCaptureUnsupported); + } + SECTION("hipMemcpy during capture") { + HIP_CHECK_ERROR(hipMemcpy(devMem.ptr(), hostMem.host_ptr(), sizeof(int), hipMemcpyHostToDevice), + hipErrorStreamCaptureImplicit); + } + SECTION("hipMemset during capture") { + HIP_CHECK_ERROR(hipMemset(devMem.ptr(), 0, sizeof(int)), hipErrorStreamCaptureImplicit); } - HIP_CHECK(hipStreamDestroy(stream)); } #endif -/* Test scenario 14 + +/** + * Test Description + * ------------------------ + * - Test to verify end stream capture when the stream capture is still in + * progress: + * -# Abruptly end stream capture when stream capture is in progress in + * forked stream. hipStreamEndCapture must return an error + * -# Abruptly end stream capture when operations in forked stream are + * still waiting to be captured. hipStreamEndCapture must return an error + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_EndingCapturewhenCaptureInProgress") { - hipStream_t stream1, stream2; - hipGraph_t graph; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); +TEST_CASE("Unit_hipStreamBeginCapture_Negative_EndingCapturewhenCaptureInProgress") { + hipGraph_t graph{nullptr}; + + StreamsGuard streams_guard(2); + hipStream_t stream1 = streams_guard[0]; + hipStream_t stream2 = streams_guard[1]; + SECTION("Abruptly end strm capture when in progress in forked strm") { - hipEvent_t e; + EventsGuard events_guard(1); + hipEvent_t e = events_guard[0]; HIP_CHECK(hipEventCreate(&e)); HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); dummyKernel<<<1, 1, 0, stream1>>>(); HIP_CHECK(hipEventRecord(e, stream1)); HIP_CHECK(hipStreamWaitEvent(stream2, e, 0)); dummyKernel<<<1, 1, 0, stream2>>>(); - REQUIRE(hipErrorStreamCaptureUnjoined == - hipStreamEndCapture(stream1, &graph)); - HIP_CHECK(hipEventDestroy(e)); + HIP_CHECK_ERROR(hipStreamEndCapture(stream1, &graph), hipErrorStreamCaptureUnjoined); } SECTION("End strm capture when forked strm still has operations") { - hipEvent_t e1, e2; - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); + EventsGuard events_guard(2); + hipEvent_t e1 = events_guard[0]; + hipEvent_t e2 = events_guard[1]; HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); dummyKernel<<<1, 1, 0, stream1>>>(); HIP_CHECK(hipEventRecord(e1, stream1)); @@ -931,18 +919,22 @@ TEST_CASE("Unit_hipStreamBeginCapture_EndingCapturewhenCaptureInProgress") { HIP_CHECK(hipEventRecord(e2, stream2)); HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); dummyKernel<<<1, 1, 0, stream2>>>(); - REQUIRE(hipErrorStreamCaptureUnjoined == - hipStreamEndCapture(stream1, &graph)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK_ERROR(hipStreamEndCapture(stream1, &graph), hipErrorStreamCaptureUnjoined); } - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } - -/* Test scenario 15 +/** + * Test Description + * ------------------------ + * - Testing independent stream capture using multiple GPUs. Capture a stream + * in each device context and execute the captured graph in the context GPU + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_MultiGPU") { +TEST_CASE("Unit_hipStreamBeginCapture_Positive_MultiGPU") { int devcount = 0; HIP_CHECK(hipGetDeviceCount(&devcount)); // If only single GPU is detected then return @@ -950,40 +942,36 @@ TEST_CASE("Unit_hipStreamBeginCapture_MultiGPU") { SUCCEED("skipping the testcases as numDevices < 2"); return; } - hipStream_t* stream = reinterpret_cast(malloc( - devcount*sizeof(hipStream_t))); + hipStream_t* stream = reinterpret_cast(malloc(devcount * sizeof(hipStream_t))); REQUIRE(stream != nullptr); - hipGraph_t* graph = reinterpret_cast(malloc( - devcount*sizeof(hipGraph_t))); + hipGraph_t* graph = reinterpret_cast(malloc(devcount * sizeof(hipGraph_t))); REQUIRE(graph != nullptr); int **devMem{nullptr}, **hostMem{nullptr}; - hostMem = reinterpret_cast(malloc(sizeof(int*)*devcount)); + hostMem = reinterpret_cast(malloc(sizeof(int*) * devcount)); REQUIRE(hostMem != nullptr); - devMem = reinterpret_cast(malloc(sizeof(int*)*devcount)); + devMem = reinterpret_cast(malloc(sizeof(int*) * devcount)); REQUIRE(devMem != nullptr); - hipGraphExec_t* graphExec = reinterpret_cast(malloc( - devcount*sizeof(hipGraphExec_t))); + hipGraphExec_t* graphExec = + reinterpret_cast(malloc(devcount * sizeof(hipGraphExec_t))); // Capture stream in each device for (int dev = 0; dev < devcount; dev++) { HIP_CHECK(hipSetDevice(dev)); HIP_CHECK(hipStreamCreate(&stream[dev])); hostMem[dev] = reinterpret_cast(malloc(sizeof(int))); HIP_CHECK(hipMalloc(&devMem[dev], sizeof(int))); - HIP_CHECK(hipStreamBeginCapture(stream[dev], - hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamBeginCapture(stream[dev], hipStreamCaptureModeGlobal)); HIP_CHECK(hipMemsetAsync(devMem[dev], 0, sizeof(int), stream[dev])); for (int i = 0; i < (dev + 1); i++) { incrementKernel<<<1, 1, 0, stream[dev]>>>(devMem[dev]); } - HIP_CHECK(hipMemcpyAsync(hostMem[dev], devMem[dev], sizeof(int), - hipMemcpyDefault, stream[dev])); + HIP_CHECK( + hipMemcpyAsync(hostMem[dev], devMem[dev], sizeof(int), hipMemcpyDefault, stream[dev])); HIP_CHECK(hipStreamEndCapture(stream[dev], &graph[dev])); } // Launch the captured graphs in the respective device for (int dev = 0; dev < devcount; dev++) { HIP_CHECK(hipSetDevice(dev)); - HIP_CHECK(hipGraphInstantiate(&graphExec[dev], graph[dev], nullptr, - nullptr, 0)); + HIP_CHECK(hipGraphInstantiate(&graphExec[dev], graph[dev], nullptr, nullptr, 0)); HIP_CHECK(hipGraphLaunch(graphExec[dev], stream[dev])); } // Validate output @@ -1004,291 +992,278 @@ TEST_CASE("Unit_hipStreamBeginCapture_MultiGPU") { free(stream); free(graph); } -/* Test scenario 16 + +/** + * Test Description + * ------------------------ + * - Test Nested Stream Capture Functionality: Create 3 streams. Capture s1, + * record event e1 on s1, wait for event e1 on s2 and queue operations in s1. + * Record event e2 on s2 and wait for it on s3. Queue operations on both s2 and + * s3. Record event e4 on s3 and wait for it in s1. Record event e3 on s2 and + * wait for it in s1. End stream capture on s1. Execute the graph and verify the + * result. + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_nestedStreamCapture") { - hipGraph_t graph; - hipStream_t stream1, stream2, stream3; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - hipEvent_t e1, e2, e3, e4; - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); - HIP_CHECK(hipEventCreate(&e3)); - HIP_CHECK(hipEventCreate(&e4)); +TEST_CASE("Unit_hipStreamBeginCapture_Positive_nestedStreamCapture") { + constexpr int INCREMENT_KERNEL_FINALEXP_VAL = 7; + + hipGraph_t graph{nullptr}; + StreamsGuard streams(3); + EventsGuard events(4); + // Create a device memory of size int and initialize it to 0 - int *devMem{nullptr}, *hostMem{nullptr}; - hostMem = reinterpret_cast(malloc(sizeof(int))); - REQUIRE(hostMem != nullptr); - HIP_CHECK(hipMalloc(&devMem, sizeof(int))); - HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + LinearAllocGuard hostMem_g(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard devMem_g(LinearAllocs::hipMalloc, sizeof(int)); + HIP_CHECK(hipMemset(devMem_g.ptr(), 0, sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); // Start Capturing stream1 - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(e1, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); - HIP_CHECK(hipEventRecord(e2, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - incrementKernel<<<1, 1, 0, stream1>>>(devMem); - incrementKernel<<<1, 1, 0, stream2>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - incrementKernel<<<1, 1, 0, stream3>>>(devMem); - HIP_CHECK(hipEventRecord(e3, stream2)); - HIP_CHECK(hipEventRecord(e4, stream3)); - HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); - HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); - HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0)); + HIP_CHECK(hipEventRecord(events[1], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0)); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr()); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem_g.ptr()); + HIP_CHECK(hipEventRecord(events[2], streams[1])); + HIP_CHECK(hipEventRecord(events[3], streams[2])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0)); + HIP_CHECK(hipMemcpyAsync(hostMem_g.host_ptr(), devMem_g.ptr(), sizeof(int), hipMemcpyDefault, + streams[0])); + HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture // Reset device memory - HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem_g.ptr(), 0, sizeof(int))); HIP_CHECK(hipDeviceSynchronize()); // Create Executable Graphs hipGraphExec_t graphExec{nullptr}; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream1)); - HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipGraphLaunch(graphExec, streams[0])); + HIP_CHECK(hipStreamSynchronize(streams[0])); + REQUIRE((*hostMem_g.host_ptr()) == INCREMENT_KERNEL_FINALEXP_VAL); + HIP_CHECK(hipGraphExecDestroy(graphExec)); - REQUIRE((*hostMem) == INCREMENT_KERNEL_FINALEXP_VAL); - HIP_CHECK(hipFree(devMem)); - free(hostMem); HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(e4)); - HIP_CHECK(hipEventDestroy(e3)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); -} -/* Test scenario 17 - */ -TEST_CASE("Unit_hipStreamBeginCapture_streamReuse") { - hipGraph_t graph1, graph2, graph3; - hipStream_t stream1, stream2, stream3; - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - hipEvent_t e1, e2, e3, e4; - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); - HIP_CHECK(hipEventCreate(&e3)); - HIP_CHECK(hipEventCreate(&e4)); - // Create a device memory of size int and initialize it to 0 - int *devMem1{nullptr}, *hostMem1{nullptr}, *devMem2{nullptr}, - *hostMem2{nullptr}, *devMem3{nullptr}, *hostMem3{nullptr}; - HipTest::initArrays(&devMem1, &devMem2, &devMem3, - &hostMem1, &hostMem2, &hostMem3, 1, false); - HIP_CHECK(hipMemset(devMem1, 0, sizeof(int))); - HIP_CHECK(hipMemset(devMem2, 0, sizeof(int))); - HIP_CHECK(hipMemset(devMem3, 0, sizeof(int))); - HIP_CHECK(hipDeviceSynchronize()); - // Start Capturing stream1 - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(e1, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); - HIP_CHECK(hipEventRecord(e2, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); - incrementKernel<<<1, 1, 0, stream1>>>(devMem1); - incrementKernel<<<1, 1, 0, stream2>>>(devMem1); - incrementKernel<<<1, 1, 0, stream3>>>(devMem1); - incrementKernel<<<1, 1, 0, stream1>>>(devMem1); - incrementKernel<<<1, 1, 0, stream2>>>(devMem1); - incrementKernel<<<1, 1, 0, stream3>>>(devMem1); - incrementKernel<<<1, 1, 0, stream3>>>(devMem1); - HIP_CHECK(hipEventRecord(e3, stream2)); - HIP_CHECK(hipEventRecord(e4, stream3)); - HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); - HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); - HIP_CHECK(hipMemcpyAsync(hostMem1, devMem1, sizeof(int), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); // End Capture - // Start capturing graph2 from stream 2 - HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); - incrementKernel<<<1, 1, 0, stream2>>>(devMem2); - incrementKernel<<<1, 1, 0, stream2>>>(devMem2); - incrementKernel<<<1, 1, 0, stream2>>>(devMem2); - HIP_CHECK(hipMemcpyAsync(hostMem2, devMem2, sizeof(int), - hipMemcpyDefault, stream2)); - HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); // End Capture - // Start capturing graph3 from stream 3 - HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); - incrementKernel<<<1, 1, 0, stream3>>>(devMem3); - incrementKernel<<<1, 1, 0, stream3>>>(devMem3); - incrementKernel<<<1, 1, 0, stream3>>>(devMem3); - incrementKernel<<<1, 1, 0, stream3>>>(devMem3); - incrementKernel<<<1, 1, 0, stream3>>>(devMem3); - HIP_CHECK(hipMemcpyAsync(hostMem3, devMem3, sizeof(int), - hipMemcpyDefault, stream3)); - HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); // End Capture - // Reset device memory - HIP_CHECK(hipMemset(devMem1, 0, sizeof(int))); - HIP_CHECK(hipMemset(devMem2, 0, sizeof(int))); - HIP_CHECK(hipMemset(devMem3, 0, sizeof(int))); - HIP_CHECK(hipDeviceSynchronize()); - // Create Executable Graphs - hipGraphExec_t graphExec{nullptr}; - // Verify graph1 - HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream1)); - HIP_CHECK(hipStreamSynchronize(stream1)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - REQUIRE((*hostMem1) == INCREMENT_KERNEL_FINALEXP_VAL); - // Verify graph2 - HIP_CHECK(hipGraphInstantiate(&graphExec, graph2, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream2)); - HIP_CHECK(hipStreamSynchronize(stream2)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - REQUIRE((*hostMem2) == 3); - // Verify graph3 - HIP_CHECK(hipGraphInstantiate(&graphExec, graph3, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream3)); - HIP_CHECK(hipStreamSynchronize(stream3)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - REQUIRE((*hostMem3) == 5); - HipTest::freeArrays(devMem1, devMem2, devMem3, - hostMem1, hostMem2, hostMem3, false); - HIP_CHECK(hipGraphDestroy(graph1)); - HIP_CHECK(hipGraphDestroy(graph2)); - HIP_CHECK(hipGraphDestroy(graph3)); - HIP_CHECK(hipEventDestroy(e4)); - HIP_CHECK(hipEventDestroy(e3)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 18 +/** + * Test Description + * ------------------------ + * - Test Nested Stream Capture Functionality: Create 3 streams. Capture s1, + * record event e1 on s1, wait for event e1 on s2 and queue operations in s1. + * Record event e2 on s2 and wait for it on s3. Queue operations on both s2 and + * s3. Record event e4 on s3 and wait for it in s1. Record event e3 on s2 and + * wait for it in s1. End stream capture on s1. Queue operations on both s2 and + * s3, and capture their graphs. Execute the graphs and verify the result. + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_captureComplexGraph") { - hipGraph_t graph; - hipStream_t stream1, stream2, stream3, stream4, stream5; - // Stream and event create - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - HIP_CHECK(hipStreamCreate(&stream4)); - HIP_CHECK(hipStreamCreate(&stream5)); - hipEvent_t e0, e1, e2, e3, e4, e5, e6; - HIP_CHECK(hipEventCreate(&e0)); - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); - HIP_CHECK(hipEventCreate(&e3)); - HIP_CHECK(hipEventCreate(&e4)); - HIP_CHECK(hipEventCreate(&e5)); - HIP_CHECK(hipEventCreate(&e6)); +TEST_CASE("Unit_hipStreamBeginCapture_Positive_streamReuse") { + constexpr int increment_kernel_vals[3] = {7, 3, 5}; + + hipGraph_t graphs[3]; + StreamsGuard streams(3); + EventsGuard events(4); + LinearAllocGuard hostMem_g1 = LinearAllocGuard(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard hostMem_g2 = LinearAllocGuard(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard hostMem_g3 = LinearAllocGuard(LinearAllocs::malloc, sizeof(int)); + LinearAllocGuard devMem_g1 = LinearAllocGuard(LinearAllocs::hipMalloc, sizeof(int)); + LinearAllocGuard devMem_g2 = LinearAllocGuard(LinearAllocs::hipMalloc, sizeof(int)); + LinearAllocGuard devMem_g3 = LinearAllocGuard(LinearAllocs::hipMalloc, sizeof(int)); + + std::vector hostMem = {hostMem_g1.host_ptr(), hostMem_g2.host_ptr(), hostMem_g3.host_ptr()}; + std::vector devMem = {devMem_g1.ptr(), devMem_g2.ptr(), devMem_g3.ptr()}; + // Create a device memory of size int and initialize it to 0 + for (int i = 0; i < 3; i++) { + memset(hostMem[i], 0, sizeof(int)); + HIP_CHECK(hipMemset(devMem[i], 0, sizeof(int))); + } + HIP_CHECK(hipDeviceSynchronize()); + // Start Capturing stream1 + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0)); + HIP_CHECK(hipEventRecord(events[1], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[1], 0)); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[0]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[0]); + HIP_CHECK(hipEventRecord(events[2], streams[1])); + HIP_CHECK(hipEventRecord(events[3], streams[2])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0)); + HIP_CHECK(hipMemcpyAsync(hostMem[0], devMem[0], sizeof(int), hipMemcpyDefault, streams[0])); + HIP_CHECK(hipStreamEndCapture(streams[0], &graphs[0])); // End Capture + // Start capturing graph2 from stream 2 + HIP_CHECK(hipStreamBeginCapture(streams[1], hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]); + incrementKernel<<<1, 1, 0, streams[1]>>>(devMem[1]); + HIP_CHECK(hipMemcpyAsync(hostMem[1], devMem[1], sizeof(int), hipMemcpyDefault, streams[1])); + HIP_CHECK(hipStreamEndCapture(streams[1], &graphs[1])); // End Capture + // Start capturing graph3 from stream 3 + HIP_CHECK(hipStreamBeginCapture(streams[2], hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]); + incrementKernel<<<1, 1, 0, streams[2]>>>(devMem[2]); + HIP_CHECK(hipMemcpyAsync(hostMem[2], devMem[2], sizeof(int), hipMemcpyDefault, streams[2])); + HIP_CHECK(hipStreamEndCapture(streams[2], &graphs[2])); // End Capture + // Reset device memory + HIP_CHECK(hipMemset(devMem[0], 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem[1], 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem[2], 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Create Executable Graphs and verify graphs + for (int i = 0; i < 3; i++) { + hipGraphExec_t graphExec{nullptr}; + HIP_CHECK(hipMemset(devMem[i], 0, sizeof(int))); + HIP_CHECK(hipGraphInstantiate(&graphExec, graphs[i], nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, streams[i])); + HIP_CHECK(hipStreamSynchronize(streams[i])); + REQUIRE((*hostMem[i]) == increment_kernel_vals[i]); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graphs[i])); + } +} + +/** + * Test Description + * ------------------------ + * - Capture a complex graph containing multiple independent memcpy, kernel + * and host nodes. Launch the graph on random input data and validate the output + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Positive_captureComplexGraph") { + constexpr int GRIDSIZE = 256; + constexpr int BLOCKSIZE = 256; + constexpr int CONST_KER1_VAL = 3; + constexpr int CONST_KER2_VAL = 2; + constexpr int CONST_KER3_VAL = 5; + + hipGraph_t graph{nullptr}; + StreamsGuard streams(5); + EventsGuard events(7); // Allocate Device memory and Host memory - size_t N = GRIDSIZE*BLOCKSIZE; - int *Ah{nullptr}, *Bh{nullptr}, *Ch{nullptr}, *Ad{nullptr}, *Bd{nullptr}; - HipTest::initArrays(&Ad, &Bd, nullptr, &Ah, &Bh, &Ch, N, false); + size_t N = GRIDSIZE * BLOCKSIZE; + LinearAllocGuard Ah = LinearAllocGuard(LinearAllocs::malloc, N * sizeof(int)); + LinearAllocGuard Bh = LinearAllocGuard(LinearAllocs::malloc, N * sizeof(int)); + LinearAllocGuard Ch = LinearAllocGuard(LinearAllocs::malloc, N * sizeof(int)); + LinearAllocGuard Ad = LinearAllocGuard(LinearAllocs::hipMalloc, N * sizeof(int)); + LinearAllocGuard Bd = LinearAllocGuard(LinearAllocs::hipMalloc, N * sizeof(int)); + // Capture streams into graph - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(e0, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream4, e0, 0)); - HIP_CHECK(hipStreamWaitEvent(stream5, e0, 0)); - HIP_CHECK(hipMemcpyAsync(Ad, Ah, (N*sizeof(int)), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipMemcpyAsync(Bd, Bh, (N*sizeof(int)), - hipMemcpyDefault, stream5)); + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[3], events[0], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[4], events[0], 0)); + HIP_CHECK( + hipMemcpyAsync(Ad.ptr(), Ah.host_ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[0])); + HIP_CHECK( + hipMemcpyAsync(Bd.ptr(), Bh.host_ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[4])); hipHostFn_t fn = hostNodeCallback; - HIPCHECK(hipLaunchHostFunc(stream4, fn, nullptr)); - HIP_CHECK(hipEventRecord(e1, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); - int *Ad_2nd_half = Ad + N/2; - int *Ad_1st_half = Ad; - mymul<<>>(Ad_2nd_half, CONST_KER2_VAL); - mymul<<>>(Ad_1st_half, CONST_KER1_VAL); - HIP_CHECK(hipEventRecord(e2, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); - mymul<<>>(Ad_1st_half, CONST_KER3_VAL); - HIPCHECK(hipLaunchHostFunc(stream3, fn, nullptr)); - HIP_CHECK(hipEventRecord(e6, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream1, e6, 0)); - HIP_CHECK(hipEventRecord(e5, stream5)); - HIP_CHECK(hipStreamWaitEvent(stream1, e5, 0)); - myadd<<>>(Ad, Bd); - HIP_CHECK(hipEventRecord(e3, stream3)); - HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); - HIP_CHECK(hipEventRecord(e4, stream4)); - HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); - HIP_CHECK(hipMemcpyAsync(Ch, Ad, (N*sizeof(int)), - hipMemcpyDefault, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + HIPCHECK(hipLaunchHostFunc(streams[3], fn, nullptr)); + HIP_CHECK(hipEventRecord(events[1], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[1], 0)); + int* Ad_2nd_half = Ad.ptr() + N / 2; + int* Ad_1st_half = Ad.ptr(); + mymul<<>>(Ad_2nd_half, CONST_KER2_VAL); + mymul<<>>(Ad_1st_half, CONST_KER1_VAL); + HIP_CHECK(hipEventRecord(events[2], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[2], 0)); + mymul<<>>(Ad_1st_half, CONST_KER3_VAL); + HIPCHECK(hipLaunchHostFunc(streams[2], fn, nullptr)); + HIP_CHECK(hipEventRecord(events[6], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[6], 0)); + HIP_CHECK(hipEventRecord(events[5], streams[4])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[5], 0)); + myadd<<>>(Ad.ptr(), Bd.ptr()); + HIP_CHECK(hipEventRecord(events[3], streams[2])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[3], 0)); + HIP_CHECK(hipEventRecord(events[4], streams[3])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[4], 0)); + HIP_CHECK( + hipMemcpyAsync(Ch.host_ptr(), Ad.ptr(), (N * sizeof(int)), hipMemcpyDefault, streams[0])); + HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture // Execute and test the graph - // Create Executable Graphs hipGraphExec_t graphExec{nullptr}; - // Verify graph1 HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - for (int iter = 0; iter < LAUNCH_ITERS; iter++) { - init_input(Ah, N); - init_input(Bh, N); - HIP_CHECK(hipGraphLaunch(graphExec, stream1)); - HIP_CHECK(hipStreamSynchronize(stream1)); + // Verify graph + for (int iter = 0; iter < kLaunchIters; iter++) { + std::fill_n(Ah.host_ptr(), N, iter); + std::fill_n(Bh.host_ptr(), N, iter); + HIP_CHECK(hipGraphLaunch(graphExec, streams[0])); + HIP_CHECK(hipStreamSynchronize(streams[0])); for (size_t i = 0; i < N; i++) { - if (i > (N/2 - 1)) { - REQUIRE(Ch[i] == (Bh[i] + Ah[i]*CONST_KER2_VAL)); + if (i > (N / 2 - 1)) { + REQUIRE(Ch.host_ptr()[i] == (Bh.host_ptr()[i] + Ah.host_ptr()[i] * CONST_KER2_VAL)); } else { - REQUIRE(Ch[i] == (Bh[i] + Ah[i]*CONST_KER1_VAL*CONST_KER3_VAL)); + REQUIRE(Ch.host_ptr()[i] == + (Bh.host_ptr()[i] + Ah.host_ptr()[i] * CONST_KER1_VAL * CONST_KER3_VAL)); } } } - REQUIRE(gCbackIter == (2*LAUNCH_ITERS)); + REQUIRE(gCbackIter == (2 * kLaunchIters)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); - // Free Device memory and Host memory - HipTest::freeArrays(Ad, Bd, nullptr, Ah, Bh, Ch, false); - // Destroy graph, events and streams HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(e6)); - HIP_CHECK(hipEventDestroy(e5)); - HIP_CHECK(hipEventDestroy(e4)); - HIP_CHECK(hipEventDestroy(e3)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); - HIP_CHECK(hipEventDestroy(e0)); - HIP_CHECK(hipStreamDestroy(stream5)); - HIP_CHECK(hipStreamDestroy(stream4)); - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } -/* Test scenario 19 + +/** + * Test Description + * ------------------------ + * - Test to verify capturing empty streams (parent + forked streams) and + * validate the captured graph has no nodes + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamBeginCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 */ -TEST_CASE("Unit_hipStreamBeginCapture_captureEmptyStreams") { - hipGraph_t graph; - hipStream_t stream1, stream2, stream3; +TEST_CASE("Unit_hipStreamBeginCapture_Positive_captureEmptyStreams") { + hipGraph_t graph{nullptr}; + // Stream and event create - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&stream3)); - hipEvent_t e0, e1, e2; - HIP_CHECK(hipEventCreate(&e0)); - HIP_CHECK(hipEventCreate(&e1)); - HIP_CHECK(hipEventCreate(&e2)); + StreamsGuard streams(3); + EventsGuard events(3); + // Capture streams into graph - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(e0, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, e0, 0)); - HIP_CHECK(hipStreamWaitEvent(stream3, e0, 0)); - HIP_CHECK(hipEventRecord(e1, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream1, e1, 0)); - HIP_CHECK(hipEventRecord(e2, stream3)); - HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + HIP_CHECK(hipStreamBeginCapture(streams[0], hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(events[0], streams[0])); + HIP_CHECK(hipStreamWaitEvent(streams[1], events[0], 0)); + HIP_CHECK(hipStreamWaitEvent(streams[2], events[0], 0)); + HIP_CHECK(hipEventRecord(events[1], streams[1])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[1], 0)); + HIP_CHECK(hipEventRecord(events[2], streams[2])); + HIP_CHECK(hipStreamWaitEvent(streams[0], events[2], 0)); + HIP_CHECK(hipStreamEndCapture(streams[0], &graph)); // End Capture size_t numNodes = 0; HIP_CHECK(hipGraphGetNodes(graph, nullptr, &numNodes)); REQUIRE(numNodes == 0); - // Destroy graph, events and streams + HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(e2)); - HIP_CHECK(hipEventDestroy(e1)); - HIP_CHECK(hipEventDestroy(e0)); - HIP_CHECK(hipStreamDestroy(stream3)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipStreamDestroy(stream1)); } diff --git a/catch/unit/graph/hipStreamBeginCapture_old.cc b/catch/unit/graph/hipStreamBeginCapture_old.cc new file mode 100644 index 0000000000..80be624e8a --- /dev/null +++ b/catch/unit/graph/hipStreamBeginCapture_old.cc @@ -0,0 +1,1294 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Testcase Scenarios : Functional + 1) Initiate stream capture with different modes on custom stream. + Capture stream sequence and replay the sequence in multiple iterations. + 2) End capture and validate that API returns captured graph for + all possible modes on custom stream. + 3) Initiate stream capture with different modes on hipStreamPerThread. + Capture stream sequence and replay the sequence in multiple iterations. + 4) End capture and validate that API returns captured graph for + all possible modes on hipStreamPerThread. + 5) Waiting on an event recorded on a captured stream. Initiate capture + on stream1, record an event on stream1, wait for the event on stream2, + end the stream1 capture and Initiate stream capture on stream2 + 5.1) Both streams are created with default flags. + 5.2) Both streams are created with flag = hipStreamCaptureModeGlobal. + 5.3) Both streams are created with different flags. + 5.4) Both streams are created with different priorities. + 5.5) Validate the number of nodes in both the captured graphs. + 6) Colligated Streams capture. Capture operation sequences queued in + 2 streams by overlapping the 2 captures. + 6.1) Both streams are created with default flags. + 6.2) Both streams are created with flag = hipStreamCaptureModeGlobal. + 6.3) Both streams are created with different flags. + 6.4) Both streams are created with different priorities. + 7) Extend the scenario 5.1 for 3 streamsss. + 8) Create 2 streams. Start capturing both stream1 and stream2 at the same + time. On stream1 queue memcpy, kernel and memcpy operations and on stream2 + queue memcpy, kernel and memcpy operations. Execute both the captured + graphs and validate the results. + 9) Capture 2 streams in parallel using threads. Execute the graphs in + sequence in main thread and validate the results. + 9.1) mode = hipStreamCaptureModeGlobal + 9.2) mode = hipStreamCaptureModeThreadLocal + 9.3) mode = hipStreamCaptureModeRelaxed + 10) Queue operations (increment kernels) in 3 streams. Start capturing + the streams after some operations have been queued. This scenario validates + that only operations queued after hipStreamBeginCapture are captured in + the graph. + 11) Detecting invalid capture. Create 2 streams s1 and s2. Start capturing + s1. Create event dependency between s1 and s2 using event record and event + wait. Try capturing s2. hipStreamBeginCapture must return error. + 12) Stream reuse. Capture multiple graphs from the same stream. Validate + graphs are captured correctly. + 13) Test different synchronization during stream capture. + 13.1) Test hipStreamSynchronize. Must return + hipErrorStreamCaptureUnsupported. + 13.2) Test hipDeviceSynchronize. Must return + hipErrorStreamCaptureUnsupported. + 13.3) Test hipDeviceSynchronize. Must return + hipEventSynchronize. + 13.4) Test hipStreamWaitEvent. Must return + hipErrorStreamCaptureIsolation. + 14) End Stream Capture when the stream capture is still in progress. + 14.1) Abruptly end stream capture when stream capture is in progress in + forked stream. hipStreamEndCapture must return + hipErrorStreamCaptureUnjoined. + 14.2) Abruptly end stream capture when operations in forked stream + are still waiting to be captured. hipStreamEndCapture must return + hipErrorStreamCaptureUnjoined. + 15) Testing independent stream capture using multiple GPUs. Capture + a stream in each device context and execute the captured graph in the + context GPU. + 16) Test Nested Stream Capture Functionality: Create 3 streams s1, s2 & s3. + Capture s1, record event e1 on s1, wait for event e1 on s2 and queue + operations in s1. Record event e2 on s2 and wait for it on s3. Queue + operations on both s2 and s3. Record event e4 on s3 and wait for it in s1. + Record event e3 on s2 and wait for it in s1. End stream capture on s1. + Execute the graph and verify the result. + 17) Forked Stream Reuse: In scenario 16, after end capture on s1, queue + operations on both s2 and s3, and capture their graphs. Execute both the + graphs and validate the functionality. + 18) Capture a complex graph containing multiple independent memcpy, kernel + and host nodes. Launch the graph on random input data and validate the + output. + 19) Capture empty streams (parent + forked streams) and validate the + functionality. +*/ + +#include +#include +#include + +#define INCREMENT_KERNEL_FINALEXP_VAL 7 +constexpr size_t N = 1000000; +constexpr int LAUNCH_ITERS = 50; +static int gCbackIter = 0; +#define GRIDSIZE 256 +#define BLOCKSIZE 256 +#define CONST_KER1_VAL 3 +#define CONST_KER2_VAL 2 +#define CONST_KER3_VAL 5 + +static __global__ void dummyKernel() { + return; +} + +static __global__ void incrementKernel(int *data) { + atomicAdd(data, 1); + return; +} + +static __global__ void myadd(int* A_d, int* B_d) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + A_d[myId] = A_d[myId] + B_d[myId]; +} + +static __global__ void mymul(int* devMem, int value) { + int myId = threadIdx.x + blockDim.x * blockIdx.x; + devMem[myId] = devMem[myId] * value; +} + +static void hostNodeCallback(void* data) { + REQUIRE(data == nullptr); + gCbackIter++; +} + +bool CaptureStreamAndLaunchGraph(float *A_d, float *C_d, float *A_h, + float *C_h, hipStreamCaptureMode mode, hipStream_t stream) { + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + size_t Nbytes = N * sizeof(float); + + HIP_CHECK(hipStreamBeginCapture(stream, mode)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + + HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, stream, A_d, C_d, N); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + + // Validate end capture is successful + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + REQUIRE(graphExec != nullptr); + + // Replay the recorded sequence multiple times + for (int i = 0; i < LAUNCH_ITERS; i++) { + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + } + + HIP_CHECK(hipStreamSynchronize(stream)); + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + + // Validate the computation + for (size_t i = 0; i < N; i++) { + if (C_h[i] != A_h[i] * A_h[i]) { + UNSCOPED_INFO("A and C not matching at " << i); + return false; + } + } + return true; +} + +/** + * Basic Functional Test for API capturing custom stream and replaying sequence. + * Test exercises the API on available/possible modes. + * Stream capture with different modes behave the same when supported/ + * safe apis are used in sequence. + */ +TEST_CASE("Unit_hipStreamBeginCapture_BasicFunctional") { + float *A_d, *C_d; + float *A_h, *C_h; + size_t Nbytes = N * sizeof(float); + hipStream_t stream; + bool ret; + + A_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + } + + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + REQUIRE(A_d != nullptr); + REQUIRE(C_d != nullptr); + + SECTION("Capture stream and launch graph when mode is global") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeGlobal, stream); + REQUIRE(ret == true); + } + + SECTION("Capture stream and launch graph when mode is local") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeThreadLocal, stream); + REQUIRE(ret == true); + } + + SECTION("Capture stream and launch graph when mode is relaxed") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeRelaxed, stream); + REQUIRE(ret == true); + } + + HIP_CHECK(hipStreamDestroy(stream)); + free(A_h); + free(C_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); +} + +/** + * Perform capture on hipStreamPerThread, launch the graph and verify results. + */ +TEST_CASE("Unit_hipStreamBeginCapture_hipStreamPerThread") { + float *A_d, *C_d; + float *A_h, *C_h; + size_t Nbytes = N * sizeof(float); + hipStream_t stream{hipStreamPerThread}; + bool ret; + + A_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + REQUIRE(A_d != nullptr); + REQUIRE(C_d != nullptr); + + SECTION("Capture hipStreamPerThread and launch graph when mode is global") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeGlobal, stream); + REQUIRE(ret == true); + } + + SECTION("Capture hipStreamPerThread and launch graph when mode is local") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeThreadLocal, stream); + REQUIRE(ret == true); + } + + SECTION("Capture hipStreamPerThread and launch graph when mode is relaxed") { + ret = CaptureStreamAndLaunchGraph(A_d, C_d, A_h, C_h, + hipStreamCaptureModeRelaxed, stream); + REQUIRE(ret == true); + } + + free(A_h); + free(C_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); +} + + +/* Test verifies hipStreamBeginCapture API Negative scenarios. + */ + +TEST_CASE("Unit_hipStreamBeginCapture_Negative") { + hipError_t ret; + hipStream_t stream{}; + HIP_CHECK(hipStreamCreate(&stream)); + + SECTION("Stream capture on legacy/null stream returns error code.") { + ret = hipStreamBeginCapture(nullptr, hipStreamCaptureModeGlobal); + REQUIRE(hipErrorStreamCaptureUnsupported == ret); + } + SECTION("Capturing hipStream status with same stream again") { + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + ret = hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal); + REQUIRE(hipErrorIllegalState == ret); + } + SECTION("Creating hipStream with invalid mode") { + ret = hipStreamBeginCapture(stream, hipStreamCaptureMode(-1)); + REQUIRE(hipErrorInvalidValue == ret); + } + HIP_CHECK(hipStreamDestroy(stream)); +} + +TEST_CASE("Unit_hipStreamBeginCapture_Basic") { + hipStream_t s1, s2, s3; + + HIP_CHECK(hipStreamCreate(&s1)); + HIP_CHECK(hipStreamBeginCapture(s1, hipStreamCaptureModeGlobal)); + + HIP_CHECK(hipStreamCreate(&s2)); + HIP_CHECK(hipStreamBeginCapture(s2, hipStreamCaptureModeThreadLocal)); + + HIP_CHECK(hipStreamCreate(&s3)); + HIP_CHECK(hipStreamBeginCapture(s3, hipStreamCaptureModeRelaxed)); + + HIP_CHECK(hipStreamDestroy(s1)); + HIP_CHECK(hipStreamDestroy(s2)); + HIP_CHECK(hipStreamDestroy(s3)); +} +/* Local Function + */ +static void interStrmEventSyncCapture(const hipStream_t &stream1, + const hipStream_t &stream2) { + hipGraph_t graph1, graph2; + hipEvent_t event; + hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, event, 0)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream2>>>(); + dummyKernel<<<1, 1, 0, stream2>>>(); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); + // Create Executable Graphs + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + REQUIRE(graphExec1 != nullptr); + HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + REQUIRE(graphExec2 != nullptr); + size_t numNodes1 = 0, numNodes2 = 0; + HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); + HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); + REQUIRE(numNodes1 == 1); + REQUIRE(numNodes2 == 2); + // Execute the Graphs + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + // Free + HIP_CHECK(hipGraphExecDestroy(graphExec2)); + HIP_CHECK(hipGraphExecDestroy(graphExec1)); + HIP_CHECK(hipGraphDestroy(graph2)); + HIP_CHECK(hipGraphDestroy(graph1)); + HIP_CHECK(hipEventDestroy(event)); +} +/* Local Function + */ +static void colligatedStrmCapture(const hipStream_t &stream1, + const hipStream_t &stream2) { + hipGraph_t graph1, graph2; + hipEvent_t event; + hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event, stream1)); + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamWaitEvent(stream1, event, 0)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); + dummyKernel<<<1, 1, 0, stream2>>>(); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); + // Validate end capture is successful + REQUIRE(graph2 != nullptr); + REQUIRE(graph1 != nullptr); + // Create Executable Graphs + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + REQUIRE(graphExec1 != nullptr); + HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + REQUIRE(graphExec2 != nullptr); + // Execute the Graphs + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + // Free + HIP_CHECK(hipGraphExecDestroy(graphExec2)); + HIP_CHECK(hipGraphExecDestroy(graphExec1)); + HIP_CHECK(hipGraphDestroy(graph2)); + HIP_CHECK(hipGraphDestroy(graph1)); + HIP_CHECK(hipEventDestroy(event)); +} +/* Fill input Data + */ +static void init_input(int* a, size_t size) { + unsigned int seed = time(nullptr); + for (size_t i = 0; i < size; i++) { + a[i] = (HipTest::RAND_R(&seed) & 0xFF); + } +} +/* Validate Output + */ +static void validate_output(int* a, int *b, size_t size) { + for (size_t i = 0; i < size; i++) { + REQUIRE(a[i] == (b[i]*b[i])); + } +} +/* Local Function + */ +static void colligatedStrmCaptureFunc(const hipStream_t &stream1, + const hipStream_t &stream2) { + constexpr size_t size = 1024; + constexpr auto blocksPerCU = 6; + constexpr auto threadsPerBlock = 256; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, size); + hipGraph_t graph1, graph2; + int *inputVec_d1{nullptr}, *inputVec_h1{nullptr}, *outputVec_h1{nullptr}, + *outputVec_d1{nullptr}; + int *inputVec_d2{nullptr}, *inputVec_h2{nullptr}, *outputVec_h2{nullptr}, + *outputVec_d2{nullptr}; + hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + // host and device allocation + HipTest::initArrays(&inputVec_d1, &outputVec_d1, nullptr, + &inputVec_h1, &outputVec_h1, nullptr, size, false); + HipTest::initArrays(&inputVec_d2, &outputVec_d2, nullptr, + &inputVec_h2, &outputVec_h2, nullptr, size, false); + // Capture 2 streams + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipMemcpyAsync(inputVec_d1, inputVec_h1, sizeof(int) * size, + hipMemcpyDefault, stream1)); + HIP_CHECK(hipMemcpyAsync(inputVec_d2, inputVec_h2, sizeof(int) * size, + hipMemcpyDefault, stream2)); + HipTest::vector_square<<>>( + inputVec_d1, outputVec_d1, size); + HipTest::vector_square<<>>( + inputVec_d2, outputVec_d2, size); + HIP_CHECK(hipMemcpyAsync(outputVec_h1, outputVec_d1, sizeof(int) * size, + hipMemcpyDefault, stream1)); + HIP_CHECK(hipMemcpyAsync(outputVec_h2, outputVec_d2, sizeof(int) * size, + hipMemcpyDefault, stream2)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); + // Validate end capture is successful + REQUIRE(graph2 != nullptr); + REQUIRE(graph1 != nullptr); + // Create Executable Graphs + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + REQUIRE(graphExec1 != nullptr); + HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + REQUIRE(graphExec2 != nullptr); + // Execute the Graphs + for (int iter = 0; iter < LAUNCH_ITERS; iter++) { + init_input(inputVec_h1, size); + init_input(inputVec_h2, size); + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + validate_output(outputVec_h1, inputVec_h1, size); + validate_output(outputVec_h2, inputVec_h2, size); + } + // Free + HipTest::freeArrays(inputVec_d1, outputVec_d1, nullptr, + inputVec_h1, outputVec_h1, nullptr, false); + HipTest::freeArrays(inputVec_d2, outputVec_d2, nullptr, + inputVec_h2, outputVec_h2, nullptr, false); + HIP_CHECK(hipGraphExecDestroy(graphExec2)); + HIP_CHECK(hipGraphExecDestroy(graphExec1)); + HIP_CHECK(hipGraphDestroy(graph2)); + HIP_CHECK(hipGraphDestroy(graph1)); +} +/* Stream Capture thread function + */ +static void threadStrmCaptureFunc(hipStream_t stream, int *inputVec_d, +int *outputVec_d, int *inputVec_h, int *outputVec_h, hipGraph_t *graph, +size_t size, hipStreamCaptureMode mode) { + constexpr auto blocksPerCU = 6; + constexpr auto threadsPerBlock = 256; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, size); + // Capture stream + HIP_CHECK(hipStreamBeginCapture(stream, mode)); + HIP_CHECK(hipMemcpyAsync(inputVec_d, inputVec_h, sizeof(int) * size, + hipMemcpyDefault, stream)); + HipTest::vector_square<<>>( + inputVec_d, outputVec_d, size); + HIP_CHECK(hipMemcpyAsync(outputVec_h, outputVec_d, sizeof(int) * size, + hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamEndCapture(stream, graph)); +} +/* Local Function for multithreaded tests + */ +static void multithreadedTest(hipStreamCaptureMode mode) { + hipStream_t stream1, stream2; + constexpr size_t size = 1024; + hipGraph_t graph1, graph2; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + int *inputVec_d1{nullptr}, *inputVec_h1{nullptr}, *outputVec_h1{nullptr}, + *outputVec_d1{nullptr}; + int *inputVec_d2{nullptr}, *inputVec_h2{nullptr}, *outputVec_h2{nullptr}, + *outputVec_d2{nullptr}; + hipGraphExec_t graphExec1{nullptr}, graphExec2{nullptr}; + // host and device allocation + HipTest::initArrays(&inputVec_d1, &outputVec_d1, nullptr, + &inputVec_h1, &outputVec_h1, nullptr, size, false); + HipTest::initArrays(&inputVec_d2, &outputVec_d2, nullptr, + &inputVec_h2, &outputVec_h2, nullptr, size, false); + // Launch 2 threads to capture the 2 streams into graphs + std::thread t1(threadStrmCaptureFunc, stream1, inputVec_d1, + outputVec_d1, inputVec_h1, outputVec_h1, &graph1, size, mode); + std::thread t2(threadStrmCaptureFunc, stream2, inputVec_d2, + outputVec_d2, inputVec_h2, outputVec_h2, &graph2, size, mode); + t1.join(); + t2.join(); + // Create Executable Graphs + HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); + // Execute the Graphs + for (int iter = 0; iter < LAUNCH_ITERS; iter++) { + init_input(inputVec_h1, size); + init_input(inputVec_h2, size); + HIP_CHECK(hipGraphLaunch(graphExec1, stream1)); + HIP_CHECK(hipGraphLaunch(graphExec2, stream2)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipStreamSynchronize(stream2)); + validate_output(outputVec_h1, inputVec_h1, size); + validate_output(outputVec_h2, inputVec_h2, size); + } + // Free + HipTest::freeArrays(inputVec_d1, outputVec_d1, nullptr, + inputVec_h1, outputVec_h1, nullptr, false); + HipTest::freeArrays(inputVec_d2, outputVec_d2, nullptr, + inputVec_h2, outputVec_h2, nullptr, false); + HIP_CHECK(hipGraphExecDestroy(graphExec2)); + HIP_CHECK(hipGraphExecDestroy(graphExec1)); + HIP_CHECK(hipGraphDestroy(graph1)); + HIP_CHECK(hipGraphDestroy(graph2)); + HIP_CHECK(hipStreamDestroy(stream1)); + HIP_CHECK(hipStreamDestroy(stream2)); +} +/* Test scenario 5.1 + */ +TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_defaultflag") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + interStrmEventSyncCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 5.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_blockingflag") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); + HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamNonBlocking)); + interStrmEventSyncCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 5.3 + */ +TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_diffflags") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); + HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamDefault)); + interStrmEventSyncCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 5.4 + */ +TEST_CASE("Unit_hipStreamBeginCapture_InterStrmEventSync_diffprio") { + hipStream_t stream1, stream2; + int minPriority = 0, maxPriority = 0; + HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority)); + HIP_CHECK(hipStreamCreateWithPriority(&stream1, hipStreamDefault, + minPriority)); + HIP_CHECK(hipStreamCreateWithPriority(&stream2, hipStreamDefault, + maxPriority)); + interStrmEventSyncCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 6.1 + */ +TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_defaultflag") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + colligatedStrmCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 6.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_blockingflag") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); + HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamNonBlocking)); + colligatedStrmCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 6.3 + */ +TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_diffflags") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreateWithFlags(&stream1, hipStreamNonBlocking)); + HIP_CHECK(hipStreamCreateWithFlags(&stream2, hipStreamDefault)); + colligatedStrmCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 6.4 + */ +TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_diffprio") { + hipStream_t stream1, stream2; + int minPriority = 0, maxPriority = 0; + HIP_CHECK(hipDeviceGetStreamPriorityRange(&minPriority, &maxPriority)); + HIP_CHECK(hipStreamCreateWithPriority(&stream1, hipStreamDefault, + minPriority)); + HIP_CHECK(hipStreamCreateWithPriority(&stream2, hipStreamDefault, + maxPriority)); + colligatedStrmCapture(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 7 + */ +TEST_CASE("Unit_hipStreamBeginCapture_multiplestrms") { + hipStream_t stream1, stream2, stream3; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + hipGraph_t graph1, graph2, graph3; + size_t numNodes1 = 0, numNodes2 = 0, numNodes3 = 0; + SECTION("Capture Multiple stream with interdependent events") { + hipEvent_t event1, event2; + HIP_CHECK(hipEventCreate(&event1)); + HIP_CHECK(hipEventCreate(&event2)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event2, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream3, event2, 0)); + dummyKernel<<<1, 1, 0, stream2>>>(); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); + HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream3>>>(); + HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); + HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); + HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); + HIP_CHECK(hipGraphGetNodes(graph3, nullptr, &numNodes3)); + REQUIRE(numNodes1 == 1); + REQUIRE(numNodes2 == 1); + REQUIRE(numNodes3 == 1); + HIP_CHECK(hipEventDestroy(event2)); + HIP_CHECK(hipEventDestroy(event1)); + } + SECTION("Capture Multiple stream with single event") { + hipEvent_t event1; + HIP_CHECK(hipEventCreate(&event1)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, event1, 0)); + HIP_CHECK(hipStreamWaitEvent(stream3, event1, 0)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream2>>>(); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); + HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream3>>>(); + HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); + HIP_CHECK(hipGraphGetNodes(graph1, nullptr, &numNodes1)); + HIP_CHECK(hipGraphGetNodes(graph2, nullptr, &numNodes2)); + HIP_CHECK(hipGraphGetNodes(graph3, nullptr, &numNodes3)); + REQUIRE(numNodes1 == 1); + REQUIRE(numNodes2 == 1); + REQUIRE(numNodes3 == 1); + HIP_CHECK(hipEventDestroy(event1)); + } + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 8 + */ +TEST_CASE("Unit_hipStreamBeginCapture_ColligatedStrmCapture_func") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + colligatedStrmCaptureFunc(stream1, stream2); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 9.1 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_Global") { + multithreadedTest(hipStreamCaptureModeGlobal); +} +/* Test scenario 9.2 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_ThreadLocal") { + multithreadedTest(hipStreamCaptureModeThreadLocal); +} +/* Test scenario 9.3 + */ +TEST_CASE("Unit_hipStreamBeginCapture_Multithreaded_Relaxed") { + multithreadedTest(hipStreamCaptureModeRelaxed); +} +/* Test scenario 10 + */ +TEST_CASE("Unit_hipStreamBeginCapture_CapturingFromWithinStrms") { + hipGraph_t graph; + hipStream_t stream1, stream2, stream3; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + hipEvent_t e1, e2, e3; + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + HIP_CHECK(hipEventCreate(&e3)); + // Create a device memory of size int and initialize it to 0 + int *devMem{nullptr}, *hostMem{nullptr}; + hostMem = reinterpret_cast(malloc(sizeof(int))); + HIP_CHECK(hipMalloc(&devMem, sizeof(int))); + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Start Capturing stream1 + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(e1, stream1)); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); + HIP_CHECK(hipStreamWaitEvent(stream3, e1, 0)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + HIP_CHECK(hipEventRecord(e2, stream2)); + HIP_CHECK(hipEventRecord(e3, stream3)); + HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); + HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + // Reset device memory + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Create Executable Graphs + hipGraphExec_t graphExec{nullptr}; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream1)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem) == INCREMENT_KERNEL_FINALEXP_VAL); + HIP_CHECK(hipFree(devMem)); + free(hostMem); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(e3)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 11 + */ +TEST_CASE("Unit_hipStreamBeginCapture_DetectingInvalidCapture") { + hipStream_t stream1, stream2; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + hipEvent_t event; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(event, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, event, 0)); + dummyKernel<<<1, 1, 0, stream1>>>(); + // Since stream2 is already in capture mode due to event wait + // hipStreamBeginCapture on stream2 is expected to return error. + REQUIRE(hipSuccess != hipStreamBeginCapture(stream2, + hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 12 + */ +TEST_CASE("Unit_hipStreamBeginCapture_CapturingMultGraphsFrom1Strm") { + hipStream_t stream1; + HIP_CHECK(hipStreamCreate(&stream1)); + hipGraph_t graph[3]; + // Create a device memory of size int and initialize it to 0 + int *devMem{nullptr}, *hostMem{nullptr}; + hostMem = reinterpret_cast(malloc(sizeof(int))); + HIP_CHECK(hipMalloc(&devMem, sizeof(int))); + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Capture Graph1 + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph[0])); + // Capture Graph2 + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph[1])); + // Capture Graph3 + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph[2])); + // Instantiate and execute all graphs + for (int i = 0; i < 3; i++) { + hipGraphExec_t graphExec{nullptr}; + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph[i], nullptr, + nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream1)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem) == (i + 1)); + } + HIP_CHECK(hipFree(devMem)); + free(hostMem); + for (int i = 0; i < 3; i++) { + HIP_CHECK(hipGraphDestroy(graph[i])); + } + HIP_CHECK(hipStreamDestroy(stream1)); +} +#if HT_NVIDIA +/* Test scenario 13 + */ +TEST_CASE("Unit_hipStreamBeginCapture_CheckingSyncDuringCapture") { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + SECTION("Synchronize stream during capture") { + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipErrorStreamCaptureUnsupported == + hipStreamSynchronize(stream)); + } + SECTION("Synchronize device during capture") { + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipErrorStreamCaptureUnsupported == hipDeviceSynchronize()); + } + SECTION("Synchronize event during capture") { + hipEvent_t e; + HIP_CHECK(hipEventCreate(&e)); + HIP_CHECK(hipEventRecord(e, stream)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipErrorStreamCaptureUnsupported == hipEventSynchronize(e)); + HIP_CHECK(hipEventDestroy(e)); + } + SECTION("Wait for an event during capture") { + hipEvent_t e; + HIP_CHECK(hipEventCreate(&e)); + HIP_CHECK(hipEventRecord(e, stream)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipErrorStreamCaptureIsolation == + hipStreamWaitEvent(stream, e, 0)); + HIP_CHECK(hipEventDestroy(e)); + } + SECTION("Query stream during capture") { + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipErrorStreamCaptureUnsupported == hipStreamQuery(stream)); + } + SECTION("Query for an event during capture") { + hipEvent_t e; + HIP_CHECK(hipEventCreate(&e)); + HIP_CHECK(hipEventRecord(e, stream)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + REQUIRE(hipSuccess != hipEventQuery(e)); + HIP_CHECK(hipEventDestroy(e)); + } + HIP_CHECK(hipStreamDestroy(stream)); +} +#endif +/* Test scenario 14 + */ +TEST_CASE("Unit_hipStreamBeginCapture_EndingCapturewhenCaptureInProgress") { + hipStream_t stream1, stream2; + hipGraph_t graph; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + SECTION("Abruptly end strm capture when in progress in forked strm") { + hipEvent_t e; + HIP_CHECK(hipEventCreate(&e)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipEventRecord(e, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e, 0)); + dummyKernel<<<1, 1, 0, stream2>>>(); + REQUIRE(hipErrorStreamCaptureUnjoined == + hipStreamEndCapture(stream1, &graph)); + HIP_CHECK(hipEventDestroy(e)); + } + SECTION("End strm capture when forked strm still has operations") { + hipEvent_t e1, e2; + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + dummyKernel<<<1, 1, 0, stream1>>>(); + HIP_CHECK(hipEventRecord(e1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); + dummyKernel<<<1, 1, 0, stream2>>>(); + HIP_CHECK(hipEventRecord(e2, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); + dummyKernel<<<1, 1, 0, stream2>>>(); + REQUIRE(hipErrorStreamCaptureUnjoined == + hipStreamEndCapture(stream1, &graph)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + } + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} + +/* Test scenario 15 + */ +TEST_CASE("Unit_hipStreamBeginCapture_MultiGPU") { + int devcount = 0; + HIP_CHECK(hipGetDeviceCount(&devcount)); + // If only single GPU is detected then return + if (devcount < 2) { + SUCCEED("skipping the testcases as numDevices < 2"); + return; + } + hipStream_t* stream = reinterpret_cast(malloc( + devcount*sizeof(hipStream_t))); + REQUIRE(stream != nullptr); + hipGraph_t* graph = reinterpret_cast(malloc( + devcount*sizeof(hipGraph_t))); + REQUIRE(graph != nullptr); + int **devMem{nullptr}, **hostMem{nullptr}; + hostMem = reinterpret_cast(malloc(sizeof(int*)*devcount)); + REQUIRE(hostMem != nullptr); + devMem = reinterpret_cast(malloc(sizeof(int*)*devcount)); + REQUIRE(devMem != nullptr); + hipGraphExec_t* graphExec = reinterpret_cast(malloc( + devcount*sizeof(hipGraphExec_t))); + // Capture stream in each device + for (int dev = 0; dev < devcount; dev++) { + HIP_CHECK(hipSetDevice(dev)); + HIP_CHECK(hipStreamCreate(&stream[dev])); + hostMem[dev] = reinterpret_cast(malloc(sizeof(int))); + HIP_CHECK(hipMalloc(&devMem[dev], sizeof(int))); + HIP_CHECK(hipStreamBeginCapture(stream[dev], + hipStreamCaptureModeGlobal)); + HIP_CHECK(hipMemsetAsync(devMem[dev], 0, sizeof(int), stream[dev])); + for (int i = 0; i < (dev + 1); i++) { + incrementKernel<<<1, 1, 0, stream[dev]>>>(devMem[dev]); + } + HIP_CHECK(hipMemcpyAsync(hostMem[dev], devMem[dev], sizeof(int), + hipMemcpyDefault, stream[dev])); + HIP_CHECK(hipStreamEndCapture(stream[dev], &graph[dev])); + } + // Launch the captured graphs in the respective device + for (int dev = 0; dev < devcount; dev++) { + HIP_CHECK(hipSetDevice(dev)); + HIP_CHECK(hipGraphInstantiate(&graphExec[dev], graph[dev], nullptr, + nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec[dev], stream[dev])); + } + // Validate output + for (int dev = 0; dev < devcount; dev++) { + HIP_CHECK(hipSetDevice(dev)); + HIP_CHECK(hipStreamSynchronize(stream[dev])); + REQUIRE((*hostMem[dev]) == (dev + 1)); + } + // Destroy all device resources + for (int dev = 0; dev < devcount; dev++) { + HIP_CHECK(hipSetDevice(dev)); + HIP_CHECK(hipGraphExecDestroy(graphExec[dev])); + HIP_CHECK(hipStreamDestroy(stream[dev])); + } + free(graphExec); + free(hostMem); + free(devMem); + free(stream); + free(graph); +} +/* Test scenario 16 + */ +TEST_CASE("Unit_hipStreamBeginCapture_nestedStreamCapture") { + hipGraph_t graph; + hipStream_t stream1, stream2, stream3; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + hipEvent_t e1, e2, e3, e4; + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + HIP_CHECK(hipEventCreate(&e3)); + HIP_CHECK(hipEventCreate(&e4)); + // Create a device memory of size int and initialize it to 0 + int *devMem{nullptr}, *hostMem{nullptr}; + hostMem = reinterpret_cast(malloc(sizeof(int))); + REQUIRE(hostMem != nullptr); + HIP_CHECK(hipMalloc(&devMem, sizeof(int))); + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Start Capturing stream1 + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(e1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); + HIP_CHECK(hipEventRecord(e2, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + incrementKernel<<<1, 1, 0, stream1>>>(devMem); + incrementKernel<<<1, 1, 0, stream2>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + incrementKernel<<<1, 1, 0, stream3>>>(devMem); + HIP_CHECK(hipEventRecord(e3, stream2)); + HIP_CHECK(hipEventRecord(e4, stream3)); + HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); + HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); + HIP_CHECK(hipMemcpyAsync(hostMem, devMem, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + // Reset device memory + HIP_CHECK(hipMemset(devMem, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Create Executable Graphs + hipGraphExec_t graphExec{nullptr}; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream1)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem) == INCREMENT_KERNEL_FINALEXP_VAL); + HIP_CHECK(hipFree(devMem)); + free(hostMem); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(e4)); + HIP_CHECK(hipEventDestroy(e3)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 17 + */ +TEST_CASE("Unit_hipStreamBeginCapture_streamReuse") { + hipGraph_t graph1, graph2, graph3; + hipStream_t stream1, stream2, stream3; + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + hipEvent_t e1, e2, e3, e4; + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + HIP_CHECK(hipEventCreate(&e3)); + HIP_CHECK(hipEventCreate(&e4)); + // Create a device memory of size int and initialize it to 0 + int *devMem1{nullptr}, *hostMem1{nullptr}, *devMem2{nullptr}, + *hostMem2{nullptr}, *devMem3{nullptr}, *hostMem3{nullptr}; + HipTest::initArrays(&devMem1, &devMem2, &devMem3, + &hostMem1, &hostMem2, &hostMem3, 1, false); + HIP_CHECK(hipMemset(devMem1, 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem2, 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem3, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Start Capturing stream1 + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(e1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); + HIP_CHECK(hipEventRecord(e2, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); + incrementKernel<<<1, 1, 0, stream1>>>(devMem1); + incrementKernel<<<1, 1, 0, stream2>>>(devMem1); + incrementKernel<<<1, 1, 0, stream3>>>(devMem1); + incrementKernel<<<1, 1, 0, stream1>>>(devMem1); + incrementKernel<<<1, 1, 0, stream2>>>(devMem1); + incrementKernel<<<1, 1, 0, stream3>>>(devMem1); + incrementKernel<<<1, 1, 0, stream3>>>(devMem1); + HIP_CHECK(hipEventRecord(e3, stream2)); + HIP_CHECK(hipEventRecord(e4, stream3)); + HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); + HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); + HIP_CHECK(hipMemcpyAsync(hostMem1, devMem1, sizeof(int), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph1)); // End Capture + // Start capturing graph2 from stream 2 + HIP_CHECK(hipStreamBeginCapture(stream2, hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, stream2>>>(devMem2); + incrementKernel<<<1, 1, 0, stream2>>>(devMem2); + incrementKernel<<<1, 1, 0, stream2>>>(devMem2); + HIP_CHECK(hipMemcpyAsync(hostMem2, devMem2, sizeof(int), + hipMemcpyDefault, stream2)); + HIP_CHECK(hipStreamEndCapture(stream2, &graph2)); // End Capture + // Start capturing graph3 from stream 3 + HIP_CHECK(hipStreamBeginCapture(stream3, hipStreamCaptureModeGlobal)); + incrementKernel<<<1, 1, 0, stream3>>>(devMem3); + incrementKernel<<<1, 1, 0, stream3>>>(devMem3); + incrementKernel<<<1, 1, 0, stream3>>>(devMem3); + incrementKernel<<<1, 1, 0, stream3>>>(devMem3); + incrementKernel<<<1, 1, 0, stream3>>>(devMem3); + HIP_CHECK(hipMemcpyAsync(hostMem3, devMem3, sizeof(int), + hipMemcpyDefault, stream3)); + HIP_CHECK(hipStreamEndCapture(stream3, &graph3)); // End Capture + // Reset device memory + HIP_CHECK(hipMemset(devMem1, 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem2, 0, sizeof(int))); + HIP_CHECK(hipMemset(devMem3, 0, sizeof(int))); + HIP_CHECK(hipDeviceSynchronize()); + // Create Executable Graphs + hipGraphExec_t graphExec{nullptr}; + // Verify graph1 + HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream1)); + HIP_CHECK(hipStreamSynchronize(stream1)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem1) == INCREMENT_KERNEL_FINALEXP_VAL); + // Verify graph2 + HIP_CHECK(hipGraphInstantiate(&graphExec, graph2, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream2)); + HIP_CHECK(hipStreamSynchronize(stream2)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem2) == 3); + // Verify graph3 + HIP_CHECK(hipGraphInstantiate(&graphExec, graph3, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream3)); + HIP_CHECK(hipStreamSynchronize(stream3)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + REQUIRE((*hostMem3) == 5); + HipTest::freeArrays(devMem1, devMem2, devMem3, + hostMem1, hostMem2, hostMem3, false); + HIP_CHECK(hipGraphDestroy(graph1)); + HIP_CHECK(hipGraphDestroy(graph2)); + HIP_CHECK(hipGraphDestroy(graph3)); + HIP_CHECK(hipEventDestroy(e4)); + HIP_CHECK(hipEventDestroy(e3)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} + +/* Test scenario 18 + */ +TEST_CASE("Unit_hipStreamBeginCapture_captureComplexGraph") { + hipGraph_t graph; + hipStream_t stream1, stream2, stream3, stream4, stream5; + // Stream and event create + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + HIP_CHECK(hipStreamCreate(&stream4)); + HIP_CHECK(hipStreamCreate(&stream5)); + hipEvent_t e0, e1, e2, e3, e4, e5, e6; + HIP_CHECK(hipEventCreate(&e0)); + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + HIP_CHECK(hipEventCreate(&e3)); + HIP_CHECK(hipEventCreate(&e4)); + HIP_CHECK(hipEventCreate(&e5)); + HIP_CHECK(hipEventCreate(&e6)); + // Allocate Device memory and Host memory + size_t N = GRIDSIZE*BLOCKSIZE; + int *Ah{nullptr}, *Bh{nullptr}, *Ch{nullptr}, *Ad{nullptr}, *Bd{nullptr}; + HipTest::initArrays(&Ad, &Bd, nullptr, &Ah, &Bh, &Ch, N, false); + // Capture streams into graph + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(e0, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream4, e0, 0)); + HIP_CHECK(hipStreamWaitEvent(stream5, e0, 0)); + HIP_CHECK(hipMemcpyAsync(Ad, Ah, (N*sizeof(int)), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipMemcpyAsync(Bd, Bh, (N*sizeof(int)), + hipMemcpyDefault, stream5)); + hipHostFn_t fn = hostNodeCallback; + HIPCHECK(hipLaunchHostFunc(stream4, fn, nullptr)); + HIP_CHECK(hipEventRecord(e1, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e1, 0)); + int *Ad_2nd_half = Ad + N/2; + int *Ad_1st_half = Ad; + mymul<<>>(Ad_2nd_half, CONST_KER2_VAL); + mymul<<>>(Ad_1st_half, CONST_KER1_VAL); + HIP_CHECK(hipEventRecord(e2, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream3, e2, 0)); + mymul<<>>(Ad_1st_half, CONST_KER3_VAL); + HIPCHECK(hipLaunchHostFunc(stream3, fn, nullptr)); + HIP_CHECK(hipEventRecord(e6, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream1, e6, 0)); + HIP_CHECK(hipEventRecord(e5, stream5)); + HIP_CHECK(hipStreamWaitEvent(stream1, e5, 0)); + myadd<<>>(Ad, Bd); + HIP_CHECK(hipEventRecord(e3, stream3)); + HIP_CHECK(hipStreamWaitEvent(stream1, e3, 0)); + HIP_CHECK(hipEventRecord(e4, stream4)); + HIP_CHECK(hipStreamWaitEvent(stream1, e4, 0)); + HIP_CHECK(hipMemcpyAsync(Ch, Ad, (N*sizeof(int)), + hipMemcpyDefault, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + // Execute and test the graph + // Create Executable Graphs + hipGraphExec_t graphExec{nullptr}; + // Verify graph1 + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + for (int iter = 0; iter < LAUNCH_ITERS; iter++) { + init_input(Ah, N); + init_input(Bh, N); + HIP_CHECK(hipGraphLaunch(graphExec, stream1)); + HIP_CHECK(hipStreamSynchronize(stream1)); + for (size_t i = 0; i < N; i++) { + if (i > (N/2 - 1)) { + REQUIRE(Ch[i] == (Bh[i] + Ah[i]*CONST_KER2_VAL)); + } else { + REQUIRE(Ch[i] == (Bh[i] + Ah[i]*CONST_KER1_VAL*CONST_KER3_VAL)); + } + } + } + REQUIRE(gCbackIter == (2*LAUNCH_ITERS)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + // Free Device memory and Host memory + HipTest::freeArrays(Ad, Bd, nullptr, Ah, Bh, Ch, false); + // Destroy graph, events and streams + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(e6)); + HIP_CHECK(hipEventDestroy(e5)); + HIP_CHECK(hipEventDestroy(e4)); + HIP_CHECK(hipEventDestroy(e3)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK(hipEventDestroy(e0)); + HIP_CHECK(hipStreamDestroy(stream5)); + HIP_CHECK(hipStreamDestroy(stream4)); + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} +/* Test scenario 19 + */ +TEST_CASE("Unit_hipStreamBeginCapture_captureEmptyStreams") { + hipGraph_t graph; + hipStream_t stream1, stream2, stream3; + // Stream and event create + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&stream3)); + hipEvent_t e0, e1, e2; + HIP_CHECK(hipEventCreate(&e0)); + HIP_CHECK(hipEventCreate(&e1)); + HIP_CHECK(hipEventCreate(&e2)); + // Capture streams into graph + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(e0, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, e0, 0)); + HIP_CHECK(hipStreamWaitEvent(stream3, e0, 0)); + HIP_CHECK(hipEventRecord(e1, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream1, e1, 0)); + HIP_CHECK(hipEventRecord(e2, stream3)); + HIP_CHECK(hipStreamWaitEvent(stream1, e2, 0)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph)); // End Capture + size_t numNodes = 0; + HIP_CHECK(hipGraphGetNodes(graph, nullptr, &numNodes)); + REQUIRE(numNodes == 0); + // Destroy graph, events and streams + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(e2)); + HIP_CHECK(hipEventDestroy(e1)); + HIP_CHECK(hipEventDestroy(e0)); + HIP_CHECK(hipStreamDestroy(stream3)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipStreamDestroy(stream1)); +} diff --git a/catch/unit/graph/hipStreamEndCapture.cc b/catch/unit/graph/hipStreamEndCapture.cc index 740764d4b2..22762268ab 100644 --- a/catch/unit/graph/hipStreamEndCapture.cc +++ b/catch/unit/graph/hipStreamEndCapture.cc @@ -17,421 +17,190 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/** -Negative Testcase Scenarios : -1) Pass stream as nullptr and verify there is no crash, api returns error code. -2) Pass graph as nullptr and verify there is no crash, api returns error code. -3) Pass graph as nullptr and and stream as hipStreamPerThread verify there - is no crash, api returns error code. -4) End capture on stream where capture has not yet started and verify - error code is returned. -5) Destroy stream and try to end capture. -6) Destroy Graph and try to end capture. -7) Begin capture on a thread with mode other than hipStreamCaptureModeRelaxed - and try to end capture from different thread. Expect to return - hipErrorStreamCaptureWrongThread. -8) Start stream capture on stream1 using mode hipStreamCaptureModeRelaxed. - In stream1 queue a memcpy operation, queue a kernel square of a number operation. - Launch a thread. In the thread, queue a memcpy operation. End the capture on - stream1 and return the captured graph. Wait for the thread in main function. - Create an executable graph and launch the graph on input data and validate the - output. -9) Create 2 streams s1 and s2. Begin stream capture in s1, spawn a - captured fork stream on s2. Queue some operations - (like increment kernel) on both s1 and s2. End the stream capture - on s2 and verify the error returned by the End capture. -10)Create 2 streams s1 and s2. Begin stream capture in s1 and spawn a captured - fork stream s2. In main thread, queue a memcpy operation on s1. - Launch a thread, queue a memcpy operation on s2. Perform hipEventRecord on - s2 and wait Event on S1. Wait for the thread to complete. Queue operations - kernel addition(Cd = Ad + Bd) operation and memcpy(Ch <- Cd) in s1. End the - stream capture in s1. Create an executable graph and launch the graph on input - data and validate the output. -*/ - #include #include +#include + +#include "stream_capture_common.hh" + +/** + * @addtogroup hipStreamEndCapture hipStreamEndCapture + * @{ + * @ingroup GraphTest + * `hipStreamEndCapture(hipStream_t stream, hipGraph_t *pGraph)` - + * ends capture on a stream, returning the captured graph + */ + +/** + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# End capture on legacy/null stream + * -# End capture when graph is nullptr + * -# End capture on stream where capture has not yet started + * -# Destroy stream and try to end capture + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamEndCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamEndCapture_Negative_Parameters") { + hipGraph_t graph{nullptr}; + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); -TEST_CASE("Unit_hipStreamEndCapture_Negative") { - hipError_t ret; SECTION("Pass stream as nullptr") { - hipGraph_t graph; - ret = hipStreamEndCapture(nullptr, &graph); - REQUIRE(hipErrorIllegalState == ret); + HIP_CHECK_ERROR(hipStreamEndCapture(nullptr, &graph), hipErrorIllegalState); } #if HT_NVIDIA SECTION("Pass graph as nullptr") { - hipStream_t stream; - HIP_CHECK(hipStreamCreate(&stream)); - ret = hipStreamEndCapture(stream, nullptr); - REQUIRE(hipErrorInvalidValue == ret); - HIP_CHECK(hipStreamDestroy(stream)); - } - SECTION("Pass graph as nullptr and stream as hipStreamPerThread") { - ret = hipStreamEndCapture(hipStreamPerThread, nullptr); - REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK_ERROR(hipStreamEndCapture(stream, nullptr), hipErrorInvalidValue); } #endif SECTION("End capture on stream where capture has not yet started") { - hipStream_t stream; - hipGraph_t graph; - HIP_CHECK(hipStreamCreate(&stream)); - ret = hipStreamEndCapture(stream, &graph); - REQUIRE(hipErrorIllegalState == ret); - HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK_ERROR(hipStreamEndCapture(stream, &graph), hipErrorIllegalState); } SECTION("Destroy stream and try to end capture") { - hipStream_t stream; - hipGraph_t graph; - HIP_CHECK(hipStreamCreate(&stream)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipStreamDestroy(stream)); - ret = hipStreamEndCapture(stream, &graph); - REQUIRE(hipErrorContextIsDestroyed == ret); - } - SECTION("Destroy graph and try to end capture in between") { - hipStream_t stream{nullptr}; - hipGraph_t graph{nullptr}; - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - constexpr size_t N = 100000; - size_t Nbytes = N * sizeof(float); - float *A_d, *C_d; - float *A_h, *C_h; - - A_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(C_h != nullptr); - - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - } - - HIP_CHECK(hipMalloc(&A_d, Nbytes)); - HIP_CHECK(hipMalloc(&C_d, Nbytes)); - REQUIRE(A_d != nullptr); - REQUIRE(C_d != nullptr); - - HIP_CHECK(hipStreamCreate(&stream)); - HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); - - HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); - hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), - dim3(threadsPerBlock), 0, stream, A_d, C_d, N); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); - - HIP_CHECK(hipGraphDestroy(graph)); - ret = hipStreamEndCapture(stream, &graph); - REQUIRE(hipSuccess == ret); - - free(A_h); - free(C_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipFree(C_d)); - HIP_CHECK(hipStreamDestroy(stream)); + hipStream_t destroyed_stream; + HIP_CHECK(hipStreamCreate(&destroyed_stream)); + HIP_CHECK(hipStreamBeginCapture(destroyed_stream, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamDestroy(destroyed_stream)); + HIP_CHECK_ERROR(hipStreamEndCapture(destroyed_stream, &graph), hipErrorContextIsDestroyed); } } -static void thread_func(hipStream_t stream, hipGraph_t graph) { - HIP_ASSERT(hipErrorStreamCaptureWrongThread == - hipStreamEndCapture(stream, &graph)); -} -static void StreamEndCaptureThreadNegative(float* A_d, float* A_h, - float* C_d, float* C_h, hipStreamCaptureMode mode) { - hipStream_t stream{nullptr}; +/** + * Test Description + * ------------------------ + * - Test to verify no error occurs when graph is destroyed before capture + * ends + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamEndCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamEndCapture_Positive_GraphDestroy") { hipGraph_t graph{nullptr}; - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - constexpr size_t N = 100000; + constexpr size_t N = 1000000; size_t Nbytes = N * sizeof(float); - HIP_CHECK(hipStreamCreate(&stream)); + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = hipStreamCaptureModeGlobal; HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamBeginCapture(stream, mode)); - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); - hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), - dim3(threadsPerBlock), 0, stream, A_d, C_d, N); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceSimple(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), N, stream); - std::thread t(thread_func, stream, graph); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamEndCapture(stream, &graph)); +} + +static void thread_func_neg(hipStream_t stream, hipGraph_t graph) { + HIP_ASSERT(hipErrorStreamCaptureWrongThread == hipStreamEndCapture(stream, &graph)); +} + +/** + * Test Description + * ------------------------ + * - Test to verify that when capture is initiated on a thread with mode + * other than hipStreamCaptureModeRelaxed and try to end capture from different + * thread, it is expected to return hipErrorStreamCaptureWrongThread + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamEndCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamEndCapture_Negative_Thread") { + constexpr size_t N = 1000000; + size_t Nbytes = N * sizeof(float); + + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + + hipGraph_t graph{nullptr}; + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = hipStreamCaptureModeGlobal; + HIP_CHECK(hipGraphCreate(&graph, 0)); + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceSimple(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), N, stream); + + std::thread t(thread_func_neg, stream, graph); t.join(); #if HT_AMD HIP_CHECK(hipStreamEndCapture(stream, &graph)); #endif - HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipGraphDestroy(graph)); } -TEST_CASE("Unit_hipStreamEndCapture_Thread_Negative") { - constexpr size_t N = 100000; - size_t Nbytes = N * sizeof(float); - float *A_d, *C_d; - float *A_h, *C_h; - A_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(C_h != nullptr); - - // Fill with Phi + i - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.618f + i; - } - - HIP_CHECK(hipMalloc(&A_d, Nbytes)); - HIP_CHECK(hipMalloc(&C_d, Nbytes)); - REQUIRE(A_d != nullptr); - REQUIRE(C_d != nullptr); - - SECTION("Capture Mode:hipStreamCaptureModeGlobal") { - StreamEndCaptureThreadNegative(A_d, A_h, C_d, C_h, - hipStreamCaptureModeGlobal); - } - SECTION("Capture Mode:hipStreamCaptureModeThreadLocal") { - StreamEndCaptureThreadNegative(A_d, A_h, C_d, C_h, - hipStreamCaptureModeThreadLocal); - } - free(A_h); - free(C_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipFree(C_d)); -} -// Thread function -static void thread_func1(hipStream_t stream, hipGraph_t *graph, - size_t Nbytes, float* A_d, float* B_h) { - HIP_CHECK(hipMemcpyAsync(B_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream)); +static void thread_func_pos(hipStream_t stream, hipGraph_t* graph) { HIP_CHECK(hipStreamEndCapture(stream, graph)); } -/* - * Start stream capture on stream1 using mode hipStreamCaptureModeRelaxed. - * In stream1 queue a memcpy operation, queue a kernel square of a number operation. - * Launch a thread. In the thread, queue a memcpy operation. End the capture on - * stream1 and return the captured graph. Wait for the thread in main function. - * Create an executable graph and launch the graph on input data and validate the output. - * */ -TEST_CASE("Unit_hipStreamEndCapture_mode_hipStreamCaptureModeRelaxed") { - hipStream_t stream{nullptr}, streamForGraph{nullptr}; - hipGraph_t graph{nullptr}; - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - constexpr size_t N = 10; + +/** + * Test Description + * ------------------------ + * - Test to verify that when capture is initiated on a thread with + * hipStreamCaptureModeRelaxed mode, end capture in a different thread is + * successful + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamEndCapture.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipStreamEndCapture_Positive_Thread") { + constexpr size_t N = 1000000; size_t Nbytes = N * sizeof(float); - // Device Pointers - float *A_d; - // Host Pointers - float *A_h, *B_h, *C_h; - // Memory allocation to Host pointers - A_h = reinterpret_cast(malloc(Nbytes)); - B_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(B_h != nullptr); - REQUIRE(C_h != nullptr); + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); - // Initialize the Host data - for (size_t i = 0; i < N; i++) { - A_h[i] = 1.0f + i; - C_h[i] = A_h[i]; - } - // Memory allocation to Device pointers - HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); - REQUIRE(A_d != nullptr); - - HIP_CHECK(hipStreamCreate(&stream)); - HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeRelaxed)); - // Copy data from Host to Device - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); - - hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), - dim3(threadsPerBlock), 0, stream, A_d, A_d, N); - // Thread Launch - std::thread t(thread_func1, stream, &graph, Nbytes, A_d, B_h); - t.join(); - - // Launch the graph - hipGraphExec_t graphExec; - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); - HIP_CHECK(hipStreamSynchronize(streamForGraph)); - - // Output verification - for (size_t i = 0; i < N; i++) { - C_h[i] = C_h[i] * C_h[i]; - REQUIRE(B_h[i] == C_h[i]); - } - - free(A_h); - free(B_h); - free(C_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipStreamDestroy(stream)); - HIP_CHECK(hipStreamDestroy(streamForGraph)); - HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); -} - -static __global__ void increment(int* A_d) { - atomicAdd(A_d, 1); -} -/* - * Create 2 streams s1 and s2. Begin stream capture in s1, spawn a - * captured fork stream on s2. Queue some operations - * (like increment kernel) on both s1 and s2. End the stream capture - * on s2 and verify the error returned by the End capture. -*/ -TEST_CASE("Unit_hipStreamEndCapture_chkError_on_wrongStream") { - int *A_d{nullptr}, *A_h{nullptr}; - hipStream_t stream1{nullptr}, stream2{nullptr}; - hipEvent_t forkStreamEvent{nullptr}; hipGraph_t graph{nullptr}; - hipError_t err; - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - size_t Nbytes = sizeof(int); + hipGraphExec_t graphExec{nullptr}; + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipEventCreate(&forkStreamEvent)); + const hipStreamCaptureMode captureMode = hipStreamCaptureModeRelaxed; - A_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - // Initialize the Host data - *A_h = 0; - HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); - REQUIRE(A_d != nullptr); + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceSimple(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), N, stream); - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - HIP_CHECK(hipEventRecord(forkStreamEvent, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, forkStreamEvent, 0)); - - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, - hipMemcpyHostToDevice, stream1)); - - hipLaunchKernelGGL(increment, dim3(blocks), - dim3(threadsPerBlock), 0, stream1, A_d); - hipLaunchKernelGGL(increment, dim3(blocks), - dim3(threadsPerBlock), 0, stream2, A_d); - - err = hipStreamEndCapture(stream2, &graph); - REQUIRE(err == hipErrorStreamCaptureUnmatched); - - HIP_CHECK(hipStreamDestroy(stream1)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipEventDestroy(forkStreamEvent)); - free(A_h); - HIP_CHECK(hipFree(A_d)); -} -static void thread_func4(hipStream_t stream1, hipStream_t stream2, - hipEvent_t event, size_t Nbytes, int* B_d, int* B_h) { - HIP_CHECK(hipMemcpyAsync(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream2)); - HIP_CHECK(hipEventRecord(event, stream2)); - HIP_CHECK(hipStreamWaitEvent(stream1, event, 0)); -} -/* - * Create 2 streams s1 and s2. Begin stream capture in s1 and spawn a captured - * fork stream s2. In main thread, queue a memcpy operation on s1. - * Launch a thread, queue a memcpy operation on s2. Perform hipEventRecord on - * s2 and wait Event on S1. Wait for the thread to complete. Queue operations - * kernel addition(Cd = Ad + Bd) operation and memcpy(Ch <- Cd) in s1. End the - * stream capture in s1. Create an executable graph and launch the graph on input - * data and validate the output. - * */ -TEST_CASE("Unit_hipStreamEndCapture_streamMerge_in_thread") { - // Device Pointers - int *A_d, *B_d, *C_d; - // Host Pointers - int *A_h, *B_h, *C_h, *D_h; - hipStream_t stream1{nullptr}, stream2{nullptr}, streamForGraph{nullptr}; - hipEvent_t forkStreamEvent{nullptr}, event{nullptr}; - hipGraph_t graph{nullptr}; - - constexpr unsigned blocks = 512; - constexpr unsigned threadsPerBlock = 256; - constexpr size_t N = 5; - size_t Nbytes = N * sizeof(int); - - HIP_CHECK(hipStreamCreate(&stream1)); - HIP_CHECK(hipStreamCreate(&stream2)); - HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipEventCreate(&forkStreamEvent)); - HIP_CHECK(hipEventCreate(&event)); - // Memory allocation to Host Pointers - A_h = reinterpret_cast(malloc(Nbytes)); - B_h = reinterpret_cast(malloc(Nbytes)); - C_h = reinterpret_cast(malloc(Nbytes)); - D_h = reinterpret_cast(malloc(Nbytes)); - REQUIRE(A_h != nullptr); - REQUIRE(B_h != nullptr); - REQUIRE(C_h != nullptr); - REQUIRE(D_h != nullptr); - // Initialize the Host data - for (size_t i = 0; i < N; i++) { - A_h[i] = 1 + i; - B_h[i] = 2 + i; - C_h[i] = 0; - D_h[i] = 0; - } - // Memory allocation to Device Pointers - HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); - HIP_CHECK(hipMalloc(reinterpret_cast(&B_d), Nbytes)); - HIP_CHECK(hipMalloc(reinterpret_cast(&C_d), Nbytes)); - REQUIRE(A_d != nullptr); - REQUIRE(B_d != nullptr); - REQUIRE(C_d != nullptr); - - // Begin Capture - HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); - - HIP_CHECK(hipEventRecord(forkStreamEvent, stream1)); - HIP_CHECK(hipStreamWaitEvent(stream2, forkStreamEvent, 0)); - - HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, - hipMemcpyHostToDevice, stream1)); - // Thread Launch - std::thread t(thread_func4, stream1, stream2, event, Nbytes, B_d, B_h); + std::thread t(thread_func_pos, stream, &graph); t.join(); - // Launch kernal - hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), - dim3(threadsPerBlock), 0, stream1, A_d, - B_d, C_d, N); + // Validate end capture is successful + REQUIRE(graph != nullptr); - HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, - hipMemcpyDeviceToHost, stream1)); - HIP_CHECK(hipStreamEndCapture(stream1, &graph)); - - // Launch graph - hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); - HIP_CHECK(hipStreamSynchronize(streamForGraph)); - // Verify Output - for (size_t i = 0; i < N; i++) { - D_h[i] = A_h[i] + B_h[i]; - REQUIRE(C_h[i] == D_h[i]); + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), N, static_cast(i)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i), N); } HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipStreamDestroy(stream1)); - HIP_CHECK(hipStreamDestroy(stream2)); - HIP_CHECK(hipEventDestroy(forkStreamEvent)); - HIP_CHECK(hipStreamDestroy(streamForGraph)); - - // Release the memory - free(A_h); - free(B_h); - free(C_h); - free(D_h); - HIP_CHECK(hipFree(A_d)); - HIP_CHECK(hipFree(B_d)); - HIP_CHECK(hipFree(C_d)); } diff --git a/catch/unit/graph/hipStreamEndCapture_old.cc b/catch/unit/graph/hipStreamEndCapture_old.cc new file mode 100644 index 0000000000..740764d4b2 --- /dev/null +++ b/catch/unit/graph/hipStreamEndCapture_old.cc @@ -0,0 +1,437 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/** +Negative Testcase Scenarios : +1) Pass stream as nullptr and verify there is no crash, api returns error code. +2) Pass graph as nullptr and verify there is no crash, api returns error code. +3) Pass graph as nullptr and and stream as hipStreamPerThread verify there + is no crash, api returns error code. +4) End capture on stream where capture has not yet started and verify + error code is returned. +5) Destroy stream and try to end capture. +6) Destroy Graph and try to end capture. +7) Begin capture on a thread with mode other than hipStreamCaptureModeRelaxed + and try to end capture from different thread. Expect to return + hipErrorStreamCaptureWrongThread. +8) Start stream capture on stream1 using mode hipStreamCaptureModeRelaxed. + In stream1 queue a memcpy operation, queue a kernel square of a number operation. + Launch a thread. In the thread, queue a memcpy operation. End the capture on + stream1 and return the captured graph. Wait for the thread in main function. + Create an executable graph and launch the graph on input data and validate the + output. +9) Create 2 streams s1 and s2. Begin stream capture in s1, spawn a + captured fork stream on s2. Queue some operations + (like increment kernel) on both s1 and s2. End the stream capture + on s2 and verify the error returned by the End capture. +10)Create 2 streams s1 and s2. Begin stream capture in s1 and spawn a captured + fork stream s2. In main thread, queue a memcpy operation on s1. + Launch a thread, queue a memcpy operation on s2. Perform hipEventRecord on + s2 and wait Event on S1. Wait for the thread to complete. Queue operations + kernel addition(Cd = Ad + Bd) operation and memcpy(Ch <- Cd) in s1. End the + stream capture in s1. Create an executable graph and launch the graph on input + data and validate the output. +*/ + +#include +#include + +TEST_CASE("Unit_hipStreamEndCapture_Negative") { + hipError_t ret; + SECTION("Pass stream as nullptr") { + hipGraph_t graph; + ret = hipStreamEndCapture(nullptr, &graph); + REQUIRE(hipErrorIllegalState == ret); + } +#if HT_NVIDIA + SECTION("Pass graph as nullptr") { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + ret = hipStreamEndCapture(stream, nullptr); + REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass graph as nullptr and stream as hipStreamPerThread") { + ret = hipStreamEndCapture(hipStreamPerThread, nullptr); + REQUIRE(hipErrorInvalidValue == ret); + } +#endif + SECTION("End capture on stream where capture has not yet started") { + hipStream_t stream; + hipGraph_t graph; + HIP_CHECK(hipStreamCreate(&stream)); + ret = hipStreamEndCapture(stream, &graph); + REQUIRE(hipErrorIllegalState == ret); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Destroy stream and try to end capture") { + hipStream_t stream; + hipGraph_t graph; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipStreamDestroy(stream)); + ret = hipStreamEndCapture(stream, &graph); + REQUIRE(hipErrorContextIsDestroyed == ret); + } + SECTION("Destroy graph and try to end capture in between") { + hipStream_t stream{nullptr}; + hipGraph_t graph{nullptr}; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + constexpr size_t N = 100000; + size_t Nbytes = N * sizeof(float); + float *A_d, *C_d; + float *A_h, *C_h; + + A_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + REQUIRE(A_d != nullptr); + REQUIRE(C_d != nullptr); + + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + + HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, stream, A_d, C_d, N); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + + HIP_CHECK(hipGraphDestroy(graph)); + ret = hipStreamEndCapture(stream, &graph); + REQUIRE(hipSuccess == ret); + + free(A_h); + free(C_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); + HIP_CHECK(hipStreamDestroy(stream)); + } +} + +static void thread_func(hipStream_t stream, hipGraph_t graph) { + HIP_ASSERT(hipErrorStreamCaptureWrongThread == + hipStreamEndCapture(stream, &graph)); +} +static void StreamEndCaptureThreadNegative(float* A_d, float* A_h, + float* C_d, float* C_h, hipStreamCaptureMode mode) { + hipStream_t stream{nullptr}; + hipGraph_t graph{nullptr}; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + constexpr size_t N = 100000; + size_t Nbytes = N * sizeof(float); + + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamBeginCapture(stream, mode)); + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + + HIP_CHECK(hipMemsetAsync(C_d, 0, Nbytes, stream)); + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, stream, A_d, C_d, N); + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream)); + + std::thread t(thread_func, stream, graph); + t.join(); + +#if HT_AMD + HIP_CHECK(hipStreamEndCapture(stream, &graph)); +#endif + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipGraphDestroy(graph)); +} +TEST_CASE("Unit_hipStreamEndCapture_Thread_Negative") { + constexpr size_t N = 100000; + size_t Nbytes = N * sizeof(float); + float *A_d, *C_d; + float *A_h, *C_h; + + A_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(C_h != nullptr); + + // Fill with Phi + i + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.618f + i; + } + + HIP_CHECK(hipMalloc(&A_d, Nbytes)); + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + REQUIRE(A_d != nullptr); + REQUIRE(C_d != nullptr); + + SECTION("Capture Mode:hipStreamCaptureModeGlobal") { + StreamEndCaptureThreadNegative(A_d, A_h, C_d, C_h, + hipStreamCaptureModeGlobal); + } + SECTION("Capture Mode:hipStreamCaptureModeThreadLocal") { + StreamEndCaptureThreadNegative(A_d, A_h, C_d, C_h, + hipStreamCaptureModeThreadLocal); + } + free(A_h); + free(C_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(C_d)); +} +// Thread function +static void thread_func1(hipStream_t stream, hipGraph_t *graph, + size_t Nbytes, float* A_d, float* B_h) { + HIP_CHECK(hipMemcpyAsync(B_h, A_d, Nbytes, hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamEndCapture(stream, graph)); +} +/* + * Start stream capture on stream1 using mode hipStreamCaptureModeRelaxed. + * In stream1 queue a memcpy operation, queue a kernel square of a number operation. + * Launch a thread. In the thread, queue a memcpy operation. End the capture on + * stream1 and return the captured graph. Wait for the thread in main function. + * Create an executable graph and launch the graph on input data and validate the output. + * */ +TEST_CASE("Unit_hipStreamEndCapture_mode_hipStreamCaptureModeRelaxed") { + hipStream_t stream{nullptr}, streamForGraph{nullptr}; + hipGraph_t graph{nullptr}; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + constexpr size_t N = 10; + size_t Nbytes = N * sizeof(float); + // Device Pointers + float *A_d; + // Host Pointers + float *A_h, *B_h, *C_h; + + // Memory allocation to Host pointers + A_h = reinterpret_cast(malloc(Nbytes)); + B_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(B_h != nullptr); + REQUIRE(C_h != nullptr); + + // Initialize the Host data + for (size_t i = 0; i < N; i++) { + A_h[i] = 1.0f + i; + C_h[i] = A_h[i]; + } + // Memory allocation to Device pointers + HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); + REQUIRE(A_d != nullptr); + + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamCreate(&streamForGraph)); + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeRelaxed)); + // Copy data from Host to Device + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream)); + + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), + dim3(threadsPerBlock), 0, stream, A_d, A_d, N); + // Thread Launch + std::thread t(thread_func1, stream, &graph, Nbytes, A_d, B_h); + t.join(); + + // Launch the graph + hipGraphExec_t graphExec; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + + // Output verification + for (size_t i = 0; i < N; i++) { + C_h[i] = C_h[i] * C_h[i]; + REQUIRE(B_h[i] == C_h[i]); + } + + free(A_h); + free(B_h); + free(C_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipStreamDestroy(streamForGraph)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); +} + +static __global__ void increment(int* A_d) { + atomicAdd(A_d, 1); +} +/* + * Create 2 streams s1 and s2. Begin stream capture in s1, spawn a + * captured fork stream on s2. Queue some operations + * (like increment kernel) on both s1 and s2. End the stream capture + * on s2 and verify the error returned by the End capture. +*/ +TEST_CASE("Unit_hipStreamEndCapture_chkError_on_wrongStream") { + int *A_d{nullptr}, *A_h{nullptr}; + hipStream_t stream1{nullptr}, stream2{nullptr}; + hipEvent_t forkStreamEvent{nullptr}; + hipGraph_t graph{nullptr}; + hipError_t err; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + size_t Nbytes = sizeof(int); + + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipEventCreate(&forkStreamEvent)); + + A_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + // Initialize the Host data + *A_h = 0; + HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); + REQUIRE(A_d != nullptr); + + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipEventRecord(forkStreamEvent, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, forkStreamEvent, 0)); + + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, + hipMemcpyHostToDevice, stream1)); + + hipLaunchKernelGGL(increment, dim3(blocks), + dim3(threadsPerBlock), 0, stream1, A_d); + hipLaunchKernelGGL(increment, dim3(blocks), + dim3(threadsPerBlock), 0, stream2, A_d); + + err = hipStreamEndCapture(stream2, &graph); + REQUIRE(err == hipErrorStreamCaptureUnmatched); + + HIP_CHECK(hipStreamDestroy(stream1)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipEventDestroy(forkStreamEvent)); + free(A_h); + HIP_CHECK(hipFree(A_d)); +} +static void thread_func4(hipStream_t stream1, hipStream_t stream2, + hipEvent_t event, size_t Nbytes, int* B_d, int* B_h) { + HIP_CHECK(hipMemcpyAsync(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream2)); + HIP_CHECK(hipEventRecord(event, stream2)); + HIP_CHECK(hipStreamWaitEvent(stream1, event, 0)); +} +/* + * Create 2 streams s1 and s2. Begin stream capture in s1 and spawn a captured + * fork stream s2. In main thread, queue a memcpy operation on s1. + * Launch a thread, queue a memcpy operation on s2. Perform hipEventRecord on + * s2 and wait Event on S1. Wait for the thread to complete. Queue operations + * kernel addition(Cd = Ad + Bd) operation and memcpy(Ch <- Cd) in s1. End the + * stream capture in s1. Create an executable graph and launch the graph on input + * data and validate the output. + * */ +TEST_CASE("Unit_hipStreamEndCapture_streamMerge_in_thread") { + // Device Pointers + int *A_d, *B_d, *C_d; + // Host Pointers + int *A_h, *B_h, *C_h, *D_h; + hipStream_t stream1{nullptr}, stream2{nullptr}, streamForGraph{nullptr}; + hipEvent_t forkStreamEvent{nullptr}, event{nullptr}; + hipGraph_t graph{nullptr}; + + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + constexpr size_t N = 5; + size_t Nbytes = N * sizeof(int); + + HIP_CHECK(hipStreamCreate(&stream1)); + HIP_CHECK(hipStreamCreate(&stream2)); + HIP_CHECK(hipStreamCreate(&streamForGraph)); + HIP_CHECK(hipEventCreate(&forkStreamEvent)); + HIP_CHECK(hipEventCreate(&event)); + // Memory allocation to Host Pointers + A_h = reinterpret_cast(malloc(Nbytes)); + B_h = reinterpret_cast(malloc(Nbytes)); + C_h = reinterpret_cast(malloc(Nbytes)); + D_h = reinterpret_cast(malloc(Nbytes)); + REQUIRE(A_h != nullptr); + REQUIRE(B_h != nullptr); + REQUIRE(C_h != nullptr); + REQUIRE(D_h != nullptr); + // Initialize the Host data + for (size_t i = 0; i < N; i++) { + A_h[i] = 1 + i; + B_h[i] = 2 + i; + C_h[i] = 0; + D_h[i] = 0; + } + // Memory allocation to Device Pointers + HIP_CHECK(hipMalloc(reinterpret_cast(&A_d), Nbytes)); + HIP_CHECK(hipMalloc(reinterpret_cast(&B_d), Nbytes)); + HIP_CHECK(hipMalloc(reinterpret_cast(&C_d), Nbytes)); + REQUIRE(A_d != nullptr); + REQUIRE(B_d != nullptr); + REQUIRE(C_d != nullptr); + + // Begin Capture + HIP_CHECK(hipStreamBeginCapture(stream1, hipStreamCaptureModeGlobal)); + + HIP_CHECK(hipEventRecord(forkStreamEvent, stream1)); + HIP_CHECK(hipStreamWaitEvent(stream2, forkStreamEvent, 0)); + + HIP_CHECK(hipMemcpyAsync(A_d, A_h, Nbytes, + hipMemcpyHostToDevice, stream1)); + // Thread Launch + std::thread t(thread_func4, stream1, stream2, event, Nbytes, B_d, B_h); + t.join(); + // Launch kernal + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), + dim3(threadsPerBlock), 0, stream1, A_d, + B_d, C_d, N); + + HIP_CHECK(hipMemcpyAsync(C_h, C_d, Nbytes, + hipMemcpyDeviceToHost, stream1)); + HIP_CHECK(hipStreamEndCapture(stream1, &graph)); + + // Launch graph + hipGraphExec_t graphExec; + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + + // Verify Output + for (size_t i = 0; i < N; i++) { + D_h[i] = A_h[i] + B_h[i]; + REQUIRE(C_h[i] == D_h[i]); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(stream1)); + HIP_CHECK(hipStreamDestroy(stream2)); + HIP_CHECK(hipEventDestroy(forkStreamEvent)); + HIP_CHECK(hipStreamDestroy(streamForGraph)); + + // Release the memory + free(A_h); + free(B_h); + free(C_h); + free(D_h); + HIP_CHECK(hipFree(A_d)); + HIP_CHECK(hipFree(B_d)); + HIP_CHECK(hipFree(C_d)); +} diff --git a/catch/unit/graph/hipStreamUpdateCaptureDependencies.cc b/catch/unit/graph/hipStreamUpdateCaptureDependencies.cc new file mode 100644 index 0000000000..871ede9ea4 --- /dev/null +++ b/catch/unit/graph/hipStreamUpdateCaptureDependencies.cc @@ -0,0 +1,472 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include + +#include "stream_capture_common.hh" + +/** + * @addtogroup hipStreamUpdateCaptureDependencies + * hipStreamUpdateCaptureDependencies + * @{ + * @ingroup GraphTest + * `hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t + * *dependencies, size_t numDependencies, unsigned int flags __dparm(0)))` - + * update the set of dependencies in a capturing stream + */ + +static __global__ void vectorSet(const float* A_d, float* B_d, int64_t NELEM) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < NELEM; i += stride) { + B_d[i] = A_d[i]; + } +} + +static __global__ void vectorSum(const float* A_d, const float* B_d, float* C_d, size_t NELEM) { + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = offset; i < NELEM; i += stride) { + C_d[i] = A_d[i] + B_d[i] + C_d[i]; + } +} + +/* Local Function for setting new dependency + */ +static void UpdateStreamCaptureDependenciesSet(hipStream_t stream, + hipStreamCaptureMode captureMode) { + constexpr size_t N = 1000000; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + size_t Nbytes = N * sizeof(float); + + hipStreamCaptureStatus captureStatus{hipStreamCaptureStatusNone}; + hipGraph_t capInfoGraph{nullptr}; + const hipGraphNode_t* nodelist{}; + size_t numDependencies; + + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard C_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard C_d(LinearAllocs::hipMalloc, Nbytes); + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + EventsGuard events_guard(3); + StreamsGuard streams_guard(2); + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceBranched(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream, + streams_guard.stream_list(), events_guard.event_list()); + + constexpr int numDepsCreated = 2; // Num of dependencies created + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + REQUIRE(captureStatus == hipStreamCaptureStatusActive); + REQUIRE(capInfoGraph != nullptr); + REQUIRE(numDependencies == numDepsCreated); + + SECTION("Set dependency to independent Memcpy node") { + // Create memcpy node and set it as a capture dependency in graph + hipMemcpy3DParms myparams{}; + hipGraphNode_t memcpyNodeC{}; + + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.srcPtr = make_hipPitchedPtr(C_h.host_ptr(), Nbytes, N, 1); + myparams.dstPtr = make_hipPitchedPtr(C_d.ptr(), Nbytes, N, 1); + myparams.kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNodeC, capInfoGraph, nullptr, 0, &myparams)); + + // Replace capture dependency with new memcpy node created. + // Further nodes captured in stream will depend on the new memcpy node. + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, &memcpyNodeC, 1, + hipStreamSetCaptureDependencies)); + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + + // Verify updating dependency is taking effect. + REQUIRE(numDependencies == 1); + REQUIRE(nodelist[0] == memcpyNodeC); + + hipLaunchKernelGGL(HipTest::vector_square, dim3(blocks), dim3(threadsPerBlock), 0, stream, + C_d.ptr(), C_d.ptr(), N); + } + + SECTION("Set dependency to Kernel node depending on graph branch") { + hipGraphNode_t kernelNode{}; + hipKernelNodeParams kernelNodeParams{}; + + // Add node to modify vector sqr result and plug-in the nod + float* C_ptr = C_d.ptr(); + float* A_ptr = A_d.ptr(); + size_t NElem{N}; + void* kernelArgs[] = {&A_ptr, &C_ptr, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vector_square); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelNode, capInfoGraph, &nodelist[0], 1, &kernelNodeParams)); + + // Replace capture dependency with new kernel node created. + // Further nodes captured in stream will depend on the new kernel node. + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, &kernelNode, 1, + hipStreamSetCaptureDependencies)); + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + + // Verify updating dependency is taking effect. + REQUIRE(numDependencies == 1); + REQUIRE(nodelist[0] == kernelNode); + } + + HIP_CHECK(hipMemcpyAsync(B_h.ptr(), C_d.ptr(), Nbytes, hipMemcpyDeviceToHost, stream)); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), N, static_cast(i)); + std::fill_n(C_h.host_ptr(), N, static_cast(i)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i) * static_cast(i), N); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/* Local Function for adding new dependency + */ +static void UpdateStreamCaptureDependenciesAdd(hipStream_t stream, + hipStreamCaptureMode captureMode) { + constexpr size_t N = 1000000; + constexpr unsigned blocks = 512; + constexpr unsigned threadsPerBlock = 256; + size_t Nbytes = N * sizeof(float); + + hipStreamCaptureStatus captureStatus{hipStreamCaptureStatusNone}; + hipGraph_t capInfoGraph{nullptr}; + const hipGraphNode_t* nodelist{}; + size_t numDependencies; + + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard C_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard C_d(LinearAllocs::hipMalloc, Nbytes); + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + EventsGuard events_guard(3); + StreamsGuard streams_guard(2); + + HIP_CHECK(hipStreamBeginCapture(stream, captureMode)); + captureSequenceBranched(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream, + streams_guard.stream_list(), events_guard.event_list()); + + constexpr int numDepsCreated = 2; // Num of dependencies created + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + REQUIRE(captureStatus == hipStreamCaptureStatusActive); + REQUIRE(capInfoGraph != nullptr); + REQUIRE(numDependencies == numDepsCreated); + + SECTION("Add Dependency to independant Memcpy node") { + // Create memcpy node and add it as additional dependency in graph + hipMemcpy3DParms myparams{}; + hipGraphNode_t memcpyNodeC{}; + + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.srcPtr = make_hipPitchedPtr(C_h.host_ptr(), Nbytes, N, 1); + myparams.dstPtr = make_hipPitchedPtr(C_d.ptr(), Nbytes, N, 1); + myparams.kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNodeC, capInfoGraph, nullptr, 0, &myparams)); + + // Add/Append additional dependency MemcpyNodeC to the existing set. + // Further nodes captured in stream will depend on Memcpy nodes A, B and C. + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, &memcpyNodeC, 1, + hipStreamAddCaptureDependencies)); + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + + REQUIRE(numDependencies == numDepsCreated + 1); + + hipLaunchKernelGGL(vectorSum, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d.ptr(), + C_d.ptr(), B_d.ptr(), N); + } + + SECTION("Add Dependency to Kernel node depending on graph branch") { + hipGraphNode_t kernelNode{}; + hipKernelNodeParams kernelNodeParams{}; + + // Add node to modify vector sqr result and plug-in the nod + float* C_ptr = C_d.ptr(); + float* A_ptr = A_d.ptr(); + size_t NElem{N}; + void* kernelArgs[] = {&A_ptr, &C_ptr, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(vectorSet); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelNode, capInfoGraph, &nodelist[0], 1, &kernelNodeParams)); + + // Add/Append additional dependency addNode to the existing set. + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, &kernelNode, 1, + hipStreamAddCaptureDependencies)); + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + + REQUIRE(numDependencies == numDepsCreated + 1); + + hipLaunchKernelGGL(vectorSum, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d.ptr(), + C_d.ptr(), B_d.ptr(), N); + } + + HIP_CHECK(hipMemcpyAsync(B_h.ptr(), B_d.ptr(), Nbytes, hipMemcpyDeviceToHost, stream)); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + // Replay the recorded sequence multiple times + for (int i = 0; i < kLaunchIters; i++) { + std::fill_n(A_h.host_ptr(), N, static_cast(i)); + std::fill_n(C_h.host_ptr(), N, static_cast(i)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + ArrayFindIfNot(B_h.host_ptr(), static_cast(i) * 2, N); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Test to verify replacing existing dependency set with new nodes by + * calling the api with flag hipStreamSetCaptureDependencies for + * created/hipStreamPerThread for all capture modes. Verify updated dependency + * list is taking effect: + * -# Replace existing dependencies with a new memcpy node that has no + * dependencies + * -# Replace existing dependencies with a new kernel node which depends + * on a previously captured sequence + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamUpdateCaptureDependencies.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipStreamSetCaptureDependencies_Positive_Functional") { + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + + UpdateStreamCaptureDependenciesSet(stream, captureMode); +} + +/** + * Test Description + * ------------------------ + * - Test to verify adding additional depencies in the flow by calling the + * api with flag hipStreamAddCaptureDependencies for created/hipStreamPerThread + * for all capture modes. Verify updated dependency list is taking effect: + * -# Add new memcpy node that has no parent to the existing dependecies + * -# Add new kernel node which depends on a previously captured sequence + * to the existing dependencies + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamUpdateCaptureDependencies.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipStreamAddCaptureDependencies_Positive_Functional") { + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + + UpdateStreamCaptureDependenciesAdd(stream, captureMode); +} + +/** + * Test Description + * ------------------------ + * - Test to verify when dependencies are passed as nullptr and numDeps as 0, + * api returns success + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamUpdateCaptureDependencies.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipStreamUpdateCaptureDependencies_Positive_Parameters") { + hipGraph_t graph{nullptr}; + + const auto stream_type = GENERATE(Streams::perThread, Streams::created); + StreamGuard stream_guard(stream_type); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureMode = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + const hipStreamUpdateCaptureDependenciesFlags flag = + GENERATE(hipStreamAddCaptureDependencies, hipStreamSetCaptureDependencies); + + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, nullptr, 0, flag)); + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# Pass Dependencies as nullptr and numDeps as nonzero + * -# numDeps exceeds actual number of nodes + * -# Invalid flag is passed + * -# Dependency node is a un-initialized/invalid parameter + * -# Stream is not capturing + * Test source + * ------------------------ + * - catch\unit\graph\hipStreamUpdateCaptureDependencies.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipStreamUpdateCaptureDependencies_Negative_Parameters") { + const int Nbytes = 100; + hipGraph_t capInfoGraph{nullptr}; + hipGraph_t graph{nullptr}; + + hipStreamCaptureStatus captureStatus; + size_t numDependencies; + const hipGraphNode_t* nodelist{}; + hipGraphNode_t memsetNode{}; + std::vector dependencies; + + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + HIP_CHECK(hipMemsetAsync(A_d.ptr(), 0, Nbytes, stream)); + + HIP_CHECK(hipStreamGetCaptureInfo_v2(stream, &captureStatus, nullptr, &capInfoGraph, &nodelist, + &numDependencies)); + + hipMemsetParams memsetParams{}; + memsetParams.dst = reinterpret_cast(A_d.ptr()); + memsetParams.value = 1; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK( + hipGraphAddMemsetNode(&memsetNode, capInfoGraph, nodelist, numDependencies, &memsetParams)); + dependencies.push_back(memsetNode); + + SECTION("Dependencies as nullptr and numDeps as nonzero") { + HIP_CHECK_ERROR(hipStreamUpdateCaptureDependencies(stream, nullptr, dependencies.size(), + hipStreamAddCaptureDependencies), + hipErrorInvalidValue); + } + + SECTION("Invalid flag") { + constexpr int invalidFlag = 20; + HIP_CHECK_ERROR(hipStreamUpdateCaptureDependencies(stream, dependencies.data(), + dependencies.size(), invalidFlag), + hipErrorInvalidValue); + } + +#if HT_NVIDIA // EXSWHTEC-227 + SECTION("numDeps exceeding actual number of nodes") { + HIP_CHECK_ERROR( + hipStreamUpdateCaptureDependencies(stream, dependencies.data(), dependencies.size() + 1, + hipStreamAddCaptureDependencies), + hipErrorInvalidValue); + } + + SECTION("depnode as un-initialized/invalid parameter") { + hipGraphNode_t uninit_node{}; + HIP_CHECK_ERROR(hipStreamUpdateCaptureDependencies(stream, &uninit_node, 1, + hipStreamAddCaptureDependencies), + hipErrorInvalidValue); + } +#endif + +#if HT_AMD // EXSWHTEC-227 + HIP_CHECK(hipStreamUpdateCaptureDependencies(stream, dependencies.data(), dependencies.size(), + hipStreamAddCaptureDependencies)); +#endif + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + + SECTION("Stream is not capturing") { + HIP_CHECK_ERROR( + hipStreamUpdateCaptureDependencies(stream, dependencies.data(), dependencies.size(), + hipStreamAddCaptureDependencies), + hipErrorIllegalState); + } + + HIP_CHECK(hipGraphDestroy(graph)); +} diff --git a/catch/unit/graph/hipThreadExchangeStreamCaptureMode.cc b/catch/unit/graph/hipThreadExchangeStreamCaptureMode.cc new file mode 100644 index 0000000000..c35fc18900 --- /dev/null +++ b/catch/unit/graph/hipThreadExchangeStreamCaptureMode.cc @@ -0,0 +1,151 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include + +#include "stream_capture_common.hh" + + +/** + * @addtogroup hipThreadExchangeStreamCaptureMode + * hipThreadExchangeStreamCaptureMode + * @{ + * @ingroup GraphTest + * `hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode *mode)` - + * swaps the stream capture mode of a thread + */ + +/* Local Function for swaping stream capture mode of a thread + */ +static void hipGraphLaunchWithMode(hipStream_t stream, hipStreamCaptureMode mode) { + constexpr size_t N = 1024; + size_t Nbytes = N * sizeof(float); + constexpr float fill_value = 5.0f; + + hipGraph_t graph{nullptr}; + hipGraphExec_t graphExec{nullptr}; + + LinearAllocGuard A_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard B_h(LinearAllocs::malloc, Nbytes); + LinearAllocGuard A_d(LinearAllocs::hipMalloc, Nbytes); + LinearAllocGuard B_d(LinearAllocs::hipMalloc, Nbytes); + float* C_d; + + HIP_CHECK(hipThreadExchangeStreamCaptureMode(&mode)); + + HIP_CHECK(hipStreamBeginCapture(stream, mode)); + + captureSequenceLinear(A_h.host_ptr(), A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream); + captureSequenceCompute(A_d.ptr(), B_h.host_ptr(), B_d.ptr(), N, stream); + + if (mode == hipStreamCaptureModeRelaxed) { + HIP_CHECK(hipMalloc(&C_d, Nbytes)); + } + + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + + // Validate end capture is successful + REQUIRE(graph != nullptr); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + std::fill_n(A_h.host_ptr(), N, fill_value); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Validate the computation + ArrayFindIfNot(B_h.host_ptr(), fill_value * fill_value, N); + if (mode == hipStreamCaptureModeRelaxed) { + HIP_CHECK(hipFree(C_d)); + } + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +void threadFuncCaptureMode(hipStream_t stream, hipStreamCaptureMode mode) { + hipGraphLaunchWithMode(stream, mode); +} + +/** + * Test Description + * ------------------------ + * - Test to verify basic functionality for API that swaps the stream capture + * mode of a thread. All combinations for main and other thread capture modes + * are tested + * Test source + * ------------------------ + * - catch\unit\graph\hipThreadExchangeStreamCaptureMode.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +TEST_CASE("Unit_hipThreadExchangeStreamCaptureMode_Positive_Functional") { + StreamGuard stream_guard(Streams::created); + hipStream_t stream = stream_guard.stream(); + + const hipStreamCaptureMode captureModeMain = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + const hipStreamCaptureMode captureModeThread = GENERATE( + hipStreamCaptureModeGlobal, hipStreamCaptureModeThreadLocal, hipStreamCaptureModeRelaxed); + + hipGraphLaunchWithMode(stream, captureModeMain); + std::thread t(threadFuncCaptureMode, stream, captureModeThread); + t.join(); +} + +/** + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# Mode as nullptr + * -# Mode as -1 + * -# Mode as INT_MAX + * -# Mode other than existing 3 modes (hipStreamCaptureModeRelaxed + 1) + * Test source + * ------------------------ + * - catch\unit\graph\hipThreadExchangeStreamCaptureMode.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.3 + */ +#if HT_AMD // getting error in Cuda Setup +TEST_CASE("Unit_hipThreadExchangeStreamCaptureMode_Negative_Parameters") { + hipStreamCaptureMode mode; + + SECTION("Pass Mode as nullptr") { + HIP_CHECK_ERROR(hipThreadExchangeStreamCaptureMode(nullptr), hipErrorInvalidValue); + } + SECTION("Pass Mode as -1") { + mode = hipStreamCaptureMode(-1); + HIP_CHECK_ERROR(hipThreadExchangeStreamCaptureMode(&mode), hipErrorInvalidValue); + } + SECTION("Pass Mode as INT_MAX") { + mode = hipStreamCaptureMode(INT_MAX); + HIP_CHECK_ERROR(hipThreadExchangeStreamCaptureMode(&mode), hipErrorInvalidValue); + } + SECTION("Pass Mode as hipStreamCaptureModeRelaxed + 1") { + mode = hipStreamCaptureMode(hipStreamCaptureModeRelaxed + 1); + HIP_CHECK_ERROR(hipThreadExchangeStreamCaptureMode(&mode), hipErrorInvalidValue); + } +} +#endif