From a291af23ba1fe2d6fbfeb18e808bb9d0858ea6b2 Mon Sep 17 00:00:00 2001 From: mbhiutra Date: Mon, 5 Feb 2024 17:06:55 +0530 Subject: [PATCH] SWDEV-431704 - [catch2][dtest] Added test case for HIPGraph Performance Change-Id: Ia09b50cab74f154ac5a0005f27386da1875fbe38 [ROCm/hip-tests commit: 5c1bc4a4c3ab4d87580e5d16252a9c3b27506414] --- .../catch/unit/graph/hipGraphPerf.cc | 2451 ++++++++++++++++- 1 file changed, 2425 insertions(+), 26 deletions(-) diff --git a/projects/hip-tests/catch/unit/graph/hipGraphPerf.cc b/projects/hip-tests/catch/unit/graph/hipGraphPerf.cc index 5f37495a25..d457c9b011 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphPerf.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphPerf.cc @@ -1,5 +1,5 @@ /* -Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -28,12 +28,15 @@ THE SOFTWARE. * Optimize HIPGraph Performance. */ +#ifdef __linux__ // windows machine build failing refer ticket SWDEV-440611 + #include #include #include +#include #ifdef _WIN64 -#define setenv(x,y,z) _putenv_s(x,y) +#define setenv(x, y, z) _putenv_s(x, y) #endif static constexpr int N = 1024; @@ -42,6 +45,11 @@ static size_t NElem{N}; static constexpr int blocksPerCU = 6; // to hide latency static constexpr int threadsPerBlock = 256; +__device__ int globalTo1[N]; +__device__ int globalTo2[N]; +__device__ int globalFrom1[N]; +__device__ int globalFrom2[N]; + static bool verifyVectorSquare(int *A_h, int* C_h, size_t size) { for (size_t i = 0; i < size; i++) { if (C_h[i] != A_h[i] * A_h[i]) { @@ -52,9 +60,9 @@ static bool verifyVectorSquare(int *A_h, int* C_h, size_t size) { return true; } -/* - Added 2 nodes of MemCpy, and multiple node if Kernel call in continous +/* - Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous sequence and copy back the result and verify. */ -static void checkGraphContinousKernelCall(const unsigned int kNumNode) { +static void checkGraphcontinuousKernelCall(const unsigned int kNumNode) { unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipGraphNode_t memCpy1, memCpy2, memCpy3; std::vector kNode(kNumNode); @@ -110,7 +118,7 @@ static void checkGraphContinousKernelCall(const unsigned int kNumNode) { /* - Added multiple nodes of MemCpy, Kernel node continuously for 2 block & copy back result in MemCpy. */ -static void checkGraphContinousKernelCallIn2Blocks( +static void checkGraphcontinuousKernelCallIn2Blocks( const unsigned int kNumNode1, const unsigned int kNumNode2) { unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4; @@ -313,48 +321,2439 @@ static void checkGraphMemcpyMemsetKernelMixCall(const unsigned int kNumIter) { HIP_CHECK(hipStreamDestroy(stream)); } +/* - Added a EventRecordNode at start and 2 nodes of MemCpy, and multiple + node of Kernel call in continuous sequence and copy back the result and + add EventRecordNode at the end and verify the result. */ +static void checkGraphEventcontinuousKernelCall(const unsigned int kNumNode) { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + + hipGraphNode_t event_start, event_final; + hipEvent_t eventstart, eventend; + HIP_CHECK(hipEventCreate(&eventstart)); + HIP_CHECK(hipEventCreate(&eventend)); + + HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, + nullptr, 0, eventstart)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, + nullptr, 0, eventend)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_final, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipEventSynchronize(eventend)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(stream)); +} + +/* - Added a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel + node continuously and added one more EventRecordNode in mid for synchronize + and do similar MemCpy, Kernel node continuously in 2nd block & copy back + result in MemCpy and add EventRecordNode at the end for synchronize. */ +static void checkGraphEventcontinuousKernelCallIn2Blocks( + const unsigned int kNumNode1, const unsigned int kNumNode2) { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4; + std::vector kNode1(kNumNode1), kNode2(kNumNode2); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + + hipGraphNode_t event_start, event_mid, event_final; + hipEvent_t eventstart, eventmid, eventend; + HIP_CHECK(hipEventCreate(&eventstart)); + HIP_CHECK(hipEventCreate(&eventmid)); + HIP_CHECK(hipEventCreate(&eventend)); + + HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, + nullptr, 0, eventstart)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_mid, graph, + nullptr, 0, eventmid)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, + nullptr, 0, eventend)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode1; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode1[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode1[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[i-1], &kNode1[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[kNumNode1-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_mid, 1)); + + for (int i=0; i < kNumNode2; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &kNode2[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[i-1], &kNode2[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[kNumNode2-1], &memCpy4, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy4, &event_final, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipEventSynchronize(eventend)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(stream)); +} + /** * Test Description * ------------------------ * - Validate hipGraph performance with doorbell set. - * - DEBUG_CLR_GRAPH_ENABLE_BUFFERING and DEBUG_CLR_GRAPH_MAX_AQL_BUFFER_SIZE - * - Added multiple nodes of MemCpy, Kernel in sequence multiple times. - * - Added multiple nodes of MemCpy, MesSet, Kernel in sequence. - * - Added multiple nodes of MemCpy, Kernel node continuously & copy back result in MemCpy. - * - Added multiple nodes of MemCpy, Kernel node continuously for 2 block & copy back result in MemCpy. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Add multiple nodes of MemCpy, Kernel in sequence multiple times. + * 2) Add multiple nodes of MemCpy, MesSet, Kernel in sequence. + * 3) Add multiple nodes of MemCpy, Kernel node continuously & copy back result in MemCpy. + * 4) Add multiple nodes of MemCpy, Kernel node continuously for 2 block & copy back result in MemCpy. + * 5) Add a EventRecordNode at start and 2 nodes of MemCpy, and multiple + node of Kernel call in continuous sequence and copy back the result and + add EventRecordNode at the end and verify the result. + * 6) Add a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel + node continuously and added one more EventRecordNode in mid for synchronize + and do similar MemCpy, Kernel node continuously in 2nd block & copy back + result in MemCpy and add EventRecordNode at the end for synchronize. * Test source * ------------------------ * - unit/graph/hipGraphPerf.cc * Test requirements * ------------------------ - * - HIP_VERSION >= 6.0 + * - HIP_VERSION >= 6.1 */ -TEST_CASE("Unit_hipGraph_Perf_Check_MemcpyKernelMixCall") { - if ((setenv("DEBUG_CLR_GRAPH_ENABLE_BUFFERING", "true", 1)) != 0) { +TEST_CASE("Unit_hipGraph_PerfCheck_MemcpyKernelMixCall") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { HipTest::HIP_SKIP_TEST("Unable to turn on " - "DEBUG_CLR_GRAPH_ENABLE_BUFFERING, hence exit!"); - return; - } - auto BufferSz = GENERATE("25", "35", "15"); - if ((setenv("DEBUG_CLR_GRAPH_MAX_AQL_BUFFER_SIZE", BufferSz, 1)) != 0) { - HipTest::HIP_SKIP_TEST("Unable to turn on " - "DEBUG_CLR_GRAPH_MAX_AQL_BUFFER_SIZE, hence exit!"); + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); return; } constexpr int kNumIter1 = 25; constexpr int kNumIter2 = 30; - constexpr int kNumKernelNode1 = 15; - constexpr int kNumKernelNode2 = 45; + constexpr int kNumKNode1 = 15; + constexpr int kNumKNode2 = 45; checkGraphMemcpyKernelMixCall(kNumIter1); checkGraphMemcpyMemsetKernelMixCall(kNumIter2); - checkGraphContinousKernelCall(kNumKernelNode1); - checkGraphContinousKernelCallIn2Blocks(kNumKernelNode1, kNumKernelNode2); + checkGraphcontinuousKernelCall(kNumKNode1); + checkGraphcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2); + checkGraphEventcontinuousKernelCall(kNumIter1); + checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2); checkGraphMemcpyKernelMixCall(kNumIter2); checkGraphMemcpyMemsetKernelMixCall(kNumIter1); - checkGraphContinousKernelCall(kNumKernelNode2); - checkGraphContinousKernelCallIn2Blocks(kNumKernelNode2, kNumKernelNode1); + checkGraphcontinuousKernelCall(kNumKNode2); + checkGraphcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1); + checkGraphEventcontinuousKernelCall(kNumIter2); + checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1); } + +static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams( + const hipStream_t& stream) { + constexpr size_t N = 1024; + constexpr size_t Nbytes = N * sizeof(int); + constexpr auto blocksPerCU = 6; // to hide latency + constexpr auto threadsPerBlock = 256; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraph_t graph; + hipGraphNode_t memcpyNode, kNode; + hipKernelNodeParams kNodeParams{}, kNodeParams1{}; + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + std::vector dependencies; + hipGraphExec_t graphExec; + size_t NElem{N}; + + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIP_CHECK(hipGraphCreate(&graph, 0)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + dependencies.push_back(memcpyNode); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + dependencies.push_back(memcpyNode); + + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kNodeParams.gridDim = dim3(blocks); + kNodeParams.blockDim = dim3(threadsPerBlock); + kNodeParams.kernelParams = reinterpret_cast(kernelArgs); + HIP_CHECK(hipGraphAddKernelNode(&kNode, graph, dependencies.data(), + dependencies.size(), &kNodeParams)); + + dependencies.clear(); + dependencies.push_back(kNode); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, dependencies.data(), + dependencies.size(), C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + // Instantiate and launch the graph + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + kNodeParams1.func = reinterpret_cast(HipTest::vectorSUB); + kNodeParams1.gridDim = dim3(blocks); + kNodeParams1.blockDim = dim3(threadsPerBlock); + kNodeParams1.kernelParams = reinterpret_cast(kernelArgs); + + // Instantiate again and launch the graph + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode, &kNodeParams1)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and a node of Kernel call and + Instantiate graph and update kernelNodeParams for last kernel + and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as used created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream); + } + } + } +} + +#if HT_NVIDIA +static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + hipKernelNodeParams kNodeParams1{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kNodeParams1.func = reinterpret_cast(HipTest::vectorSUB); + kNodeParams1.gridDim = dim3(blocks); + kNodeParams1.blockDim = dim3(threadsPerBlock); + kNodeParams1.sharedMemBytes = 0; + kNodeParams1.kernelParams = reinterpret_cast(kernelArgs); + kNodeParams1.extra = nullptr; + HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode[kNumNode-1], + &kNodeParams1)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence + and Instantiate graph & update kernelNodeParams with hipGraphExecKernelNodeSetParams + for last kernel and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as used created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream); + } + } + } +} +#endif +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams + for source memCopy3 node and copy back the result and verify. + i) Verifying with different memCopy node kind from D2H -> D2D + ii) Verifying with different memCopy node kind from D2H -> H2D + iii) Verifying with different memCopy node kind from D2H -> H2H + iv) Verifying with different memCopy node kind from D2H -> D2H + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ + +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams") { + constexpr int kNumNode = 1; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int harray1D[N]{}; + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1; + HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipGraphCreate(&graph, 0)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[0], graph, nullptr, 0, + &kernelNodeParams)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[0], 1)); + + hipMemcpy3DParms myparams; + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(harray1D, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyHostToHost; + + HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + +#if HT_NVIDIA + SECTION("Verifying with different memCopy node kind from D2H -> H2H") { + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(A_h1, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(B_h1, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyHostToHost; + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } +#else + SECTION("Verifying with different memCopy node kind from D2H -> D2D") { + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(A_d1, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(B_d1, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyDeviceToDevice; + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + SECTION("Verifying with different memCopy node kind from D2H -> H2D") { + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(C_d1, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(C_h1, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + SECTION("Verifying with different memCopy node kind from D2H -> D2H") { + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + } +#endif + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(stream)); +} + +static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int harray1D[N]{}; + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1; + HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + + hipMemcpy3DParms myparams; + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(harray1D, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, harray1D, N); + + memset(&myparams, 0x0, sizeof(hipMemcpy3DParms)); + myparams.srcPos = make_hipPos(0, 0, 0); + myparams.dstPos = make_hipPos(0, 0, 0); + myparams.extent = make_hipExtent(Nbytes, 1, 1); + myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1); + myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1); + myparams.kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams + for source memCopy3 node and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ + +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *hData = reinterpret_cast(malloc(Nbytes)); + REQUIRE(hData != nullptr); + for (int i=0; i < N; ++i) + hData[i] = 2 * i + 1; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graphExec, memCpy2, B_d, + hData, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, hData, C_h, N); + + free(hData); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams1D + for source memCopy2 node and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ + +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, + HIP_SYMBOL(globalFrom1), C_d, Nbytes, 0, hipMemcpyDeviceToDevice)); + HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(&memCpy4, graph, nullptr, 0, C_h, + HIP_SYMBOL(globalFrom2), Nbytes, 0, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphExecMemcpyNodeSetParamsFromSymbol(graphExec, memCpy4, C_h, + HIP_SYMBOL(globalFrom1), Nbytes, 0, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsFromSymbol + for source memCopy4 node and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ + +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, + HIP_SYMBOL(globalTo1), C_d, Nbytes, 0, hipMemcpyDeviceToDevice)); + HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(&memCpy4, graph, nullptr, 0, C_h, + HIP_SYMBOL(globalTo2), Nbytes, 0, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphExecMemcpyNodeSetParamsToSymbol(graphExec, memCpy3, + HIP_SYMBOL(globalTo2), C_d, Nbytes, 0, hipMemcpyDeviceToDevice)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsToSymbol + for source memCopy3 node and copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams( + const hipStream_t& stream, int test) { + constexpr int kNumNode = 35; + constexpr int memSetVal = 7, memSetVal2 = 9; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memSet; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + int size, elementSize; + + if (test == 0) { + size = Nbytes; + elementSize = sizeof(char); + } else { + size = N; + elementSize = sizeof(int); + } + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + // Add MemSet Node + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = A_d; + memsetParams.value = memSetVal; + memsetParams.elementSize = elementSize; + memsetParams.width = size; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memSet, graph, nullptr, 0, &memsetParams)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &memSet, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memSet, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + if (test == 0) { + memset(A_h, memSetVal, size); + } else { + for (int i=0; i < N; i++) { + A_h[i] = memSetVal; + } + } + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + // update MemSet Node using Exec + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = A_d; + memsetParams.value = memSetVal2; + memsetParams.elementSize = elementSize; + memsetParams.width = size; + memsetParams.height = 1; + HIP_CHECK(hipGraphExecMemsetNodeSetParams(graphExec, memSet, &memsetParams)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + if (test == 0) { + memset(A_h, memSetVal2, size); + } else { + for (int i=0; i < N; i++) { + A_h[i] = memSetVal2; + } + } + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy and add a memSet node, and multiple node of Kernel + call in continuous sequence and Instantiate graph and update memSet node with + hipGraphExecMemsetNodeSetParams api and verify the result. + i) Verify the memset with reset 1 byte (char size) block. + ii) Verify the memset with reset 4 byte (int size) block. + iii) Check with Multi device case. + iv) Pass stream as user created stream + v) Pass stream as default stream + vi) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0); + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0); + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0); + hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + constexpr int memSetVal = 7, memSetVal2 = 9; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4; + hipGraphNode_t memSet1, memSet2, childGraphNode; + std::vector kNode(kNumNode); + hipGraph_t graph, childGraph1, childGraph2; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphCreate(&childGraph1, 0)); + HIP_CHECK(hipGraphCreate(&childGraph2, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + + // Add MemSet Node + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = C_d; + memsetParams.value = memSetVal; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memSet1, childGraph1, + nullptr, 0, &memsetParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph1, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memSet1, &memCpy3, 1)); + // Adding childnode to graph + HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, + nullptr, 0, childGraph1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], + &childGraphNode, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + memset(A_h, memSetVal, Nbytes); + for (unsigned int i = 0; i < N; i++) { + if (A_h[i] != C_h[i]) { + WARN("Validation failed at " << i << "\t" << C_h[i]); + REQUIRE(false); + } + } + + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = C_d; + memsetParams.value = memSetVal2; + memsetParams.elementSize = sizeof(int); + memsetParams.width = N; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memSet2, childGraph2, + nullptr, 0, &memsetParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph2, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memSet2, &memCpy4, 1)); + + // Update the childgraph node + HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, + childGraph2)); + // Launch Again and verify it once + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + for (unsigned int i = 0; i < N; i++) { + if (memSetVal2 != C_h[i]) { + WARN("Validation failed at " << i << "\t" << C_h[i]); + REQUIRE(false); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode; + hipGraphNode_t memCpyC11, memCpyC12, memCpyC13, kNodeC1; + hipGraphNode_t memCpyC21, memCpyC22, memCpyC23, kNodeC2; + std::vector kNode(kNumNode); + hipGraph_t graph, childGraph1, childGraph2; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1; + HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphCreate(&childGraph1, 0)); + HIP_CHECK(hipGraphCreate(&childGraph2, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + + // Add child graph + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, + A_d1, A_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, + B_d1, B_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1)); + + hipKernelNodeParams kNodeParams{}; + void* kernelArgsC[] = {&A_d1, &C_d1, reinterpret_cast(&NElem)}; + kNodeParams.func = reinterpret_cast(HipTest::vector_square); + kNodeParams.gridDim = dim3(blocks); + kNodeParams.blockDim = dim3(threadsPerBlock); + kNodeParams.sharedMemBytes = 0; + kNodeParams.kernelParams = reinterpret_cast(kernelArgsC); + kNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNodeC1, childGraph1, + nullptr, 0, &kNodeParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, + C_h1, C_d1, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, &kNodeC1, 1)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1, &memCpyC13, 1)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1)); + + // Adding childnode to graph + HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, + nullptr, 0, childGraph1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], + &childGraphNode, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification + REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N)); // ChildGraph o/p verify + + // new child graph + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, + A_d1, A_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, + B_d1, B_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1)); + + memset(&kNodeParams, 0x00, sizeof(hipKernelNodeParams)); + void* kernelArgC[] = {&A_d1, &B_d1, &C_d1, reinterpret_cast(&NElem)}; + kNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kNodeParams.gridDim = dim3(blocks); + kNodeParams.blockDim = dim3(threadsPerBlock); + kNodeParams.sharedMemBytes = 0; + kNodeParams.kernelParams = reinterpret_cast(kernelArgC); + kNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNodeC2, childGraph2, + nullptr, 0, &kNodeParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, + C_h1, C_d1, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, &kNodeC2, 1)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2, &memCpyC23, 1)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1)); + + // Update the childgraph node + HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, + childGraph2)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify modified graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification + HipTest::checkVectorADD(A_h1, B_h1, C_h1, N); // ChildGraph o/p verification + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + constexpr int kNumNodeChild = 45; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode; + hipGraphNode_t memCpyC11, memCpyC12, memCpyC13; + hipGraphNode_t memCpyC21, memCpyC22, memCpyC23; + std::vector kNode(kNumNode); + std::vector kNodeC1(kNumNodeChild); + std::vector kNodeC2(kNumNodeChild); + hipGraph_t graph, childGraph1, childGraph2; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1; + HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphCreate(&childGraph1, 0)); + HIP_CHECK(hipGraphCreate(&childGraph2, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + + // Add child graph + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, + A_d1, A_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, + B_d1, B_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1)); + + for (int i=0; i < kNumNodeChild; i++) { + hipKernelNodeParams kNodeParams{}; + void* kernelArgs[] = {&A_d1, &C_d1, reinterpret_cast(&NElem)}; + kNodeParams.func = reinterpret_cast(HipTest::vector_square); + kNodeParams.gridDim = dim3(blocks); + kNodeParams.blockDim = dim3(threadsPerBlock); + kNodeParams.sharedMemBytes = 0; + kNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNodeC1[i], childGraph1, nullptr, 0, + &kNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, + &kNodeC1[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[i-1], + &kNodeC1[i], 1)); + } + } + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, + C_h1, C_d1, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[kNumNodeChild-1], + &memCpyC13, 1)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1)); + + // Adding childnode to graph + HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, + nullptr, 0, childGraph1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], + &childGraphNode, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification + REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N)); // ChildGraph o/p verify + + // new child graph + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, + A_d1, A_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, + B_d1, B_h1, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1)); + + for (int i=0; i < kNumNodeChild; i++) { + hipKernelNodeParams kNodeParams{}; + void* kernelArgs[] = {&A_d1, &B_d1, &C_d1, + reinterpret_cast(&NElem)}; + kNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kNodeParams.gridDim = dim3(blocks); + kNodeParams.blockDim = dim3(threadsPerBlock); + kNodeParams.sharedMemBytes = 0; + kNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNodeC2[i], childGraph2, nullptr, 0, + &kNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, + &kNodeC2[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[i-1], + &kNodeC2[i], 1)); + } + } + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, + C_h1, C_d1, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[kNumNodeChild-1], + &memCpyC23, 1)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1)); + + // Update the childgraph node + HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, + childGraph2)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify modified graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification + HipTest::checkVectorADD(A_h1, B_h1, C_h1, N); // ChildGraph o/p verification + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and add a child graph node in the end and Instantiate graph and + update the graphExec with hipGraphExecChildGraphNodeSetParams for child + graph node which will copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + 2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and add a child graph which contain a kernel operation and + add this child graph as a node in the end of main graph & Instantiate graph. + update the graphExec with hipGraphExecChildGraphNodeSetParams for child + graph node with similar topology which will copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + 3) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and add a child graph which contain a kernel call in continuous + sequence operation and add this child graph as a node in the end of main + graph & Instantiate main graph and launch and check result of main graph. + update the graphExec with hipGraphExecChildGraphNodeSetParams for child + graph node with similar topology which will copy back the result and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream); + hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraph_t graph; + hipGraphExec_t graphExec; + hipEvent_t event_start, event1_end, event2_end; + HIP_CHECK(hipEventCreate(&event_start)); + HIP_CHECK(hipEventCreate(&event1_end)); + HIP_CHECK(hipEventCreate(&event2_end)); + + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + // Create nodes with event_start and event1_end + hipGraphNode_t event_start_rec, event_end_rec; + HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, + nullptr, 0, event_start)); + + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memCpy1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, + nullptr, 0, event1_end)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_end_rec, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + float t1 = 0.0f; + HIP_CHECK(hipEventElapsedTime(&t1, event_start, event1_end)); + REQUIRE(t1 > 0.0f); + + // Change the event at event_end_rec node to event2_end + HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, + event2_end)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + + // Validate the changed events + float t2 = 0.0f; + HIP_CHECK(hipEventElapsedTime(&t2, event_start, event2_end)); + REQUIRE(t2 > 0.0f); + + // Validate the changed events and initial event + float t3 = 0.0f; + HIP_CHECK(hipEventElapsedTime(&t3, event1_end, event2_end)); + REQUIRE(t3 > 0.0f); + + // Free resources + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event_start)); + HIP_CHECK(hipEventDestroy(event1_end)); + HIP_CHECK(hipEventDestroy(event2_end)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added event start node at the begining and than add 2 nodes of MemCpy, + and multiple node of Kernel call in continuous sequence and add a child + graph node in the end and Instantiate graph and update the graphExec with + hipGraphExecEventRecordNodeSetEvent and added a graph node which will copy + back the result and add event end node at the end and verify the time elapse. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl( + const hipStream_t& stream) { + constexpr int kNumNode = 35; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + // Create events + hipEvent_t event, event2; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipEventCreate(&event2)); + hipGraphNode_t event_wait_node; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + + hipGraphNode_t event_start_rec; + HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, + nullptr, 0, event2)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, + nullptr, 0, event)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_start_rec, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, + &event_wait_node, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, + event_wait_node, event2)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipEventDestroy(event2)); +} + +static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent( + const hipStream_t& stream) { + constexpr int kNumNode = 45; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + // Create events + hipEvent_t event, event2; + HIP_CHECK(hipEventCreate(&event)); + HIP_CHECK(hipEventCreate(&event2)); + hipGraphNode_t event_wait_node; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, + nullptr, 0, event)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_wait_node, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, + event_wait_node, event2)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipEventDestroy(event2)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and add a wait event node in the end and Instantiate graph and + update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a + graph node which will copy back the result and add event end node at the + end and verify. + 2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous + sequence and add a wait event node in the end and Instantiate graph and + add a wait kernel and memcpy node to copy back the result. + update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a + graph node which will copy back the result and add event end node at the + end and verify. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as user created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream); + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream); + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream); + hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream); + } + } + } +} + +void callBackFunc_1(void* A_h) { + int* A = reinterpret_cast(A_h); + for (int i = 0; i < N; i++) { + A[i] = i + i; + } +} +static void callBackFunc_1_Verify(int *C_h) { + for (int i = 0; i < N; i++) { + if (C_h[i] != (i + i)) { + INFO("Validation failed i " << i << "C_h[i] " << C_h[i]); + REQUIRE(false); + } + } +} + +void callBackFunc_2(void* A_h) { + int* A = reinterpret_cast(A_h); + for (int i = 0; i < N; i++) { + A[i] = i * i; + } +} +static void callBackFunc_2_Verify(int *C_h) { + for (int i = 0; i < N; i++) { + if (C_h[i] != (i * i)) { + INFO("Validation failed i " << i << "C_h[i] " << C_h[i]); + REQUIRE(false); + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecHostNodeSetParams( + const hipStream_t& stream) { + constexpr int kNumNode = 45; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + std::vector kNode(kNumNode); + hipGraph_t graph; + hipGraphExec_t graphExec; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + hipGraphNode_t hostNode; + hipHostNodeParams hostParams = {0, 0}; + hostParams.fn = callBackFunc_1; + hostParams.userData = C_h; + HIP_CHECK(hipGraphAddHostNode(&hostNode, graph, nullptr, 0, &hostParams)); + + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &hostNode, 1)); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify execution result + callBackFunc_1_Verify(C_h); + + hipHostNodeParams sethostParam = {0, 0}; + sethostParam.fn = callBackFunc_2; + sethostParam.userData = C_h; + + HIP_CHECK(hipGraphExecHostNodeSetParams(graphExec, hostNode, &sethostParam)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + callBackFunc_2_Verify(C_h); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence + and Instantiate graph & add a host node and launch the graph and verify result. + Now update the host node parameters using api hipGraphExecHostNodeSetParams + and verify the result which reflect modified data. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecHostNodeSetParams") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as used created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream); + } + } + } +} + +static void hipGraph_PerfCheck_hipGraphExecUpdate(const hipStream_t& stream) { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + size_t NElem{N}; + hipGraphNode_t memcpy_A, memcpy_B, memcpy_C; + hipGraphNode_t memcpy_A2, memcpy_B2, memcpy_C2; + hipGraphNode_t kernel_vecADD, kernel_vecSUB; + hipGraph_t graph1, graph2; + hipGraphExec_t graphExec; + hipGraphNode_t hErrorNode_out; + hipGraphExecUpdateResult updateResult_out; + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + HIP_CHECK(hipGraphCreate(&graph1, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, + A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, + B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, + &kernelNodeParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIP_CHECK(hipGraphCreate(&graph2, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A2, graph2, nullptr, 0, + A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B2, graph2, nullptr, 0, + B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams)); + void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs1); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, + &kernelNodeParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C2, graph2, nullptr, 0, + C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A2, &kernel_vecSUB, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B2, &kernel_vecSUB, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C2, 1)); + HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, + &updateResult_out)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h, B_h, C_h, N); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph1)); + HIP_CHECK(hipGraphDestroy(graph2)); +} + + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy & a Kernel node and copy back result using memcpy + and Instantiate graph & update new graph with similar node structure with + api hipGraphExecUpdate and verify the result, the updated node should reflect. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as used created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecUpdate(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecUpdate(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecUpdate(stream); + } + } + } +} + +#if HT_NVIDIA +static void hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop( + const hipStream_t& stream) { + constexpr int kNumNode = 45; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + hipGraphNode_t memCpy1, memCpy2, memCpy3; + hipGraphNode_t memCpy21, memCpy22, memCpy23; + std::vector kNode(kNumNode); + std::vector kNode2(kNumNode); + hipGraph_t graph, graph2; + hipGraphExec_t graphExec; + hipGraphNode_t hErrorNode_out; + hipGraphExecUpdateResult updateResult_out; + + int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h; + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + int *A_d2, *B_d2, *C_d2, *A_h2, *B_h2, *C_h2; + HipTest::initArrays(&A_d2, &B_d2, &C_d2, &A_h2, &B_h2, &C_h2, N, false); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParams{}; + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + kernelNodeParams.gridDim = dim3(blocks); + kernelNodeParams.blockDim = dim3(threadsPerBlock); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kernelNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, + &kernelNodeParams)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i-1], &kNode[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode-1], &memCpy3, 1)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIP_CHECK(hipGraphCreate(&graph2, 0)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy21, graph2, nullptr, 0, A_d2, A_h2, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy22, graph2, nullptr, 0, B_d2, B_h2, + Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy21, &memCpy22, 1)); + + for (int i=0; i < kNumNode; i++) { + hipKernelNodeParams kernelNodeParam{}; + void* kernelArgs2[] = {&A_d2, &B_d2, &C_d2, + reinterpret_cast(&NElem)}; + kernelNodeParam.func = reinterpret_cast(HipTest::vectorSUB); + kernelNodeParam.gridDim = dim3(blocks); + kernelNodeParam.blockDim = dim3(threadsPerBlock); + kernelNodeParam.sharedMemBytes = 0; + kernelNodeParam.kernelParams = reinterpret_cast(kernelArgs2); + kernelNodeParam.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph2, nullptr, 0, + &kernelNodeParam)); + if (i == 0) { + HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy22, &kNode2[i], 1)); + } else { + HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[i-1], &kNode2[i], 1)); + } + } + HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy23, graph2, nullptr, 0, C_h2, C_d2, + Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[kNumNode-1], &memCpy23, 1)); + + HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, + &updateResult_out)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + // Verify graph execution result + HipTest::checkVectorSUB(A_h2, B_h2, C_h2, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HipTest::freeArrays(A_d2, B_d2, C_d2, A_h2, B_h2, C_h2, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipGraphDestroy(graph2)); +} + +/** + * Test Description + * ------------------------ + * - Validate hipGraph performance with doorbell set. + * - DEBUG_CLR_GRAPH_PACKET_CAPTURE + * 1) Added 2 nodes of MemCpy & a Kernel node in sequence and copy back result using memcpy + and Instantiate graph & update new graph with similar node structure with + api hipGraphExecUpdate and verify the result, the updated node should reflect. + i) Check with Multi device case. + ii) Pass stream as user created stream + iii) Pass stream as default stream + iv) Pass stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphPerf.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 6.1 + */ +TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop") { + if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) { + HipTest::HIP_SKIP_TEST("Unable to turn on " + "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!"); + return; + } + + hipStream_t stream; + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + SECTION("Multi device test with different type of stream") { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + + SECTION("Pass stream as used created stream") { + HIP_CHECK(hipStreamCreate(&stream)); + hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + SECTION("Pass stream as default stream") { + stream = 0; + hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream); + } + SECTION("Pass stream as hipStreamPerThread") { + stream = hipStreamPerThread; + hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream); + } + } + } +} +#endif +#endif // #if __linux__