rocm-systems/projects/hip-tests/catch/unit/graph/hipGraphPerf.cc

/*
Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 * @addtogroup hipGraphPerfCheck hipGraphPerfCheck
 * @{
 * @ingroup GraphTest
 * `hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
 *     const hipGraphNode_t* pDependencies, size_t numDependencies,
 *     const hipKernelNodeParams* pNodeParams)` -
 * Creates a kernel execution node and adds it to a graph.
 * Optimize HIPGraph Performance.
 */

#ifdef __linux__  // windows machine build failing refer ticket SWDEV-440611

#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_kernels.hh>
#include <hip_test_defgroups.hh>

#ifdef _WIN64
#define setenv(x, y, z) _putenv_s(x, y)
#endif

static constexpr int N = 1024;
static constexpr int Nbytes = N * sizeof(int);
static size_t NElem{N};
static constexpr int blocksPerCU = 6;  // to hide latency
static constexpr int threadsPerBlock = 256;

__device__ int globalTo1[N];
__device__ int globalTo2[N];
__device__ int globalFrom1[N];
__device__ int globalFrom2[N];

static bool verifyVectorSquare(int* A_h, int* C_h, size_t size) {
  for (size_t i = 0; i < size; i++) {
    if (C_h[i] != A_h[i] * A_h[i]) {
      INFO("VectorSquare A and C not matching at " << i);
      return false;
    }
  }
  return true;
}

/*  - Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
      sequence and copy back the result and verify. */
static void checkGraphcontinuousKernelCall(const unsigned int kNumNode) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/*  - Added multiple nodes of MemCpy, Kernel node continuously for
      2 block & copy back result in MemCpy. */
static void checkGraphcontinuousKernelCallIn2Blocks(const unsigned int kNumNode1,
                                                    const unsigned int kNumNode2) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
  std::vector<hipGraphNode_t> kNode1(kNumNode1), kNode2(kNumNode2);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode1; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode1[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode1[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[i - 1], &kNode1[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[kNumNode1 - 1], &memCpy3, 1));

  for (int i = 0; i < kNumNode2; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &kNode2[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[i - 1], &kNode2[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[kNumNode2 - 1], &memCpy4, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/*  - Added 2 nodes of MemCpy, Kernel node whicl compute the operation & copy
      back result using MemCpy node. Call this multiple times sequentially. */
static void checkGraphMemcpyKernelMixCall(const unsigned int kNumIter) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  constexpr int kNumNode = 3;
  std::vector<hipGraphNode_t> node(kNumIter * kNumNode);
  std::vector<hipGraphNode_t> kNode(kNumIter);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  int i = 0;
  for (int iter = 0; iter < kNumIter; iter++) {
    i = kNumNode * iter;
    HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i], graph, nullptr, 0, A_d, A_h, Nbytes,
                                      hipMemcpyHostToDevice));
    HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 1], graph, nullptr, 0, B_d, B_h, Nbytes,
                                      hipMemcpyHostToDevice));

    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[iter], graph, nullptr, 0, &kernelNodeParams));

    HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 2], graph, nullptr, 0, C_h, C_d, Nbytes,
                                      hipMemcpyDeviceToHost));
    if (i != 0) HIP_CHECK(hipGraphAddDependencies(graph, &node[i - 1], &node[i], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &node[i], &node[i + 1], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &node[i + 1], &kNode[iter], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &kNode[iter], &node[i + 2], 1));
  }

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/*  - Added a nodes of MemCpy, MesSet, Kernel to do operation and copy back
      result using MemCpy node and call above operation in sequence. */
static void checkGraphMemcpyMemsetKernelMixCall(const unsigned int kNumIter) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  constexpr int kNumNode = 3;
  std::vector<hipGraphNode_t> node(kNumIter * kNumNode);
  std::vector<hipGraphNode_t> kNode(kNumIter);
  hipGraph_t graph;
  hipGraphExec_t graphExec;
  int pitch_M = 0;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  int i = 0;
  for (int iter = 0; iter < kNumIter; iter++) {
    i = kNumNode * iter;
    HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i], graph, nullptr, 0, A_d, A_h, Nbytes,
                                      hipMemcpyHostToDevice));

    hipMemsetParams memsetParams{};
    memset(&memsetParams, 0, sizeof(memsetParams));
    memsetParams.dst = reinterpret_cast<void*>(B_d);
    memsetParams.value = 2;
    memsetParams.pitch = pitch_M;
    memsetParams.elementSize = sizeof(char);
    memsetParams.width = N;
    memsetParams.height = 1;
    HIP_CHECK(hipGraphAddMemsetNode(&node[i + 1], graph, nullptr, 0, &memsetParams));

    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[iter], graph, nullptr, 0, &kernelNodeParams));

    HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 2], graph, nullptr, 0, C_h, C_d, Nbytes,
                                      hipMemcpyDeviceToHost));
    if (i != 0) HIP_CHECK(hipGraphAddDependencies(graph, &node[i - 1], &node[i], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &node[i], &node[i + 1], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &node[i + 1], &kNode[iter], 1));
    HIP_CHECK(hipGraphAddDependencies(graph, &kNode[iter], &node[i + 2], 1));
  }

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  REQUIRE(true == verifyVectorSquare(A_h, C_h, N));

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/*  - Added a EventRecordNode at start and 2 nodes of MemCpy, and multiple
      node of Kernel call in continuous sequence and copy back the result and
      add EventRecordNode at the end and verify the result. */
static void checkGraphEventcontinuousKernelCall(const unsigned int kNumNode) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  hipGraphNode_t event_start, event_final;
  hipEvent_t eventstart, eventend;
  HIP_CHECK(hipEventCreate(&eventstart));
  HIP_CHECK(hipEventCreate(&eventend));

  HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
  HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_final, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipEventSynchronize(eventend));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipEventDestroy(eventstart));
  HIP_CHECK(hipEventDestroy(eventend));
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/*  - Added a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel
      node continuously and added one more EventRecordNode in mid for synchronize
      and do similar MemCpy, Kernel node continuously in 2nd block & copy back
      result in MemCpy and add EventRecordNode at the end for synchronize. */
static void checkGraphEventcontinuousKernelCallIn2Blocks(const unsigned int kNumNode1,
                                                         const unsigned int kNumNode2) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
  std::vector<hipGraphNode_t> kNode1(kNumNode1), kNode2(kNumNode2);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  hipGraphNode_t event_start, event_mid, event_final;
  hipEvent_t eventstart, eventmid, eventend;
  HIP_CHECK(hipEventCreate(&eventstart));
  HIP_CHECK(hipEventCreate(&eventmid));
  HIP_CHECK(hipEventCreate(&eventend));

  HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
  HIP_CHECK(hipGraphAddEventRecordNode(&event_mid, graph, nullptr, 0, eventmid));
  HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode1; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode1[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode1[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[i - 1], &kNode1[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[kNumNode1 - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_mid, 1));

  for (int i = 0; i < kNumNode2; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &kNode2[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[i - 1], &kNode2[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[kNumNode2 - 1], &memCpy4, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy4, &event_final, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipEventSynchronize(eventend));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipEventDestroy(eventstart));
  HIP_CHECK(hipEventDestroy(eventmid));
  HIP_CHECK(hipEventDestroy(eventend));
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Add multiple nodes of MemCpy, Kernel in sequence multiple times.
 *  2) Add multiple nodes of MemCpy, MesSet, Kernel in sequence.
 *  3) Add multiple nodes of MemCpy, Kernel node continuously & copy back result in MemCpy.
 *  4) Add multiple nodes of MemCpy, Kernel node continuously for 2 block & copy back result in
 MemCpy.
 *  5) Add a EventRecordNode at start and 2 nodes of MemCpy, and multiple
       node of Kernel call in continuous sequence and copy back the result and
       add EventRecordNode at the end and verify the result.
 *  6) Add a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel
      node continuously and added one more EventRecordNode in mid for synchronize
      and do similar MemCpy, Kernel node continuously in 2nd block & copy back
      result in MemCpy and add EventRecordNode at the end for synchronize.
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */

TEST_CASE("Unit_hipGraph_PerfCheck_MemcpyKernelMixCall") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  constexpr int kNumIter1 = 25;
  constexpr int kNumIter2 = 30;
  constexpr int kNumKNode1 = 15;
  constexpr int kNumKNode2 = 45;

  checkGraphMemcpyKernelMixCall(kNumIter1);
  checkGraphMemcpyMemsetKernelMixCall(kNumIter2);
  checkGraphcontinuousKernelCall(kNumKNode1);
  checkGraphcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2);
  checkGraphEventcontinuousKernelCall(kNumIter1);
  checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2);

  checkGraphMemcpyKernelMixCall(kNumIter2);
  checkGraphMemcpyMemsetKernelMixCall(kNumIter1);
  checkGraphcontinuousKernelCall(kNumKNode2);
  checkGraphcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1);
  checkGraphEventcontinuousKernelCall(kNumIter2);
  checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1);
}

static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(const hipStream_t& stream) {
  constexpr size_t N = 1024;
  constexpr size_t Nbytes = N * sizeof(int);
  constexpr auto blocksPerCU = 6;  // to hide latency
  constexpr auto threadsPerBlock = 256;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraph_t graph;
  hipGraphNode_t memcpyNode, kNode;
  hipKernelNodeParams kNodeParams{}, kNodeParams1{};
  int *A_d, *B_d, *C_d;
  int *A_h, *B_h, *C_h;
  std::vector<hipGraphNode_t> dependencies;
  hipGraphExec_t graphExec;
  size_t NElem{N};

  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  HIP_CHECK(hipGraphCreate(&graph, 0));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  dependencies.push_back(memcpyNode);
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  dependencies.push_back(memcpyNode);

  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
  kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kNodeParams.gridDim = dim3(blocks);
  kNodeParams.blockDim = dim3(threadsPerBlock);
  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  HIP_CHECK(
      hipGraphAddKernelNode(&kNode, graph, dependencies.data(), dependencies.size(), &kNodeParams));

  dependencies.clear();
  dependencies.push_back(kNode);
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, dependencies.data(), dependencies.size(),
                                    C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

  // Instantiate and launch the graph
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));
  HIP_CHECK(hipGraphExecDestroy(graphExec));

  // Verify graph execution result
  HipTest::checkVectorADD<int>(A_h, B_h, C_h, N);

  kNodeParams1.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
  kNodeParams1.gridDim = dim3(blocks);
  kNodeParams1.blockDim = dim3(threadsPerBlock);
  kNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);

  // Instantiate again and launch the graph
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
  HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode, &kNodeParams1));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB<int>(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and a node of Kernel call and
       Instantiate graph and update kernelNodeParams for last kernel
       and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as used created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
      }
    }
  }
}

#if HT_NVIDIA
static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  hipKernelNodeParams kNodeParams1{};
  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
  kNodeParams1.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
  kNodeParams1.gridDim = dim3(blocks);
  kNodeParams1.blockDim = dim3(threadsPerBlock);
  kNodeParams1.sharedMemBytes = 0;
  kNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kNodeParams1.extra = nullptr;
  HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode[kNumNode - 1], &kNodeParams1));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence
       and Instantiate graph & update kernelNodeParams with hipGraphExecKernelNodeSetParams
       for last kernel and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as used created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
      }
    }
  }
}
#endif
/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 3 nodes of MemCpy, and node of Kernel call in continuous
      sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams
      for source memCopy3 node and copy back the result and verify.
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */

TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }
  constexpr int kNumNode = 1;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
  HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);

  hipStream_t stream;
  HIP_CHECK(hipStreamCreate(&stream));
  HIP_CHECK(hipGraphCreate(&graph, 0));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  hipKernelNodeParams kernelNodeParams{};
  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kNode[0], graph, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[0], 1));

  hipMemcpy3DParms myparams;
  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
  myparams.srcPos = make_hipPos(0, 0, 0);
  myparams.dstPos = make_hipPos(0, 0, 0);
  myparams.extent = make_hipExtent(Nbytes, 1, 1);
  myparams.dstPtr = make_hipPitchedPtr(A_h1, Nbytes, Nbytes, 1);
  myparams.srcPtr = make_hipPitchedPtr(A_d1, Nbytes, Nbytes, 1);
  myparams.kind = hipMemcpyDeviceToHost;

  HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  SECTION("Verifying with different memCopy node Params") {
    memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
    myparams.srcPos = make_hipPos(0, 0, 0);
    myparams.dstPos = make_hipPos(0, 0, 0);
    myparams.extent = make_hipExtent(Nbytes, 1, 1);
    myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1);
    myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
    myparams.kind = hipMemcpyDeviceToHost;

    HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams));
    HIP_CHECK(hipGraphLaunch(graphExec, stream));
    HIP_CHECK(hipStreamSynchronize(stream));
  }

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);
  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipStreamDestroy(stream));
}

static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int harray1D[N]{};
  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
  HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }

  hipMemcpy3DParms myparams;
  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
  myparams.srcPos = make_hipPos(0, 0, 0);
  myparams.dstPos = make_hipPos(0, 0, 0);
  myparams.extent = make_hipExtent(Nbytes, 1, 1);
  myparams.dstPtr = make_hipPitchedPtr(harray1D, Nbytes, Nbytes, 1);
  myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
  myparams.kind = hipMemcpyDeviceToHost;

  HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, harray1D, N);

  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
  myparams.srcPos = make_hipPos(0, 0, 0);
  myparams.dstPos = make_hipPos(0, 0, 0);
  myparams.extent = make_hipExtent(Nbytes, 1, 1);
  myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1);
  myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
  myparams.kind = hipMemcpyDeviceToHost;

  HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
      sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams
      for source memCopy3 node and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */

TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int* hData = reinterpret_cast<int*>(malloc(Nbytes));
  REQUIRE(hData != nullptr);
  for (int i = 0; i < N; ++i) hData[i] = 2 * i + 1;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graphExec, memCpy2, B_d, hData, Nbytes,
                                              hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, hData, C_h, N);

  free(hData);
  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
      sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams1D
      for source memCopy2 node and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */

TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, HIP_SYMBOL(globalFrom1), C_d,
                                          Nbytes, 0, hipMemcpyDeviceToDevice));
  HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(
      &memCpy4, graph, nullptr, 0, C_h, HIP_SYMBOL(globalFrom2), Nbytes, 0, hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphExecMemcpyNodeSetParamsFromSymbol(
      graphExec, memCpy4, C_h, HIP_SYMBOL(globalFrom1), Nbytes, 0, hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
      sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsFromSymbol
      for source memCopy4 node and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */

TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, HIP_SYMBOL(globalTo1), C_d,
                                          Nbytes, 0, hipMemcpyDeviceToDevice));
  HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(&memCpy4, graph, nullptr, 0, C_h, HIP_SYMBOL(globalTo2),
                                            Nbytes, 0, hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphExecMemcpyNodeSetParamsToSymbol(graphExec, memCpy3, HIP_SYMBOL(globalTo2), C_d,
                                                    Nbytes, 0, hipMemcpyDeviceToDevice));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
      sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsToSymbol
      for source memCopy3 node and copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(const hipStream_t& stream,
                                                               int test) {
  constexpr int kNumNode = 35;
  constexpr int memSetVal = 7, memSetVal2 = 9;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memSet;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;
  int size, elementSize;

  if (test == 0) {
    size = Nbytes;
    elementSize = sizeof(char);
  } else {
    size = N;
    elementSize = sizeof(int);
  }

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  // Add MemSet Node
  hipMemsetParams memsetParams{};
  memset(&memsetParams, 0, sizeof(memsetParams));
  memsetParams.dst = A_d;
  memsetParams.value = memSetVal;
  memsetParams.elementSize = elementSize;
  memsetParams.width = size;
  memsetParams.height = 1;
  HIP_CHECK(hipGraphAddMemsetNode(&memSet, graph, nullptr, 0, &memsetParams));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &memSet, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memSet, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  if (test == 0) {
    memset(A_h, memSetVal, size);
  } else {
    for (int i = 0; i < N; i++) {
      A_h[i] = memSetVal;
    }
  }
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  // update MemSet Node using Exec
  memset(&memsetParams, 0, sizeof(memsetParams));
  memsetParams.dst = A_d;
  memsetParams.value = memSetVal2;
  memsetParams.elementSize = elementSize;
  memsetParams.width = size;
  memsetParams.height = 1;
  HIP_CHECK(hipGraphExecMemsetNodeSetParams(graphExec, memSet, &memsetParams));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  if (test == 0) {
    memset(A_h, memSetVal2, size);
  } else {
    for (int i = 0; i < N; i++) {
      A_h[i] = memSetVal2;
    }
  }
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy and add a memSet node, and multiple node of Kernel
       call in continuous sequence and Instantiate graph and update memSet node with
       hipGraphExecMemsetNodeSetParams api and verify the result.
    i)   Verify the memset with reset 1 byte (char size) block.
    ii)  Verify the memset with reset 4 byte (int size) block.
    iii) Check with Multi device case.
    iv)  Pass stream as user created stream
    v)   Pass stream as default stream
    vi)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
        hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  constexpr int memSetVal = 7, memSetVal2 = 9;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
  hipGraphNode_t memSet1, memSet2, childGraphNode;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph, childGraph1, childGraph2;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphCreate(&childGraph1, 0));
  HIP_CHECK(hipGraphCreate(&childGraph2, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }

  // Add MemSet Node
  hipMemsetParams memsetParams{};
  memset(&memsetParams, 0, sizeof(memsetParams));
  memsetParams.dst = C_d;
  memsetParams.value = memSetVal;
  memsetParams.elementSize = sizeof(char);
  memsetParams.width = Nbytes;
  memsetParams.height = 1;
  HIP_CHECK(hipGraphAddMemsetNode(&memSet1, childGraph1, nullptr, 0, &memsetParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memSet1, &memCpy3, 1));
  // Adding childnode to graph
  HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  memset(A_h, memSetVal, Nbytes);
  for (unsigned int i = 0; i < N; i++) {
    if (A_h[i] != C_h[i]) {
      WARN("Validation failed at " << i << "\t" << C_h[i]);
      REQUIRE(false);
    }
  }

  memset(&memsetParams, 0, sizeof(memsetParams));
  memsetParams.dst = C_d;
  memsetParams.value = memSetVal2;
  memsetParams.elementSize = sizeof(int);
  memsetParams.width = N;
  memsetParams.height = 1;
  HIP_CHECK(hipGraphAddMemsetNode(&memSet2, childGraph2, nullptr, 0, &memsetParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memSet2, &memCpy4, 1));

  // Update the childgraph node
  HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
  // Launch Again and verify it once
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  for (unsigned int i = 0; i < N; i++) {
    if (memSetVal2 != C_h[i]) {
      WARN("Validation failed at " << i << "\t" << C_h[i]);
      REQUIRE(false);
    }
  }

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipGraphDestroy(childGraph1));
  HIP_CHECK(hipGraphDestroy(childGraph2));
}

static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(
    const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode;
  hipGraphNode_t memCpyC11, memCpyC12, memCpyC13, kNodeC1;
  hipGraphNode_t memCpyC21, memCpyC22, memCpyC23, kNodeC2;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph, childGraph1, childGraph2;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
  HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphCreate(&childGraph1, 0));
  HIP_CHECK(hipGraphCreate(&childGraph2, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }

  // Add child graph
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, A_d1, A_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, B_d1, B_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1));

  hipKernelNodeParams kNodeParams{};
  void* kernelArgsC[] = {&A_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
  kNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
  kNodeParams.gridDim = dim3(blocks);
  kNodeParams.blockDim = dim3(threadsPerBlock);
  kNodeParams.sharedMemBytes = 0;
  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgsC);
  kNodeParams.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kNodeC1, childGraph1, nullptr, 0, &kNodeParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, C_h1, C_d1, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, &kNodeC1, 1));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1, &memCpyC13, 1));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1));

  // Adding childnode to graph
  HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);           // MainGraph o/p verification
  REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N));  // ChildGraph o/p verify

  // new child graph
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, A_d1, A_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, B_d1, B_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1));

  memset(&kNodeParams, 0x00, sizeof(hipKernelNodeParams));
  void* kernelArgC[] = {&A_d1, &B_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
  kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kNodeParams.gridDim = dim3(blocks);
  kNodeParams.blockDim = dim3(threadsPerBlock);
  kNodeParams.sharedMemBytes = 0;
  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgC);
  kNodeParams.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kNodeC2, childGraph2, nullptr, 0, &kNodeParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, C_h1, C_d1, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, &kNodeC2, 1));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2, &memCpyC23, 1));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1));

  // Update the childgraph node
  HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify modified graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);     // MainGraph o/p verification
  HipTest::checkVectorADD(A_h1, B_h1, C_h1, N);  // ChildGraph o/p verification

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipGraphDestroy(childGraph1));
  HIP_CHECK(hipGraphDestroy(childGraph2));
}

static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(
    const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  constexpr int kNumNodeChild = 45;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode;
  hipGraphNode_t memCpyC11, memCpyC12, memCpyC13;
  hipGraphNode_t memCpyC21, memCpyC22, memCpyC23;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  std::vector<hipGraphNode_t> kNodeC1(kNumNodeChild);
  std::vector<hipGraphNode_t> kNodeC2(kNumNodeChild);
  hipGraph_t graph, childGraph1, childGraph2;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
  HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphCreate(&childGraph1, 0));
  HIP_CHECK(hipGraphCreate(&childGraph2, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }

  // Add child graph
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, A_d1, A_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, B_d1, B_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1));

  for (int i = 0; i < kNumNodeChild; i++) {
    hipKernelNodeParams kNodeParams{};
    void* kernelArgs[] = {&A_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
    kNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
    kNodeParams.gridDim = dim3(blocks);
    kNodeParams.blockDim = dim3(threadsPerBlock);
    kNodeParams.sharedMemBytes = 0;
    kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNodeC1[i], childGraph1, nullptr, 0, &kNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, &kNodeC1[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[i - 1], &kNodeC1[i], 1));
    }
  }

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, C_h1, C_d1, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[kNumNodeChild - 1], &memCpyC13, 1));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1));

  // Adding childnode to graph
  HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));

  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);           // MainGraph o/p verification
  REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N));  // ChildGraph o/p verify

  // new child graph
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, A_d1, A_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, B_d1, B_h1, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1));

  for (int i = 0; i < kNumNodeChild; i++) {
    hipKernelNodeParams kNodeParams{};
    void* kernelArgs[] = {&A_d1, &B_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
    kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kNodeParams.gridDim = dim3(blocks);
    kNodeParams.blockDim = dim3(threadsPerBlock);
    kNodeParams.sharedMemBytes = 0;
    kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNodeC2[i], childGraph2, nullptr, 0, &kNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, &kNodeC2[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[i - 1], &kNodeC2[i], 1));
    }
  }

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, C_h1, C_d1, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[kNumNodeChild - 1], &memCpyC23, 1));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1));

  // Update the childgraph node
  HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify modified graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);     // MainGraph o/p verification
  HipTest::checkVectorADD(A_h1, B_h1, C_h1, N);  // ChildGraph o/p verification

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipGraphDestroy(childGraph1));
  HIP_CHECK(hipGraphDestroy(childGraph2));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
       sequence and add a child graph node in the end and Instantiate graph and
       update the graphExec with hipGraphExecChildGraphNodeSetParams for child
       graph node which will copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
    2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
       sequence and add a child graph which contain a kernel operation and
       add this child graph as a node in the end of main graph & Instantiate graph.
       update the graphExec with hipGraphExecChildGraphNodeSetParams for child
       graph node with similar topology which will copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
    3) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
       sequence and add a child graph which contain a kernel call in continuous
       sequence operation and add this child graph as a node in the end of main
       graph & Instantiate main graph and launch and check result of main graph.
       update the graphExec with hipGraphExecChildGraphNodeSetParams for child
       graph node with similar topology which will copy back the result and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
        hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraph_t graph;
  hipGraphExec_t graphExec;
  hipEvent_t event_start, event1_end, event2_end;
  HIP_CHECK(hipEventCreate(&event_start));
  HIP_CHECK(hipEventCreate(&event1_end));
  HIP_CHECK(hipEventCreate(&event2_end));

  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  // Create nodes with event_start and event1_end
  hipGraphNode_t event_start_rec, event_end_rec;
  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start));

  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memCpy1, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_end_rec, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  float t1 = 0.0f;
  HIP_CHECK(hipEventElapsedTime(&t1, event_start, event1_end));
  REQUIRE(t1 > 0.0f);

  // Change the event at event_end_rec node to event2_end
  HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);

  // Validate the changed events
  float t2 = 0.0f;
  HIP_CHECK(hipEventElapsedTime(&t2, event_start, event2_end));
  REQUIRE(t2 > 0.0f);

  // Validate the changed events and initial event
  float t3 = 0.0f;
  HIP_CHECK(hipEventElapsedTime(&t3, event1_end, event2_end));
  REQUIRE(t3 > 0.0f);

  // Free resources
  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipEventDestroy(event_start));
  HIP_CHECK(hipEventDestroy(event1_end));
  HIP_CHECK(hipEventDestroy(event2_end));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added event start node at the begining and than add 2 nodes of MemCpy,
       and multiple node of Kernel call in continuous sequence and add a child
       graph node in the end and Instantiate graph and update the graphExec with
       hipGraphExecEventRecordNodeSetEvent and added a graph node which will copy
       back the result and add event end node at the end and verify the time elapse.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(
    const hipStream_t& stream) {
  constexpr int kNumNode = 35;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  // Create events
  hipEvent_t event, event2;
  HIP_CHECK(hipEventCreate(&event));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t event_wait_node;

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));

  hipGraphNode_t event_start_rec;
  HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event2));
  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_start_rec, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &event_wait_node, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, event_wait_node, event2));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipEventDestroy(event));
  HIP_CHECK(hipEventDestroy(event2));
}

static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(const hipStream_t& stream) {
  constexpr int kNumNode = 45;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  // Create events
  hipEvent_t event, event2;
  HIP_CHECK(hipEventCreate(&event));
  HIP_CHECK(hipEventCreate(&event2));
  hipGraphNode_t event_wait_node;

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_wait_node, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, event_wait_node, event2));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipEventDestroy(event));
  HIP_CHECK(hipEventDestroy(event2));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
       sequence and add a wait event node in the end and Instantiate graph and
       update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a
       graph node which will copy back the result and add event end node at the
       end and verify.
    2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
       sequence and add a wait event node in the end and Instantiate graph and
       add a wait kernel and memcpy node to copy back the result.
       update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a
       graph node which will copy back the result and add event end node at the
       end and verify.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as user created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
        hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
      }
    }
  }
}

void callBackFunc_1(void* A_h) {
  int* A = reinterpret_cast<int*>(A_h);
  for (int i = 0; i < N; i++) {
    A[i] = i + i;
  }
}
static void callBackFunc_1_Verify(int* C_h) {
  for (int i = 0; i < N; i++) {
    if (C_h[i] != (i + i)) {
      INFO("Validation failed i " << i << "C_h[i] " << C_h[i]);
      REQUIRE(false);
    }
  }
}

void callBackFunc_2(void* A_h) {
  int* A = reinterpret_cast<int*>(A_h);
  for (int i = 0; i < N; i++) {
    A[i] = i * i;
  }
}
static void callBackFunc_2_Verify(int* C_h) {
  for (int i = 0; i < N; i++) {
    if (C_h[i] != (i * i)) {
      INFO("Validation failed i " << i << "C_h[i] " << C_h[i]);
      REQUIRE(false);
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(const hipStream_t& stream) {
  constexpr int kNumNode = 45;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  hipGraph_t graph;
  hipGraphExec_t graphExec;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  hipGraphNode_t hostNode;
  hipHostNodeParams hostParams = {0, 0};
  hostParams.fn = callBackFunc_1;
  hostParams.userData = C_h;
  HIP_CHECK(hipGraphAddHostNode(&hostNode, graph, nullptr, 0, &hostParams));

  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &hostNode, 1));

  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify execution result
  callBackFunc_1_Verify(C_h);

  hipHostNodeParams sethostParam = {0, 0};
  sethostParam.fn = callBackFunc_2;
  sethostParam.userData = C_h;

  HIP_CHECK(hipGraphExecHostNodeSetParams(graphExec, hostNode, &sethostParam));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  callBackFunc_2_Verify(C_h);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence
       and Instantiate graph & add a host node and launch the graph and verify result.
       Now update the host node parameters using api hipGraphExecHostNodeSetParams
       and verify the result which reflect modified data.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecHostNodeSetParams",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as used created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
      }
    }
  }
}

static void hipGraph_PerfCheck_hipGraphExecUpdate(const hipStream_t& stream) {
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  size_t NElem{N};
  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C;
  hipGraphNode_t memcpy_A2, memcpy_B2, memcpy_C2;
  hipGraphNode_t kernel_vecADD, kernel_vecSUB;
  hipGraph_t graph1, graph2;
  hipGraphExec_t graphExec;
  hipGraphNode_t hErrorNode_out;
  hipGraphExecUpdateResult updateResult_out;
  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);

  HIP_CHECK(hipGraphCreate(&graph1, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  hipKernelNodeParams kernelNodeParams{};
  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
  kernelNodeParams.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
  HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HIP_CHECK(hipGraphCreate(&graph2, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A2, graph2, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B2, graph2, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
  void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
  kernelNodeParams.gridDim = dim3(blocks);
  kernelNodeParams.blockDim = dim3(threadsPerBlock);
  kernelNodeParams.sharedMemBytes = 0;
  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
  kernelNodeParams.extra = nullptr;
  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C2, graph2, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A2, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B2, &kernel_vecSUB, 1));
  HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C2, 1));
  HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h, B_h, C_h, N);
  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph1));
  HIP_CHECK(hipGraphDestroy(graph2));
}


/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy & a Kernel node and copy back result using memcpy
       and Instantiate graph & update new graph with similar node structure with
       api hipGraphExecUpdate and verify the result, the updated node should reflect.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate", "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as used created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecUpdate(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecUpdate(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecUpdate(stream);
      }
    }
  }
}

#if HT_NVIDIA
static void hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(const hipStream_t& stream) {
  constexpr int kNumNode = 45;
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
  hipGraphNode_t memCpy1, memCpy2, memCpy3;
  hipGraphNode_t memCpy21, memCpy22, memCpy23;
  std::vector<hipGraphNode_t> kNode(kNumNode);
  std::vector<hipGraphNode_t> kNode2(kNumNode);
  hipGraph_t graph, graph2;
  hipGraphExec_t graphExec;
  hipGraphNode_t hErrorNode_out;
  hipGraphExecUpdateResult updateResult_out;

  int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
  int *A_d2, *B_d2, *C_d2, *A_h2, *B_h2, *C_h2;
  HipTest::initArrays(&A_d2, &B_d2, &C_d2, &A_h2, &B_h2, &C_h2, N, false);

  HIP_CHECK(hipGraphCreate(&graph, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParams{};
    void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
    kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
    kernelNodeParams.gridDim = dim3(blocks);
    kernelNodeParams.blockDim = dim3(threadsPerBlock);
    kernelNodeParams.sharedMemBytes = 0;
    kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
    kernelNodeParams.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorADD(A_h, B_h, C_h, N);

  HIP_CHECK(hipGraphCreate(&graph2, 0));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy21, graph2, nullptr, 0, A_d2, A_h2, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy22, graph2, nullptr, 0, B_d2, B_h2, Nbytes,
                                    hipMemcpyHostToDevice));
  HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy21, &memCpy22, 1));

  for (int i = 0; i < kNumNode; i++) {
    hipKernelNodeParams kernelNodeParam{};
    void* kernelArgs2[] = {&A_d2, &B_d2, &C_d2, reinterpret_cast<void*>(&NElem)};
    kernelNodeParam.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
    kernelNodeParam.gridDim = dim3(blocks);
    kernelNodeParam.blockDim = dim3(threadsPerBlock);
    kernelNodeParam.sharedMemBytes = 0;
    kernelNodeParam.kernelParams = reinterpret_cast<void**>(kernelArgs2);
    kernelNodeParam.extra = nullptr;
    HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph2, nullptr, 0, &kernelNodeParam));
    if (i == 0) {
      HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy22, &kNode2[i], 1));
    } else {
      HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[i - 1], &kNode2[i], 1));
    }
  }
  HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy23, graph2, nullptr, 0, C_h2, C_d2, Nbytes,
                                    hipMemcpyDeviceToHost));
  HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[kNumNode - 1], &memCpy23, 1));

  HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
  HIP_CHECK(hipGraphLaunch(graphExec, stream));
  HIP_CHECK(hipStreamSynchronize(stream));

  // Verify graph execution result
  HipTest::checkVectorSUB(A_h2, B_h2, C_h2, N);

  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HipTest::freeArrays(A_d2, B_d2, C_d2, A_h2, B_h2, C_h2, false);
  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
  HIP_CHECK(hipGraphDestroy(graph2));
}

/**
 * Test Description
 * ------------------------
 *  - Validate hipGraph performance with doorbell set.
 *  - DEBUG_CLR_GRAPH_PACKET_CAPTURE
 *  1) Added 2 nodes of MemCpy & a Kernel node in sequence and copy back result using memcpy
       and Instantiate graph & update new graph with similar node structure with
       api hipGraphExecUpdate and verify the result, the updated node should reflect.
    i)   Check with Multi device case.
    ii)  Pass stream as user created stream
    iii) Pass stream as default stream
    iv)  Pass stream as hipStreamPerThread
 * Test source
 * ------------------------
 *  - unit/graph/hipGraphPerf.cc
 * Test requirements
 * ------------------------
 *  - HIP_VERSION >= 6.1
 */
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop",
          "[multigpu]") {
  if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
    HipTest::HIP_SKIP_TEST(
        "Unable to turn on "
        "DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
    return;
  }

  hipStream_t stream;
  int numDevices = 0;
  HIP_CHECK(hipGetDeviceCount(&numDevices));

  SECTION("Multi device test with different type of stream") {
    for (int i = 0; i < numDevices; i++) {
      HIP_CHECK(hipSetDevice(i));

      SECTION("Pass stream as used created stream") {
        HIP_CHECK(hipStreamCreate(&stream));
        hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
        HIP_CHECK(hipStreamDestroy(stream));
      }
      SECTION("Pass stream as default stream") {
        stream = 0;
        hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
      }
      SECTION("Pass stream as hipStreamPerThread") {
        stream = hipStreamPerThread;
        hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
      }
    }
  }
}
#endif
#endif  //  #if __linux__

/**
 * End doxygen group GraphTest.
 * @}
 */