2680 řádky
110 KiB
C++
2680 řádky
110 KiB
C++
/*
|
|
Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @addtogroup hipGraphPerfCheck hipGraphPerfCheck
|
|
* @{
|
|
* @ingroup GraphTest
|
|
* `hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
|
|
* const hipGraphNode_t* pDependencies, size_t numDependencies,
|
|
* const hipKernelNodeParams* pNodeParams)` -
|
|
* Creates a kernel execution node and adds it to a graph.
|
|
* Optimize HIPGraph Performance.
|
|
*/
|
|
|
|
#ifdef __linux__ // windows machine build failing refer ticket SWDEV-440611
|
|
|
|
#include <hip_test_common.hh>
|
|
#include <hip_test_checkers.hh>
|
|
#include <hip_test_kernels.hh>
|
|
#include <hip_test_defgroups.hh>
|
|
|
|
#ifdef _WIN64
|
|
#define setenv(x, y, z) _putenv_s(x, y)
|
|
#endif
|
|
|
|
static constexpr int N = 1024;
|
|
static constexpr int Nbytes = N * sizeof(int);
|
|
static size_t NElem{N};
|
|
static constexpr int blocksPerCU = 6; // to hide latency
|
|
static constexpr int threadsPerBlock = 256;
|
|
|
|
__device__ int globalTo1[N];
|
|
__device__ int globalTo2[N];
|
|
__device__ int globalFrom1[N];
|
|
__device__ int globalFrom2[N];
|
|
|
|
static bool verifyVectorSquare(int* A_h, int* C_h, size_t size) {
|
|
for (size_t i = 0; i < size; i++) {
|
|
if (C_h[i] != A_h[i] * A_h[i]) {
|
|
INFO("VectorSquare A and C not matching at " << i);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* - Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and copy back the result and verify. */
|
|
static void checkGraphcontinuousKernelCall(const unsigned int kNumNode) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/* - Added multiple nodes of MemCpy, Kernel node continuously for
|
|
2 block & copy back result in MemCpy. */
|
|
static void checkGraphcontinuousKernelCallIn2Blocks(const unsigned int kNumNode1,
|
|
const unsigned int kNumNode2) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
|
|
std::vector<hipGraphNode_t> kNode1(kNumNode1), kNode2(kNumNode2);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode1; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode1[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode1[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[i - 1], &kNode1[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[kNumNode1 - 1], &memCpy3, 1));
|
|
|
|
for (int i = 0; i < kNumNode2; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &kNode2[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[i - 1], &kNode2[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[kNumNode2 - 1], &memCpy4, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/* - Added 2 nodes of MemCpy, Kernel node whicl compute the operation & copy
|
|
back result using MemCpy node. Call this multiple times sequentially. */
|
|
static void checkGraphMemcpyKernelMixCall(const unsigned int kNumIter) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
constexpr int kNumNode = 3;
|
|
std::vector<hipGraphNode_t> node(kNumIter * kNumNode);
|
|
std::vector<hipGraphNode_t> kNode(kNumIter);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
int i = 0;
|
|
for (int iter = 0; iter < kNumIter; iter++) {
|
|
i = kNumNode * iter;
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i], graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 1], graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[iter], graph, nullptr, 0, &kernelNodeParams));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 2], graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
if (i != 0) HIP_CHECK(hipGraphAddDependencies(graph, &node[i - 1], &node[i], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &node[i], &node[i + 1], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &node[i + 1], &kNode[iter], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[iter], &node[i + 2], 1));
|
|
}
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/* - Added a nodes of MemCpy, MesSet, Kernel to do operation and copy back
|
|
result using MemCpy node and call above operation in sequence. */
|
|
static void checkGraphMemcpyMemsetKernelMixCall(const unsigned int kNumIter) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
constexpr int kNumNode = 3;
|
|
std::vector<hipGraphNode_t> node(kNumIter * kNumNode);
|
|
std::vector<hipGraphNode_t> kNode(kNumIter);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
int pitch_M = 0;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
int i = 0;
|
|
for (int iter = 0; iter < kNumIter; iter++) {
|
|
i = kNumNode * iter;
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i], graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
|
|
hipMemsetParams memsetParams{};
|
|
memset(&memsetParams, 0, sizeof(memsetParams));
|
|
memsetParams.dst = reinterpret_cast<void*>(B_d);
|
|
memsetParams.value = 2;
|
|
memsetParams.pitch = pitch_M;
|
|
memsetParams.elementSize = sizeof(char);
|
|
memsetParams.width = N;
|
|
memsetParams.height = 1;
|
|
HIP_CHECK(hipGraphAddMemsetNode(&node[i + 1], graph, nullptr, 0, &memsetParams));
|
|
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[iter], graph, nullptr, 0, &kernelNodeParams));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&node[i + 2], graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
if (i != 0) HIP_CHECK(hipGraphAddDependencies(graph, &node[i - 1], &node[i], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &node[i], &node[i + 1], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &node[i + 1], &kNode[iter], 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[iter], &node[i + 2], 1));
|
|
}
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
REQUIRE(true == verifyVectorSquare(A_h, C_h, N));
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/* - Added a EventRecordNode at start and 2 nodes of MemCpy, and multiple
|
|
node of Kernel call in continuous sequence and copy back the result and
|
|
add EventRecordNode at the end and verify the result. */
|
|
static void checkGraphEventcontinuousKernelCall(const unsigned int kNumNode) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
hipGraphNode_t event_start, event_final;
|
|
hipEvent_t eventstart, eventend;
|
|
HIP_CHECK(hipEventCreate(&eventstart));
|
|
HIP_CHECK(hipEventCreate(&eventend));
|
|
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_final, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipEventSynchronize(eventend));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipEventDestroy(eventstart));
|
|
HIP_CHECK(hipEventDestroy(eventend));
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/* - Added a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel
|
|
node continuously and added one more EventRecordNode in mid for synchronize
|
|
and do similar MemCpy, Kernel node continuously in 2nd block & copy back
|
|
result in MemCpy and add EventRecordNode at the end for synchronize. */
|
|
static void checkGraphEventcontinuousKernelCallIn2Blocks(const unsigned int kNumNode1,
|
|
const unsigned int kNumNode2) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
|
|
std::vector<hipGraphNode_t> kNode1(kNumNode1), kNode2(kNumNode2);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
hipGraphNode_t event_start, event_mid, event_final;
|
|
hipEvent_t eventstart, eventmid, eventend;
|
|
HIP_CHECK(hipEventCreate(&eventstart));
|
|
HIP_CHECK(hipEventCreate(&eventmid));
|
|
HIP_CHECK(hipEventCreate(&eventend));
|
|
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_mid, graph, nullptr, 0, eventmid));
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memCpy1, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode1; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode1[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode1[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[i - 1], &kNode1[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode1[kNumNode1 - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_mid, 1));
|
|
|
|
for (int i = 0; i < kNumNode2; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &kNode2[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[i - 1], &kNode2[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode2[kNumNode2 - 1], &memCpy4, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy4, &event_final, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipEventSynchronize(eventend));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipEventDestroy(eventstart));
|
|
HIP_CHECK(hipEventDestroy(eventmid));
|
|
HIP_CHECK(hipEventDestroy(eventend));
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Add multiple nodes of MemCpy, Kernel in sequence multiple times.
|
|
* 2) Add multiple nodes of MemCpy, MesSet, Kernel in sequence.
|
|
* 3) Add multiple nodes of MemCpy, Kernel node continuously & copy back result in MemCpy.
|
|
* 4) Add multiple nodes of MemCpy, Kernel node continuously for 2 block & copy back result in
|
|
MemCpy.
|
|
* 5) Add a EventRecordNode at start and 2 nodes of MemCpy, and multiple
|
|
node of Kernel call in continuous sequence and copy back the result and
|
|
add EventRecordNode at the end and verify the result.
|
|
* 6) Add a EventRecordNode at start and Added multiple nodes of MemCpy, Kernel
|
|
node continuously and added one more EventRecordNode in mid for synchronize
|
|
and do similar MemCpy, Kernel node continuously in 2nd block & copy back
|
|
result in MemCpy and add EventRecordNode at the end for synchronize.
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_MemcpyKernelMixCall") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
constexpr int kNumIter1 = 25;
|
|
constexpr int kNumIter2 = 30;
|
|
constexpr int kNumKNode1 = 15;
|
|
constexpr int kNumKNode2 = 45;
|
|
|
|
checkGraphMemcpyKernelMixCall(kNumIter1);
|
|
checkGraphMemcpyMemsetKernelMixCall(kNumIter2);
|
|
checkGraphcontinuousKernelCall(kNumKNode1);
|
|
checkGraphcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2);
|
|
checkGraphEventcontinuousKernelCall(kNumIter1);
|
|
checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode1, kNumKNode2);
|
|
|
|
checkGraphMemcpyKernelMixCall(kNumIter2);
|
|
checkGraphMemcpyMemsetKernelMixCall(kNumIter1);
|
|
checkGraphcontinuousKernelCall(kNumKNode2);
|
|
checkGraphcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1);
|
|
checkGraphEventcontinuousKernelCall(kNumIter2);
|
|
checkGraphEventcontinuousKernelCallIn2Blocks(kNumKNode2, kNumKNode1);
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(const hipStream_t& stream) {
|
|
constexpr size_t N = 1024;
|
|
constexpr size_t Nbytes = N * sizeof(int);
|
|
constexpr auto blocksPerCU = 6; // to hide latency
|
|
constexpr auto threadsPerBlock = 256;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraph_t graph;
|
|
hipGraphNode_t memcpyNode, kNode;
|
|
hipKernelNodeParams kNodeParams{}, kNodeParams1{};
|
|
int *A_d, *B_d, *C_d;
|
|
int *A_h, *B_h, *C_h;
|
|
std::vector<hipGraphNode_t> dependencies;
|
|
hipGraphExec_t graphExec;
|
|
size_t NElem{N};
|
|
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
dependencies.push_back(memcpyNode);
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
dependencies.push_back(memcpyNode);
|
|
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kNodeParams.gridDim = dim3(blocks);
|
|
kNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
HIP_CHECK(
|
|
hipGraphAddKernelNode(&kNode, graph, dependencies.data(), dependencies.size(), &kNodeParams));
|
|
|
|
dependencies.clear();
|
|
dependencies.push_back(kNode);
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, dependencies.data(), dependencies.size(),
|
|
C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
|
|
|
// Instantiate and launch the graph
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD<int>(A_h, B_h, C_h, N);
|
|
|
|
kNodeParams1.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kNodeParams1.gridDim = dim3(blocks);
|
|
kNodeParams1.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
|
|
// Instantiate again and launch the graph
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
|
HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode, &kNodeParams1));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB<int>(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and a node of Kernel call and
|
|
Instantiate graph and update kernelNodeParams for last kernel
|
|
and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as used created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if HT_NVIDIA
|
|
static void hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
hipKernelNodeParams kNodeParams1{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams1.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kNodeParams1.gridDim = dim3(blocks);
|
|
kNodeParams1.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams1.sharedMemBytes = 0;
|
|
kNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kNodeParams1.extra = nullptr;
|
|
HIP_CHECK(hipGraphExecKernelNodeSetParams(graphExec, kNode[kNumNode - 1], &kNodeParams1));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence
|
|
and Instantiate graph & update kernelNodeParams with hipGraphExecKernelNodeSetParams
|
|
for last kernel and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as used created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecKernelNodeSetParams_inLoop(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 3 nodes of MemCpy, and node of Kernel call in continuous
|
|
sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams
|
|
for source memCopy3 node and copy back the result and verify.
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
constexpr int kNumNode = 1;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
|
|
HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);
|
|
|
|
hipStream_t stream;
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[0], graph, nullptr, 0, &kernelNodeParams));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[0], 1));
|
|
|
|
hipMemcpy3DParms myparams;
|
|
memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
|
|
myparams.srcPos = make_hipPos(0, 0, 0);
|
|
myparams.dstPos = make_hipPos(0, 0, 0);
|
|
myparams.extent = make_hipExtent(Nbytes, 1, 1);
|
|
myparams.dstPtr = make_hipPitchedPtr(A_h1, Nbytes, Nbytes, 1);
|
|
myparams.srcPtr = make_hipPitchedPtr(A_d1, Nbytes, Nbytes, 1);
|
|
myparams.kind = hipMemcpyDeviceToHost;
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
|
|
SECTION("Verifying with different memCopy node Params") {
|
|
memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
|
|
myparams.srcPos = make_hipPos(0, 0, 0);
|
|
myparams.dstPos = make_hipPos(0, 0, 0);
|
|
myparams.extent = make_hipExtent(Nbytes, 1, 1);
|
|
myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1);
|
|
myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
|
|
myparams.kind = hipMemcpyDeviceToHost;
|
|
|
|
HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
}
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int harray1D[N]{};
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
|
|
HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
|
|
hipMemcpy3DParms myparams;
|
|
memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
|
|
myparams.srcPos = make_hipPos(0, 0, 0);
|
|
myparams.dstPos = make_hipPos(0, 0, 0);
|
|
myparams.extent = make_hipExtent(Nbytes, 1, 1);
|
|
myparams.dstPtr = make_hipPitchedPtr(harray1D, Nbytes, Nbytes, 1);
|
|
myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
|
|
myparams.kind = hipMemcpyDeviceToHost;
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode(&memCpy3, graph, nullptr, 0, &myparams));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, harray1D, N);
|
|
|
|
memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
|
|
myparams.srcPos = make_hipPos(0, 0, 0);
|
|
myparams.dstPos = make_hipPos(0, 0, 0);
|
|
myparams.extent = make_hipExtent(Nbytes, 1, 1);
|
|
myparams.dstPtr = make_hipPitchedPtr(C_h, Nbytes, Nbytes, 1);
|
|
myparams.srcPtr = make_hipPitchedPtr(C_d, Nbytes, Nbytes, 1);
|
|
myparams.kind = hipMemcpyDeviceToHost;
|
|
|
|
HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memCpy3, &myparams));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams
|
|
for source memCopy3 node and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams_inLoop(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int* hData = reinterpret_cast<int*>(malloc(Nbytes));
|
|
REQUIRE(hData != nullptr);
|
|
for (int i = 0; i < N; ++i) hData[i] = 2 * i + 1;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graphExec, memCpy2, B_d, hData, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, hData, C_h, N);
|
|
|
|
free(hData);
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParams1D
|
|
for source memCopy2 node and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParams1D_inLoop(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, HIP_SYMBOL(globalFrom1), C_d,
|
|
Nbytes, 0, hipMemcpyDeviceToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(
|
|
&memCpy4, graph, nullptr, 0, C_h, HIP_SYMBOL(globalFrom2), Nbytes, 0, hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphExecMemcpyNodeSetParamsFromSymbol(
|
|
graphExec, memCpy4, C_h, HIP_SYMBOL(globalFrom1), Nbytes, 0, hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsFromSymbol
|
|
for source memCopy4 node and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsFrmSymbol(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNodeToSymbol(&memCpy3, graph, nullptr, 0, HIP_SYMBOL(globalTo1), C_d,
|
|
Nbytes, 0, hipMemcpyDeviceToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNodeFromSymbol(&memCpy4, graph, nullptr, 0, C_h, HIP_SYMBOL(globalTo2),
|
|
Nbytes, 0, hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &memCpy4, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphExecMemcpyNodeSetParamsToSymbol(graphExec, memCpy3, HIP_SYMBOL(globalTo2), C_d,
|
|
Nbytes, 0, hipMemcpyDeviceToDevice));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and Instantiate graph and update hipGraphExecMemcpyNodeSetParamsToSymbol
|
|
for source memCopy3 node and copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecMemcpyNodeSetParamsToSymbol(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(const hipStream_t& stream,
|
|
int test) {
|
|
constexpr int kNumNode = 35;
|
|
constexpr int memSetVal = 7, memSetVal2 = 9;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memSet;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
int size, elementSize;
|
|
|
|
if (test == 0) {
|
|
size = Nbytes;
|
|
elementSize = sizeof(char);
|
|
} else {
|
|
size = N;
|
|
elementSize = sizeof(int);
|
|
}
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
// Add MemSet Node
|
|
hipMemsetParams memsetParams{};
|
|
memset(&memsetParams, 0, sizeof(memsetParams));
|
|
memsetParams.dst = A_d;
|
|
memsetParams.value = memSetVal;
|
|
memsetParams.elementSize = elementSize;
|
|
memsetParams.width = size;
|
|
memsetParams.height = 1;
|
|
HIP_CHECK(hipGraphAddMemsetNode(&memSet, graph, nullptr, 0, &memsetParams));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &memSet, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memSet, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
if (test == 0) {
|
|
memset(A_h, memSetVal, size);
|
|
} else {
|
|
for (int i = 0; i < N; i++) {
|
|
A_h[i] = memSetVal;
|
|
}
|
|
}
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
// update MemSet Node using Exec
|
|
memset(&memsetParams, 0, sizeof(memsetParams));
|
|
memsetParams.dst = A_d;
|
|
memsetParams.value = memSetVal2;
|
|
memsetParams.elementSize = elementSize;
|
|
memsetParams.width = size;
|
|
memsetParams.height = 1;
|
|
HIP_CHECK(hipGraphExecMemsetNodeSetParams(graphExec, memSet, &memsetParams));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
if (test == 0) {
|
|
memset(A_h, memSetVal2, size);
|
|
} else {
|
|
for (int i = 0; i < N; i++) {
|
|
A_h[i] = memSetVal2;
|
|
}
|
|
}
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy and add a memSet node, and multiple node of Kernel
|
|
call in continuous sequence and Instantiate graph and update memSet node with
|
|
hipGraphExecMemsetNodeSetParams api and verify the result.
|
|
i) Verify the memset with reset 1 byte (char size) block.
|
|
ii) Verify the memset with reset 4 byte (int size) block.
|
|
iii) Check with Multi device case.
|
|
iv) Pass stream as user created stream
|
|
v) Pass stream as default stream
|
|
vi) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 0);
|
|
hipGraph_PerfCheck_hipGraphExecMemsetNodeSetParams(stream, 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
constexpr int memSetVal = 7, memSetVal2 = 9;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4;
|
|
hipGraphNode_t memSet1, memSet2, childGraphNode;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph, childGraph1, childGraph2;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph1, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph2, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
|
|
// Add MemSet Node
|
|
hipMemsetParams memsetParams{};
|
|
memset(&memsetParams, 0, sizeof(memsetParams));
|
|
memsetParams.dst = C_d;
|
|
memsetParams.value = memSetVal;
|
|
memsetParams.elementSize = sizeof(char);
|
|
memsetParams.width = Nbytes;
|
|
memsetParams.height = 1;
|
|
HIP_CHECK(hipGraphAddMemsetNode(&memSet1, childGraph1, nullptr, 0, &memsetParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memSet1, &memCpy3, 1));
|
|
// Adding childnode to graph
|
|
HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
memset(A_h, memSetVal, Nbytes);
|
|
for (unsigned int i = 0; i < N; i++) {
|
|
if (A_h[i] != C_h[i]) {
|
|
WARN("Validation failed at " << i << "\t" << C_h[i]);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
|
|
memset(&memsetParams, 0, sizeof(memsetParams));
|
|
memsetParams.dst = C_d;
|
|
memsetParams.value = memSetVal2;
|
|
memsetParams.elementSize = sizeof(int);
|
|
memsetParams.width = N;
|
|
memsetParams.height = 1;
|
|
HIP_CHECK(hipGraphAddMemsetNode(&memSet2, childGraph2, nullptr, 0, &memsetParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memSet2, &memCpy4, 1));
|
|
|
|
// Update the childgraph node
|
|
HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
|
|
// Launch Again and verify it once
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
for (unsigned int i = 0; i < N; i++) {
|
|
if (memSetVal2 != C_h[i]) {
|
|
WARN("Validation failed at " << i << "\t" << C_h[i]);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipGraphDestroy(childGraph1));
|
|
HIP_CHECK(hipGraphDestroy(childGraph2));
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(
|
|
const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode;
|
|
hipGraphNode_t memCpyC11, memCpyC12, memCpyC13, kNodeC1;
|
|
hipGraphNode_t memCpyC21, memCpyC22, memCpyC23, kNodeC2;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph, childGraph1, childGraph2;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
|
|
HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph1, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph2, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
|
|
// Add child graph
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, A_d1, A_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, B_d1, B_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1));
|
|
|
|
hipKernelNodeParams kNodeParams{};
|
|
void* kernelArgsC[] = {&A_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
|
kNodeParams.gridDim = dim3(blocks);
|
|
kNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams.sharedMemBytes = 0;
|
|
kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgsC);
|
|
kNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNodeC1, childGraph1, nullptr, 0, &kNodeParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, C_h1, C_d1, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, &kNodeC1, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1, &memCpyC13, 1));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1));
|
|
|
|
// Adding childnode to graph
|
|
HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification
|
|
REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N)); // ChildGraph o/p verify
|
|
|
|
// new child graph
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, A_d1, A_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, B_d1, B_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1));
|
|
|
|
memset(&kNodeParams, 0x00, sizeof(hipKernelNodeParams));
|
|
void* kernelArgC[] = {&A_d1, &B_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kNodeParams.gridDim = dim3(blocks);
|
|
kNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams.sharedMemBytes = 0;
|
|
kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgC);
|
|
kNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNodeC2, childGraph2, nullptr, 0, &kNodeParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, C_h1, C_d1, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, &kNodeC2, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2, &memCpyC23, 1));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1));
|
|
|
|
// Update the childgraph node
|
|
HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify modified graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification
|
|
HipTest::checkVectorADD(A_h1, B_h1, C_h1, N); // ChildGraph o/p verification
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipGraphDestroy(childGraph1));
|
|
HIP_CHECK(hipGraphDestroy(childGraph2));
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(
|
|
const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
constexpr int kNumNodeChild = 45;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3, memCpy4, childGraphNode;
|
|
hipGraphNode_t memCpyC11, memCpyC12, memCpyC13;
|
|
hipGraphNode_t memCpyC21, memCpyC22, memCpyC23;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
std::vector<hipGraphNode_t> kNodeC1(kNumNodeChild);
|
|
std::vector<hipGraphNode_t> kNodeC2(kNumNodeChild);
|
|
hipGraph_t graph, childGraph1, childGraph2;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
int *A_d1, *B_d1, *C_d1, *A_h1, *B_h1, *C_h1;
|
|
HipTest::initArrays(&A_d1, &B_d1, &C_d1, &A_h1, &B_h1, &C_h1, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph1, 0));
|
|
HIP_CHECK(hipGraphCreate(&childGraph2, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
|
|
// Add child graph
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC11, childGraph1, nullptr, 0, A_d1, A_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC12, childGraph1, nullptr, 0, B_d1, B_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC11, &memCpyC12, 1));
|
|
|
|
for (int i = 0; i < kNumNodeChild; i++) {
|
|
hipKernelNodeParams kNodeParams{};
|
|
void* kernelArgs[] = {&A_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
|
kNodeParams.gridDim = dim3(blocks);
|
|
kNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams.sharedMemBytes = 0;
|
|
kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNodeC1[i], childGraph1, nullptr, 0, &kNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC12, &kNodeC1[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[i - 1], &kNodeC1[i], 1));
|
|
}
|
|
}
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC13, childGraph1, nullptr, 0, C_h1, C_d1, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &kNodeC1[kNumNodeChild - 1], &memCpyC13, 1));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy4, childGraph1, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph1, &memCpyC13, &memCpy4, 1));
|
|
|
|
// Adding childnode to graph
|
|
HIP_CHECK(hipGraphAddChildGraphNode(&childGraphNode, graph, nullptr, 0, childGraph1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &childGraphNode, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification
|
|
REQUIRE(true == verifyVectorSquare(A_h1, C_h1, N)); // ChildGraph o/p verify
|
|
|
|
// new child graph
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC21, childGraph2, nullptr, 0, A_d1, A_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC22, childGraph2, nullptr, 0, B_d1, B_h1, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC21, &memCpyC22, 1));
|
|
|
|
for (int i = 0; i < kNumNodeChild; i++) {
|
|
hipKernelNodeParams kNodeParams{};
|
|
void* kernelArgs[] = {&A_d1, &B_d1, &C_d1, reinterpret_cast<void*>(&NElem)};
|
|
kNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kNodeParams.gridDim = dim3(blocks);
|
|
kNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kNodeParams.sharedMemBytes = 0;
|
|
kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNodeC2[i], childGraph2, nullptr, 0, &kNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC22, &kNodeC2[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[i - 1], &kNodeC2[i], 1));
|
|
}
|
|
}
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpyC23, childGraph2, nullptr, 0, C_h1, C_d1, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &kNodeC2[kNumNodeChild - 1], &memCpyC23, 1));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, childGraph2, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(childGraph2, &memCpyC23, &memCpy3, 1));
|
|
|
|
// Update the childgraph node
|
|
HIP_CHECK(hipGraphExecChildGraphNodeSetParams(graphExec, childGraphNode, childGraph2));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify modified graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N); // MainGraph o/p verification
|
|
HipTest::checkVectorADD(A_h1, B_h1, C_h1, N); // ChildGraph o/p verification
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HipTest::freeArrays(A_d1, B_d1, C_d1, A_h1, B_h1, C_h1, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipGraphDestroy(childGraph1));
|
|
HIP_CHECK(hipGraphDestroy(childGraph2));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and add a child graph node in the end and Instantiate graph and
|
|
update the graphExec with hipGraphExecChildGraphNodeSetParams for child
|
|
graph node which will copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and add a child graph which contain a kernel operation and
|
|
add this child graph as a node in the end of main graph & Instantiate graph.
|
|
update the graphExec with hipGraphExecChildGraphNodeSetParams for child
|
|
graph node with similar topology which will copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
3) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and add a child graph which contain a kernel call in continuous
|
|
sequence operation and add this child graph as a node in the end of main
|
|
graph & Instantiate main graph and launch and check result of main graph.
|
|
update the graphExec with hipGraphExecChildGraphNodeSetParams for child
|
|
graph node with similar topology which will copy back the result and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_Kernel(stream);
|
|
hipGraph_PerfCheck_hipGraphExecChildGraphNodeSetParams_mKernel(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
hipEvent_t event_start, event1_end, event2_end;
|
|
HIP_CHECK(hipEventCreate(&event_start));
|
|
HIP_CHECK(hipEventCreate(&event1_end));
|
|
HIP_CHECK(hipEventCreate(&event2_end));
|
|
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
// Create nodes with event_start and event1_end
|
|
hipGraphNode_t event_start_rec, event_end_rec;
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start));
|
|
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memCpy1, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_end_rec, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
float t1 = 0.0f;
|
|
HIP_CHECK(hipEventElapsedTime(&t1, event_start, event1_end));
|
|
REQUIRE(t1 > 0.0f);
|
|
|
|
// Change the event at event_end_rec node to event2_end
|
|
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
|
|
// Validate the changed events
|
|
float t2 = 0.0f;
|
|
HIP_CHECK(hipEventElapsedTime(&t2, event_start, event2_end));
|
|
REQUIRE(t2 > 0.0f);
|
|
|
|
// Validate the changed events and initial event
|
|
float t3 = 0.0f;
|
|
HIP_CHECK(hipEventElapsedTime(&t3, event1_end, event2_end));
|
|
REQUIRE(t3 > 0.0f);
|
|
|
|
// Free resources
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipEventDestroy(event_start));
|
|
HIP_CHECK(hipEventDestroy(event1_end));
|
|
HIP_CHECK(hipEventDestroy(event2_end));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added event start node at the begining and than add 2 nodes of MemCpy,
|
|
and multiple node of Kernel call in continuous sequence and add a child
|
|
graph node in the end and Instantiate graph and update the graphExec with
|
|
hipGraphExecEventRecordNodeSetEvent and added a graph node which will copy
|
|
back the result and add event end node at the end and verify the time elapse.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecEventRecordNodeSetEvent(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(
|
|
const hipStream_t& stream) {
|
|
constexpr int kNumNode = 35;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
// Create events
|
|
hipEvent_t event, event2;
|
|
HIP_CHECK(hipEventCreate(&event));
|
|
HIP_CHECK(hipEventCreate(&event2));
|
|
hipGraphNode_t event_wait_node;
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
|
|
hipGraphNode_t event_start_rec;
|
|
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event2));
|
|
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_start_rec, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &event_wait_node, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, event_wait_node, event2));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipEventDestroy(event));
|
|
HIP_CHECK(hipEventDestroy(event2));
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 45;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
// Create events
|
|
hipEvent_t event, event2;
|
|
HIP_CHECK(hipEventCreate(&event));
|
|
HIP_CHECK(hipEventCreate(&event2));
|
|
hipGraphNode_t event_wait_node;
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &event_wait_node, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, event_wait_node, event2));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipEventDestroy(event));
|
|
HIP_CHECK(hipEventDestroy(event2));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and add a wait event node in the end and Instantiate graph and
|
|
update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a
|
|
graph node which will copy back the result and add event end node at the
|
|
end and verify.
|
|
2) Added 2 nodes of MemCpy, and multiple node of Kernel call in continuous
|
|
sequence and add a wait event node in the end and Instantiate graph and
|
|
add a wait kernel and memcpy node to copy back the result.
|
|
update the graphExec with hipGraphExecEventWaitNodeSetEvent node and a
|
|
graph node which will copy back the result and add event end node at the
|
|
end and verify.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as user created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent(stream);
|
|
hipGraph_PerfCheck_hipGraphExecEventWaitNodeSetEvent_waitKrnl(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void callBackFunc_1(void* A_h) {
|
|
int* A = reinterpret_cast<int*>(A_h);
|
|
for (int i = 0; i < N; i++) {
|
|
A[i] = i + i;
|
|
}
|
|
}
|
|
static void callBackFunc_1_Verify(int* C_h) {
|
|
for (int i = 0; i < N; i++) {
|
|
if (C_h[i] != (i + i)) {
|
|
INFO("Validation failed i " << i << "C_h[i] " << C_h[i]);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
void callBackFunc_2(void* A_h) {
|
|
int* A = reinterpret_cast<int*>(A_h);
|
|
for (int i = 0; i < N; i++) {
|
|
A[i] = i * i;
|
|
}
|
|
}
|
|
static void callBackFunc_2_Verify(int* C_h) {
|
|
for (int i = 0; i < N; i++) {
|
|
if (C_h[i] != (i * i)) {
|
|
INFO("Validation failed i " << i << "C_h[i] " << C_h[i]);
|
|
REQUIRE(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 45;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
hipGraph_t graph;
|
|
hipGraphExec_t graphExec;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
hipGraphNode_t hostNode;
|
|
hipHostNodeParams hostParams = {0, 0};
|
|
hostParams.fn = callBackFunc_1;
|
|
hostParams.userData = C_h;
|
|
HIP_CHECK(hipGraphAddHostNode(&hostNode, graph, nullptr, 0, &hostParams));
|
|
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy3, &hostNode, 1));
|
|
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify execution result
|
|
callBackFunc_1_Verify(C_h);
|
|
|
|
hipHostNodeParams sethostParam = {0, 0};
|
|
sethostParam.fn = callBackFunc_2;
|
|
sethostParam.userData = C_h;
|
|
|
|
HIP_CHECK(hipGraphExecHostNodeSetParams(graphExec, hostNode, &sethostParam));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
callBackFunc_2_Verify(C_h);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy, & multiple node of Kernel call in continuous sequence
|
|
and Instantiate graph & add a host node and launch the graph and verify result.
|
|
Now update the host node parameters using api hipGraphExecHostNodeSetParams
|
|
and verify the result which reflect modified data.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecHostNodeSetParams",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as used created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecHostNodeSetParams(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void hipGraph_PerfCheck_hipGraphExecUpdate(const hipStream_t& stream) {
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
size_t NElem{N};
|
|
hipGraphNode_t memcpy_A, memcpy_B, memcpy_C;
|
|
hipGraphNode_t memcpy_A2, memcpy_B2, memcpy_C2;
|
|
hipGraphNode_t kernel_vecADD, kernel_vecSUB;
|
|
hipGraph_t graph1, graph2;
|
|
hipGraphExec_t graphExec;
|
|
hipGraphNode_t hErrorNode_out;
|
|
hipGraphExecUpdateResult updateResult_out;
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A2, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B2, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
|
|
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C2, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A2, &kernel_vecSUB, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B2, &kernel_vecSUB, 1));
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C2, 1));
|
|
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h, B_h, C_h, N);
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph1));
|
|
HIP_CHECK(hipGraphDestroy(graph2));
|
|
}
|
|
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy & a Kernel node and copy back result using memcpy
|
|
and Instantiate graph & update new graph with similar node structure with
|
|
api hipGraphExecUpdate and verify the result, the updated node should reflect.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate", "[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as used created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecUpdate(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecUpdate(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecUpdate(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if HT_NVIDIA
|
|
static void hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(const hipStream_t& stream) {
|
|
constexpr int kNumNode = 45;
|
|
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
|
hipGraphNode_t memCpy1, memCpy2, memCpy3;
|
|
hipGraphNode_t memCpy21, memCpy22, memCpy23;
|
|
std::vector<hipGraphNode_t> kNode(kNumNode);
|
|
std::vector<hipGraphNode_t> kNode2(kNumNode);
|
|
hipGraph_t graph, graph2;
|
|
hipGraphExec_t graphExec;
|
|
hipGraphNode_t hErrorNode_out;
|
|
hipGraphExecUpdateResult updateResult_out;
|
|
|
|
int *A_d, *B_d, *C_d, *A_h, *B_h, *C_h;
|
|
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
|
int *A_d2, *B_d2, *C_d2, *A_h2, *B_h2, *C_h2;
|
|
HipTest::initArrays(&A_d2, &B_d2, &C_d2, &A_h2, &B_h2, &C_h2, N, false);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy1, graph, nullptr, 0, A_d, A_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy2, graph, nullptr, 0, B_d, B_h, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy1, &memCpy2, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParams{};
|
|
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
|
kernelNodeParams.gridDim = dim3(blocks);
|
|
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParams.sharedMemBytes = 0;
|
|
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
|
kernelNodeParams.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode[i], graph, nullptr, 0, &kernelNodeParams));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &memCpy2, &kNode[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[i - 1], &kNode[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy3, graph, nullptr, 0, C_h, C_d, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph, &kNode[kNumNode - 1], &memCpy3, 1));
|
|
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
|
|
|
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy21, graph2, nullptr, 0, A_d2, A_h2, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy22, graph2, nullptr, 0, B_d2, B_h2, Nbytes,
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy21, &memCpy22, 1));
|
|
|
|
for (int i = 0; i < kNumNode; i++) {
|
|
hipKernelNodeParams kernelNodeParam{};
|
|
void* kernelArgs2[] = {&A_d2, &B_d2, &C_d2, reinterpret_cast<void*>(&NElem)};
|
|
kernelNodeParam.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
|
kernelNodeParam.gridDim = dim3(blocks);
|
|
kernelNodeParam.blockDim = dim3(threadsPerBlock);
|
|
kernelNodeParam.sharedMemBytes = 0;
|
|
kernelNodeParam.kernelParams = reinterpret_cast<void**>(kernelArgs2);
|
|
kernelNodeParam.extra = nullptr;
|
|
HIP_CHECK(hipGraphAddKernelNode(&kNode2[i], graph2, nullptr, 0, &kernelNodeParam));
|
|
if (i == 0) {
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &memCpy22, &kNode2[i], 1));
|
|
} else {
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[i - 1], &kNode2[i], 1));
|
|
}
|
|
}
|
|
HIP_CHECK(hipGraphAddMemcpyNode1D(&memCpy23, graph2, nullptr, 0, C_h2, C_d2, Nbytes,
|
|
hipMemcpyDeviceToHost));
|
|
HIP_CHECK(hipGraphAddDependencies(graph2, &kNode2[kNumNode - 1], &memCpy23, 1));
|
|
|
|
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
|
|
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
|
HIP_CHECK(hipStreamSynchronize(stream));
|
|
|
|
// Verify graph execution result
|
|
HipTest::checkVectorSUB(A_h2, B_h2, C_h2, N);
|
|
|
|
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
|
HipTest::freeArrays(A_d2, B_d2, C_d2, A_h2, B_h2, C_h2, false);
|
|
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
|
HIP_CHECK(hipGraphDestroy(graph));
|
|
HIP_CHECK(hipGraphDestroy(graph2));
|
|
}
|
|
|
|
/**
|
|
* Test Description
|
|
* ------------------------
|
|
* - Validate hipGraph performance with doorbell set.
|
|
* - DEBUG_CLR_GRAPH_PACKET_CAPTURE
|
|
* 1) Added 2 nodes of MemCpy & a Kernel node in sequence and copy back result using memcpy
|
|
and Instantiate graph & update new graph with similar node structure with
|
|
api hipGraphExecUpdate and verify the result, the updated node should reflect.
|
|
i) Check with Multi device case.
|
|
ii) Pass stream as user created stream
|
|
iii) Pass stream as default stream
|
|
iv) Pass stream as hipStreamPerThread
|
|
* Test source
|
|
* ------------------------
|
|
* - unit/graph/hipGraphPerf.cc
|
|
* Test requirements
|
|
* ------------------------
|
|
* - HIP_VERSION >= 6.1
|
|
*/
|
|
TEST_CASE("Unit_hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop",
|
|
"[multigpu]") {
|
|
if ((setenv("DEBUG_CLR_GRAPH_PACKET_CAPTURE", "true", 1)) != 0) {
|
|
HipTest::HIP_SKIP_TEST(
|
|
"Unable to turn on "
|
|
"DEBUG_CLR_GRAPH_PACKET_CAPTURE, hence exit!");
|
|
return;
|
|
}
|
|
|
|
hipStream_t stream;
|
|
int numDevices = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
|
|
|
SECTION("Multi device test with different type of stream") {
|
|
for (int i = 0; i < numDevices; i++) {
|
|
HIP_CHECK(hipSetDevice(i));
|
|
|
|
SECTION("Pass stream as used created stream") {
|
|
HIP_CHECK(hipStreamCreate(&stream));
|
|
hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
|
|
HIP_CHECK(hipStreamDestroy(stream));
|
|
}
|
|
SECTION("Pass stream as default stream") {
|
|
stream = 0;
|
|
hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
|
|
}
|
|
SECTION("Pass stream as hipStreamPerThread") {
|
|
stream = hipStreamPerThread;
|
|
hipGraph_PerfCheck_hipGraphExecUpdate_kernel_inLoop(stream);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
#endif // #if __linux__
|
|
|
|
/**
|
|
* End doxygen group GraphTest.
|
|
* @}
|
|
*/
|