/* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include static constexpr int launches = 5; /** * The tests in this file are added to see the performance improvement with the * Alloc node detection optimization task : SWDEV-490864 */ /** * @addtogroup hipGraphLaunch hipGraphLaunch * @{ * @ingroup GraphTest * `hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);` * - Launches an executable graph in the specified stream. */ /** * In fillKernel, all elements of the array filled with given value */ static __global__ void fillKernel(int* arr, int size, int value) { int offset = blockDim.x * blockIdx.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = offset; i < size; i += stride) { arr[i] = value; } } /** * In addOneKernel, all elements of the array are incremented by 1 */ static __global__ void addOneKernel(int* arr, int size) { int offset = blockDim.x * blockIdx.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = offset; i < size; i += stride) { arr[i] += 1; } } /** * In addKernel, Array1 and Array2 will be added by element wise * and stored in Array 1 */ static __global__ void addKernel(int* arr1, int* arr2, int size) { int offset = blockDim.x * blockIdx.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = offset; i < size; i += stride) { arr1[i] = arr1[i] + arr2[i]; } } /** * Test Description * ------------------------ * - This test case, tests the following scenario : * - 1) Create 1024 Mem alloc Nodes, make them serial dependent. * - (Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 2) Create 1024 Mem free Nodes. * - (Node 0 depends on last created mem alloc node, * - Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 3) Launch the graph repeatedly * - 4) Capture the Graph exection time and Synchronization time. * * Test source * ------------------------ * - catch/perftests/graph/hipPerfGraphLaunch.cc * * Test requirements * ------------------------ * - HIP_VERSION >= 6.4 */ TEST_CASE("Perf_GraphWithMoreAllocFreeNodes_SingleBranchNoOperations") { constexpr int numberOfNodes = 1024; int* devMem[numberOfNodes]; for (int i = 0; i < numberOfNodes; i++) { devMem[i] = nullptr; } hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipGraphNode_t memAllocNode[numberOfNodes], memFreeNode[numberOfNodes]; // Prapare Mem Alloc Nodes for (int i = 0; i < numberOfNodes; i++) { hipMemAllocNodeParams memAllocNodeParams{}; memAllocNodeParams.poolProps.allocType = hipMemAllocationTypePinned; memAllocNodeParams.poolProps.handleTypes = hipMemHandleTypeNone; memAllocNodeParams.poolProps.location.type = hipMemLocationTypeDevice; memAllocNodeParams.poolProps.location.id = 0; memAllocNodeParams.bytesize = sizeof(int); if (i == 0) { HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, nullptr, 0, &memAllocNodeParams)); } else { ::std::vector memAllocNodeDependencies; memAllocNodeDependencies.push_back(memAllocNode[i - 1]); HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, memAllocNodeDependencies.data(), memAllocNodeDependencies.size(), &memAllocNodeParams)); } devMem[i] = reinterpret_cast(memAllocNodeParams.dptr); REQUIRE(devMem[i] != nullptr); } // Prapare Mem Free Nodes for (int i = 0; i < numberOfNodes; i++) { if (i == 0) { ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memAllocNode[numberOfNodes - 1]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[i], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(devMem[i]))); } else { ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memFreeNode[i - 1]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[i], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(devMem[i]))); } } hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiateWithFlags(&graphExec, graph, 0)); // Warm up call HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); std::cout << "Graph launches = " << launches << std::endl; auto launch_start = std::chrono::high_resolution_clock::now(); for (int itr = 1; itr <= launches; itr++) { HIP_CHECK(hipGraphLaunch(graphExec, stream)); } auto launch_stop = std::chrono::high_resolution_clock::now(); auto launch_result = std::chrono::duration(launch_stop - launch_start); auto sync_start = std::chrono::high_resolution_clock::now(); HIP_CHECK(hipStreamSynchronize(stream)); auto sync_stop = std::chrono::high_resolution_clock::now(); auto sync_result = std::chrono::duration(sync_stop - sync_start); std::cout << "Time taken to Execute : " << std::chrono::duration_cast(launch_result).count() << " millisecs " << std::endl; std::cout << "Time taken to Synchronize : " << std::chrono::duration_cast(sync_result).count() << " millisecs " << std::endl; HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); } /** * Test Description * ------------------------ * - This test case, tests the following scenario : * - 1) Create 100 Mem alloc Nodes, make them serial dependent. * - (Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 2) Create 100 Memset Nodes. * - (Node 0 depends on last created mem alloc node, * - Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 3) Create 100 Kernel Nodes. * - (Node 0 depends on last created mem set node, * - Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 4) Create 100 Memcpy Nodes. * - (Node 0 depends on last created kernel node, * - Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 5) Create 100 Mem free Nodes. * - (Node 0 depends on last created mem copy node, * - Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 6) Launch the graph repeatedly * - 7) Capture the Graph exection time and Synchronization time. * * Test source * ------------------------ * - catch/perftests/graph/hipPerfGraphLaunch.cc * * Test requirements * ------------------------ * - HIP_VERSION >= 6.4 */ TEST_CASE("Perf_GraphWithMoreAllocFreeNodes_SerialNodesSingleBranchWithOps") { constexpr int SIZE = 100; char* dev[SIZE]; for (int i = 0; i < SIZE; i++) { dev[i] = nullptr; } char hostDst[SIZE]; for (int i = 0; i < SIZE; i++) { hostDst[i] = 0; } hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipGraphNode_t memAllocNode[SIZE], memsetNode[SIZE], kernelNode[SIZE], memcpyNode[SIZE], memFreeNode[SIZE]; // Prapare Mem alloc Nodes for (int i = 0; i < SIZE; i++) { hipMemAllocNodeParams memAllocNodeParams{}; memAllocNodeParams.poolProps.allocType = hipMemAllocationTypePinned; memAllocNodeParams.poolProps.handleTypes = hipMemHandleTypeNone; memAllocNodeParams.poolProps.location.type = hipMemLocationTypeDevice; memAllocNodeParams.poolProps.location.id = 0; memAllocNodeParams.bytesize = sizeof(char); if (i == 0) { HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, nullptr, 0, &memAllocNodeParams)); } else { ::std::vector memAllocNodeDependencies; memAllocNodeDependencies.push_back(memAllocNode[i - 1]); HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, memAllocNodeDependencies.data(), memAllocNodeDependencies.size(), &memAllocNodeParams)); } dev[i] = reinterpret_cast(memAllocNodeParams.dptr); REQUIRE(dev[i] != nullptr); } // Prapare Memset Nodes for (int i = 0; i < SIZE; i++) { hipMemsetParams pMemsetParams{}; pMemsetParams.dst = reinterpret_cast(dev[i]); pMemsetParams.elementSize = 1; pMemsetParams.height = 1; pMemsetParams.pitch = 1; pMemsetParams.value = i; pMemsetParams.width = 1; ::std::vector memsetNodeDependencies; if (i == 0) { memsetNodeDependencies.push_back(memAllocNode[SIZE - 1]); } else { memsetNodeDependencies.push_back(memsetNode[i - 1]); } HIP_CHECK(hipGraphAddMemsetNode(&memsetNode[i], graph, memsetNodeDependencies.data(), memsetNodeDependencies.size(), &pMemsetParams)); } // Prapare Kernel Nodes for (int i = 0; i < SIZE; i++) { hipKernelNodeParams kernelNodeParams{}; kernelNodeParams.func = reinterpret_cast(addOneKernel); kernelNodeParams.gridDim = dim3(1, 1, 1); kernelNodeParams.blockDim = dim3(1, 1, 1); kernelNodeParams.sharedMemBytes = 0; int size = 1; void* kernelArgs[2] = {reinterpret_cast(&dev[i]), reinterpret_cast(&size)}; kernelNodeParams.kernelParams = kernelArgs; kernelNodeParams.extra = nullptr; ::std::vector kernelNodeDependencies; if (i == 0) { kernelNodeDependencies.push_back(memsetNode[SIZE - 1]); } else { kernelNodeDependencies.push_back(kernelNode[i - 1]); } HIP_CHECK(hipGraphAddKernelNode(&kernelNode[i], graph, kernelNodeDependencies.data(), kernelNodeDependencies.size(), &kernelNodeParams)); } // Prapare Memcpy Nodes for (int i = 0; i < SIZE; i++) { hipMemcpy3DParms pMemcpyParams{}; pMemcpyParams.srcPos = make_hipPos(0, 0, 0); pMemcpyParams.dstPos = make_hipPos(0, 0, 0); pMemcpyParams.srcPtr = make_hipPitchedPtr(dev[i], 1, 1, 1); pMemcpyParams.dstPtr = make_hipPitchedPtr(&hostDst[i], 1, 1, 1); pMemcpyParams.extent = make_hipExtent(1, 1, 1); pMemcpyParams.kind = hipMemcpyDeviceToHost; ::std::vector memcpyNodeDependencies; if (i == 0) { memcpyNodeDependencies.push_back(kernelNode[SIZE - 1]); } else { memcpyNodeDependencies.push_back(memcpyNode[i - 1]); } HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode[i], graph, memcpyNodeDependencies.data(), memcpyNodeDependencies.size(), &pMemcpyParams)); } // Prapare Mem free Nodes for (int i = 0; i < SIZE; i++) { if (i == 0) { ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memcpyNode[SIZE - 1]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[i], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(dev[i]))); } else { ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memFreeNode[i - 1]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[i], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(dev[i]))); } } hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiateWithFlags(&graphExec, graph, 0)); // Warm up call HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); std::cout << "Graph launches = " << launches << std::endl; auto launch_start = std::chrono::high_resolution_clock::now(); for (int itr = 1; itr <= launches; itr++) { HIP_CHECK(hipGraphLaunch(graphExec, stream)); } auto launch_stop = std::chrono::high_resolution_clock::now(); auto launch_result = std::chrono::duration(launch_stop - launch_start); auto sync_start = std::chrono::high_resolution_clock::now(); HIP_CHECK(hipStreamSynchronize(stream)); auto sync_stop = std::chrono::high_resolution_clock::now(); auto sync_result = std::chrono::duration(sync_stop - sync_start); std::cout << "Time taken to Execute : " << std::chrono::duration_cast(launch_result).count() << " millisecs " << std::endl; std::cout << "Time taken to Synchronize : " << std::chrono::duration_cast(sync_result).count() << " millisecs " << std::endl; for (int i = 0; i < SIZE; i++) { REQUIRE(hostDst[i] == (i + 1)); } HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); } /** * Test Description * ------------------------ * - This test case, tests the following scenario : * - 1) Create memcpy Node ( H2D ) as a root node * - 2) Create 10 Mem Alloc nodes, 10 Fill Kernel Nodes, 10 Add Kernel Nodes * - and 10 memcpy Nodes (D2H). * - a) All Mem Alloc nodes depends memcpy H2D Node. * - ( Mem Alloc Node 0 to 9, depends on memcpy H2D Node) * - b) Fill Kernel Node 0 depends on Mem Alloc Node 0, * - Fill Kernel Node 1 depends on Mem Alloc Node 1, and so on. * - c) Add Kernel Node 0 depends on Fill Kernel Node 0, * - Add Kernel Node 1 depends on Fill Kernel Node 1, and so on. * - d) MemcpyD2H Node 0 depends on Add Kernel Node 0, * - MemcpyD2H Node 1 depends on Add Kernel Node 1, and so on. * - 3) Create MemcpyH2H, which depend on all the 10 MemcpyD2H Nodes * - 4) Launch the graph repeatedly * - 5) Capture the Graph exection time and Synchronization time. * * Test source * ------------------------ * - catch/perftests/graph/hipPerfGraphLaunch.cc * * Test requirements * ------------------------ * - HIP_VERSION >= 6.4 */ TEST_CASE("Perf_GraphWithMoreAllocFreeNodes_MultipleBranches") { constexpr int SIZE = 1024; constexpr size_t NBYTES = SIZE * sizeof(int); constexpr int BRANCHES = 10; int value = 100; int* hostMemSrc = new int[SIZE]; REQUIRE(hostMemSrc != nullptr); int* devMemSrc1 = nullptr; HIP_CHECK(hipMalloc(&devMemSrc1, NBYTES)); REQUIRE(devMemSrc1 != nullptr); int* devMemSrc2[BRANCHES]; for (int i = 0; i < BRANCHES; i++) { devMemSrc2[i] = nullptr; } int hostMemDst[BRANCHES][SIZE]; int finalHostDst[BRANCHES * SIZE]; hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipGraphNode_t memcpyNodeH2D, memAllocNode[BRANCHES], fillKernelNode[BRANCHES], addKernelNode[BRANCHES], memcpyNodeD2H[BRANCHES], memFreeNode[BRANCHES], memcpyNodeH2H; // Add H2D Node HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNodeH2D, graph, nullptr, 0, devMemSrc1, hostMemSrc, NBYTES, hipMemcpyHostToDevice)); for (int branch = 0; branch < BRANCHES; branch++) { // Add Mem alloc Nodes ::std::vector memAllocNodeDependencies; memAllocNodeDependencies.push_back(memcpyNodeH2D); hipMemAllocNodeParams memAllocNodeParams{}; memAllocNodeParams.poolProps.allocType = hipMemAllocationTypePinned; memAllocNodeParams.poolProps.handleTypes = hipMemHandleTypeNone; memAllocNodeParams.poolProps.location.type = hipMemLocationTypeDevice; memAllocNodeParams.poolProps.location.id = 0; memAllocNodeParams.bytesize = NBYTES; HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[branch], graph, memAllocNodeDependencies.data(), memAllocNodeDependencies.size(), &memAllocNodeParams)); devMemSrc2[branch] = reinterpret_cast(memAllocNodeParams.dptr); REQUIRE(devMemSrc2[branch] != nullptr); // Add Kernel Nodes (fillKernel) ::std::vector kernelNodeDependencies; kernelNodeDependencies.push_back(memAllocNode[branch]); hipKernelNodeParams kernelNodeParams{}; kernelNodeParams.func = reinterpret_cast(fillKernel); kernelNodeParams.gridDim = dim3(1, 1, 1); kernelNodeParams.blockDim = dim3(1, 1, 1); kernelNodeParams.sharedMemBytes = 0; int size = SIZE; void* kernelArgs[3] = {reinterpret_cast(&devMemSrc2[branch]), reinterpret_cast(&size), reinterpret_cast(&value)}; kernelNodeParams.kernelParams = kernelArgs; kernelNodeParams.extra = nullptr; HIP_CHECK(hipGraphAddKernelNode(&fillKernelNode[branch], graph, kernelNodeDependencies.data(), kernelNodeDependencies.size(), &kernelNodeParams)); // Add Kernel Nodes (addKernel) ::std::vector kernelNodeDependencies2; kernelNodeDependencies2.push_back(fillKernelNode[branch]); hipKernelNodeParams kernelNodeParams2{}; kernelNodeParams2.func = reinterpret_cast(addKernel); kernelNodeParams2.gridDim = dim3(1, 1, 1); kernelNodeParams2.blockDim = dim3(1, 1, 1); kernelNodeParams2.sharedMemBytes = 0; int size2 = SIZE; void* kernelArgs2[3] = {reinterpret_cast(&devMemSrc2[branch]), reinterpret_cast(&devMemSrc1), reinterpret_cast(&size2)}; kernelNodeParams2.kernelParams = kernelArgs2; kernelNodeParams2.extra = nullptr; HIP_CHECK(hipGraphAddKernelNode(&addKernelNode[branch], graph, kernelNodeDependencies2.data(), kernelNodeDependencies2.size(), &kernelNodeParams2)); // Add D2H Nodes ::std::vector memcpyNodeD2HDependencies; memcpyNodeD2HDependencies.push_back(addKernelNode[branch]); HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNodeD2H[branch], graph, memcpyNodeD2HDependencies.data(), memcpyNodeD2HDependencies.size(), hostMemDst[branch], devMemSrc2[branch], NBYTES, hipMemcpyDeviceToHost)); ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memcpyNodeD2H[branch]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[branch], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(devMemSrc2[branch]))); } // Add H2H Node ::std::vector memcpyNodeH2HDependencies; for (int i = 0; i < BRANCHES; i++) { memcpyNodeH2HDependencies.push_back(memFreeNode[i]); } HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNodeH2H, graph, memcpyNodeH2HDependencies.data(), memcpyNodeH2HDependencies.size(), finalHostDst, hostMemDst, BRANCHES * SIZE * sizeof(int), hipMemcpyHostToHost)); hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiateWithFlags(&graphExec, graph, 0)); // Warm up call HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); std::cout << "Graph launches : " << launches << std::endl; auto launch_start = std::chrono::high_resolution_clock::now(); for (int launch = 1; launch <= launches; launch++) { std::fill(hostMemSrc, hostMemSrc + SIZE, launch); HIP_CHECK(hipGraphLaunch(graphExec, stream)); } auto launch_stop = std::chrono::high_resolution_clock::now(); auto launch_result = std::chrono::duration(launch_stop - launch_start); auto sync_start = std::chrono::high_resolution_clock::now(); HIP_CHECK(hipStreamSynchronize(stream)); auto sync_stop = std::chrono::high_resolution_clock::now(); auto sync_result = std::chrono::duration(sync_stop - sync_start); std::cout << "Time taken to Execute : " << std::chrono::duration_cast(launch_result).count() << " millisecs " << std::endl; std::cout << "Time taken to Synchronize : " << std::chrono::duration_cast(sync_result).count() << " millisecs " << std::endl; for (int branch = 0; branch < BRANCHES; branch++) { for (int idx = 0; idx < SIZE; idx++) { REQUIRE(hostMemDst[branch][idx] == (launches + value)); } } for (int idx = 0; idx < BRANCHES * SIZE; idx++) { REQUIRE(finalHostDst[idx] == (launches + value)); } HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); HIP_CHECK(hipFree(devMemSrc1)); } /** * Test Description * ------------------------ * - This test case, tests the following scenario : * - 1) Create mem alloc Node * - 2) Create memset Node, which depends on mem alloc Node * - 3) Create kernel Node, which depends on memset Node * - 4) Create memcpy Node, which depends on kernel Node * - 5) Create Mem free Node, which depends on memcpy Node * - 6) Repeat the above 5 steps 10 times, and graph will be created with * - the 10 independent branches. * - 7) Launch the graph repeatedly * - 8) Capture the Graph exection time and Synchronization time. * * Test source * ------------------------ * - catch/perftests/graph/hipPerfGraphLaunch.cc * * Test requirements * ------------------------ * - HIP_VERSION >= 6.4 */ TEST_CASE("Perf_GraphWithMoreAllocFreeNodes_MultipleIndependentBranches") { constexpr int BRANCHES = 10; char* dev[BRANCHES]; for (int i = 0; i < BRANCHES; i++) { dev[i] = nullptr; } char hostDst[BRANCHES]; for (int i = 0; i < BRANCHES; i++) { hostDst[i] = 0; } hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipGraphNode_t memAllocNode[BRANCHES], memsetNode[BRANCHES], kernelNode[BRANCHES], memcpyNode[BRANCHES], memFreeNode[BRANCHES]; // Prapare Mem alloc Nodes for (int i = 0; i < BRANCHES; i++) { hipMemAllocNodeParams memAllocNodeParams{}; memAllocNodeParams.poolProps.allocType = hipMemAllocationTypePinned; memAllocNodeParams.poolProps.handleTypes = hipMemHandleTypeNone; memAllocNodeParams.poolProps.location.type = hipMemLocationTypeDevice; memAllocNodeParams.poolProps.location.id = 0; memAllocNodeParams.bytesize = sizeof(char); HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, nullptr, 0, &memAllocNodeParams)); dev[i] = reinterpret_cast(memAllocNodeParams.dptr); REQUIRE(dev[i] != nullptr); hipMemsetParams pMemsetParams{}; pMemsetParams.dst = reinterpret_cast(dev[i]); pMemsetParams.elementSize = 1; pMemsetParams.height = 1; pMemsetParams.pitch = 1; pMemsetParams.value = i; pMemsetParams.width = 1; ::std::vector memsetNodeDependencies; memsetNodeDependencies.push_back(memAllocNode[i]); HIP_CHECK(hipGraphAddMemsetNode(&memsetNode[i], graph, memsetNodeDependencies.data(), memsetNodeDependencies.size(), &pMemsetParams)); hipKernelNodeParams kernelNodeParams{}; kernelNodeParams.func = reinterpret_cast(addOneKernel); kernelNodeParams.gridDim = dim3(1, 1, 1); kernelNodeParams.blockDim = dim3(1, 1, 1); kernelNodeParams.sharedMemBytes = 0; int size = 1; void* kernelArgs[2] = {reinterpret_cast(&dev[i]), reinterpret_cast(&size)}; kernelNodeParams.kernelParams = kernelArgs; kernelNodeParams.extra = nullptr; ::std::vector kernelNodeDependencies; kernelNodeDependencies.push_back(memsetNode[i]); HIP_CHECK(hipGraphAddKernelNode(&kernelNode[i], graph, kernelNodeDependencies.data(), kernelNodeDependencies.size(), &kernelNodeParams)); hipMemcpy3DParms pMemcpyParams{}; pMemcpyParams.srcPos = make_hipPos(0, 0, 0); pMemcpyParams.dstPos = make_hipPos(0, 0, 0); pMemcpyParams.srcPtr = make_hipPitchedPtr(dev[i], 1, 1, 1); pMemcpyParams.dstPtr = make_hipPitchedPtr(&hostDst[i], 1, 1, 1); pMemcpyParams.extent = make_hipExtent(1, 1, 1); pMemcpyParams.kind = hipMemcpyDeviceToHost; ::std::vector memcpyNodeDependencies; memcpyNodeDependencies.push_back(kernelNode[i]); HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode[i], graph, memcpyNodeDependencies.data(), memcpyNodeDependencies.size(), &pMemcpyParams)); ::std::vector memFreeNodeDependencies; memFreeNodeDependencies.push_back(memcpyNode[i]); HIP_CHECK(hipGraphAddMemFreeNode(&memFreeNode[i], graph, memFreeNodeDependencies.data(), memFreeNodeDependencies.size(), reinterpret_cast(dev[i]))); } hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiateWithFlags(&graphExec, graph, 0)); // Warm up call HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); std::cout << "Graph launches = " << launches << std::endl; auto launch_start = std::chrono::high_resolution_clock::now(); for (int itr = 1; itr <= launches; itr++) { HIP_CHECK(hipGraphLaunch(graphExec, stream)); } auto launch_stop = std::chrono::high_resolution_clock::now(); auto launch_result = std::chrono::duration(launch_stop - launch_start); auto sync_start = std::chrono::high_resolution_clock::now(); HIP_CHECK(hipStreamSynchronize(stream)); auto sync_stop = std::chrono::high_resolution_clock::now(); auto sync_result = std::chrono::duration(sync_stop - sync_start); std::cout << "Time taken to Execute : " << std::chrono::duration_cast(launch_result).count() << " millisecs " << std::endl; std::cout << "Time taken to Synchronize : " << std::chrono::duration_cast(sync_result).count() << " millisecs " << std::endl; for (int i = 0; i < BRANCHES; i++) { REQUIRE(hostDst[i] == (i + 1)); } HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); } /** * Test Description * ------------------------ * - This test case, tests the following scenario : * - 1) Create 1024 Mem alloc Nodes. And make them serial dependent. * - (Node 1 depends on Node 0, Node 2 depends on Node 1, and so on) * - 2) Instantiate the graph with hipGraphInstantiateFlagAutoFreeOnLaunch flag * - 3) Launch the graph repeatedly * - 4) Capture the Graph exection time and Synchronization time. * * Test source * ------------------------ * - catch/perftests/graph/hipPerfGraphLaunch.cc * * Test requirements * ------------------------ * - HIP_VERSION >= 6.4 */ TEST_CASE("Perf_GraphWithMoreAllocFreeNodes_OneBranchNoOps_AutoFreeOnLaunch") { constexpr int SIZE = 1024; int* devMem[SIZE]; for (int i = 0; i < SIZE; i++) { devMem[i] = nullptr; } hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipGraphNode_t memAllocNode[SIZE]; for (int i = 0; i < SIZE; i++) { hipMemAllocNodeParams memAllocNodeParams{}; memAllocNodeParams.poolProps.allocType = hipMemAllocationTypePinned; memAllocNodeParams.poolProps.handleTypes = hipMemHandleTypeNone; memAllocNodeParams.poolProps.location.type = hipMemLocationTypeDevice; memAllocNodeParams.poolProps.location.id = 0; memAllocNodeParams.bytesize = sizeof(int); if (i == 0) { HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, nullptr, 0, &memAllocNodeParams)); } else { ::std::vector memAllocNodeDependencies; memAllocNodeDependencies.push_back(memAllocNode[i - 1]); HIP_CHECK(hipGraphAddMemAllocNode(&memAllocNode[i], graph, memAllocNodeDependencies.data(), memAllocNodeDependencies.size(), &memAllocNodeParams)); } devMem[i] = reinterpret_cast(memAllocNodeParams.dptr); REQUIRE(devMem[i] != nullptr); } hipGraphExec_t graphExec; HIP_CHECK( hipGraphInstantiateWithFlags(&graphExec, graph, hipGraphInstantiateFlagAutoFreeOnLaunch)); // Warm up call HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); std::cout << "Graph launches = " << launches << std::endl; auto launch_start = std::chrono::high_resolution_clock::now(); for (int itr = 1; itr <= launches; itr++) { HIP_CHECK(hipGraphLaunch(graphExec, stream)); } auto launch_stop = std::chrono::high_resolution_clock::now(); auto launch_result = std::chrono::duration(launch_stop - launch_start); auto sync_start = std::chrono::high_resolution_clock::now(); HIP_CHECK(hipStreamSynchronize(stream)); auto sync_stop = std::chrono::high_resolution_clock::now(); auto sync_result = std::chrono::duration(sync_stop - sync_start); std::cout << "Time taken to Execute : " << std::chrono::duration_cast(launch_result).count() << " millisecs " << std::endl; std::cout << "Time taken to Synchronize : " << std::chrono::duration_cast(sync_result).count() << " millisecs " << std::endl; HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); } /** * End doxygen group GraphTest. * @} */