From 7c28abc5386fba09eb7063b6cb1bd4e5effbb2d6 Mon Sep 17 00:00:00 2001 From: "Arandjelovic, Marko" Date: Wed, 14 May 2025 16:42:23 +0200 Subject: [PATCH] SWDEV-505672 - Add performance test for HtoD and DtoH with kernel in-between (#27) [ROCm/hip-tests commit: 3257c44126c8bfe768a6bac54ab5174d01e7fbe0] --- .../catch/performance/memcpy/CMakeLists.txt | 1 + .../performance/memcpy/hipMemcpyWithKernel.cc | 124 ++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 projects/hip-tests/catch/performance/memcpy/hipMemcpyWithKernel.cc diff --git a/projects/hip-tests/catch/performance/memcpy/CMakeLists.txt b/projects/hip-tests/catch/performance/memcpy/CMakeLists.txt index e3ed71d9b5..8c173ba15c 100644 --- a/projects/hip-tests/catch/performance/memcpy/CMakeLists.txt +++ b/projects/hip-tests/catch/performance/memcpy/CMakeLists.txt @@ -30,6 +30,7 @@ set(TEST_SRC hipMemcpyDtoHAsync.cc hipMemcpyHtoD.cc hipMemcpyHtoDAsync.cc + hipMemcpyWithKernel.cc hipMemcpyToSymbol.cc hipMemcpyToSymbolAsync.cc hipMemcpyFromSymbol.cc diff --git a/projects/hip-tests/catch/performance/memcpy/hipMemcpyWithKernel.cc b/projects/hip-tests/catch/performance/memcpy/hipMemcpyWithKernel.cc new file mode 100644 index 0000000000..a6afd7050f --- /dev/null +++ b/projects/hip-tests/catch/performance/memcpy/hipMemcpyWithKernel.cc @@ -0,0 +1,124 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "memcpy_performance_common.hh" +/** + * @addtogroup memcpy memcpy + * @{ + * @ingroup PerformanceTest + */ +__global__ void Sum(void* ptr, size_t size) { + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + if (index != 0 && index < size) { + atomicAdd(&((unsigned long long*)ptr)[0], ((unsigned long long*)ptr)[index]); + } +} +class MemcpyHtoDKernelDtoHv1AsyncBenchmark : public Benchmark { + public: + void operator()(void* host_mem, void* device_mem, size_t size, const hipStream_t& stream) { + size_t count = size / sizeof(size_t); + for (size_t i = 0; i < count; i++) { + ((size_t*)host_mem)[i] = i; + } + TIMED_SECTION_STREAM(kTimerTypeCpu, stream) { + HIP_CHECK(hipMemcpyHtoDAsync((hipDeviceptr_t)device_mem, host_mem, size, stream)); + int threads_num = 32; + Sum<<>>(device_mem, count); + HIP_CHECK(hipMemcpyDtoHAsync(host_mem, (hipDeviceptr_t)device_mem, size, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + size_t sum = ((size_t*)host_mem)[0]; + REQUIRE(sum == count * (count - 1) / 2); + } +}; +class MemcpyHtoDKernelDtoHv2AsyncBenchmark : public Benchmark { + public: + void operator()(void* host_mem, void* device_mem, size_t size, const hipStream_t& stream) { + size_t count = size / sizeof(size_t); + for (size_t i = 0; i < count; i++) { + ((size_t*)host_mem)[i] = i; + } + TIMED_SECTION_STREAM(kTimerTypeCpu, stream) { + HIP_CHECK(hipMemcpyAsync((hipDeviceptr_t)device_mem, host_mem, size, hipMemcpyHostToDevice, + stream)); + int threads_num = 32; + Sum<<>>(device_mem, count); + HIP_CHECK(hipMemcpyWithStream(host_mem, (hipDeviceptr_t)device_mem, size, + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + size_t sum = ((size_t*)host_mem)[0]; + REQUIRE(sum == count * (count - 1) / 2); + } +}; +template +static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, + size_t size) { + BenchmarkType benchmark; + if (size < 1_KB) { + benchmark.AddSectionName(std::to_string(size)); + } else if (size < 1_MB) { + benchmark.AddSectionName(std::to_string(size / 1024) + std::string(" KB")); + } else { + benchmark.AddSectionName(std::to_string(size / (1024 * 1024)) + std::string(" MB")); + } + benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type)); + const StreamGuard stream_guard(Streams::created); + const hipStream_t stream = stream_guard.stream(); + LinearAllocGuard device_allocation(device_allocation_type, size); + LinearAllocGuard host_allocation(host_allocation_type, size); + benchmark.Run(host_allocation.ptr(), device_allocation.ptr(), size, stream); +} +/** + * Test Description + * ------------------------ + * - Executes `hipMemcpyHtoDAsync->Kernel->hipMemcpyDtoHAsync` from Device to Host: + * -# Allocation size + * - Small: 4 KB + * - Medium: 4 MB + * - Large: 16 MB + * -# Allocation type + * - Source: device malloc + * - Destination: host pinned and pageable + * Test source + * ------------------------ + * - performance/memcpy/hipMemcpyDtoHAsync.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemcpyHtoDKernelDtoHV1Async") { + const auto allocation_size = + GENERATE(16, 128, 1_KB, 4_KB, 16_KB, 256_KB, 512_KB, 1_MB, 4_MB, 16_MB, 128_MB); + const auto device_allocation_type = LinearAllocs::hipMalloc; + const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc); + RunBenchmark(host_allocation_type, device_allocation_type, + allocation_size); +} +TEST_CASE("Performance_hipMemcpyHtoDKernelDtoHV2Async") { + const auto allocation_size = + GENERATE(16, 128, 1_KB, 4_KB, 16_KB, 256_KB, 512_KB, 1_MB, 4_MB, 16_MB, 128_MB); + const auto device_allocation_type = LinearAllocs::hipMalloc; + const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc); + RunBenchmark(host_allocation_type, device_allocation_type, + allocation_size); +} +/** + * End doxygen group memcpy. + * @} + */