EXSWHTEC-249 - Implement Performance Tests for Memcpy APIs #119

Change-Id: Ib04fe4dd3efce92d7c7bfc8f0c75abd8e9dfe7be [ROCm/hip-tests commit: e3bac85a3c]
2023-12-28 18:30:45 +01:00
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -186,7 +186,8 @@ THE SOFTWARE.
 * @}
 */

- /**
+/**
+ * @defgroup ComplexTest Complex type
 * @{
 * This section describes tests for the Complex type functions.
 * @}
@@ -19,6 +19,7 @@
 # THE SOFTWARE.

 add_subdirectory(memset)
+add_subdirectory(memcpy)
 add_subdirectory(kernelLaunch)
 add_subdirectory(stream)
 add_subdirectory(event)
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    hipMemcpy.cc
+    hipMemcpyAsync.cc
+    hipMemcpyWithStream.cc
+    hipMemcpyAtoH.cc
+    hipMemcpyHtoA.cc
+    hipMemcpyDtoD.cc
+    hipMemcpyDtoDAsync.cc
+    hipMemcpyDtoH.cc
+    hipMemcpyDtoHAsync.cc
+    hipMemcpyHtoD.cc
+    hipMemcpyHtoDAsync.cc
+    hipMemcpyToSymbol.cc
+    hipMemcpyToSymbolAsync.cc
+    hipMemcpyFromSymbol.cc
+    hipMemcpyFromSymbolAsync.cc
+    hipMemcpy2D.cc
+    hipMemcpy2DAsync.cc
+    hipMemcpy2DToArray.cc
+    hipMemcpy2DToArrayAsync.cc
+    hipMemcpy2DFromArray.cc
+    hipMemcpy2DFromArrayAsync.cc
+    hipMemcpyParam2D.cc
+    hipMemcpyParam2DAsync.cc
+    hipMemcpy3D.cc
+    hipMemcpy3DAsync.cc
+)
+
+hip_add_exe_to_target(NAME MemcpyPerformance
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,190 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for all memcpy HIP APIs.
+ */
+
+class MemcpyBenchmark : public Benchmark<MemcpyBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy(dst, src, size, kind));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DBenchmark : public Benchmark<Memcpy2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2D(dst, dst_pitch, src, src_pitch, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToHost);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyHostToDevice);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int), src_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height, hipMemcpyHostToHost);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DAsyncBenchmark : public Benchmark<Memcpy2DAsyncBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DAsync(dst, dst_pitch, src, src_pitch, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy2DAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToHost, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyHostToDevice, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int), src_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height, hipMemcpyHostToHost, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,127 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DFromArrayBenchmark : public Benchmark<Memcpy2DFromArrayBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, hipArray_const_t src, size_t width, size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2DFromArray(dst, dst_pitch, src, 0, 0, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DFromArrayBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(host_allocation.ptr(), width * sizeof(int), array_allocation.ptr(),
+                  width * sizeof(int), height, hipMemcpyDeviceToHost);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  array_allocation.ptr(), device_allocation.width(),
+                  device_allocation.height(), hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DFromArrayAsyncBenchmark : public Benchmark<Memcpy2DFromArrayAsyncBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, hipArray_const_t src, size_t width, size_t height,
+                  hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DFromArrayAsync(dst, dst_pitch, src, 0, 0, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DFromArrayAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(host_allocation.ptr(), width * sizeof(int),
+                  array_allocation.ptr(), width * sizeof(int),
+                  height, hipMemcpyDeviceToHost, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  array_allocation.ptr(), device_allocation.width(),
+                  device_allocation.height(), hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,127 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DToArrayBenchmark : public Benchmark<Memcpy2DToArrayBenchmark> {
+ public:
+  void operator()(hipArray* dst, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2DToArray(dst, 0, 0, src, src_pitch, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DToArrayBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyHostToDevice) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(array_allocation.ptr(), host_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, hipMemcpyHostToDevice);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(array_allocation.ptr(), device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(), hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_HostToDevice") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DToArrayAsyncBenchmark : public Benchmark<Memcpy2DToArrayAsyncBenchmark> {
+ public:
+  void operator()(hipArray* dst, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DToArrayAsync(dst, 0, 0, src, src_pitch, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DToArrayAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyHostToDevice) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(array_allocation.ptr(), host_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height,
+                  hipMemcpyHostToDevice, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(array_allocation.ptr(), device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,189 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy3DBenchmark : public Benchmark<Memcpy3DBenchmark> {
+ public:
+  void operator()(const hipPitchedPtr& dst_ptr, const hipPitchedPtr& src_ptr,
+                  const hipExtent extent, hipMemcpyKind kind) {
+    hipMemcpy3DParms params = CreateMemcpy3DParam(dst_ptr, make_hipPos(0, 0, 0),
+                                                  src_ptr, make_hipPos(0, 0, 0),
+                                                  extent, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy3D(&params));
+    }
+  }
+};
+
+static void RunBenchmark(const hipExtent extent, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy3DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(extent.width) + ", " + std::to_string(extent.height)
+                           + ", " + std::to_string(extent.depth) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(make_hipPitchedPtr(host_allocation.ptr(), device_allocation.width(), 
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.pitched_ptr(), device_allocation.extent(), kind);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.pitch() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(device_allocation.pitched_ptr(),
+                  make_hipPitchedPtr(host_allocation.ptr(), device_allocation.pitch(),
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.extent(), kind);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    benchmark.Run(make_hipPitchedPtr(dst_allocation.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPitchedPtr(src_allocation.ptr(), extent.width, extent.width, extent.height),
+                  extent, kind);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard3D<int> src_allocation(extent);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard3D<int> dst_allocation(extent);
+
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.pitched_ptr(), src_allocation.pitched_ptr(),
+                  dst_allocation.extent(), kind);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy3DAsyncBenchmark : public Benchmark<Memcpy3DAsyncBenchmark> {
+ public:
+  void operator()(const hipPitchedPtr& dst_ptr, const hipPitchedPtr& src_ptr,
+                  const hipExtent extent, hipMemcpyKind kind, const hipStream_t& stream) {
+    hipMemcpy3DParms params = CreateMemcpy3DParam(dst_ptr, make_hipPos(0, 0, 0),
+                                                  src_ptr, make_hipPos(0, 0, 0),
+                                                  extent, kind);
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy3DAsync(&params, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const hipExtent extent, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy3DAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(extent.width) + ", " + std::to_string(extent.height)
+                           + ", " + std::to_string(extent.depth) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(make_hipPitchedPtr(host_allocation.ptr(), device_allocation.width(),
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.pitched_ptr(), device_allocation.extent(), kind, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.pitch() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(device_allocation.pitched_ptr(),
+                  make_hipPitchedPtr(host_allocation.ptr(),
+                                     device_allocation.pitch(),
+                                     device_allocation.width(),
+                                     device_allocation.height()),
+                  device_allocation.extent(), kind, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    benchmark.Run(make_hipPitchedPtr(dst_allocation.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPitchedPtr(src_allocation.ptr(), extent.width, extent.width, extent.height),
+                  extent, kind, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard3D<int> src_allocation(extent);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard3D<int> dst_allocation(extent);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.pitched_ptr(), src_allocation.pitched_ptr(),
+                  dst_allocation.extent(), kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyAsyncBenchmark : public Benchmark<MemcpyAsyncBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyAsync(dst, src, size, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  const StreamGuard stream_guard{Streams::created};
+  const hipStream_t stream = stream_guard.stream();
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyAtoHBenchmark : public Benchmark<MemcpyAtoHBenchmark> {
+ public:
+  void operator()(void* dst, hipArray* src_array, size_t allocation_size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyAtoH(dst, src_array, 0, allocation_size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, size_t width) {
+  MemcpyAtoHBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(width));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  size_t allocation_size = width * sizeof(int);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, allocation_size);
+  ArrayAllocGuard<int> array_allocation(make_hipExtent(width, 0, 0), hipArrayDefault);
+  benchmark.Run(host_allocation.ptr(), array_allocation.ptr(), allocation_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAtoH` from Device array to Host:
+ *    -# Allocation size
+ *      - Small: 512 B
+ *      - Medium: 1024 B
+ *      - Large: 4096 B
+ *    -# Allocation type
+ *      - Host: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAtoH.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAtoH") {
+  const auto allocation_size = GENERATE(512, 1024, 4096);
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, allocation_size);
+}
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoDBenchmark : public Benchmark<MemcpyDtoDBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, const hipDeviceptr_t& src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyDtoD(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(size_t size, bool enable_peer_access=false) {
+  MemcpyDtoDBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+
+  int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+  int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+  LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(dst_device));
+  LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(src_device));
+
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(dst_allocation.ptr()),
+                reinterpret_cast<hipDeviceptr_t>(src_allocation.ptr()), size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoD_PeerAccessEnabled") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoD_PeerAccessDisabled") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size);
+}
@@ -0,0 +1,106 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoDAsyncBenchmark : public Benchmark<MemcpyDtoDAsyncBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, const hipDeviceptr_t& src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyDtoDAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t size, bool enable_peer_access=false) {
+  MemcpyDtoDAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+  int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+  LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(dst_device));
+  LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(src_device));
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(dst_allocation.ptr()),
+                reinterpret_cast<hipDeviceptr_t>(src_allocation.ptr()),
+                size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoDAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoDAsync_PeerAccessEnabled") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoDAsync_PeerAccessDisabled") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size);
+}
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoHBenchmark : public Benchmark<MemcpyDtoHBenchmark> {
+ public:
+  void operator()(void* dst, const hipDeviceptr_t& src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyDtoH(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyDtoHBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(host_allocation.ptr(),
+                reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoH` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoH.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoH") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoHAsyncBenchmark : public Benchmark<MemcpyDtoHAsyncBenchmark> {
+ public:
+  void operator()(void* dst, const hipDeviceptr_t& src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyDtoHAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyDtoHAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(host_allocation.ptr(),
+                reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoHAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoHAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoHAsync") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyFromSymbolBenchmark : public Benchmark<MemcpyFromSymbolBenchmark> {
+ public:
+  void operator()(const void* source, void* result, size_t size, size_t offset) {
+    HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(devSymbol), source, size, offset));
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyFromSymbol(result, HIP_SYMBOL(devSymbol), size, offset));
+    }
+  }
+};
+
+static void RunBenchmark(const void* source, void* result, size_t size=1, size_t offset=0) {
+  MemcpyFromSymbolBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+  benchmark.Run(source, result, size, offset);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_SingularValue") {
+  int set{42};
+  int result{0};
+  RunBenchmark(&set, &result);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  int array[size];
+  std::fill_n(array, size, 42);
+  int result[size];
+  std::fill_n(result, size, 0);
+
+  RunBenchmark(array, result, sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  int array[size];
+  std::fill_n(array, size, 42);
+  int result[size];
+  std::fill_n(result, size, 0);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array + offset, result + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyFromSymbolAsyncBenchmark : public Benchmark<MemcpyFromSymbolAsyncBenchmark> {
+ public:
+  void operator()(const void* source, void* result, size_t size, size_t offset, const hipStream_t& stream) {
+    HIP_CHECK(hipMemcpyToSymbolAsync(HIP_SYMBOL(devSymbol), source, size, offset,
+                                     hipMemcpyHostToDevice, stream));
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyFromSymbolAsync(result, HIP_SYMBOL(devSymbol), size, offset,
+                                         hipMemcpyDeviceToHost, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const void* source, void* result, size_t size=1, size_t offset=0) {
+  MemcpyFromSymbolAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  benchmark.Run(source, result, size, offset, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_SingularValue") {
+  int set{42};
+  int result{0};
+  RunBenchmark(&set, &result);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  int array[size];
+  std::fill_n(array, size, 42);
+  int result[size];
+  std::fill_n(result, size, 0);
+
+  RunBenchmark(array, result, sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  int array[size];
+  std::fill_n(array, size, 42);
+  int result[size];
+  std::fill_n(result, size, 0);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array + offset, result + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoABenchmark : public Benchmark<MemcpyHtoABenchmark> {
+ public:
+  void operator()(hipArray* dst_array, const void* src, size_t allocation_size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyHtoA(dst_array, 0, src, allocation_size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, size_t width) {
+  MemcpyHtoABenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(width));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  size_t allocation_size = width * sizeof(int);
+  ArrayAllocGuard<int> array_allocation(make_hipExtent(width, 0, 0), hipArrayDefault);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, allocation_size);
+  benchmark.Run(array_allocation.ptr(), host_allocation.ptr(), allocation_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoA` from Host to Device array:
+ *    -# Allocation size
+ *      - Small: 512 B
+ *      - Medium: 1024 B
+ *      - Large: 4096 B
+ *    -# Allocation type
+ *      - Host: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoA.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoA") {
+  const auto allocation_size = GENERATE(512, 1024, 4096);
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, allocation_size);
+}
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoDBenchmark : public Benchmark<MemcpyHtoDBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, void* src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyHtoD(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyHtoDBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()), host_allocation.ptr(), size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoD` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoD") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoDAsyncBenchmark : public Benchmark<MemcpyHtoDAsyncBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, void* src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyHtoDAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyHtoDAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                host_allocation.ptr(), size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoD` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoDAsync") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyParam2DBenchmark : public Benchmark<MemcpyParam2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, void* src, size_t src_pitch,
+                  size_t width, size_t height, hipMemcpyKind kind) {
+    hip_Memcpy2D params = CreateMemcpy2DParam(dst, dst_pitch, src, src_pitch,
+                                              width, height, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyParam2D(&params));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  MemcpyParam2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(), kind);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(), kind);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int),
+                  src_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, kind);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  kind);
+  }
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,193 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyParam2DBenchmark : public Benchmark<MemcpyParam2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, void* src, size_t src_pitch,
+                  size_t width, size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    hip_Memcpy2D params = CreateMemcpy2DParam(dst, dst_pitch, src, src_pitch,
+                                              width, height, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyParam2DAsync(&params, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  MemcpyParam2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  kind, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  kind, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int),
+                  src_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, kind, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  kind, stream);
+  }
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyToSymbolBenchmark : public Benchmark<MemcpyToSymbolBenchmark> {
+ public:
+  void operator()(const void* source, size_t size, size_t offset) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(devSymbol), source, size, offset));
+    }
+  }
+};
+
+static void RunBenchmark(const void* source, size_t size=1, size_t offset=0) {
+  MemcpyToSymbolBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+  benchmark.Run(source, size, offset);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_SingularValue") {
+  int set{42};
+  RunBenchmark(&set);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  int array[size];
+  std::fill_n(array, size, 42);
+
+  RunBenchmark(array, sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  int array[size];
+  std::fill_n(array, size, 42);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyToSymbolAsyncBenchmark : public Benchmark<MemcpyToSymbolAsyncBenchmark> {
+ public:
+  void operator()(const void* source, size_t size, size_t offset, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyToSymbolAsync(HIP_SYMBOL(devSymbol), source, size, offset,
+                                       hipMemcpyHostToDevice, stream));
+    }
+
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const void* source, size_t size=1, size_t offset=0) {
+  MemcpyToSymbolAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  benchmark.Run(source, size, offset, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_SingularValue") {
+  int set{42};
+  RunBenchmark(&set);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  int array[size];
+  std::fill_n(array, size, 42);
+
+  RunBenchmark(array, sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  int array[size];
+  std::fill_n(array, size, 42);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyWithStreamBenchmark : public Benchmark<MemcpyWithStreamBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind, hipStream_t stream) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyWithStream(dst, src, size, kind, stream));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyWithStreamBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,117 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <performance_common.hh>
+
+static hip_Memcpy2D CreateMemcpy2DParam(void* dst, size_t dpitch, void* src, size_t spitch,
+                                        size_t width, size_t height, hipMemcpyKind kind) {
+  hip_Memcpy2D params = {};
+  const hipExtent src_offset = {};
+  const hipExtent dst_offset = {};
+  params.dstPitch = dpitch;
+  switch (kind) {
+    case hipMemcpyDeviceToHost:
+    case hipMemcpyHostToHost:
+      #if HT_AMD
+        params.dstMemoryType = hipMemoryTypeHost;
+      #else
+        params.dstMemoryType = CU_MEMORYTYPE_HOST;
+      #endif
+      params.dstHost = dst;
+      break;
+    case hipMemcpyDeviceToDevice:
+    case hipMemcpyHostToDevice:
+      #if HT_AMD
+        params.dstMemoryType = hipMemoryTypeDevice;
+      #else
+        params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      #endif
+      params.dstDevice = reinterpret_cast<hipDeviceptr_t>(dst);
+      break;
+    default:
+      REQUIRE(false);
+  }
+
+  params.srcPitch = dpitch;
+  switch (kind) {
+    case hipMemcpyDeviceToHost:
+    case hipMemcpyHostToHost:
+      #if HT_AMD
+        params.srcMemoryType = hipMemoryTypeHost;
+      #else
+        params.srcMemoryType = CU_MEMORYTYPE_HOST;
+      #endif
+      params.srcHost = src;
+      break;
+    case hipMemcpyDeviceToDevice:
+    case hipMemcpyHostToDevice:
+      #if HT_AMD
+        params.srcMemoryType = hipMemoryTypeDevice;
+      #else
+        params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+      #endif
+      params.srcDevice = reinterpret_cast<hipDeviceptr_t>(src);
+      break;
+    default:
+      REQUIRE(false);
+  }
+
+  params.WidthInBytes = width;
+  params.Height = height;
+  params.srcXInBytes = src_offset.width;
+  params.srcY = src_offset.height;
+  params.dstXInBytes = dst_offset.width;
+  params.dstY = dst_offset.height;
+
+  return params;
+}
+
+static hipMemcpy3DParms CreateMemcpy3DParam(hipPitchedPtr dst_ptr, hipPos dst_pos,
+                                            hipPitchedPtr src_ptr, hipPos src_pos,
+                                            hipExtent extent, hipMemcpyKind kind) {
+  hipMemcpy3DParms params = {};
+  params.dstPtr = dst_ptr;
+  params.dstPos = dst_pos;
+  params.srcPtr = src_ptr;
+  params.srcPos = src_pos;
+  params.extent = extent;
+  params.kind = kind;
+  return params;
+}
+
+static std::tuple<int, int> GetDeviceIds(bool enable_peer_access) {
+  int src_device = 0;
+  int dst_device = 1;
+
+  if (enable_peer_access) {
+    int can_access_peer = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
+    if (!can_access_peer) {
+      INFO("Peer access cannot be enabled between devices " << src_device << " and " << dst_device);
+      REQUIRE(can_access_peer);
+    }
+    HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
+  } else {
+    dst_device = 0;
+  }
+
+  return {src_device, dst_device};
+}