EXSWHTEC-98 - Implement tests for hipMemcpy3D APIs (#55)

- Implement tests for hipMemcpy3D APIs - Implement basic behavior checks in all copy directions - Implement synchronization behavior checks for expected behavior based on cuda docs - Implement positive tests for zero sized width and/or height copies, where no copy is expected to happen - Implement negative parameter tests - Implement all of the above for hipMemcpy3D and hipMemcpy3DAsync. - Disable failing tests on AMD. - Fix copyright disclaimer. [ROCm/hip-tests commit: 113a36c0eb]
2023-06-28 09:20:42 +02:00
@@ -1,5 +1,6 @@
 /*
 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -0,0 +1,585 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <variant>
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include <resource_guards.hh>
+
+using PtrVariant = std::variant<hipPitchedPtr, hipArray_t>;
+
+static hipMemcpyKind ReverseMemcpyDirection(const hipMemcpyKind direction) {
+  switch (direction) {
+    case hipMemcpyHostToDevice:
+      return hipMemcpyDeviceToHost;
+    case hipMemcpyDeviceToHost:
+      return hipMemcpyHostToDevice;
+    default:
+      return direction;
+  }
+};
+
+static hipMemcpy3DParms GetMemcpy3DParms(PtrVariant dst_ptr, hipPos dst_pos, PtrVariant src_ptr,
+                                         hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+  hipMemcpy3DParms parms = {0};
+  if (std::holds_alternative<hipArray_t>(dst_ptr)) {
+    parms.dstArray = std::get<hipArray_t>(dst_ptr);
+  } else {
+    parms.dstPtr = std::get<hipPitchedPtr>(dst_ptr);
+  }
+  parms.dstPos = dst_pos;
+  if (std::holds_alternative<hipArray_t>(src_ptr)) {
+    parms.srcArray = std::get<hipArray_t>(src_ptr);
+  } else {
+    parms.srcPtr = std::get<hipPitchedPtr>(src_ptr);
+  }
+  parms.srcPos = src_pos;
+  parms.extent = extent;
+  parms.kind = kind;
+
+  return parms;
+}
+
+static bool operator==(const hipPitchedPtr& lhs, const hipPitchedPtr& rhs) {
+  // not checking for xsize currently as hipGraphMemcpyNodeGetParams returns incorrect value
+  return lhs.ptr == rhs.ptr && lhs.pitch == rhs.pitch && lhs.ysize == rhs.ysize;
+}
+
+static bool operator==(const hipPos& lhs, const hipPos& rhs) {
+  return lhs.x == rhs.x && lhs.y == rhs.y && lhs.z == rhs.z;
+}
+
+static bool operator==(const hipExtent& lhs, const hipExtent& rhs) {
+  return lhs.width == rhs.width && lhs.height == rhs.height && lhs.depth == rhs.depth;
+}
+
+static bool operator==(const hipMemcpy3DParms& lhs, const hipMemcpy3DParms& rhs) {
+  return lhs.dstArray == rhs.dstArray && lhs.dstPtr == rhs.dstPtr && lhs.dstPos == rhs.dstPos &&
+      lhs.srcArray == rhs.srcArray && lhs.srcPtr == rhs.srcPtr && lhs.srcPos == rhs.srcPos &&
+      lhs.extent == rhs.extent && lhs.kind == rhs.kind;
+}
+
+template <bool async = false, bool graph = false, bool set_params = false>
+hipError_t Memcpy3DWrapper(PtrVariant dst_ptr, hipPos dst_pos, PtrVariant src_ptr, hipPos src_pos,
+                           hipExtent extent, hipMemcpyKind kind, hipStream_t stream = nullptr) {
+  auto parms = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+
+  if constexpr (graph) {
+    hipGraph_t g = nullptr;
+    HIP_CHECK(hipGraphCreate(&g, 0));
+    hipGraphNode_t node = nullptr;
+
+    if constexpr (set_params) {
+      auto reversed_parms = GetMemcpy3DParms(src_ptr, src_pos, dst_ptr, dst_pos, extent,
+                                             ReverseMemcpyDirection(kind));
+      HIP_CHECK(hipGraphAddMemcpyNode(&node, g, nullptr, 0, &reversed_parms));
+      HIP_CHECK(hipGraphMemcpyNodeSetParams(node, &parms));
+    } else {
+      HIP_CHECK(hipGraphAddMemcpyNode(&node, g, nullptr, 0, &parms));
+    }
+
+    hipMemcpy3DParms retrieved_params = {0};
+    HIP_CHECK(hipGraphMemcpyNodeGetParams(node, &retrieved_params));
+    REQUIRE(parms == retrieved_params);
+
+    hipGraphExec_t graph_exec = nullptr;
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, g, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(g));
+
+    return hipSuccess;
+  }
+
+  if constexpr (async) {
+    return hipMemcpy3DAsync(&parms, stream);
+  } else {
+    return hipMemcpy3D(&parms);
+  }
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DDeviceToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyDeviceToHost, hipMemcpyDefault);
+
+  constexpr hipExtent extent{127 * sizeof(int), 128, 8};
+
+  LinearAllocGuard3D<int> device_alloc(extent);
+
+  const size_t host_pitch = GENERATE_REF(device_alloc.width(), device_alloc.width() + 64);
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc,
+                                   host_pitch * device_alloc.height() * device_alloc.depth());
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(device_alloc.width_logical() / threads_per_block.x + 1,
+                    device_alloc.height() / threads_per_block.y + 1, device_alloc.depth());
+  Iota<<<blocks, threads_per_block>>>(device_alloc.ptr(), device_alloc.pitch(),
+                                      device_alloc.width_logical(), device_alloc.height(),
+                                      device_alloc.depth());
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(memcpy_func(
+      make_hipPitchedPtr(host_alloc.ptr(), host_pitch, device_alloc.width(), device_alloc.height()),
+      make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0), device_alloc.extent(),
+      kind, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    constexpr auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemoryVerify(host_alloc.ptr(), host_pitch, device_alloc.width_logical(),
+                      device_alloc.height(), device_alloc.depth(), f);
+}
+
+template <bool should_synchronize, bool enable_peer_access, typename F>
+void Memcpy3DDeviceToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyDeviceToDevice, hipMemcpyDefault);
+
+  constexpr hipExtent extent{127 * sizeof(int), 128, 8};
+
+  const auto device_count = HipTest::getDeviceCount();
+  const auto src_device = GENERATE_COPY(range(0, device_count));
+  const auto dst_device = GENERATE_COPY(range(0, device_count));
+  const size_t src_cols_mult = GENERATE(1, 2);
+
+  INFO("Src device: " << src_device << ", Dst device: " << dst_device);
+
+  HIP_CHECK(hipSetDevice(src_device));
+  if constexpr (enable_peer_access) {
+    if (src_device == dst_device) {
+      return;
+    }
+    int can_access_peer = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
+    if (!can_access_peer) {
+      INFO("Peer access cannot be enabled between devices " << src_device << " " << dst_device);
+      REQUIRE(can_access_peer);
+    }
+    HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
+  }
+
+  LinearAllocGuard3D<int> src_alloc(extent);
+  HIP_CHECK(hipSetDevice(src_device));
+  LinearAllocGuard3D<int> dst_alloc(extent);
+  HIP_CHECK(hipSetDevice(src_device));
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc,
+                                   dst_alloc.width() * dst_alloc.height() * dst_alloc.depth());
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(dst_alloc.width_logical() / threads_per_block.x + 1,
+                    dst_alloc.height() / threads_per_block.y + 1, dst_alloc.depth());
+  // Using dst_alloc width and height to set only the elements that will be copied over to
+  // dst_alloc
+  Iota<<<blocks, threads_per_block>>>(src_alloc.ptr(), src_alloc.pitch(), dst_alloc.width_logical(),
+                                      dst_alloc.height(), dst_alloc.depth());
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(memcpy_func(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                        make_hipPos(0, 0, 0), dst_alloc.extent(), kind, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  HIP_CHECK(Memcpy3DWrapper(make_hipPitchedPtr(host_alloc.ptr(), dst_alloc.width(),
+                                               dst_alloc.width(), dst_alloc.height()),
+                            make_hipPos(0, 0, 0), dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                            dst_alloc.extent(), hipMemcpyDeviceToHost));
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    constexpr auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemoryVerify(host_alloc.ptr(), dst_alloc.width(), dst_alloc.width_logical(),
+                      dst_alloc.height(), dst_alloc.depth(), f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DHostToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyHostToDevice, hipMemcpyDefault);
+
+  constexpr hipExtent extent{127 * sizeof(int), 128, 8};
+
+  LinearAllocGuard3D<int> device_alloc(extent);
+
+  const size_t host_pitch = GENERATE_REF(device_alloc.pitch(), 2 * device_alloc.pitch());
+
+  LinearAllocGuard<int> src_host_alloc(LinearAllocs::hipHostMalloc,
+                                       host_pitch * device_alloc.height() * device_alloc.depth());
+  LinearAllocGuard<int> dst_host_alloc(
+      LinearAllocs::hipHostMalloc,
+      device_alloc.width() * device_alloc.height() * device_alloc.depth());
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    constexpr auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemorySet(src_host_alloc.ptr(), host_pitch, device_alloc.width_logical(),
+                   device_alloc.height(), device_alloc.depth(), f);
+
+  std::fill_n(dst_host_alloc.ptr(),
+              device_alloc.width_logical() * device_alloc.height() * device_alloc.depth(), 0);
+
+  HIP_CHECK(memcpy_func(device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                        make_hipPitchedPtr(src_host_alloc.ptr(), host_pitch, device_alloc.width(),
+                                           device_alloc.height()),
+                        make_hipPos(0, 0, 0), device_alloc.extent(), kind, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  HIP_CHECK(Memcpy3DWrapper(make_hipPitchedPtr(dst_host_alloc.ptr(), device_alloc.width(),
+                                               device_alloc.width(), device_alloc.height()),
+                            make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                            device_alloc.extent(), hipMemcpyDeviceToHost));
+
+  PitchedMemoryVerify(dst_host_alloc.ptr(), device_alloc.width(), device_alloc.width_logical(),
+                      device_alloc.height(), device_alloc.depth(), f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DHostToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyHostToHost, hipMemcpyDefault);
+
+  constexpr hipExtent extent{127 * sizeof(int), 128, 8};
+
+  const size_t padding = GENERATE_COPY(0, 64);
+  const size_t src_pitch = extent.width + padding;
+
+  LinearAllocGuard<int> src_host(LinearAllocs::hipHostMalloc,
+                                 src_pitch * extent.height * extent.depth);
+  LinearAllocGuard<int> dst_host(LinearAllocs::hipHostMalloc,
+                                 extent.width * extent.height * extent.depth);
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    constexpr auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemorySet(src_host.ptr(), src_pitch, extent.width / sizeof(int), extent.height,
+                   extent.depth, f);
+
+  HIP_CHECK(
+      memcpy_func(make_hipPitchedPtr(dst_host.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(src_host.ptr(), src_pitch, extent.width, extent.height),
+                  make_hipPos(0, 0, 0), extent, kind, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  PitchedMemoryVerify(dst_host.ptr(), extent.width, extent.width / sizeof(int), extent.height,
+                      extent.depth, f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DArrayHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  constexpr hipExtent extent{127, 128, 8};
+
+  LinearAllocGuard<int> src_host(LinearAllocs::hipHostMalloc,
+                                 extent.width * sizeof(int) * extent.height * extent.depth);
+  LinearAllocGuard<int> dst_host(LinearAllocs::hipHostMalloc,
+                                 extent.width * sizeof(int) * extent.height * extent.depth);
+
+  ArrayAllocGuard<int> src_array(extent);
+  ArrayAllocGuard<int> dst_array(extent);
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    return z * extent.width * extent.height + y * extent.width + x;
+  };
+  PitchedMemorySet(src_host.ptr(), extent.width * sizeof(int), extent.width, extent.height,
+                   extent.depth, f);
+
+  // Host -> Array
+  HIP_CHECK(memcpy_func(src_array.ptr(), make_hipPos(0, 0, 0),
+                        make_hipPitchedPtr(src_host.ptr(), extent.width * sizeof(int),
+                                           extent.width * sizeof(int), extent.height),
+                        make_hipPos(0, 0, 0), extent, hipMemcpyHostToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Array
+  HIP_CHECK(memcpy_func(dst_array.ptr(), make_hipPos(0, 0, 0), src_array.ptr(),
+                        make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Host
+  HIP_CHECK(memcpy_func(make_hipPitchedPtr(dst_host.ptr(), extent.width * sizeof(int),
+                                           extent.width * sizeof(int), extent.height),
+                        make_hipPos(0, 0, 0), dst_array.ptr(), make_hipPos(0, 0, 0), extent,
+                        hipMemcpyDeviceToHost, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  PitchedMemoryVerify(dst_host.ptr(), extent.width * sizeof(int), extent.width, extent.height,
+                      extent.depth, f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DArrayDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  constexpr hipExtent extent{127, 128, 8};
+
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc,
+                                   extent.width * sizeof(int) * extent.height * extent.depth);
+
+  ArrayAllocGuard<int> src_array(extent);
+  ArrayAllocGuard<int> dst_array(extent);
+
+  LinearAllocGuard3D<int> src_device(extent.width, extent.height, extent.depth);
+  LinearAllocGuard3D<int> dst_device(extent.width, extent.height, extent.depth);
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(src_device.width_logical() / threads_per_block.x + 1,
+                    src_device.height() / threads_per_block.y + 1, src_device.depth());
+  Iota<<<blocks, threads_per_block>>>(src_device.ptr(), src_device.pitch(),
+                                      src_device.width_logical(), src_device.height(),
+                                      src_device.depth());
+  HIP_CHECK(hipGetLastError());
+
+  // Device -> Array
+  HIP_CHECK(memcpy_func(src_array.ptr(), make_hipPos(0, 0, 0), src_device.pitched_ptr(),
+                        make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Array
+  HIP_CHECK(memcpy_func(dst_array.ptr(), make_hipPos(0, 0, 0), src_array.ptr(),
+                        make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Device
+  HIP_CHECK(memcpy_func(dst_device.pitched_ptr(), make_hipPos(0, 0, 0), dst_array.ptr(),
+                        make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Device -> Host
+  HIP_CHECK(memcpy_func(make_hipPitchedPtr(host_alloc.ptr(), extent.width * sizeof(int),
+                                           extent.width * sizeof(int), extent.height),
+                        make_hipPos(0, 0, 0), dst_device.pitched_ptr(), make_hipPos(0, 0, 0),
+                        dst_device.extent(), hipMemcpyDeviceToHost, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    return z * extent.width * extent.height + y * extent.width + x;
+  };
+  PitchedMemoryVerify(host_alloc.ptr(), extent.width * sizeof(int), extent.width, extent.height,
+                      extent.depth, f);
+}
+
+template <typename F>
+void Memcpy3DHtoDSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  using LA = LinearAllocs;
+  const auto host_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+  LinearAllocGuard3D<int> device_alloc(make_hipExtent(32 * sizeof(int), 32, 8));
+  LinearAllocGuard<int> host_alloc(
+      host_alloc_type, device_alloc.width() * device_alloc.height() * device_alloc.depth());
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func, device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                make_hipPitchedPtr(host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                                   device_alloc.height()),
+                make_hipPos(0, 0, 0), device_alloc.extent(), hipMemcpyHostToDevice, kernel_stream),
+      should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy3DDtoHPageableSyncBehavior(F memcpy_func, const bool should_sync,
+                                      const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard3D<int> device_alloc(make_hipExtent(32 * sizeof(int), 32, 8));
+  LinearAllocGuard<int> host_alloc(
+      LinearAllocs::malloc, device_alloc.width() * device_alloc.height() * device_alloc.depth());
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func,
+                make_hipPitchedPtr(host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                                   device_alloc.height()),
+                make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                device_alloc.extent(), hipMemcpyDeviceToHost, kernel_stream),
+      should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy3DDtoHPinnedSyncBehavior(F memcpy_func, const bool should_sync,
+                                    const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard3D<int> device_alloc(make_hipExtent(32 * sizeof(int), 32, 8));
+  LinearAllocGuard<int> host_alloc(
+      LinearAllocs::hipHostMalloc,
+      device_alloc.width() * device_alloc.height() * device_alloc.depth());
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func,
+                make_hipPitchedPtr(host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                                   device_alloc.height()),
+                make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                device_alloc.extent(), hipMemcpyDeviceToHost, kernel_stream),
+      should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy3DDtoDSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard3D<int> src_alloc(make_hipExtent(32 * sizeof(int), 32, 8));
+  LinearAllocGuard3D<int> dst_alloc(make_hipExtent(32 * sizeof(int), 32, 8));
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func, dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                make_hipPos(0, 0, 0), dst_alloc.extent(), hipMemcpyDeviceToDevice, kernel_stream),
+      should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy3DHtoHSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  using LA = LinearAllocs;
+  const auto src_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+  const auto dst_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+
+  LinearAllocGuard<int> src_alloc(src_alloc_type, 32 * sizeof(int) * 32 * 8);
+  LinearAllocGuard<int> dst_alloc(dst_alloc_type, 32 * sizeof(int) * 32 * 8);
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func,
+                make_hipPitchedPtr(dst_alloc.ptr(), 32 * sizeof(int), 32 * sizeof(int), 32),
+                make_hipPos(0, 0, 0),
+                make_hipPitchedPtr(src_alloc.ptr(), 32 * sizeof(int), 32 * sizeof(int), 32),
+                make_hipPos(0, 0, 0), make_hipExtent(32 * sizeof(int), 32, 8), hipMemcpyHostToHost,
+                kernel_stream),
+      should_sync, kernel_stream);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy3DZeroWidthHeightDepth(F memcpy_func, const hipStream_t stream = nullptr) {
+  constexpr hipExtent extent{127 * sizeof(int), 128, 8};
+
+  const auto [width_mult, height_mult, depth_mult] =
+      GENERATE(std::make_tuple(0, 1, 1), std::make_tuple(1, 0, 1), std::make_tuple(1, 1, 0));
+
+  SECTION("Device to Host") {
+    LinearAllocGuard3D<uint8_t> device_alloc(extent);
+    LinearAllocGuard<uint8_t> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.width() * device_alloc.height() * device_alloc.depth());
+    std::fill_n(host_alloc.ptr(),
+                device_alloc.width_logical() * device_alloc.height() * device_alloc.depth(), 42);
+    HIP_CHECK(hipMemset3D(device_alloc.pitched_ptr(), 1, device_alloc.extent()));
+    HIP_CHECK(memcpy_func(
+        make_hipPitchedPtr(host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                           device_alloc.height()),
+        make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+        make_hipExtent(device_alloc.width() * width_mult, device_alloc.height() * height_mult,
+                       device_alloc.depth() * depth_mult),
+        hipMemcpyDeviceToHost, stream));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    ArrayFindIfNot(host_alloc.ptr(), static_cast<uint8_t>(42),
+                   device_alloc.width_logical() * device_alloc.height() * device_alloc.depth());
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard3D<uint8_t> src_alloc(extent);
+    LinearAllocGuard3D<uint8_t> dst_alloc(extent);
+    LinearAllocGuard<uint8_t> host_alloc(
+        LinearAllocs::hipHostMalloc, dst_alloc.width() * dst_alloc.height() * dst_alloc.depth());
+    HIP_CHECK(hipMemset3D(src_alloc.pitched_ptr(), 1, src_alloc.extent()));
+    HIP_CHECK(hipMemset3D(dst_alloc.pitched_ptr(), 42, dst_alloc.extent()));
+    HIP_CHECK(
+        memcpy_func(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                    make_hipPos(0, 0, 0),
+                    make_hipExtent(dst_alloc.width() * width_mult, dst_alloc.height() * height_mult,
+                                   dst_alloc.depth() * depth_mult),
+                    hipMemcpyDeviceToDevice, stream));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    HIP_CHECK(Memcpy3DWrapper(make_hipPitchedPtr(host_alloc.ptr(), dst_alloc.width(),
+                                                 dst_alloc.width(), dst_alloc.height()),
+                              make_hipPos(0, 0, 0), dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                              dst_alloc.extent(), hipMemcpyDeviceToHost));
+    ArrayFindIfNot(host_alloc.ptr(), static_cast<uint8_t>(42),
+                   dst_alloc.width_logical() * dst_alloc.height());
+  }
+
+  SECTION("Host to Device") {
+    LinearAllocGuard3D<uint8_t> device_alloc(extent);
+    LinearAllocGuard<uint8_t> src_host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.width() * device_alloc.height() * device_alloc.depth());
+    LinearAllocGuard<uint8_t> dst_host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.width() * device_alloc.height() * device_alloc.depth());
+    std::fill_n(src_host_alloc.ptr(),
+                device_alloc.width_logical() * device_alloc.height() * device_alloc.depth(), 1);
+    HIP_CHECK(hipMemset3D(device_alloc.pitched_ptr(), 42, device_alloc.extent()));
+    HIP_CHECK(memcpy_func(
+        device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+        make_hipPitchedPtr(src_host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                           device_alloc.height()),
+        make_hipPos(0, 0, 0),
+        make_hipExtent(device_alloc.width() * width_mult, device_alloc.height() * height_mult,
+                       device_alloc.depth() * depth_mult),
+        hipMemcpyHostToDevice, stream));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    HIP_CHECK(Memcpy3DWrapper(make_hipPitchedPtr(dst_host_alloc.ptr(), device_alloc.width(),
+                                                 device_alloc.width(), device_alloc.height()),
+                              make_hipPos(0, 0, 0), device_alloc.pitched_ptr(),
+                              make_hipPos(0, 0, 0), device_alloc.extent(), hipMemcpyDeviceToHost));
+    ArrayFindIfNot(dst_host_alloc.ptr(), static_cast<uint8_t>(42),
+                   device_alloc.width_logical() * device_alloc.height());
+  }
+
+  SECTION("Host to Host") {
+    const auto alloc_size = extent.width * extent.height * extent.depth;
+    LinearAllocGuard<uint8_t> src_alloc(LinearAllocs::hipHostMalloc, alloc_size);
+    LinearAllocGuard<uint8_t> dst_alloc(LinearAllocs::hipHostMalloc, alloc_size);
+    std::fill_n(src_alloc.ptr(), alloc_size, 1);
+    std::fill_n(dst_alloc.ptr(), alloc_size, 42);
+    HIP_CHECK(
+        memcpy_func(make_hipPitchedPtr(dst_alloc.ptr(), extent.width, extent.width, extent.height),
+                    make_hipPos(0, 0, 0),
+                    make_hipPitchedPtr(src_alloc.ptr(), extent.width, extent.width, extent.height),
+                    make_hipPos(0, 0, 0),
+                    make_hipExtent(extent.width * width_mult, extent.height * height_mult,
+                                   extent.depth * depth_mult),
+                    hipMemcpyHostToHost, stream));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    ArrayFindIfNot(dst_alloc.ptr(), static_cast<uint8_t>(42), alloc_size);
+  }
+}
@@ -29,7 +29,9 @@ set(TEST_SRC
    hipMemcpy2DToArrayAsync.cc
    hipMemcpy2DToArrayAsync_old.cc
    hipMemcpy3D.cc
+    hipMemcpy3D_old.cc
    hipMemcpy3DAsync.cc
+    hipMemcpy3DAsync_old.cc
    hipMemcpyParam2D.cc
    hipMemcpyParam2DAsync.cc
    hipMemcpy2D.cc
@@ -1,13 +1,16 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,611 +20,209 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/*
- * This testfile verifies the following scenarios of hipMemcpy3D API
- *
- * 1. Verifying hipMemcpy3D API for H2D,D2D and D2H scenarios for
-      different datatypes and sizes.
- * 2. Verifying Negative Scenarios
- * 3. Verifying Extent validation scenarios by passing 0
- * 4. Verifying hipMemcpy3D API by allocating Memory in
- *    one GPU and trigger hipMemcpy3D from peer GPU
- *
- */
+#include <memcpy1d_tests_common.hh>
+#include <memcpy3d_tests_common.hh>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr auto width{10};
-static constexpr auto height{10};
-static constexpr auto depth{10};
+TEST_CASE("Unit_hipMemcpy3D_Positive_Basic") {
+  constexpr bool async = false;

-template <typename T>
-class Memcpy3D {
-  int width, height, depth;
-  unsigned int size;
-  hipArray *arr, *arr1;
-  hipChannelFormatKind formatKind;
-  hipMemcpy3DParms myparms;
-  T* hData;
- public:
-  Memcpy3D(int l_width, int l_height, int l_depth,
-      hipChannelFormatKind l_format);
-  void simple_Memcpy3D();
-  void Extent_Validation();
-  void NegativeTests();
-  void AllocateMemory();
-  void DeAllocateMemory();
-  void SetDefaultData();
-  void D2D_DeviceMem_OnDiffDevice();
-  void D2H_H2D_DeviceMem_OnDiffDevice();
-};
+  SECTION("Device to Host") { Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<>); }

-/*
- * This API sets the default values of hipMemcpy3DParms structure
- */
-template <typename T>
-void Memcpy3D<T>::SetDefaultData() {
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-}
-
-/*
- * Constructor initalized width,depth and height
- */
-template <typename T>
-Memcpy3D<T>::Memcpy3D(int l_width, int l_height, int l_depth,
-                      hipChannelFormatKind l_format) {
-  width = l_width;
-  height = l_height;
-  depth = l_depth;
-  formatKind = l_format;
-}
-
-/*
- * Allocating Memory and initalizing data for both
- * device and host variables
- */
-template <typename T>
-void Memcpy3D<T>::AllocateMemory() {
-  size = width * height * depth * sizeof(T);
-  hData = reinterpret_cast<T*>(malloc(size));
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width +k] = i*width*height + j*width + k;
-      }
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<>);
    }
+    SECTION("Peer access enabled") { Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<>); }
  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8,
-                                                          0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height,
-                                                depth), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height,
-                                                 depth), hipArrayDefault));
+
+  SECTION("Host to Device") { Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<>); }
+
+  SECTION("Host to Host") { Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<>); }
 }

-/*
- * DeAllocates the Memory of device and host variables
- */
-template <typename T>
-void Memcpy3D<T>::DeAllocateMemory() {
-    HIP_CHECK(hipFreeArray(arr));
-    HIP_CHECK(hipFreeArray(arr1));
-    free(hData);
+TEST_CASE("Unit_hipMemcpy3D_Positive_Synchronization_Behavior") {
+  HIP_CHECK(hipDeviceSynchronize());
+
+  SECTION("Host to Device") { Memcpy3DHtoDSyncBehavior(Memcpy3DWrapper<>, true); }
+
+  SECTION("Device to Pageable Host") { Memcpy3DDtoHPageableSyncBehavior(Memcpy3DWrapper<>, true); }
+
+  SECTION("Device to Pinned Host") { Memcpy3DDtoHPinnedSyncBehavior(Memcpy3DWrapper<>, true); }
+
+  SECTION("Device to Device") {
+#if HT_NVIDIA
+    Memcpy3DDtoDSyncBehavior(Memcpy3DWrapper<>, false);
+#else
+    Memcpy3DDtoDSyncBehavior(Memcpy3DWrapper<>, true);
+#endif
+  }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-232
+  SECTION("Host to Host") { Memcpy3DHtoHSyncBehavior(Memcpy3DWrapper<>, true); }
+#endif
 }

-/*
- * This API verifies both H2D & D2H functionalities of hipMemcpy3D API
- * by allocating memory in one GPU and calling the hipMemcpy3D API
- * from another GPU.
- * H2D case:
- * Input : "hData" is  initialized with the respective offset value
- * Output: Destination array "arr" variable.
- *
- * D2H case:
- * Input: "arr" array variable from the above output
- * Output: "hOutputData" variable data is copied from "arr" variable
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3D<T>::D2H_H2D_DeviceMem_OnDiffDevice() {
-  HIP_CHECK(hipSetDevice(0));
-  int peerAccess = 0;
-  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-  if (peerAccess) {
-    AllocateMemory();
-    // Memory is allocated on device 0 and Memcpy3DAsync triggered from device 1
-    HIP_CHECK(hipSetDevice(1));
-
-    // H2D Scenario
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyHostToDevice;
-#else
-    myparms.kind = hipMemcpyHostToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-    HIP_CHECK(hipDeviceSynchronize());
-
-    // Device to host
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    T *hOutputData = reinterpret_cast<T*>(malloc(size));
-    memset(hOutputData, 0,  size);
-    SetDefaultData();
-    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-                                        width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-    HIP_CHECK(hipDeviceSynchronize());
-
-    // Validating the result
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
-    free(hOutputData);
-    DeAllocateMemory();
-  } else {
-    SUCCEED("Skipped the test as there is no peer access\n");
-  }
-}
-/*
- * This API verifies both D2D functionalities of hipMemcpy3D API
- * by allocating memory in one GPU and calling the hipMemcpy3D API
- * from another GPU.
- *
- * D2D case:
- * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
- * Output: "arr2" variable in GPU-0
- *
- * hipMemcpy3D API is triggered from GPU-1
- * The "arr2" variable is then copied to "hOutputData" for validating 
- * the result
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3D<T>::D2D_DeviceMem_OnDiffDevice() {
-  HIP_CHECK(hipSetDevice(0));
-  int peerAccess = 0;
-  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
-  if (peerAccess) {
-    AllocateMemory();
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-
-    // Host to device copy
-    myparms.srcPtr = make_hipPitchedPtr(hData,
-        width * sizeof(T),
-        width, height);
-    myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyHostToDevice;
-#else
-    myparms.kind = hipMemcpyHostToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-    hipArray *arr2;
-    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
-        0, 0, 0, formatKind);
-    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
-          make_hipExtent(width, height,
-            depth), hipArrayDefault));
-
-    // Allocating Mem on GPU device 0 and trigger hipMemcpy3D from GPU 1
-    HIP_CHECK(hipSetDevice(1));
-
-    // D2D Scenario
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-    myparms.srcArray = arr;
-    myparms.dstArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-    HIP_CHECK(hipDeviceSynchronize());
-
-    // For validating the D2D copy copying it again to hOutputData and
-    // verifying it with iniital data hData
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    T *hOutputData = reinterpret_cast<T*>(malloc(size));
-    memset(hOutputData, 0, size);
-    SetDefaultData();
-
-    // Device to host
-    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-        width * sizeof(T),
-        width, height);
-    myparms.srcArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-    HIP_CHECK(hipDeviceSynchronize());
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-    // DeAllocating Memory
-    free(hOutputData);
-    DeAllocateMemory();
-  } else {
-    SUCCEED("Skipped the test as there is no peer access\n");
-  }
-}
-/*
- * This API verifies all the negative scenarios of hipMemcpy3D API
- */
-template <typename T>
-void Memcpy3D<T>::NegativeTests() {
-  HIP_CHECK(hipSetDevice(0));
-  AllocateMemory();
-
-  // Initialization of data
-  memset(&myparms, 0, sizeof(myparms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
-#endif
-
-  SECTION("Nullptr to destination array") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = nullptr;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Nullptr to source array") {
-    myparms.srcArray = nullptr;
-    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing both Source ptr and array") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing both destination ptr and array") {
-    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    myparms.srcArray = arr1;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing Max value to extent") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    myparms.extent = make_hipExtent(std::numeric_limits<int>::max(),
-                                    std::numeric_limits<int>::max(),
-                                    std::numeric_limits<int>::max());
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing Source pitchedPtr as nullptr") {
-    myparms.srcPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing Dst pitchedPtr as nullptr") {
-    myparms.dstPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing width > max width size in extent") {
-    myparms.extent = make_hipExtent(width+1 , height, depth);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing hgt > max width size in extent") {
-    myparms.extent = make_hipExtent(width , height+1, depth);
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing depth > max width size in extent") {
-    myparms.extent = make_hipExtent(width , height, depth+1);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing dst width pos > max allocated width") {
-    myparms.dstPos = make_hipPos(width+1, 0, 0);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing dst height pos > max allocated hgt") {
-    myparms.dstPos = make_hipPos(0, height+1, 0);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing dst depth pos > max allocated depth") {
-    myparms.dstPos = make_hipPos(0, 0, depth+1);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing src width pos > max allocated width") {
-    myparms.srcPos = make_hipPos(width+1, 0, 0);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing src height pos > max allocated hgt") {
-    myparms.srcPos = make_hipPos(0, height+1, 0);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing src height pos > max allocated hgt") {
-    myparms.srcPos = make_hipPos(0, 0, depth+1);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  SECTION("Passing src array size  > dst array size") {
-    // Passing src array size greater than destination array size
-    hipArray *arr2;
-    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
-                                                    0, 0, 0, formatKind);
-    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
-                              make_hipExtent(3, 3
-                              , 3), hipArrayDefault));
-    myparms.srcArray = arr;
-    myparms.dstArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
-  }
-
-  // DeAllocation of memory
-  DeAllocateMemory();
+TEST_CASE("Unit_hipMemcpy3D_Positive_Parameters") {
+  constexpr bool async = false;
+  Memcpy3DZeroWidthHeightDepth<async>(Memcpy3DWrapper<async>);
 }

-/*
- * This API verifies the Extent validation Scenarios
- */
-template <typename T>
-void Memcpy3D<T>::Extent_Validation() {
-  HIP_CHECK(hipSetDevice(0));
-  AllocateMemory();
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                      width, height);
-  myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
+TEST_CASE("Unit_hipMemcpy3D_Positive_Array") {
+  constexpr bool async = false;
+  SECTION("Array from/to Host") { Memcpy3DArrayHostShell<async>(Memcpy3DWrapper<async>); }
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-238
+  SECTION("Array from/to Device") { Memcpy3DArrayDeviceShell<async>(Memcpy3DWrapper<async>); }
 #endif
-  SECTION("Passing Extent as 0") {
-    myparms.extent = make_hipExtent(0 , 0, 0);
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-  }
-  SECTION("Passing Width 0 in Extent") {
-    myparms.extent = make_hipExtent(0 , height, depth);
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-  }
-  SECTION("Passing Height 0 in Extent") {
-    myparms.extent = make_hipExtent(width , 0, depth);
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-  }
-  SECTION("Passing Depth 0 in Extent") {
-    myparms.extent = make_hipExtent(width , height, 0);
-    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-  }
-  SECTION("Passing Depth 0 in Extent") {
-    REQUIRE(hipMemcpy3D(nullptr) != hipSuccess);
-  }
-  DeAllocateMemory();
 }

-/*
- * This API verifies H2H-D2D-D2H functionalities of hipMemcpy3D API
- *
- * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
- * Output: "arr1" variable in GPU-0
- *
- * The "arr1" variable is then copied to "hOutputData" for validating
- * the result
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
+TEST_CASE("Unit_hipMemcpy3D_Negative_Parameters") {
+  constexpr hipExtent extent{128 * sizeof(int), 128, 8};

-template <typename T>
-void Memcpy3D<T>::simple_Memcpy3D() {
-  HIP_CHECK(hipSetDevice(0));
-  AllocateMemory();
-
-  // Host to Device
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                      width, height);
-  myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
-#endif
-  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-
-  // Array to Array
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-  myparms.srcArray = arr;
-  myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-  T *hOutputData = reinterpret_cast<T*>(malloc(size));
-  memset(hOutputData, 0,  size);
-
-  // Device to host
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-  myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-                   width * sizeof(T), width, height);
-  myparms.srcArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
-
-  // Validating the result
-  HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-  // DeAllocating the Memory
-  free(hOutputData);
-  DeAllocateMemory();
-}
-/*
- This testcase performs hipMemcpy3D API validation for 
- different datatypes and different sizes
-*/
-TEMPLATE_TEST_CASE("Unit_hipMemcpy3D_Basic", "[hipMemcpy3D]",
-    int, unsigned int, float) {
-  int device = -1;
-  HIP_CHECK(hipGetDevice(&device));
-  hipDeviceProp_t prop;
-  HIP_CHECK(hipGetDeviceProperties(&prop,device));
-  auto i = GENERATE_COPY(10, 100, 1024, prop.maxTexture3D[0]);
-  auto j = GENERATE(10, 100);
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    if (std::is_same<TestType, float>::value) {
-      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindFloat);
-      memcpy3d_obj.simple_Memcpy3D();
-    } else if (std::is_same<TestType, unsigned int>::value)  {
-      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindUnsigned);
-      memcpy3d_obj.simple_Memcpy3D();
-    } else if (std::is_same<TestType, int>::value)  {
-      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindSigned);
-      memcpy3d_obj.simple_Memcpy3D();
-    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
-This testcase performs the extent validation scenarios of
-hipMemcpy3D API
-*/
-TEST_CASE("Unit_hipMemcpy3D_ExtentValidation") {
-  Memcpy3D<int> memcpy3d(width, height, depth,
-                         hipChannelFormatKindSigned);
-  memcpy3d.Extent_Validation();
-}
-
-/*
-This testcase performs the negative scenarios of
-hipMemcpy3D API
-*/
-TEST_CASE("Unit_hipMemcpy3D_multiDevice-Negative") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    Memcpy3D<int> memcpy3d(width, height, depth,
-                           hipChannelFormatKindSigned);
-    memcpy3d.NegativeTests();
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
-This testcase performs the D2H,H2D and D2D on peer
-GPU device
-*/
-TEST_CASE("Unit_hipMemcpy3D_multiDevice-OnPeerDevice") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    SECTION("D2H & H2D On DiffDevice") {
-      Memcpy3D<float> memcpy3d_d2h_obj(width, height, depth,
-                                       hipChannelFormatKindFloat);
-      memcpy3d_d2h_obj.D2H_H2D_DeviceMem_OnDiffDevice();
+  constexpr auto NegativeTests = [](hipPitchedPtr dst_ptr, hipPos dst_pos, hipPitchedPtr src_ptr,
+                                    hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+    SECTION("dst_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.ptr = nullptr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
    }

-    SECTION("D2D On DiffDevice") {
-      Memcpy3D<float> memcpy3d_d2d_obj(width, height, depth,
-                                       hipChannelFormatKindFloat);
-      memcpy3d_d2d_obj.D2D_DeviceMem_OnDiffDevice();
+    SECTION("src_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.ptr = nullptr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-239
+    SECTION("dst_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidPitchValue);
+    }
+
+    SECTION("src_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidPitchValue);
+    }
+#endif
+
+    SECTION("dst_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = attr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = attr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-237
+    SECTION("extent.width + dst_pos.x > dst_ptr.pitch") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.x = dst_ptr.pitch - extent.width + 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + src_pos.x > src_ptr.pitch") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.x = src_ptr.pitch - extent.width + 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.y out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.y = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.y out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.y = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.z out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.z = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.z out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.z = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+#endif
+
+#if HT_NVIDIA  // Disable on AMD due to defect - EXSWHTEC-234
+    SECTION("Invalid MemcpyKind") {
+      HIP_CHECK_ERROR(Memcpy3DWrapper(dst_ptr, dst_pos, src_ptr, src_pos, extent,
+                                      static_cast<hipMemcpyKind>(-1)),
+                      hipErrorInvalidMemcpyDirection);
+    }
+#endif
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToDevice);
+  }
+
+  SECTION("Device to Host") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0), extent,
+                  hipMemcpyDeviceToHost);
+  }
+
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    NegativeTests(make_hipPitchedPtr(dst_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(src_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard3D<int> src_alloc(extent);
+    LinearAllocGuard3D<int> dst_alloc(extent);
+    NegativeTests(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice);
  }
 }
@@ -1,13 +1,16 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,739 +20,228 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/*
- * This testfile verifies the following Scenarios of hipMemcpy3DAsync API
-
- * 1. Verifying hipMemcpy3DAsync API for H2D,D2D and D2H scenarios
- * 2. Verifying Negative Scenarios
- * 3. Verifying Extent validation scenarios by passing 0
- * 4. Verifying hipMemcpy3DAsync API by allocating Memory in
- *    one GPU and trigger hipMemcpy3D from peer GPU
- * 5. D2D where src and dst memory on GPU-0 and stream on GPU-1
-*/
+#include <memcpy1d_tests_common.hh>
+#include <memcpy3d_tests_common.hh>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr auto width{10};
-static constexpr auto height{10};
-static constexpr auto depth{10};
+TEST_CASE("Unit_hipMemcpy3DAsync_Positive_Basic") {
+  constexpr bool async = true;

-template <typename T>
-class Memcpy3DAsync {
-    int width, height, depth;
-    unsigned int size;
-    hipArray *arr, *arr1;
-    hipChannelFormatKind formatKind;
-    hipMemcpy3DParms myparms;
-    T* hData;
-    hipStream_t stream;
- public:
-    Memcpy3DAsync(int l_width, int l_height, int l_depth,
-                  hipChannelFormatKind l_format);
-    void simple_Memcpy3DAsync();
-    void Extent_Validation();
-    void NegativeTests();
-    void AllocateMemory();
-    void DeAllocateMemory();
-    void SetDefaultData();
-    void D2D_SameDeviceMem_StreamDiffDevice();
-    void D2D_DeviceMem_OnDiffDevice();
-    void D2H_H2D_DeviceMem_OnDiffDevice();
-};
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
+  const StreamGuard stream_guard(stream_type);
+  const hipStream_t stream = stream_guard.stream();

-/*
- * This API sets the default values of hipMemcpy3DParms structure
- */
-template <typename T>
-void Memcpy3DAsync<T>::SetDefaultData() {
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-}
+  SECTION("Device to Host") { Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<async>, stream); }

-/*
- * Constructor initalized width,depth and height
- */
-template <typename T>
-Memcpy3DAsync<T>::Memcpy3DAsync(int l_width, int l_height, int l_depth,
-                      hipChannelFormatKind l_format) {
-  width = l_width;
-  height = l_height;
-  depth = l_depth;
-  formatKind = l_format;
-}
-
-/*
- * Allocating Memory and initalizing data for both
- * device and host variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::AllocateMemory() {
-  size = width * height * depth * sizeof(T);
-  hData = reinterpret_cast<T*>(malloc(size));
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width +k] = i*width*height + j*width + k;
-      }
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<async>, stream);
+    }
+    SECTION("Peer access enabled") {
+      Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<async>, stream);
    }
  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8,
-                                                          0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height,
-                                                depth), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height,
-                                                 depth), hipArrayDefault));
+
+  SECTION("Host to Device") { Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<async>, stream); }
+
+  SECTION("Host to Host") { Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<async>, stream); }
 }

-/*
- * DeAllocates the Memory of device and host variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::DeAllocateMemory() {
-  HIP_CHECK(hipFreeArray(arr));
-  HIP_CHECK(hipFreeArray(arr1));
-  free(hData);
-  HIP_CHECK(hipStreamDestroy(stream));
+TEST_CASE("Unit_hipMemcpy3DAsync_Positive_Synchronization_Behavior") {
+  constexpr bool async = true;
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  SECTION("Host to Device") { Memcpy3DHtoDSyncBehavior(Memcpy3DWrapper<async>, false); }
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Device to Pageable Host") {
+    Memcpy3DDtoHPageableSyncBehavior(Memcpy3DWrapper<async>, true);
+  }
+#endif
+
+  SECTION("Device to Pinned Host") {
+    Memcpy3DDtoHPinnedSyncBehavior(Memcpy3DWrapper<async>, false);
+  }
+
+  SECTION("Device to Device") { Memcpy3DDtoDSyncBehavior(Memcpy3DWrapper<async>, false); }
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Host to Host") { Memcpy3DHtoHSyncBehavior(Memcpy3DWrapper<async>, true); }
+#endif
 }

-/*
- * This API verifies both H2D & D2H functionalities of hipMemcpy3DAsync API
- * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
- * from another GPU.
- * H2D case:
- * Input : "hData" is  initialized with the respective offset value
- * Output: Destination array "arr" variable.
- *
- * D2H case:
- * Input: "arr" array variable from the above output
- * Output: "hOutputData" variable data is copied from "arr" variable
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::D2H_H2D_DeviceMem_OnDiffDevice() {
-  HIP_CHECK(hipSetDevice(0));
-  int peerAccess = 0;
-  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-  if (peerAccess) {
-    AllocateMemory();
-
-    // Memory is allocated on device 0 and Memcpy3DAsyncAsync
-    // triggered from device 1
-    HIP_CHECK(hipSetDevice(1));
-    HIP_CHECK(hipStreamCreate(&stream));
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-
-    // Host to Device
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-        width, height);
-    myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyHostToDevice;
-#else
-    myparms.kind = hipMemcpyHostToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    T *hOutputData = reinterpret_cast<T*>(malloc(size));
-    memset(hOutputData, 0,  size);
-    SetDefaultData();
-
-    // Device to host
-    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-        width * sizeof(T),
-        width, height);
-    myparms.srcArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    // Validating the result
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
-    free(hOutputData);
-
-    // DeAllocating the Memory
-    DeAllocateMemory();
-  } else {
-    SUCCEED("Skipped the test as there is no peer access");
-  }
+TEST_CASE("Unit_hipMemcpy3DAsync_Positive_Parameters") {
+  constexpr bool async = true;
+  Memcpy3DZeroWidthHeightDepth<async>(Memcpy3DWrapper<async>);
 }

-/*
- * This API verifies both D2D functionalities of hipMemcpy3DAsync API
- * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
- * from another GPU.
- *
- * D2D case:
- * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
- * Output: "arr2" variable in GPU-0
- *
- * hipMemcpy3DAsync API is triggered from GPU-1
- * The "arr2" variable is then copied to "hOutputData" for validating
- * the result
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::D2D_DeviceMem_OnDiffDevice() {
-  HIP_CHECK(hipSetDevice(0));
-  int peerAccess = 0;
-  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
-  if (peerAccess) {
-    // Allocating Memory and setting default data
-    AllocateMemory();
-    hipStream_t stream1;
-    HIP_CHECK(hipStreamCreate(&stream1));
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-
-    // Host to Device Scenario
-    myparms.srcPtr = make_hipPitchedPtr(hData,
-                                        width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyHostToDevice;
-#else
-    myparms.kind = hipMemcpyHostToDevice;
+TEST_CASE("Unit_hipMemcpy3DAsync_Positive_Array") {
+  constexpr bool async = true;
+  SECTION("Array from/to Host") { Memcpy3DArrayHostShell<async>(Memcpy3DWrapper<async>); }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-238
+  SECTION("Array from/to Device") { Memcpy3DArrayDeviceShell<async>(Memcpy3DWrapper<async>); }
 #endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream1) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream1));
-
-    // Allocating Mem on GPU device 0 and trigger hipMemcpy3DAsync from GPU 1
-    HIP_CHECK(hipSetDevice(1));
-    HIP_CHECK(hipStreamCreate(&stream));
-    hipArray *arr2;
-    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
-                                                    0, 0, 0, formatKind);
-    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
-                              make_hipExtent(width, height,
-                              depth), hipArrayDefault));
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-
-    // Device to Device
-    myparms.srcArray = arr;
-    myparms.dstArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    // For validating the D2D copy copying it again to hOutputData and
-    // verifying it with iniital data hData
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    T *hOutputData = reinterpret_cast<T*>(malloc(size));
-    memset(hOutputData, 0, size);
-    SetDefaultData();
-
-    // Device to host
-    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-                                        width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream)== hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    // Validating the result
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-    // Deleting the memory
-    free(hOutputData);
-    DeAllocateMemory();
-  } else {
-    SUCCEED("Skipped the test as there is no peer access");
-  }
 }

-/*
- * This API verifies all the negative scenarios of hipMemcpy3D API
-*/
-template <typename T>
-void Memcpy3DAsync<T>::NegativeTests() {
-  HIP_CHECK(hipSetDevice(0));
-  AllocateMemory();
-  HIP_CHECK(hipStreamCreate(&stream));
+TEST_CASE("Unit_hipMemcpy3DAsync_Negative_Parameters") {
+  constexpr bool async = true;
+  constexpr hipExtent extent{128 * sizeof(int), 128, 8};

-  // Initialization of data
-  memset(&myparms, 0, sizeof(myparms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
-#endif
-
-  SECTION("Nullptr to destination array") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-        width, height);
-    myparms.dstArray = nullptr;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Nullptr to source array") {
-    myparms.srcArray = nullptr;
-    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing both Source ptr and array") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing both destination ptr and array") {
-    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    myparms.srcArray = arr1;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing Max value to extent") {
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    myparms.extent = make_hipExtent(std::numeric_limits<int>::max(),
-                                    std::numeric_limits<int>::max(),
-                                    std::numeric_limits<int>::max());
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing Source pitchedPtr as nullptr") {
-    myparms.srcPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing Dst pitchedPtr as nullptr") {
-    myparms.dstPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
-                                        width, height);
-    myparms.srcArray = arr;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing width > max width size in extent") {
-    myparms.extent = make_hipExtent(width+1 , height, depth);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing hgt > max width size in extent") {
-    myparms.extent = make_hipExtent(width , height+1, depth);
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    myparms.dstArray = arr;
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing depth > max width size in extent") {
-    myparms.extent = make_hipExtent(width , height, depth+1);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing dst width pos > max allocated width") {
-    myparms.dstPos = make_hipPos(width+1, 0, 0);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing dst height pos > max allocated hgt") {
-    myparms.dstPos = make_hipPos(0, height+1, 0);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing dst depth pos > max allocated depth") {
-    myparms.dstPos = make_hipPos(0, 0, depth+1);
-    myparms.dstArray = arr;
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
-                                        width, height);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing src width pos > max allocated width") {
-    myparms.srcPos = make_hipPos(width+1, 0, 0);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing src height pos > max allocated hgt") {
-    myparms.srcPos = make_hipPos(0, height+1, 0);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing src height pos > max allocated hgt") {
-    myparms.srcPos = make_hipPos(0, 0, depth+1);
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing src array size  > dst array size") {
-    hipArray *arr2;
-    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
-        0, 0, 0, formatKind);
-    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
-                              make_hipExtent(3, 3
-                              , 3), hipArrayDefault));
-    myparms.srcArray = arr;
-    myparms.dstArray = arr2;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
-  }
-
-  SECTION("Passing nullptr to hipMemcpy3DAsync") {
-    REQUIRE(hipMemcpy3DAsync(nullptr, stream) != hipSuccess);
-  }
-  // DeAllocating of memory
-  DeAllocateMemory();
-}
-
-/*
- * This API verifies both D2D functionalities of hipMemcpy3DAsync API
- * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
- * from another GPU.
- *
- * D2D case:
- * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
- * Output: "arr1" variable in GPU-0
- *
- * Stream on GPU-1
- * The "arr2" variable is then copied to "hOutputData" for validating
- * the result
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::D2D_SameDeviceMem_StreamDiffDevice() {
-  HIP_CHECK(hipSetDevice(0));
-  // Allocating the Memory
-  int peerAccess = 0;
-  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
-  if (peerAccess) {
-    AllocateMemory();
-    HIP_CHECK(hipSetDevice(1));
-    HIP_CHECK(hipStreamCreate(&stream));
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-
-    // Host to Device
-    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
-    myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyHostToDevice;
-#else
-    myparms.kind = hipMemcpyHostToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    // Array to Array
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-    myparms.srcArray = arr;
-    myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-    myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-    T *hOutputData = reinterpret_cast<T*>(malloc(size));
-    memset(hOutputData, 0,  size);
-
-    // Device to host
-    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-    SetDefaultData();
-    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-        width * sizeof(T), width, height);
-    myparms.srcArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-    myparms.kind = cudaMemcpyDeviceToHost;
-#else
-    myparms.kind = hipMemcpyDeviceToHost;
-#endif
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-
-    // Validating the result
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-    // Deallocating the memory
-    free(hOutputData);
-    DeAllocateMemory();
-  } else {
-    SUCCEED("Skipped the test as there is no peer access");
-  }
-}
-
-/*
- * This API verifies the Extent validation Scenarios
- */
-template <typename T>
-void Memcpy3DAsync<T>::Extent_Validation() {
-  HIP_CHECK(hipSetDevice(0));
-  AllocateMemory();
-  HIP_CHECK(hipStreamCreate(&stream));
-  // Passing extent as 0
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
-  myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
-#endif
-  SECTION("Passing Extent as 0") {
-    myparms.extent = make_hipExtent(0 , 0, 0);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  }
-  SECTION("Passing Width 0 in Extent") {
-    myparms.extent = make_hipExtent(0 , height, depth);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  }
-  SECTION("Passing Height 0 in Extent") {
-    myparms.extent = make_hipExtent(width , 0, depth);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  }
-  SECTION("Passing Depth 0 in Extent") {
-    myparms.extent = make_hipExtent(width , height, 0);
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  }
-  DeAllocateMemory();
-}
-
-/*
- * This API verifies H2H-D2D-D2H functionalities of hipMemcpy3DAsync API
- *
- * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
- * Output: "arr1" variable in GPU-0
- *
- * The "arr1" variable is then copied to "hOutputData" for validating
- * the result
- *
- * Validating the result by comparing "hData" and "hOutputData" variables
- */
-template <typename T>
-void Memcpy3DAsync<T>::simple_Memcpy3DAsync() {
-  HIP_CHECK(hipSetDevice(0));
-
-  // Allocating the Memory
-  AllocateMemory();
-  HIP_CHECK(hipStreamCreate(&stream));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-
-  // Host to Device
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
-  myparms.dstArray = arr;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyHostToDevice;
-#else
-  myparms.kind = hipMemcpyHostToDevice;
-#endif
-  SECTION("Calling hipMemcpy3DAsync() using user declared stream obj") {
-    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-  }
-  SECTION("Calling hipMemcpy3DAsync() using hipStreamPerThread") {
-    REQUIRE(hipMemcpy3DAsync(&myparms, hipStreamPerThread) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
-  }
-
-  // Array to Array
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-  myparms.srcArray = arr;
-  myparms.dstArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyDeviceToDevice;
-#else
-  myparms.kind = hipMemcpyDeviceToDevice;
-#endif
-  REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  HIP_CHECK(hipStreamSynchronize(stream));
-  T *hOutputData = reinterpret_cast<T*>(malloc(size));
-  memset(hOutputData, 0,  size);
-
-  // Device to host
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  SetDefaultData();
-  myparms.dstPtr = make_hipPitchedPtr(hOutputData,
-      width * sizeof(T), width, height);
-  myparms.srcArray = arr1;
-#ifdef __HIP_PLATFORM_NVCC__
-  myparms.kind = cudaMemcpyDeviceToHost;
-#else
-  myparms.kind = hipMemcpyDeviceToHost;
-#endif
-  REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
-  HIP_CHECK(hipStreamSynchronize(stream));
-
-  // Validating the result
-  HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-  // DeAllocating the memory
-  free(hOutputData);
-  DeAllocateMemory();
-}
-/*
-This testcase verifies hipMemcpyAsync for different datatypes
-and different sizes
-*/
-TEMPLATE_TEST_CASE("Unit_hipMemcpy3DAsync_Basic",
-                   "[hipMemcpy3DAsync]",
-                   int, unsigned int, float) {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  int device = -1;
-  HIP_CHECK(hipGetDevice(&device));
-  hipDeviceProp_t prop;
-  HIP_CHECK(hipGetDeviceProperties(&prop,device));
-  auto i = GENERATE_COPY(10, 100, 1024, prop.maxTexture3D[0]);
-  auto j = GENERATE(10, 100);
-  if (numDevices > 1) {
-      if (std::is_same<TestType, int>::value)  {
-        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
-                                             hipChannelFormatKindSigned);
-        memcpy3d_obj.simple_Memcpy3DAsync();
-      } else if (std::is_same<TestType, unsigned int>::value)  {
-        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
-                                             hipChannelFormatKindUnsigned);
-        memcpy3d_obj.simple_Memcpy3DAsync();
-      } else if (std::is_same<TestType, float>::value) {
-        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
-                                             hipChannelFormatKindFloat);
-        memcpy3d_obj.simple_Memcpy3DAsync();
-      }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
-This testcase performs the extent validation scenarios of
-hipMemcpy3D API
-*/
-TEST_CASE("Unit_hipMemcpy3DAsync_ExtentValidation") {
-  Memcpy3DAsync<int> memcpy3d(width, height, depth,
-                              hipChannelFormatKindSigned);
-  memcpy3d.Extent_Validation();
-}
-
-/*
-This testcase performs the negative scenarios of
-hipMemcpy3DAsync API
-*/
-TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-Negative") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    Memcpy3DAsync<int> memcpy3d(width, height, depth,
-                                hipChannelFormatKindSigned);
-    memcpy3d.NegativeTests();
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
-This testcase performs the D2H,H2D and D2D on peer
-GPU device
-*/
-TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-D2D") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    SECTION("D2D on different Device") {
-      Memcpy3DAsync<float> memcpy3d_d2d_obj(width, height, depth,
-                                            hipChannelFormatKindFloat);
-      memcpy3d_d2d_obj.D2D_DeviceMem_OnDiffDevice();
+  constexpr auto NegativeTests = [](hipPitchedPtr dst_ptr, hipPos dst_pos, hipPitchedPtr src_ptr,
+                                    hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+    SECTION("dst_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.ptr = nullptr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
    }

-    SECTION("D2H and H2D on different device") {
-      Memcpy3DAsync<float> memcpy3d_d2h_obj(width, height, depth,
-                                            hipChannelFormatKindFloat);
-      memcpy3d_d2h_obj.D2H_H2D_DeviceMem_OnDiffDevice();
+    SECTION("src_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.ptr = nullptr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}

-/*
-This testcase checks hipMemcpy3DAsync API by 
-allocating memory in one GPU and creating stream
-in another GPU
-*/
-TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-DiffStream") {
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    Memcpy3DAsync<float> memcpy3dAsync(width, height, depth,
-                                     hipChannelFormatKindFloat);
-    memcpy3dAsync.D2D_SameDeviceMem_StreamDiffDevice();
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-239
+    SECTION("dst_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidPitchValue);
+    }
+
+    SECTION("src_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidPitchValue);
+    }
+#endif
+
+    SECTION("dst_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = attr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = attr;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-237
+    SECTION("extent.width + dst_pos.x > dst_ptr.pitch") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.x = dst_ptr.pitch - extent.width + 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + src_pos.x > src_ptr.pitch") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.x = src_ptr.pitch - extent.width + 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.y out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.y = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.y out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.y = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.z out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.z = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.z out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.z = 1;
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind),
+                      hipErrorInvalidValue);
+    }
+#endif
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-234
+    SECTION("Invalid MemcpyKind") {
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, src_ptr, src_pos, extent,
+                                             static_cast<hipMemcpyKind>(-1)),
+                      hipErrorInvalidMemcpyDirection);
+    }
+#endif
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-235
+    SECTION("Invalid stream") {
+      StreamGuard stream_guard(Streams::created);
+      HIP_CHECK(hipStreamDestroy(stream_guard.stream()));
+      HIP_CHECK_ERROR(Memcpy3DWrapper<async>(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind,
+                                             stream_guard.stream()),
+                      hipErrorContextIsDestroyed);
+    }
+#endif
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToDevice);
+  }
+
+  SECTION("Device to Host") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0), extent,
+                  hipMemcpyDeviceToHost);
+  }
+
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    NegativeTests(make_hipPitchedPtr(dst_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(src_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard3D<int> src_alloc(extent);
+    LinearAllocGuard3D<int> dst_alloc(extent);
+    NegativeTests(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice);
  }
 }
@@ -0,0 +1,755 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+ * This testfile verifies the following Scenarios of hipMemcpy3DAsync API
+
+ * 1. Verifying hipMemcpy3DAsync API for H2D,D2D and D2H scenarios
+ * 2. Verifying Negative Scenarios
+ * 3. Verifying Extent validation scenarios by passing 0
+ * 4. Verifying hipMemcpy3DAsync API by allocating Memory in
+ *    one GPU and trigger hipMemcpy3D from peer GPU
+ * 5. D2D where src and dst memory on GPU-0 and stream on GPU-1
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr auto width{10};
+static constexpr auto height{10};
+static constexpr auto depth{10};
+
+template <typename T>
+class Memcpy3DAsync {
+    int width, height, depth;
+    unsigned int size;
+    hipArray *arr, *arr1;
+    hipChannelFormatKind formatKind;
+    hipMemcpy3DParms myparms;
+    T* hData;
+    hipStream_t stream;
+ public:
+    Memcpy3DAsync(int l_width, int l_height, int l_depth,
+                  hipChannelFormatKind l_format);
+    void simple_Memcpy3DAsync();
+    void Extent_Validation();
+    void NegativeTests();
+    void AllocateMemory();
+    void DeAllocateMemory();
+    void SetDefaultData();
+    void D2D_SameDeviceMem_StreamDiffDevice();
+    void D2D_DeviceMem_OnDiffDevice();
+    void D2H_H2D_DeviceMem_OnDiffDevice();
+};
+
+/*
+ * This API sets the default values of hipMemcpy3DParms structure
+ */
+template <typename T>
+void Memcpy3DAsync<T>::SetDefaultData() {
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+}
+
+/*
+ * Constructor initalized width,depth and height
+ */
+template <typename T>
+Memcpy3DAsync<T>::Memcpy3DAsync(int l_width, int l_height, int l_depth,
+                      hipChannelFormatKind l_format) {
+  width = l_width;
+  height = l_height;
+  depth = l_depth;
+  formatKind = l_format;
+}
+
+/*
+ * Allocating Memory and initalizing data for both
+ * device and host variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::AllocateMemory() {
+  size = width * height * depth * sizeof(T);
+  hData = reinterpret_cast<T*>(malloc(size));
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width +k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8,
+                                                          0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height,
+                                                depth), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height,
+                                                 depth), hipArrayDefault));
+}
+
+/*
+ * DeAllocates the Memory of device and host variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::DeAllocateMemory() {
+  HIP_CHECK(hipFreeArray(arr));
+  HIP_CHECK(hipFreeArray(arr1));
+  free(hData);
+  HIP_CHECK(hipStreamDestroy(stream));
+}
+
+/*
+ * This API verifies both H2D & D2H functionalities of hipMemcpy3DAsync API
+ * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
+ * from another GPU.
+ * H2D case:
+ * Input : "hData" is  initialized with the respective offset value
+ * Output: Destination array "arr" variable.
+ *
+ * D2H case:
+ * Input: "arr" array variable from the above output
+ * Output: "hOutputData" variable data is copied from "arr" variable
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::D2H_H2D_DeviceMem_OnDiffDevice() {
+  HIP_CHECK(hipSetDevice(0));
+  int peerAccess = 0;
+  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+  if (peerAccess) {
+    AllocateMemory();
+
+    // Memory is allocated on device 0 and Memcpy3DAsyncAsync
+    // triggered from device 1
+    HIP_CHECK(hipSetDevice(1));
+    HIP_CHECK(hipStreamCreate(&stream));
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+
+    // Host to Device
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+        width, height);
+    myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    T *hOutputData = reinterpret_cast<T*>(malloc(size));
+    memset(hOutputData, 0,  size);
+    SetDefaultData();
+
+    // Device to host
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+        width * sizeof(T),
+        width, height);
+    myparms.srcArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // Validating the result
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+    free(hOutputData);
+
+    // DeAllocating the Memory
+    DeAllocateMemory();
+  } else {
+    SUCCEED("Skipped the test as there is no peer access");
+  }
+}
+
+/*
+ * This API verifies both D2D functionalities of hipMemcpy3DAsync API
+ * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
+ * from another GPU.
+ *
+ * D2D case:
+ * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
+ * Output: "arr2" variable in GPU-0
+ *
+ * hipMemcpy3DAsync API is triggered from GPU-1
+ * The "arr2" variable is then copied to "hOutputData" for validating
+ * the result
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::D2D_DeviceMem_OnDiffDevice() {
+  HIP_CHECK(hipSetDevice(0));
+  int peerAccess = 0;
+  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
+  if (peerAccess) {
+    // Allocating Memory and setting default data
+    AllocateMemory();
+    hipStream_t stream1;
+    HIP_CHECK(hipStreamCreate(&stream1));
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+
+    // Host to Device Scenario
+    myparms.srcPtr = make_hipPitchedPtr(hData,
+                                        width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream1) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream1));
+
+    // Allocating Mem on GPU device 0 and trigger hipMemcpy3DAsync from GPU 1
+    HIP_CHECK(hipSetDevice(1));
+    HIP_CHECK(hipStreamCreate(&stream));
+    hipArray *arr2;
+    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
+                                                    0, 0, 0, formatKind);
+    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
+                              make_hipExtent(width, height,
+                              depth), hipArrayDefault));
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+
+    // Device to Device
+    myparms.srcArray = arr;
+    myparms.dstArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // For validating the D2D copy copying it again to hOutputData and
+    // verifying it with iniital data hData
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    T *hOutputData = reinterpret_cast<T*>(malloc(size));
+    memset(hOutputData, 0, size);
+    SetDefaultData();
+
+    // Device to host
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+                                        width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream)== hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // Validating the result
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+    // Deleting the memory
+    free(hOutputData);
+    DeAllocateMemory();
+  } else {
+    SUCCEED("Skipped the test as there is no peer access");
+  }
+}
+
+/*
+ * This API verifies all the negative scenarios of hipMemcpy3D API
+*/
+template <typename T>
+void Memcpy3DAsync<T>::NegativeTests() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Initialization of data
+  memset(&myparms, 0, sizeof(myparms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+
+  SECTION("Nullptr to destination array") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+        width, height);
+    myparms.dstArray = nullptr;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Nullptr to source array") {
+    myparms.srcArray = nullptr;
+    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing both Source ptr and array") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing both destination ptr and array") {
+    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    myparms.srcArray = arr1;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing Max value to extent") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    myparms.extent = make_hipExtent(std::numeric_limits<int>::max(),
+                                    std::numeric_limits<int>::max(),
+                                    std::numeric_limits<int>::max());
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing Source pitchedPtr as nullptr") {
+    myparms.srcPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing Dst pitchedPtr as nullptr") {
+    myparms.dstPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing width > max width size in extent") {
+    myparms.extent = make_hipExtent(width+1 , height, depth);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing hgt > max width size in extent") {
+    myparms.extent = make_hipExtent(width , height+1, depth);
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing depth > max width size in extent") {
+    myparms.extent = make_hipExtent(width , height, depth+1);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing dst width pos > max allocated width") {
+    myparms.dstPos = make_hipPos(width+1, 0, 0);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing dst height pos > max allocated hgt") {
+    myparms.dstPos = make_hipPos(0, height+1, 0);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing dst depth pos > max allocated depth") {
+    myparms.dstPos = make_hipPos(0, 0, depth+1);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing src width pos > max allocated width") {
+    myparms.srcPos = make_hipPos(width+1, 0, 0);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing src height pos > max allocated hgt") {
+    myparms.srcPos = make_hipPos(0, height+1, 0);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing src height pos > max allocated hgt") {
+    myparms.srcPos = make_hipPos(0, 0, depth+1);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing src array size  > dst array size") {
+    hipArray *arr2;
+    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
+        0, 0, 0, formatKind);
+    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
+                              make_hipExtent(3, 3
+                              , 3), hipArrayDefault));
+    myparms.srcArray = arr;
+    myparms.dstArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) != hipSuccess);
+  }
+
+  SECTION("Passing nullptr to hipMemcpy3DAsync") {
+    REQUIRE(hipMemcpy3DAsync(nullptr, stream) != hipSuccess);
+  }
+  // DeAllocating of memory
+  DeAllocateMemory();
+}
+
+/*
+ * This API verifies both D2D functionalities of hipMemcpy3DAsync API
+ * by allocating memory in one GPU and calling the hipMemcpy3DAsync API
+ * from another GPU.
+ *
+ * D2D case:
+ * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
+ * Output: "arr1" variable in GPU-0
+ *
+ * Stream on GPU-1
+ * The "arr2" variable is then copied to "hOutputData" for validating
+ * the result
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::D2D_SameDeviceMem_StreamDiffDevice() {
+  HIP_CHECK(hipSetDevice(0));
+  // Allocating the Memory
+  int peerAccess = 0;
+  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
+  if (peerAccess) {
+    AllocateMemory();
+    HIP_CHECK(hipSetDevice(1));
+    HIP_CHECK(hipStreamCreate(&stream));
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+
+    // Host to Device
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
+    myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // Array to Array
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+    T *hOutputData = reinterpret_cast<T*>(malloc(size));
+    memset(hOutputData, 0,  size);
+
+    // Device to host
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+        width * sizeof(T), width, height);
+    myparms.srcArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // Validating the result
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+    // Deallocating the memory
+    free(hOutputData);
+    DeAllocateMemory();
+  } else {
+    SUCCEED("Skipped the test as there is no peer access");
+  }
+}
+
+/*
+ * This API verifies the Extent validation Scenarios
+ */
+template <typename T>
+void Memcpy3DAsync<T>::Extent_Validation() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+  HIP_CHECK(hipStreamCreate(&stream));
+  // Passing extent as 0
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
+  myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+  SECTION("Passing Extent as 0") {
+    myparms.extent = make_hipExtent(0 , 0, 0);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  }
+  SECTION("Passing Width 0 in Extent") {
+    myparms.extent = make_hipExtent(0 , height, depth);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  }
+  SECTION("Passing Height 0 in Extent") {
+    myparms.extent = make_hipExtent(width , 0, depth);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  }
+  SECTION("Passing Depth 0 in Extent") {
+    myparms.extent = make_hipExtent(width , height, 0);
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  }
+  DeAllocateMemory();
+}
+
+/*
+ * This API verifies H2H-D2D-D2H functionalities of hipMemcpy3DAsync API
+ *
+ * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
+ * Output: "arr1" variable in GPU-0
+ *
+ * The "arr1" variable is then copied to "hOutputData" for validating
+ * the result
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3DAsync<T>::simple_Memcpy3DAsync() {
+  HIP_CHECK(hipSetDevice(0));
+
+  // Allocating the Memory
+  AllocateMemory();
+  HIP_CHECK(hipStreamCreate(&stream));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+
+  // Host to Device
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T), width, height);
+  myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+  SECTION("Calling hipMemcpy3DAsync() using user declared stream obj") {
+    REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+  SECTION("Calling hipMemcpy3DAsync() using hipStreamPerThread") {
+    REQUIRE(hipMemcpy3DAsync(&myparms, hipStreamPerThread) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+  }
+
+  // Array to Array
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+  myparms.srcArray = arr;
+  myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+  myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+  REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  HIP_CHECK(hipStreamSynchronize(stream));
+  T *hOutputData = reinterpret_cast<T*>(malloc(size));
+  memset(hOutputData, 0,  size);
+
+  // Device to host
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+  myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+      width * sizeof(T), width, height);
+  myparms.srcArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyDeviceToHost;
+#else
+  myparms.kind = hipMemcpyDeviceToHost;
+#endif
+  REQUIRE(hipMemcpy3DAsync(&myparms, stream) == hipSuccess);
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  // Validating the result
+  HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+  // DeAllocating the memory
+  free(hOutputData);
+  DeAllocateMemory();
+}
+/*
+This testcase verifies hipMemcpyAsync for different datatypes
+and different sizes
+*/
+TEMPLATE_TEST_CASE("Unit_hipMemcpy3DAsync_Basic",
+                   "[hipMemcpy3DAsync]",
+                   int, unsigned int, float) {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  int device = -1;
+  HIP_CHECK(hipGetDevice(&device));
+  hipDeviceProp_t prop;
+  HIP_CHECK(hipGetDeviceProperties(&prop,device));
+  auto i = GENERATE_COPY(10, 100, 1024, prop.maxTexture3D[0]);
+  auto j = GENERATE(10, 100);
+  if (numDevices > 1) {
+      if (std::is_same<TestType, int>::value)  {
+        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
+                                             hipChannelFormatKindSigned);
+        memcpy3d_obj.simple_Memcpy3DAsync();
+      } else if (std::is_same<TestType, unsigned int>::value)  {
+        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
+                                             hipChannelFormatKindUnsigned);
+        memcpy3d_obj.simple_Memcpy3DAsync();
+      } else if (std::is_same<TestType, float>::value) {
+        Memcpy3DAsync<TestType> memcpy3d_obj(i, j, j,
+                                             hipChannelFormatKindFloat);
+        memcpy3d_obj.simple_Memcpy3DAsync();
+      }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+This testcase performs the extent validation scenarios of
+hipMemcpy3D API
+*/
+TEST_CASE("Unit_hipMemcpy3DAsync_ExtentValidation") {
+  Memcpy3DAsync<int> memcpy3d(width, height, depth,
+                              hipChannelFormatKindSigned);
+  memcpy3d.Extent_Validation();
+}
+
+/*
+This testcase performs the negative scenarios of
+hipMemcpy3DAsync API
+*/
+TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-Negative") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    Memcpy3DAsync<int> memcpy3d(width, height, depth,
+                                hipChannelFormatKindSigned);
+    memcpy3d.NegativeTests();
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+This testcase performs the D2H,H2D and D2D on peer
+GPU device
+*/
+TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-D2D") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    SECTION("D2D on different Device") {
+      Memcpy3DAsync<float> memcpy3d_d2d_obj(width, height, depth,
+                                            hipChannelFormatKindFloat);
+      memcpy3d_d2d_obj.D2D_DeviceMem_OnDiffDevice();
+    }
+
+    SECTION("D2H and H2D on different device") {
+      Memcpy3DAsync<float> memcpy3d_d2h_obj(width, height, depth,
+                                            hipChannelFormatKindFloat);
+      memcpy3d_d2h_obj.D2H_H2D_DeviceMem_OnDiffDevice();
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+This testcase checks hipMemcpy3DAsync API by 
+allocating memory in one GPU and creating stream
+in another GPU
+*/
+TEST_CASE("Unit_hipMemcpy3DAsync_multiDevice-DiffStream") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    Memcpy3DAsync<float> memcpy3dAsync(width, height, depth,
+                                     hipChannelFormatKindFloat);
+    memcpy3dAsync.D2D_SameDeviceMem_StreamDiffDevice();
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
@@ -0,0 +1,627 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+ * This testfile verifies the following scenarios of hipMemcpy3D API
+ *
+ * 1. Verifying hipMemcpy3D API for H2D,D2D and D2H scenarios for
+      different datatypes and sizes.
+ * 2. Verifying Negative Scenarios
+ * 3. Verifying Extent validation scenarios by passing 0
+ * 4. Verifying hipMemcpy3D API by allocating Memory in
+ *    one GPU and trigger hipMemcpy3D from peer GPU
+ *
+ */
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr auto width{10};
+static constexpr auto height{10};
+static constexpr auto depth{10};
+
+template <typename T>
+class Memcpy3D {
+  int width, height, depth;
+  unsigned int size;
+  hipArray *arr, *arr1;
+  hipChannelFormatKind formatKind;
+  hipMemcpy3DParms myparms;
+  T* hData;
+ public:
+  Memcpy3D(int l_width, int l_height, int l_depth,
+      hipChannelFormatKind l_format);
+  void simple_Memcpy3D();
+  void Extent_Validation();
+  void NegativeTests();
+  void AllocateMemory();
+  void DeAllocateMemory();
+  void SetDefaultData();
+  void D2D_DeviceMem_OnDiffDevice();
+  void D2H_H2D_DeviceMem_OnDiffDevice();
+};
+
+/*
+ * This API sets the default values of hipMemcpy3DParms structure
+ */
+template <typename T>
+void Memcpy3D<T>::SetDefaultData() {
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+}
+
+/*
+ * Constructor initalized width,depth and height
+ */
+template <typename T>
+Memcpy3D<T>::Memcpy3D(int l_width, int l_height, int l_depth,
+                      hipChannelFormatKind l_format) {
+  width = l_width;
+  height = l_height;
+  depth = l_depth;
+  formatKind = l_format;
+}
+
+/*
+ * Allocating Memory and initalizing data for both
+ * device and host variables
+ */
+template <typename T>
+void Memcpy3D<T>::AllocateMemory() {
+  size = width * height * depth * sizeof(T);
+  hData = reinterpret_cast<T*>(malloc(size));
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width +k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(T)*8,
+                                                          0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&arr, &channelDesc, make_hipExtent(width, height,
+                                                depth), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&arr1, &channelDesc, make_hipExtent(width, height,
+                                                 depth), hipArrayDefault));
+}
+
+/*
+ * DeAllocates the Memory of device and host variables
+ */
+template <typename T>
+void Memcpy3D<T>::DeAllocateMemory() {
+    HIP_CHECK(hipFreeArray(arr));
+    HIP_CHECK(hipFreeArray(arr1));
+    free(hData);
+}
+
+/*
+ * This API verifies both H2D & D2H functionalities of hipMemcpy3D API
+ * by allocating memory in one GPU and calling the hipMemcpy3D API
+ * from another GPU.
+ * H2D case:
+ * Input : "hData" is  initialized with the respective offset value
+ * Output: Destination array "arr" variable.
+ *
+ * D2H case:
+ * Input: "arr" array variable from the above output
+ * Output: "hOutputData" variable data is copied from "arr" variable
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3D<T>::D2H_H2D_DeviceMem_OnDiffDevice() {
+  HIP_CHECK(hipSetDevice(0));
+  int peerAccess = 0;
+  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+  if (peerAccess) {
+    AllocateMemory();
+    // Memory is allocated on device 0 and Memcpy3DAsync triggered from device 1
+    HIP_CHECK(hipSetDevice(1));
+
+    // H2D Scenario
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Device to host
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    T *hOutputData = reinterpret_cast<T*>(malloc(size));
+    memset(hOutputData, 0,  size);
+    SetDefaultData();
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+                                        width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Validating the result
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+    free(hOutputData);
+    DeAllocateMemory();
+  } else {
+    SUCCEED("Skipped the test as there is no peer access\n");
+  }
+}
+/*
+ * This API verifies both D2D functionalities of hipMemcpy3D API
+ * by allocating memory in one GPU and calling the hipMemcpy3D API
+ * from another GPU.
+ *
+ * D2D case:
+ * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
+ * Output: "arr2" variable in GPU-0
+ *
+ * hipMemcpy3D API is triggered from GPU-1
+ * The "arr2" variable is then copied to "hOutputData" for validating 
+ * the result
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+template <typename T>
+void Memcpy3D<T>::D2D_DeviceMem_OnDiffDevice() {
+  HIP_CHECK(hipSetDevice(0));
+  int peerAccess = 0;
+  HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 0, 1));
+  if (peerAccess) {
+    AllocateMemory();
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+
+    // Host to device copy
+    myparms.srcPtr = make_hipPitchedPtr(hData,
+        width * sizeof(T),
+        width, height);
+    myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyHostToDevice;
+#else
+    myparms.kind = hipMemcpyHostToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+    hipArray *arr2;
+    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
+        0, 0, 0, formatKind);
+    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
+          make_hipExtent(width, height,
+            depth), hipArrayDefault));
+
+    // Allocating Mem on GPU device 0 and trigger hipMemcpy3D from GPU 1
+    HIP_CHECK(hipSetDevice(1));
+
+    // D2D Scenario
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    SetDefaultData();
+    myparms.srcArray = arr;
+    myparms.dstArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // For validating the D2D copy copying it again to hOutputData and
+    // verifying it with iniital data hData
+    memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+    T *hOutputData = reinterpret_cast<T*>(malloc(size));
+    memset(hOutputData, 0, size);
+    SetDefaultData();
+
+    // Device to host
+    myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+        width * sizeof(T),
+        width, height);
+    myparms.srcArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+    HIP_CHECK(hipDeviceSynchronize());
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+    // DeAllocating Memory
+    free(hOutputData);
+    DeAllocateMemory();
+  } else {
+    SUCCEED("Skipped the test as there is no peer access\n");
+  }
+}
+/*
+ * This API verifies all the negative scenarios of hipMemcpy3D API
+ */
+template <typename T>
+void Memcpy3D<T>::NegativeTests() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+
+  // Initialization of data
+  memset(&myparms, 0, sizeof(myparms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+
+  SECTION("Nullptr to destination array") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = nullptr;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Nullptr to source array") {
+    myparms.srcArray = nullptr;
+    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing both Source ptr and array") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing both destination ptr and array") {
+    myparms.dstPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    myparms.srcArray = arr1;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing Max value to extent") {
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    myparms.extent = make_hipExtent(std::numeric_limits<int>::max(),
+                                    std::numeric_limits<int>::max(),
+                                    std::numeric_limits<int>::max());
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing Source pitchedPtr as nullptr") {
+    myparms.srcPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing Dst pitchedPtr as nullptr") {
+    myparms.dstPtr = make_hipPitchedPtr(nullptr, width * sizeof(T),
+                                        width, height);
+    myparms.srcArray = arr;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing width > max width size in extent") {
+    myparms.extent = make_hipExtent(width+1 , height, depth);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing hgt > max width size in extent") {
+    myparms.extent = make_hipExtent(width , height+1, depth);
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    myparms.dstArray = arr;
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing depth > max width size in extent") {
+    myparms.extent = make_hipExtent(width , height, depth+1);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing dst width pos > max allocated width") {
+    myparms.dstPos = make_hipPos(width+1, 0, 0);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing dst height pos > max allocated hgt") {
+    myparms.dstPos = make_hipPos(0, height+1, 0);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing dst depth pos > max allocated depth") {
+    myparms.dstPos = make_hipPos(0, 0, depth+1);
+    myparms.dstArray = arr;
+    myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                        width, height);
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing src width pos > max allocated width") {
+    myparms.srcPos = make_hipPos(width+1, 0, 0);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing src height pos > max allocated hgt") {
+    myparms.srcPos = make_hipPos(0, height+1, 0);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing src height pos > max allocated hgt") {
+    myparms.srcPos = make_hipPos(0, 0, depth+1);
+    myparms.srcArray = arr;
+    myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  SECTION("Passing src array size  > dst array size") {
+    // Passing src array size greater than destination array size
+    hipArray *arr2;
+    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(T)*8,
+                                                    0, 0, 0, formatKind);
+    HIP_CHECK(hipMalloc3DArray(&arr2, &channelDesc1,
+                              make_hipExtent(3, 3
+                              , 3), hipArrayDefault));
+    myparms.srcArray = arr;
+    myparms.dstArray = arr2;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+    REQUIRE(hipMemcpy3D(&myparms) != hipSuccess);
+  }
+
+  // DeAllocation of memory
+  DeAllocateMemory();
+}
+
+/*
+ * This API verifies the Extent validation Scenarios
+ */
+template <typename T>
+void Memcpy3D<T>::Extent_Validation() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                      width, height);
+  myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+  SECTION("Passing Extent as 0") {
+    myparms.extent = make_hipExtent(0 , 0, 0);
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+  }
+  SECTION("Passing Width 0 in Extent") {
+    myparms.extent = make_hipExtent(0 , height, depth);
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+  }
+  SECTION("Passing Height 0 in Extent") {
+    myparms.extent = make_hipExtent(width , 0, depth);
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+  }
+  SECTION("Passing Depth 0 in Extent") {
+    myparms.extent = make_hipExtent(width , height, 0);
+    REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+  }
+  SECTION("Passing Depth 0 in Extent") {
+    REQUIRE(hipMemcpy3D(nullptr) != hipSuccess);
+  }
+  DeAllocateMemory();
+}
+
+/*
+ * This API verifies H2H-D2D-D2H functionalities of hipMemcpy3D API
+ *
+ * Input : "arr" variable  is  initialized with the "hData" variable in GPU-0
+ * Output: "arr1" variable in GPU-0
+ *
+ * The "arr1" variable is then copied to "hOutputData" for validating
+ * the result
+ *
+ * Validating the result by comparing "hData" and "hOutputData" variables
+ */
+
+template <typename T>
+void Memcpy3D<T>::simple_Memcpy3D() {
+  HIP_CHECK(hipSetDevice(0));
+  AllocateMemory();
+
+  // Host to Device
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(T),
+                                      width, height);
+  myparms.dstArray = arr;
+#ifdef __HIP_PLATFORM_NVCC__
+  myparms.kind = cudaMemcpyHostToDevice;
+#else
+  myparms.kind = hipMemcpyHostToDevice;
+#endif
+  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+
+  // Array to Array
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+  myparms.srcArray = arr;
+  myparms.dstArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToDevice;
+#else
+    myparms.kind = hipMemcpyDeviceToDevice;
+#endif
+  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+  T *hOutputData = reinterpret_cast<T*>(malloc(size));
+  memset(hOutputData, 0,  size);
+
+  // Device to host
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  SetDefaultData();
+  myparms.dstPtr = make_hipPitchedPtr(hOutputData,
+                   width * sizeof(T), width, height);
+  myparms.srcArray = arr1;
+#ifdef __HIP_PLATFORM_NVCC__
+    myparms.kind = cudaMemcpyDeviceToHost;
+#else
+    myparms.kind = hipMemcpyDeviceToHost;
+#endif
+  REQUIRE(hipMemcpy3D(&myparms) == hipSuccess);
+
+  // Validating the result
+  HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+  // DeAllocating the Memory
+  free(hOutputData);
+  DeAllocateMemory();
+}
+/*
+ This testcase performs hipMemcpy3D API validation for 
+ different datatypes and different sizes
+*/
+TEMPLATE_TEST_CASE("Unit_hipMemcpy3D_Basic", "[hipMemcpy3D]",
+    int, unsigned int, float) {
+  int device = -1;
+  HIP_CHECK(hipGetDevice(&device));
+  hipDeviceProp_t prop;
+  HIP_CHECK(hipGetDeviceProperties(&prop,device));
+  auto i = GENERATE_COPY(10, 100, 1024, prop.maxTexture3D[0]);
+  auto j = GENERATE(10, 100);
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    if (std::is_same<TestType, float>::value) {
+      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindFloat);
+      memcpy3d_obj.simple_Memcpy3D();
+    } else if (std::is_same<TestType, unsigned int>::value)  {
+      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindUnsigned);
+      memcpy3d_obj.simple_Memcpy3D();
+    } else if (std::is_same<TestType, int>::value)  {
+      Memcpy3D<TestType> memcpy3d_obj(i, j, j, hipChannelFormatKindSigned);
+      memcpy3d_obj.simple_Memcpy3D();
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+This testcase performs the extent validation scenarios of
+hipMemcpy3D API
+*/
+TEST_CASE("Unit_hipMemcpy3D_ExtentValidation") {
+  Memcpy3D<int> memcpy3d(width, height, depth,
+                         hipChannelFormatKindSigned);
+  memcpy3d.Extent_Validation();
+}
+
+/*
+This testcase performs the negative scenarios of
+hipMemcpy3D API
+*/
+TEST_CASE("Unit_hipMemcpy3D_multiDevice-Negative") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    Memcpy3D<int> memcpy3d(width, height, depth,
+                           hipChannelFormatKindSigned);
+    memcpy3d.NegativeTests();
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+This testcase performs the D2H,H2D and D2D on peer
+GPU device
+*/
+TEST_CASE("Unit_hipMemcpy3D_multiDevice-OnPeerDevice") {
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    SECTION("D2H & H2D On DiffDevice") {
+      Memcpy3D<float> memcpy3d_d2h_obj(width, height, depth,
+                                       hipChannelFormatKindFloat);
+      memcpy3d_d2h_obj.D2H_H2D_DeviceMem_OnDiffDevice();
+    }
+
+    SECTION("D2D On DiffDevice") {
+      Memcpy3D<float> memcpy3d_d2d_obj(width, height, depth,
+                                       hipChannelFormatKindFloat);
+      memcpy3d_d2d_obj.D2D_DeviceMem_OnDiffDevice();
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}