SWDEV-1 - Merge github PRs to amd-staging

- https://github.com/ROCm/hip-tests/pull/119 - https://github.com/ROCm/hip-tests/pull/151 - https://github.com/ROCm/hip-tests/pull/57 - https://github.com/ROCm/hip-tests/pull/58 - https://github.com/ROCm/hip-tests/pull/59 - https://github.com/ROCm/hip-tests/pull/60 - https://github.com/ROCm/hip-tests/pull/99 - https://github.com/ROCm/hip-tests/pull/139 - https://github.com/ROCm/hip-tests/pull/152 - https://github.com/ROCm/hip-tests/pull/48 - https://github.com/ROCm/hip-tests/pull/54 - https://github.com/ROCm/hip-tests/pull/53 - https://github.com/ROCm/hip-tests/pull/24 - https://github.com/ROCm/hip-tests/pull/23 - https://github.com/ROCm/hip-tests/pull/22 - https://github.com/ROCm/hip-tests/pull/21 - https://github.com/ROCm/hip-tests/pull/20 - https://github.com/ROCm/hip-tests/pull/14 - https://github.com/ROCm/hip-tests/pull/8 Change-Id: I1eea54cd1436f3ddbfd5c1b3b2f672eb81d03cd4 [ROCm/hip-tests commit: 96df1fde80]
2024-02-22 18:31:56 +05:30
@@ -48,11 +48,14 @@
        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout",
        "Unit_hipFuncSetAttribute_Positive_Parameters",
        "Unit_hipFuncSetAttribute_Negative_Parameters",
-        "NOTE: The following 4 tests are disabled due to defect - EXSWHTEC-240",
-        "Unit_hipFuncSetCacheConfig_Negative_Not_Supported",
-        "Unit_hipFuncSetSharedMemConfig_Negative_Not_Supported",
-        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
-        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-241",
+        "Unit_hipFuncGetAttributes_Negative_Parameters",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-242",
+        "Unit_hipFuncGetAttributes_Positive_Basic",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-243",
+        "Unit_hipExtLaunchKernel_Negative_Parameters",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-244",
+        "Unit_hipExtLaunchMultiKernelMultiDevice_Negative_Parameters",
        "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
@@ -127,6 +130,7 @@
        "Unit_deviceAllocation_InOneThread_AccessInAllThreads",
        "=== Patch which removes the typetraits implementation from std namespace in hiprtc is reverted ===",
        "Unit_hiprtc_stdheaders",
+        "Unit_hipGraphAddMemcpyNode_Negative_Parameters",
        "Unit_hipMemAddressFree_negative",
        "Unit_hipMemAddressReserve_AlignmentTest",
        "Unit_hipMemAddressReserve_Negative",
@@ -257,6 +261,54 @@
        "Unit_Device_Complex_hipCfma_Negative_Parameters_RTC",
        "Unit_Device_make_Complex_Negative_Parameters_RTC",
        "Unit_Device_Complex_Cast_Negative_Parameters_RTC",
+        "Note: Test disabled due to defect - EXSWHTEC-151",
+        "Unit_hipModuleLoad_Negative_Load_From_A_File_That_Is_Not_A_Module",
+        "Note: Following two tests disabled due to defect - EXSWHTEC-153",
+        "Unit_hipModuleLoadData_Negative_Image_Is_An_Empty_String",
+        "Unit_hipModuleLoadDataEx_Negative_Image_Is_An_Empty_String",
+        "Note: Test disabled due to defect - EXSWHTEC-163",
+        "Unit_hipModuleGetGlobal_Negative_Hmod_Is_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-164",
+        "Unit_hipModuleGetGlobal_Negative_Name_Is_Empty_String",
+        "Note: Test disabled due to defect - EXSWHTEC-165",
+        "Unit_hipModuleGetGlobal_Negative_Dptr_And_Bytes_Are_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-166",
+        "Unit_hipModuleGetTexRef_Negative_Hmod_Is_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-167",
+        "Unit_hipModuleGetTexRef_Negative_Name_Is_Empty_String",
+        "SWDEV-441785: Below tests failing in stress test on 05/01/24 ===",
+        "Unit_hipMemcpyParam2DAsync_Positive_Basic",
+        "Unit_hipMemcpy2DAsync_Positive_Basic",
+        "SWDEV-442583: Below tests failing in stress test on 12/01/24 ===",
+        "Unit_hipLaunchCooperativeKernelMultiDevice_Negative_Parameters",
+        "Unit_hipLaunchCooperativeKernelMultiDevice_Negative_MultiKernelSameDevice",
+        "Unit_hipExtLaunchMultiKernelMultiDevice_Negative_MultiKernelSameDevice",
+        "=== Below tests are failing PSDB ===",
+        "Unit_hipGraphExecMemcpyNodeSetParams1D_Negative_Changing_Memcpy_Direction",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Positive_Basic",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Negative_Parameters",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Negative_Changing_Memcpy_Direction",
+        "Unit_hipGraphMemcpyNodeSetParams_Negative_Parameters",
+        "Unit_hipGraphKernelNodeSetAttribute_Positive_AccessPolicyWindow",
+        "Unit_hipGraphKernelNodeSetAttribute_Negative_Parameters",
+        "Unit_hipMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyParam2D_Positive_Array",
+        "Unit_hipMemcpyParam2DAsync_Positive_Array",
+        "Unit_hipMemcpy2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyAsync_Positive_Synchronization_Behavior",
+        "Unit_hipDrvMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_Thread_Block_Tile_Dynamic_Getters_Positive_Basic",
+        "Unit_hipFuncSetCacheConfig_Negative_Not_Supported",
+        "Unit_hipFuncSetSharedMemConfig_Negative_Not_Supported",
+        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
+        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
+        "Unit_hipLaunchCooperativeKernel_Negative_Parameters",
+        "Performance_hipMemsetD16",
+        "Performance_hipMemsetD16Async",
+        "Performance_hipMemsetD32",
+        "Performance_hipMemsetD32Async",
+        "Unit_hipGraphKernelNodeGetAttribute_Negative_Parameters",
    #endif
    #if defined VEGA20
        "=== SWDEV-419112 Below tests fail in stress test on 29/08/23 ===",
@@ -115,11 +115,6 @@
        "Unit_hipEventCreateWithFlags_DefaultFlg_NonCohHstMem",
        "Unit_hipEventCreateWithFlags_DisableSystemFence_CohHstMem",
        "Unit_hipEventCreateWithFlags_DefaultFlg_CohHstMem",
-        "NOTE: The following 4 tests are disabled due to defect - EXSWHTEC-240",
-        "Unit_hipFuncSetCacheConfig_Negative_Not_Supported",
-        "Unit_hipFuncSetSharedMemConfig_Negative_Not_Supported",
-        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
-        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
        "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
        "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
        "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
@@ -218,7 +213,17 @@
        "Unit_hipVectorTypes_test_on_device",
        "=== Patch which removes the typetraits implementation from std namespace in hiprtc is reverted ===",
        "Unit_hiprtc_stdheaders",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-241",
+        "Unit_hipFuncGetAttributes_Negative_Parameters",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-242",
+        "Unit_hipFuncGetAttributes_Positive_Basic",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-243",
+        "Unit_hipExtLaunchKernel_Negative_Parameters",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-244",
+        "Unit_hipExtLaunchMultiKernelMultiDevice_Negative_Parameters",
+        "Unit_hipMemAddressFree_negative",
        "Unit_hipMemAddressReserve_AlignmentTest",
+        "Unit_hipGraphAddMemcpyNode_Negative_Parameters",
        "Unit_hipMemCreate_ChkWithKerLaunch",
        "Unit_hipMemCreate_MapNonContiguousChunks",
        "Unit_hipMemMap_MapPartialPhysicalMem",
@@ -356,6 +361,85 @@
        "Unit_hipGetMipmappedArrayLevel_Negative",
        "Unit_hipFreeMipmappedArray_Negative_DoubleFree",
        "Unit_hipFreeMipmappedArrayMultiTArray - int",
+        "Unit_hipGraphExecMemcpyNodeSetParams1D_Negative_Changing_Memcpy_Direction",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Positive_Basic",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Negative_Parameters",
+        "Unit_hipGraphExecMemcpyNodeSetParams_Negative_Changing_Memcpy_Direction",
+        "Unit_hipGraphMemcpyNodeSetParams_Negative_Parameters",
+        "Unit_hipGraphKernelNodeGetAttribute_Negative_Parameters",
+        "Unit_hipGraphKernelNodeSetAttribute_Positive_AccessPolicyWindow",
+        "Unit_hipGraphKernelNodeSetAttribute_Negative_Parameters",
+        "Unit_hipMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyParam2D_Positive_Array",
+        "Unit_hipMemcpyParam2DAsync_Positive_Array",
+        "Unit_hipMemcpy2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyAsync_Positive_Synchronization_Behavior",
+        "Unit_hipDrvMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_Thread_Block_Tile_Dynamic_Getters_Positive_Basic",
+        "Unit_hipFuncSetCacheConfig_Negative_Not_Supported",
+        "Unit_hipFuncSetSharedMemConfig_Negative_Not_Supported",
+        "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
+        "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
+        "Performance_hipMemsetD16",
+        "Performance_hipMemsetD16Async",
+        "Performance_hipMemsetD32",
+        "Performance_hipMemsetD32Async",
+        "Note: Test disabled due to defect - EXSWHTEC-151",
+        "Unit_hipModuleLoad_Negative_Load_From_A_File_That_Is_Not_A_Module",
+        "Note: Test disabled due to defect - EXSWHTEC-152",
+        "Unit_hipModuleUnload_Negative_Module_Is_Nullptr",
+        "Note: Following two tests disabled due to defect - EXSWHTEC-153",
+        "Unit_hipModuleLoadData_Negative_Image_Is_An_Empty_String",
+        "Unit_hipModuleLoadDataEx_Negative_Image_Is_An_Empty_String",
+        "Note: Test disabled due to defect - EXSWHTEC-163",
+        "Unit_hipModuleGetGlobal_Negative_Hmod_Is_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-164",
+        "Unit_hipModuleGetGlobal_Negative_Name_Is_Empty_String",
+        "Note: Test disabled due to defect - EXSWHTEC-165",
+        "Unit_hipModuleGetGlobal_Negative_Dptr_And_Bytes_Are_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-166",
+        "Unit_hipModuleGetTexRef_Negative_Hmod_Is_Nullptr",
+        "Note: Test disabled due to defect - EXSWHTEC-167",
+        "Unit_hipModuleGetTexRef_Negative_Name_Is_Empty_String",
+        "Below tests hang in Jenkins PSDB",
+        "Unit_Thread_Block_Tile_Sync_Positive_Basic - uint8_t",
+        "Unit_Thread_Block_Tile_Sync_Positive_Basic - uint16_t",
+        "Unit_Thread_Block_Tile_Sync_Positive_Basic - uint32_t",
+        "=== SWDEV-441604: Below tests take long time to run in stress test on 12/01/24 ===",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - int",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - unsigned int",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - long",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - unsigned long",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - long long",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - unsigned long long",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - float",
+        "Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic - double",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - int",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - unsigned int",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - long",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - unsigned long",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - long long",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - unsigned long long",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - float",
+        "Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic - double",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - int",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - unsigned int",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - long",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - unsigned long",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - long long",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - unsigned long long",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - float",
+        "Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic - double",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - int",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - unsigned int",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - long",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - unsigned long",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - long long",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - unsigned long long",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - float",
+        "Unit_Thread_Block_Tile_Shfl_Positive_Basic - double",
+        "Unit_Thread_Block_Tile_Getters_Positive_Basic",
    #endif
        "End of json"
    ]
@@ -53,6 +53,7 @@
        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
        "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
        "Unit_atomicExch_system_Positive_Host_And_GPU - float",
+        "Unit_hipModuleUnload_Negative_Double_Unload",
        "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/356 ===",
        "Unit_Device_Complex_Unary_Negative_Parameters_RTC",
        "Unit_Device_Complex_Binary_Negative_Parameters_RTC",
@@ -70,6 +71,24 @@
        "Unit_hipFreeMipmappedArrayMultiTArray - int",
        "Unit_hipFreeMipmappedArray_Negative_Parameters",
        "Unit_hipCreateSurfaceObject_Negative_Parameters",
-        "Unit_hipDestroySurfaceObject_Negative_Parameters"
+        "Unit_hipDestroySurfaceObject_Negative_Parameters",
+        "Unit_hipMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpy2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyAsync_Positive_Synchronization_Behavior",
+        "Unit_hipDrvMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipFreeMipmappedArray_Negative_DoubleFree",
+        "Unit_hipModuleLoad_Positive_Basic",
+        "Unit_hipModuleLoad_Negative_Load_From_A_File_That_Is_Not_A_Module",
+        "Unit_hipModuleLoadData_Positive_Basic",
+        "Unit_hipModuleLoadData_Negative_Parameters",
+        "Unit_hipModuleLoadDataEx_Positive_Basic",
+        "Unit_hipModuleLoadDataEx_Negative_Parameters",
+        "Unit_hipModuleGetTexRef_Positive_Basic",
+        "Performance_hipMemsetD16",
+        "Performance_hipMemsetD16Async",
+        "Performance_hipMemsetD32",
+        "Performance_hipMemsetD32Async",
+        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpy_Positive_Synchronization_Behavior"
    ]
 }
@@ -16,6 +16,7 @@
        "Unit_ChannelDescriptor_Positive_Basic_3D - long3",
        "Unit_ChannelDescriptor_Positive_Basic_4D - ulong4",
        "Unit_ChannelDescriptor_Positive_Basic_4D - long4",
+        "Unit_hipModuleUnload_Negative_Double_Unload",
        "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/356 ===",
        "Unit_Device_Complex_Unary_Negative_Parameters_RTC",
        "Unit_Device_Complex_Binary_Negative_Parameters_RTC",
@@ -24,6 +25,25 @@
        "Unit_Device_Complex_Cast_Negative_Parameters_RTC",
        "=== Below 2 tests are disabled due to defect EXSWHTEC-342 ===",
        "Unit_hipDeviceSetLimit_Negative_Parameters",
-        "Unit_hipDeviceGetLimit_Negative_Parameters"
+        "Unit_hipDeviceGetLimit_Negative_Parameters",
+        "=== Below tests tests fail in PSDB ===",
+        "Unit_hipMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpy2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpyAsync_Positive_Synchronization_Behavior",
+        "Unit_hipDrvMemcpy3D_Positive_Synchronization_Behavior",
+        "Unit_hipFreeMipmappedArray_Negative_DoubleFree",
+        "Unit_hipModuleLoad_Positive_Basic",
+        "Unit_hipModuleLoad_Negative_Load_From_A_File_That_Is_Not_A_Module",
+        "Unit_hipModuleLoadData_Positive_Basic",
+        "Unit_hipModuleLoadData_Negative_Parameters",
+        "Unit_hipModuleLoadDataEx_Positive_Basic",
+        "Unit_hipModuleLoadDataEx_Negative_Parameters",
+        "Unit_hipModuleGetTexRef_Positive_Basic",
+        "Performance_hipMemsetD16",
+        "Performance_hipMemsetD16Async",
+        "Performance_hipMemsetD32",
+        "Performance_hipMemsetD32Async",
+        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
+        "Unit_hipMemcpy_Positive_Synchronization_Behavior"
    ]
 }
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -144,21 +144,6 @@ THE SOFTWARE.
 * @}
 */

- /**
- * @defgroup StreamOTest Ordered Memory Allocator
- * @{
- * This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
- * API.
- * @}
- */
-
-/**
- * @defgroup StreamOTest Ordered Memory Allocator
- * @{
- * This section describes the tests for Stream Ordered Memory Allocator functions of HIP runtime
- * API.
- */
-
 /**
 * @defgroup StreamOTest Ordered Memory Allocator
 * @{
@@ -201,7 +186,7 @@ THE SOFTWARE.
 * @}
 */

- /**
+/**
 * @defgroup ComplexTest Complex type
 * @{
 * This section describes tests for the Complex type functions.
@@ -24,10 +24,10 @@ THE SOFTWARE.

 #include <functional>

-#include <hip_test_common.hh>
 #include <hip/hip_runtime_api.h>
-#include <utils.hh>
+#include <hip_test_common.hh>
 #include <resource_guards.hh>
+#include <utils.hh>

 static inline unsigned int GenerateLinearAllocationFlagCombinations(
    const LinearAllocs allocation_type) {
@@ -169,8 +169,8 @@ void MemcpyDeviceToDeviceShell(F memcpy_func, const hipStream_t kernel_stream =
  HIP_CHECK(
      hipMemcpy(result.host_ptr(), dst_allocation.ptr(), allocation_size, hipMemcpyDeviceToHost));
  if constexpr (enable_peer_access) {
-    // If we've gotten this far, EnablePeerAccess must have succeeded, so we only need to check this
-    // condition
+    // If we've gotten this far, EnablePeerAccess must have succeeded, so we
+    // only need to check this condition
    HIP_CHECK(hipDeviceDisablePeerAccess(dst_device));
  }

@@ -238,7 +238,6 @@ void MemcpySyncBehaviorCheck(F memcpy_func, const bool should_sync,
  LaunchDelayKernel(std::chrono::milliseconds{100}, kernel_stream);
  HIP_CHECK(memcpy_func());
  if (should_sync) {
-    HIP_CHECK(hipStreamSynchronize(kernel_stream));
    HIP_CHECK(hipStreamQuery(kernel_stream));
  } else {
    HIP_CHECK_ERROR(hipStreamQuery(kernel_stream), hipErrorNotReady);
@@ -23,6 +23,7 @@ THE SOFTWARE.
 #pragma once
 #pragma clang diagnostic ignored "-Wmissing-field-initializers"
 #pragma clang diagnostic ignored "-Wunused-lambda-capture"
+
 #include <variant>

 #include <hip_test_common.hh>
@@ -169,3 +169,9 @@ inline bool DeviceAttributesSupport(const int device, Attributes... attributes)
  };
  return (... && DeviceAttributeSupport(device, attributes));
 }
+
+inline int GetDeviceAttribute(int device, const hipDeviceAttribute_t attr) {
+  int value = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&value, attr, device));
+  return value;
+}
@@ -18,6 +18,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.

+add_subdirectory(memset)
+add_subdirectory(memcpy)
+add_subdirectory(kernelLaunch)
 add_subdirectory(stream)
 add_subdirectory(event)
 add_subdirectory(example)
@@ -0,0 +1,37 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    kernel_launch_common.cc
+    triple_chevron.cc
+    hipLaunchKernel.cc
+    hipLaunchCooperativeKernel.cc
+)
+
+if(HIP_PLATFORM MATCHES "amd")
+    set(TEST_SRC ${TEST_SRC}
+        hipExtLaunchKernel.cc
+    )
+endif()
+
+hip_add_exe_to_target(NAME KernelLaunchPerformance
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,120 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "kernel_launch_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup kernelLaunch kernel launch
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for kernel launch overhead benchmarking.
+ */
+
+template <KernelType kernel_type, bool timer_type>
+class ExtLaunchKernelBenchmark
+    : public KernelLaunchBenchmark<ExtLaunchKernelBenchmark<kernel_type, timer_type>, timer_type> {
+ public:
+  constexpr void LaunchKernel() {
+    if constexpr (kernel_type == KernelType::kNull) {
+      error_ = hipExtLaunchKernel(reinterpret_cast<void*>(NullKernel), 1, 1, nullptr, 0, nullptr,
+                                  events_[0], events_[1], 0u);
+    } else if constexpr (kernel_type == KernelType::kSmall) {
+      error_ = hipExtLaunchKernel(reinterpret_cast<void*>(KernelWithSmallArgs), 1, 1,
+                                  small_kernel_args_, 0, nullptr, events_[0], events_[1], 0u);
+    } else if constexpr (kernel_type == KernelType::kMedium) {
+      error_ = hipExtLaunchKernel(reinterpret_cast<void*>(KernelWithMediumArgs), 1, 1,
+                                  medium_kernel_args_, 0, nullptr, events_[0], events_[1], 0u);
+    } else if constexpr (kernel_type == KernelType::kLarge) {
+      error_ = hipExtLaunchKernel(reinterpret_cast<void*>(KernelWithLargeArgs), 1, 1,
+                                  large_kernel_args_, 0, nullptr, events_[0], events_[1], 0u);
+    } else
+      ;
+  }
+
+  hipError_t GetError() { return error_; }
+
+ private:
+  EventsGuard events_{2};
+  hipError_t error_;
+
+  char* out_ = nullptr;
+  void* small_kernel_args_[2] = {&small_kernel_args, &out_};
+  void* medium_kernel_args_[2] = {&medium_kernel_args, &out_};
+  void* large_kernel_args_[2] = {&large_kernel_args, &out_};
+};
+
+template <KernelType kernel_type, bool timer_type> static void RunBenchmark(bool sync) {
+  ExtLaunchKernelBenchmark<kernel_type, timer_type> benchmark;
+  benchmark.AddSectionName(GetSynchronizationSectionName(sync));
+  benchmark.AddSectionName(GetKernelTypeSectionName<kernel_type>());
+  benchmark.AddSectionName(GetTimerTypeSectionName<timer_type>());
+  benchmark.Run(sync);
+  HIP_CHECK(benchmark.GetError());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Calls an empty kernel using hipExtLaunchKernel:
+ *    -# With different timing methods:
+ *      - CPU-based
+ *      - Event-based
+ *    -# With different synchronization behavior:
+ *      - Using a stream synchronization between each iteration
+ *      - Without any synchronization between iterations
+ *    -# With different kernel argument sizes
+ * Test source
+ * ------------------------
+ *  - performance/kernelLaunch/hipExtLaunchKernel.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipExtLaunchKernel") {
+  bool sync = GENERATE(true, false);
+
+  SECTION("null kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("small kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("medium kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("large kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeEvent>(sync); }
+  }
+}
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "kernel_launch_common.hh"
+
+#include <hip_test_common.hh>
+#include <utils.hh>
+
+/**
+ * @addtogroup kernelLaunch kernel launch
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for kernel launch overhead benchmarking.
+ */
+
+template <KernelType kernel_type, bool timer_type>
+class LaunchCooperativeKernelBenchmark
+    : public KernelLaunchBenchmark<LaunchCooperativeKernelBenchmark<kernel_type, timer_type>,
+                                   timer_type> {
+ public:
+  constexpr void LaunchKernel() {
+    if constexpr (kernel_type == KernelType::kNull) {
+      error_ = hipLaunchCooperativeKernel(reinterpret_cast<void*>(NullKernel), dim3{1, 1, 1},
+                                          dim3{1, 1, 1}, nullptr, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kSmall) {
+      error_ =
+          hipLaunchCooperativeKernel(reinterpret_cast<void*>(KernelWithSmallArgs), dim3{1, 1, 1},
+                                     dim3{1, 1, 1}, small_kernel_args_, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kMedium) {
+      error_ =
+          hipLaunchCooperativeKernel(reinterpret_cast<void*>(KernelWithMediumArgs), dim3{1, 1, 1},
+                                     dim3{1, 1, 1}, medium_kernel_args_, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kLarge) {
+      error_ =
+          hipLaunchCooperativeKernel(reinterpret_cast<void*>(KernelWithLargeArgs), dim3{1, 1, 1},
+                                     dim3{1, 1, 1}, large_kernel_args_, 0, nullptr);
+    } else
+      ;
+  }
+
+  hipError_t GetError() { return error_; }
+
+ private:
+  hipError_t error_;
+
+  char* out_ = nullptr;
+  void* small_kernel_args_[2] = {&small_kernel_args, &out_};
+  void* medium_kernel_args_[2] = {&medium_kernel_args, &out_};
+  void* large_kernel_args_[2] = {&large_kernel_args, &out_};
+};
+
+template <KernelType kernel_type, bool timer_type> static void RunBenchmark(bool sync) {
+  LaunchCooperativeKernelBenchmark<kernel_type, timer_type> benchmark;
+  benchmark.AddSectionName(GetSynchronizationSectionName(sync));
+  benchmark.AddSectionName(GetKernelTypeSectionName<kernel_type>());
+  benchmark.AddSectionName(GetTimerTypeSectionName<timer_type>());
+  benchmark.Run(sync);
+  HIP_CHECK(benchmark.GetError());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Calls an empty kernel using hipLaunchCooperativeKernel:
+ *    -# With different timing methods:
+ *      - CPU-based
+ *      - Event-based
+ *    -# With different synchronization behavior:
+ *      - Using a stream synchronization between each iteration
+ *      - Without any synchronization between iterations
+ *    -# With different kernel argument sizes
+ * Test source
+ * ------------------------
+ *  - performance/kernelLaunch/hipLaunchCooperativeKernel.cc
+ * Test requirements
+ * ------------------------
+ *  - Device supports CooperativeLaunch
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipLaunchCooperativeKernel") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  bool sync = GENERATE(true, false);
+
+  SECTION("null kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("small kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("medium kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("large kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeEvent>(sync); }
+  }
+}
@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "kernel_launch_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup kernelLaunch kernel launch
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for kernel launch overhead benchmarking.
+ */
+
+template <KernelType kernel_type, bool timer_type>
+class LaunchKernelBenchmark
+    : public KernelLaunchBenchmark<LaunchKernelBenchmark<kernel_type, timer_type>, timer_type> {
+ public:
+  constexpr void LaunchKernel() {
+    if constexpr (kernel_type == KernelType::kNull) {
+      error_ = hipLaunchKernel(reinterpret_cast<void*>(NullKernel), 1, 1, nullptr, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kSmall) {
+      error_ = hipLaunchKernel(reinterpret_cast<void*>(KernelWithSmallArgs), 1, 1,
+                               small_kernel_args_, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kMedium) {
+      error_ = hipLaunchKernel(reinterpret_cast<void*>(KernelWithMediumArgs), 1, 1,
+                               medium_kernel_args_, 0, nullptr);
+    } else if constexpr (kernel_type == KernelType::kLarge) {
+      error_ = hipLaunchKernel(reinterpret_cast<void*>(KernelWithLargeArgs), 1, 1,
+                               large_kernel_args_, 0, nullptr);
+    } else
+      ;
+  }
+
+  hipError_t GetError() { return error_; }
+
+ private:
+  hipError_t error_;
+
+  char* out_ = nullptr;
+  void* small_kernel_args_[2] = {&small_kernel_args, &out_};
+  void* medium_kernel_args_[2] = {&medium_kernel_args, &out_};
+  void* large_kernel_args_[2] = {&large_kernel_args, &out_};
+};
+
+template <KernelType kernel_type, bool timer_type> static void RunBenchmark(bool sync) {
+  LaunchKernelBenchmark<kernel_type, timer_type> benchmark;
+  benchmark.AddSectionName(GetSynchronizationSectionName(sync));
+  benchmark.AddSectionName(GetKernelTypeSectionName<kernel_type>());
+  benchmark.AddSectionName(GetTimerTypeSectionName<timer_type>());
+  benchmark.Run(sync);
+  HIP_CHECK(benchmark.GetError());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Calls an empty kernel using hipLaunchKernel:
+ *    -# With different timing methods:
+ *      - CPU-based
+ *      - Event-based
+ *    -# With different synchronization behavior:
+ *      - Using a stream synchronization between each iteration
+ *      - Without any synchronization between iterations
+ *    -# With different kernel argument sizes
+ * Test source
+ * ------------------------
+ *  - performance/kernelLaunch/hipLaunchKernel.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipLaunchKernel") {
+  bool sync = GENERATE(true, false);
+
+  SECTION("null kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("small kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("medium kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("large kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeEvent>(sync); }
+  }
+}
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "kernel_launch_common.hh"
+
+#define DO_NOT_OPTIMIZE_AWAY                                                                       \
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;                                              \
+  if (out) *out = args.args[i];
+
+__global__ void NullKernel() {}
+
+__global__ void KernelWithSmallArgs(SmallKernelArgs args, char* out) { DO_NOT_OPTIMIZE_AWAY; }
+
+__global__ void KernelWithMediumArgs(MediumKernelArgs args, char* out) { DO_NOT_OPTIMIZE_AWAY; }
+
+__global__ void KernelWithLargeArgs(LargeKernelArgs args, char* out) { DO_NOT_OPTIMIZE_AWAY; }
+
+SmallKernelArgs small_kernel_args;
+MediumKernelArgs medium_kernel_args;
+LargeKernelArgs large_kernel_args;
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+
+struct SmallKernelArgs {
+  char args[16];
+};
+
+struct MediumKernelArgs {
+  char args[256];
+};
+
+struct LargeKernelArgs {
+  char args[4080];
+};
+
+extern SmallKernelArgs small_kernel_args;
+extern MediumKernelArgs medium_kernel_args;
+extern LargeKernelArgs large_kernel_args;
+
+__global__ void NullKernel();
+
+__global__ void KernelWithSmallArgs(SmallKernelArgs, char*);
+
+__global__ void KernelWithMediumArgs(MediumKernelArgs, char*);
+
+__global__ void KernelWithLargeArgs(LargeKernelArgs, char*);
+
+enum class KernelType { kNull = 0, kSmall, kMedium, kLarge };
+
+template <typename Derived, bool timer_type>
+class KernelLaunchBenchmark : public Benchmark<KernelLaunchBenchmark<Derived, timer_type>> {
+ public:
+  void operator()(bool sync = true) {
+    auto& derived = static_cast<Derived&>(*this);
+
+    if (sync) {
+      TIMED_SECTION(timer_type) { derived.LaunchKernel(); }
+    } else {
+      if (this->current() != this->kWarmup)  // if not warmup
+        RunWithoutSynchronization();
+    }
+  }
+
+ private:
+  void RunWithoutSynchronization() {
+    auto iterations = this->iterations();
+    auto warmups = this->warmups();
+
+    // manually handle iterations here to avoid synchronization after each iteration
+    this->Configure(1, 0);
+
+    this->RegisterModifier([iterations](float time) { return time / iterations; });
+
+    auto& derived = static_cast<Derived&>(*this);
+
+    for (size_t i = 0u; i < warmups; ++i) {
+      derived.LaunchKernel();
+    }
+
+    TIMED_SECTION(timer_type) {
+      for (size_t i = 0u; i < iterations; ++i) {
+        derived.LaunchKernel();
+      }
+    }
+  }
+};
+
+static std::string GetSynchronizationSectionName(bool sync) {
+  return sync ? "with synchronization" : "without synchronization";
+}
+
+template <KernelType kernel_type> std::string GetKernelTypeSectionName() {
+  if constexpr (kernel_type == KernelType::kNull) {
+    return "null kernel";
+  } else if constexpr (kernel_type == KernelType::kSmall) {
+    return "small kernel";
+  } else if constexpr (kernel_type == KernelType::kMedium) {
+    return "medium kernel";
+  } else if constexpr (kernel_type == KernelType::kLarge) {
+    return "large kernel";
+  } else {
+    return "unknown kernel type";
+  }
+}
+
+template <bool timer_type> std::string GetTimerTypeSectionName() {
+  if constexpr (timer_type == kTimerTypeEvent) {
+    return "event based";
+  } else {
+    return "cpu based";
+  }
+}
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "kernel_launch_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup kernelLaunch kernel launch
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for kernel launch overhead benchmarking.
+ */
+
+template <KernelType kernel_type, bool timer_type>
+class TripleChevronBenchmark
+    : public KernelLaunchBenchmark<TripleChevronBenchmark<kernel_type, timer_type>, timer_type> {
+ public:
+  constexpr void LaunchKernel() {
+    if constexpr (kernel_type == KernelType::kNull) {
+      NullKernel<<<1, 1>>>();
+    } else if constexpr (kernel_type == KernelType::kSmall) {
+      KernelWithSmallArgs<<<1, 1>>>(small_kernel_args, nullptr);
+    } else if constexpr (kernel_type == KernelType::kMedium) {
+      KernelWithMediumArgs<<<1, 1>>>(medium_kernel_args, nullptr);
+    } else if constexpr (kernel_type == KernelType::kLarge) {
+      KernelWithLargeArgs<<<1, 1>>>(large_kernel_args, nullptr);
+    } else
+      ;
+  }
+};
+
+template <KernelType kernel_type, bool timer_type> static void RunBenchmark(bool sync) {
+  TripleChevronBenchmark<kernel_type, timer_type> benchmark;
+  benchmark.AddSectionName(GetSynchronizationSectionName(sync));
+  benchmark.AddSectionName(GetKernelTypeSectionName<kernel_type>());
+  benchmark.AddSectionName(GetTimerTypeSectionName<timer_type>());
+  benchmark.Run(sync);
+  HIP_CHECK(hipGetLastError());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Calls an empty kernel using triple chevron annotation:
+ *    -# With different timing methods:
+ *      - CPU-based
+ *      - Event-based
+ *    -# With different synchronization behavior:
+ *      - Using a stream synchronization between each iteration
+ *      - Without any synchronization between iterations
+ *    -# With different kernel argument sizes
+ * Test source
+ * ------------------------
+ *  - performance/kernelLaunch/triple_chevron.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_Triple_Chevron") {
+  bool sync = GENERATE(true, false);
+
+  SECTION("null kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kNull, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("small kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kSmall, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("medium kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kMedium, kTimerTypeEvent>(sync); }
+  }
+
+  SECTION("large kernel") {
+    SECTION("cpu-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeCpu>(sync); }
+
+    SECTION("event-based timing") { RunBenchmark<KernelType::kLarge, kTimerTypeEvent>(sync); }
+  }
+}
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    hipMemcpy.cc
+    hipMemcpyAsync.cc
+    hipMemcpyWithStream.cc
+    hipMemcpyAtoH.cc
+    hipMemcpyHtoA.cc
+    hipMemcpyDtoD.cc
+    hipMemcpyDtoDAsync.cc
+    hipMemcpyDtoH.cc
+    hipMemcpyDtoHAsync.cc
+    hipMemcpyHtoD.cc
+    hipMemcpyHtoDAsync.cc
+    hipMemcpyToSymbol.cc
+    hipMemcpyToSymbolAsync.cc
+    hipMemcpyFromSymbol.cc
+    hipMemcpyFromSymbolAsync.cc
+    hipMemcpy2D.cc
+    hipMemcpy2DAsync.cc
+    hipMemcpy2DToArray.cc
+    hipMemcpy2DToArrayAsync.cc
+    hipMemcpy2DFromArray.cc
+    hipMemcpy2DFromArrayAsync.cc
+    hipMemcpyParam2D.cc
+    hipMemcpyParam2DAsync.cc
+    hipMemcpy3D.cc
+    hipMemcpy3DAsync.cc
+)
+
+hip_add_exe_to_target(NAME MemcpyPerformance
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,190 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for all memcpy HIP APIs.
+ */
+
+class MemcpyBenchmark : public Benchmark<MemcpyBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy(dst, src, size, kind));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
@@ -0,0 +1,183 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DBenchmark : public Benchmark<Memcpy2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2D(dst, dst_pitch, src, src_pitch, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToHost);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyHostToDevice);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int), src_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height, hipMemcpyHostToHost);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DAsyncBenchmark : public Benchmark<Memcpy2DAsyncBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DAsync(dst, dst_pitch, src, src_pitch, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy2DAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToHost, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc,
+                                          device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyHostToDevice, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int), src_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height, hipMemcpyHostToHost, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,127 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DFromArrayBenchmark : public Benchmark<Memcpy2DFromArrayBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, hipArray_const_t src, size_t width, size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2DFromArray(dst, dst_pitch, src, 0, 0, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DFromArrayBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(host_allocation.ptr(), width * sizeof(int), array_allocation.ptr(),
+                  width * sizeof(int), height, hipMemcpyDeviceToHost);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  array_allocation.ptr(), device_allocation.width(),
+                  device_allocation.height(), hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArray` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArray.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArray_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DFromArrayAsyncBenchmark : public Benchmark<Memcpy2DFromArrayAsyncBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, hipArray_const_t src, size_t width, size_t height,
+                  hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DFromArrayAsync(dst, dst_pitch, src, 0, 0, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DFromArrayAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(host_allocation.ptr(), width * sizeof(int),
+                  array_allocation.ptr(), width * sizeof(int),
+                  height, hipMemcpyDeviceToHost, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  array_allocation.ptr(), device_allocation.width(),
+                  device_allocation.height(), hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DFromArrayAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DFromArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DFromArrayAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,127 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DToArrayBenchmark : public Benchmark<Memcpy2DToArrayBenchmark> {
+ public:
+  void operator()(hipArray_t dst, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy2DToArray(dst, 0, 0, src, src_pitch, width, height, kind));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DToArrayBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyHostToDevice) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(array_allocation.ptr(), host_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, hipMemcpyHostToDevice);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(array_allocation.ptr(), device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(), hipMemcpyDeviceToDevice);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_HostToDevice") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArray` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArray.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArray_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,133 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy2DToArrayAsyncBenchmark : public Benchmark<Memcpy2DToArrayAsyncBenchmark> {
+ public:
+  void operator()(hipArray_t dst, const void* src, size_t src_pitch, size_t width,
+                  size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy2DToArrayAsync(dst, 0, 0, src, src_pitch, width, height, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  Memcpy2DToArrayAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyHostToDevice) {
+    size_t allocation_size = width * height * sizeof(int);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, allocation_size);
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    benchmark.Run(array_allocation.ptr(), host_allocation.ptr(),
+                  width * sizeof(int), width * sizeof(int), height,
+                  hipMemcpyHostToDevice, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    ArrayAllocGuard<int> array_allocation(make_hipExtent(width, height, 0), hipArrayDefault);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(array_allocation.ptr(), device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  hipMemcpyDeviceToDevice, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy2DToArrayAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 8 KB x 32 B
+ *      - Large: 16 KB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy2DToArrayAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy2DToArrayAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 8_KB, 16_KB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,189 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy3DBenchmark : public Benchmark<Memcpy3DBenchmark> {
+ public:
+  void operator()(const hipPitchedPtr& dst_ptr, const hipPitchedPtr& src_ptr,
+                  const hipExtent extent, hipMemcpyKind kind) {
+    hipMemcpy3DParms params = CreateMemcpy3DParam(dst_ptr, make_hipPos(0, 0, 0),
+                                                  src_ptr, make_hipPos(0, 0, 0),
+                                                  extent, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpy3D(&params));
+    }
+  }
+};
+
+static void RunBenchmark(const hipExtent extent, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy3DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(extent.width) + ", " + std::to_string(extent.height)
+                           + ", " + std::to_string(extent.depth) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(make_hipPitchedPtr(host_allocation.ptr(), device_allocation.width(), 
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.pitched_ptr(), device_allocation.extent(), kind);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.pitch() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(device_allocation.pitched_ptr(),
+                  make_hipPitchedPtr(host_allocation.ptr(), device_allocation.pitch(),
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.extent(), kind);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    benchmark.Run(make_hipPitchedPtr(dst_allocation.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPitchedPtr(src_allocation.ptr(), extent.width, extent.width, extent.height),
+                  extent, kind);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard3D<int> src_allocation(extent);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard3D<int> dst_allocation(extent);
+
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.pitched_ptr(), src_allocation.pitched_ptr(),
+                  dst_allocation.extent(), kind);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memcpy3DAsyncBenchmark : public Benchmark<Memcpy3DAsyncBenchmark> {
+ public:
+  void operator()(const hipPitchedPtr& dst_ptr, const hipPitchedPtr& src_ptr,
+                  const hipExtent extent, hipMemcpyKind kind, const hipStream_t& stream) {
+    hipMemcpy3DParms params = CreateMemcpy3DParam(dst_ptr, make_hipPos(0, 0, 0),
+                                                  src_ptr, make_hipPos(0, 0, 0),
+                                                  extent, kind);
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpy3DAsync(&params, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const hipExtent extent, hipMemcpyKind kind, bool enable_peer_access=false) {
+  Memcpy3DAsyncBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(extent.width) + ", " + std::to_string(extent.height)
+                           + ", " + std::to_string(extent.depth) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(make_hipPitchedPtr(host_allocation.ptr(), device_allocation.width(),
+                                     device_allocation.width(), device_allocation.height()),
+                  device_allocation.pitched_ptr(), device_allocation.extent(), kind, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.pitch() * 
+                                          device_allocation.height() * device_allocation.depth());
+    benchmark.Run(device_allocation.pitched_ptr(),
+                  make_hipPitchedPtr(host_allocation.ptr(),
+                                     device_allocation.pitch(),
+                                     device_allocation.width(),
+                                     device_allocation.height()),
+                  device_allocation.extent(), kind, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard3D<int> device_allocation(extent);
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, extent.width * 
+                                         extent.height * extent.depth);
+    benchmark.Run(make_hipPitchedPtr(dst_allocation.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPitchedPtr(src_allocation.ptr(), extent.width, extent.width, extent.height),
+                  extent, kind, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard3D<int> src_allocation(extent);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard3D<int> dst_allocation(extent);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.pitched_ptr(), src_allocation.pitched_ptr(),
+                  dst_allocation.extent(), kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy3DAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpy3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpy3DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(make_hipExtent(width, 16, 4), hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyAsyncBenchmark : public Benchmark<MemcpyAsyncBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyAsync(dst, src, size, kind, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  const StreamGuard stream_guard{Streams::created};
+  const hipStream_t stream = stream_guard.stream();
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyAtoHBenchmark : public Benchmark<MemcpyAtoHBenchmark> {
+ public:
+  void operator()(void* dst, hipArray_t src_array, size_t allocation_size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyAtoH(dst, src_array, 0, allocation_size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, size_t width) {
+  MemcpyAtoHBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(width));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  size_t allocation_size = width * sizeof(int);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, allocation_size);
+  ArrayAllocGuard<int> array_allocation(make_hipExtent(width, 0, 0), hipArrayDefault);
+  benchmark.Run(host_allocation.ptr(), array_allocation.ptr(), allocation_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyAtoH` from Device array to Host:
+ *    -# Allocation size
+ *      - Small: 512 B
+ *      - Medium: 1024 B
+ *      - Large: 4096 B
+ *    -# Allocation type
+ *      - Host: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyAtoH.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyAtoH") {
+  const auto allocation_size = GENERATE(512, 1024, 4096);
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, allocation_size);
+}
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoDBenchmark : public Benchmark<MemcpyDtoDBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, const hipDeviceptr_t& src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyDtoD(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(size_t size, bool enable_peer_access=false) {
+  MemcpyDtoDBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+
+  int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+  int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+  LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(dst_device));
+  LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(src_device));
+
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(dst_allocation.ptr()),
+                reinterpret_cast<hipDeviceptr_t>(src_allocation.ptr()), size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoD_PeerAccessEnabled") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoD_PeerAccessDisabled") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size);
+}
@@ -0,0 +1,106 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoDAsyncBenchmark : public Benchmark<MemcpyDtoDAsyncBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, const hipDeviceptr_t& src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyDtoDAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t size, bool enable_peer_access=false) {
+  MemcpyDtoDAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+  int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+
+  LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(dst_device));
+  LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+  HIP_CHECK(hipSetDevice(src_device));
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(dst_allocation.ptr()),
+                reinterpret_cast<hipDeviceptr_t>(src_allocation.ptr()),
+                size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoDAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoDAsync_PeerAccessEnabled") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size, true);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoD` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoDAsync_PeerAccessDisabled") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(allocation_size);
+}
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoHBenchmark : public Benchmark<MemcpyDtoHBenchmark> {
+ public:
+  void operator()(void* dst, const hipDeviceptr_t& src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyDtoH(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyDtoHBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(host_allocation.ptr(),
+                reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoH` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoH.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoH") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyDtoHAsyncBenchmark : public Benchmark<MemcpyDtoHAsyncBenchmark> {
+ public:
+  void operator()(void* dst, const hipDeviceptr_t& src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyDtoHAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyDtoHAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(host_allocation.ptr(),
+                reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyDtoHAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyDtoHAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyDtoHAsync") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyFromSymbolBenchmark : public Benchmark<MemcpyFromSymbolBenchmark> {
+ public:
+  void operator()(const void* source, void* result, size_t size, size_t offset) {
+    HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(devSymbol), source, size, offset));
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyFromSymbol(result, HIP_SYMBOL(devSymbol), size, offset));
+    }
+  }
+};
+
+static void RunBenchmark(const void* source, void* result, size_t size=1, size_t offset=0) {
+  MemcpyFromSymbolBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+  benchmark.Run(source, result, size, offset);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_SingularValue") {
+  int set{42};
+  int result{0};
+  RunBenchmark(&set, &result);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+  std::vector<int> result(size);
+  std::fill_n(result.data(), size, 0);
+
+  RunBenchmark(array.data(), result.data(), sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbol` from Device to Host.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbol_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+  std::vector<int> result(size);
+  std::fill_n(result.data(), size, 0);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array.data() + offset, result.data() + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyFromSymbolAsyncBenchmark : public Benchmark<MemcpyFromSymbolAsyncBenchmark> {
+ public:
+  void operator()(const void* source, void* result, size_t size, size_t offset, const hipStream_t& stream) {
+    HIP_CHECK(hipMemcpyToSymbolAsync(HIP_SYMBOL(devSymbol), source, size, offset,
+                                     hipMemcpyHostToDevice, stream));
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyFromSymbolAsync(result, HIP_SYMBOL(devSymbol), size, offset,
+                                         hipMemcpyDeviceToHost, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const void* source, void* result, size_t size=1, size_t offset=0) {
+  MemcpyFromSymbolAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  benchmark.Run(source, result, size, offset, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_SingularValue") {
+  int set{42};
+  int result{0};
+  RunBenchmark(&set, &result);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+  std::vector<int> result(size);
+  std::fill_n(result.data(), size, 0);
+
+  RunBenchmark(array.data(), result.data(), sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyFromSymbolAsync` from Device to Host.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 512 KB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyFromSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyFromSymbolAsync_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 512_KB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+  std::vector<int> result(size);
+  std::fill_n(result.data(), size, 0);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array.data() + offset, result.data() + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoABenchmark : public Benchmark<MemcpyHtoABenchmark> {
+ public:
+  void operator()(hipArray_t dst_array, const void* src, size_t allocation_size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyHtoA(dst_array, 0, src, allocation_size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, size_t width) {
+  MemcpyHtoABenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(width));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  size_t allocation_size = width * sizeof(int);
+  ArrayAllocGuard<int> array_allocation(make_hipExtent(width, 0, 0), hipArrayDefault);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, allocation_size);
+  benchmark.Run(array_allocation.ptr(), host_allocation.ptr(), allocation_size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoA` from Host to Device array:
+ *    -# Allocation size
+ *      - Small: 512 B
+ *      - Medium: 1024 B
+ *      - Large: 4096 B
+ *    -# Allocation type
+ *      - Host: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoA.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoA") {
+  const auto allocation_size = GENERATE(512, 1024, 4096);
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, allocation_size);
+}
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoDBenchmark : public Benchmark<MemcpyHtoDBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, void* src, size_t size) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyHtoD(dst, src, size));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyHtoDBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()), host_allocation.ptr(), size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoD` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoD.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoD") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyHtoDAsyncBenchmark : public Benchmark<MemcpyHtoDAsyncBenchmark> {
+ public:
+  void operator()(hipDeviceptr_t& dst, void* src, size_t size, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyHtoDAsync(dst, src, size, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type, size_t size) {
+  MemcpyHtoDAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  LinearAllocGuard<int> device_allocation(device_allocation_type, size);
+  LinearAllocGuard<int> host_allocation(host_allocation_type, size);
+  benchmark.Run(reinterpret_cast<hipDeviceptr_t>(device_allocation.ptr()),
+                host_allocation.ptr(), size, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyHtoD` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyHtoDAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyHtoDAsync") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto device_allocation_type = LinearAllocs::hipMalloc;
+  const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(host_allocation_type, device_allocation_type, allocation_size);
+}
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyParam2DBenchmark : public Benchmark<MemcpyParam2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, void* src, size_t src_pitch,
+                  size_t width, size_t height, hipMemcpyKind kind) {
+    hip_Memcpy2D params = CreateMemcpy2DParam(dst, dst_pitch, src, src_pitch,
+                                              width, height, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyParam2D(&params));
+    }
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  MemcpyParam2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(), kind);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(), kind);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int),
+                  src_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, kind);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  kind);
+  }
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2D` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2D.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2D_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,193 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyParam2DBenchmark : public Benchmark<MemcpyParam2DBenchmark> {
+ public:
+  void operator()(void* dst, size_t dst_pitch, void* src, size_t src_pitch,
+                  size_t width, size_t height, hipMemcpyKind kind, const hipStream_t& stream) {
+    hip_Memcpy2D params = CreateMemcpy2DParam(dst, dst_pitch, src, src_pitch,
+                                              width, height, kind);
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyParam2DAsync(&params, stream));
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(size_t width, size_t height, hipMemcpyKind kind,
+                         bool enable_peer_access=false) {
+  MemcpyParam2DBenchmark benchmark;
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind == hipMemcpyDeviceToHost) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.ptr(), device_allocation.pitch(),
+                  device_allocation.width(), device_allocation.height(),
+                  kind, stream);
+  } else if (kind == hipMemcpyHostToDevice) {
+    LinearAllocGuard2D<int> device_allocation(width, height);
+    LinearAllocGuard<int> host_allocation(LinearAllocs::hipHostMalloc, device_allocation.width() * height);
+    benchmark.Run(device_allocation.ptr(), device_allocation.pitch(),
+                  host_allocation.ptr(), device_allocation.width(),
+                  device_allocation.width(), device_allocation.height(),
+                  kind, stream);
+  } else if (kind == hipMemcpyHostToHost) {
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipHostMalloc, width * sizeof(int) * height);
+    benchmark.Run(dst_allocation.ptr(), width * sizeof(int),
+                  src_allocation.ptr(), width * sizeof(int),
+                  width * sizeof(int), height, kind, stream);
+  } else {
+    // hipMemcpyDeviceToDevice
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard2D<int> src_allocation(width, height);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard2D<int> dst_allocation(width, height);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), dst_allocation.pitch(),
+                  src_allocation.ptr(), src_allocation.pitch(),
+                  dst_allocation.width(), dst_allocation.height(),
+                  kind, stream);
+  }
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_HostToDevice") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToDevice);
+}
+
+#if HT_NVIDIA
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_HostToHost") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyHostToHost);
+}
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToDevice_DisablePeerAccess") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyParam2DAsync` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyParam2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyParam2DAsync_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyToSymbolBenchmark : public Benchmark<MemcpyToSymbolBenchmark> {
+ public:
+  void operator()(const void* source, size_t size, size_t offset) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyToSymbol(HIP_SYMBOL(devSymbol), source, size, offset));
+    }
+  }
+};
+
+static void RunBenchmark(const void* source, size_t size=1, size_t offset=0) {
+  MemcpyToSymbolBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+  benchmark.Run(source, size, offset);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_SingularValue") {
+  int set{42};
+  RunBenchmark(&set);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+
+  RunBenchmark(array.data(), sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbol` from Host to Device.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbol.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbol_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array.data() + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+#pragma clang diagnostic ignored "-Wvla-extension"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+__device__ int devSymbol[1_MB];
+
+class MemcpyToSymbolAsyncBenchmark : public Benchmark<MemcpyToSymbolAsyncBenchmark> {
+ public:
+  void operator()(const void* source, size_t size, size_t offset, const hipStream_t& stream) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
+      HIP_CHECK(hipMemcpyToSymbolAsync(HIP_SYMBOL(devSymbol), source, size, offset,
+                                       hipMemcpyHostToDevice, stream));
+    }
+
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+};
+
+static void RunBenchmark(const void* source, size_t size=1, size_t offset=0) {
+  MemcpyToSymbolAsyncBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(std::to_string(offset));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+  benchmark.Run(source, size, offset, stream);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes sigular integer values.
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_SingularValue") {
+  int set{42};
+  RunBenchmark(&set);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes array integers:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_ArrayValue") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+
+  RunBenchmark(array.data(), sizeof(int) * size);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyToSymbolAsync` from Host to Device.
+ *  - Utilizes array integers with offsets:
+ *    - Small: 1 KB
+ *    - Medium: 4 KB
+ *    - Large: 1 MB
+ *  - Offset: 0 and size/2
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyToSymbolAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyToSymbolAsync_WithOffset") {
+  size_t size = GENERATE(1_KB, 4_KB, 1_MB);
+  std::vector<int> array(size);
+  std::fill_n(array.data(), size, 42);
+
+  size_t offset = GENERATE_REF(0, size / 2);
+  RunBenchmark(array.data() + offset, sizeof(int) * (size - offset), offset * sizeof(int));
+}
@@ -0,0 +1,192 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "memcpy_performance_common.hh"
+
+/**
+ * @addtogroup memcpy memcpy
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemcpyWithStreamBenchmark : public Benchmark<MemcpyWithStreamBenchmark> {
+ public:
+  void operator()(void* dst, const void* src, size_t size, hipMemcpyKind kind, hipStream_t stream) {
+    TIMED_SECTION(kTimerTypeCpu) {
+      HIP_CHECK(hipMemcpyWithStream(dst, src, size, kind, stream));
+    }
+  }
+};
+
+static void RunBenchmark(LinearAllocs dst_allocation_type, LinearAllocs src_allocation_type,
+                         size_t size, hipMemcpyKind kind, bool enable_peer_access=false) {
+  MemcpyWithStreamBenchmark benchmark;
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(src_allocation_type));
+  benchmark.AddSectionName(GetAllocationSectionName(dst_allocation_type));
+
+  const StreamGuard stream_guard(Streams::created);
+  const hipStream_t stream = stream_guard.stream();
+
+  if (kind != hipMemcpyDeviceToDevice) {
+    LinearAllocGuard<int> src_allocation(src_allocation_type, size);
+    LinearAllocGuard<int> dst_allocation(dst_allocation_type, size);
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  } else {
+    int src_device = std::get<0>(GetDeviceIds(enable_peer_access));
+    int dst_device = std::get<1>(GetDeviceIds(enable_peer_access));
+  
+    LinearAllocGuard<int> src_allocation(LinearAllocs::hipMalloc, size);
+    HIP_CHECK(hipSetDevice(dst_device));
+    LinearAllocGuard<int> dst_allocation(LinearAllocs::hipMalloc, size);
+    HIP_CHECK(hipSetDevice(src_device));
+    benchmark.Run(dst_allocation.ptr(), src_allocation.ptr(), size, kind, stream);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Device to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Host to Device:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_HostToDevice") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Host to Host:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: host pinned and pageable
+ *      - Destination: host pinned and pageable
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_HostToHost") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  const auto dst_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyHostToHost);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpy` from Device to Device with peer access disabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToDevice_DisablePeerAccess") {
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemcpyWithStream` from Device to Device with peer access enabled:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - Source: device malloc
+ *      - Destination: device malloc
+ * Test source
+ * ------------------------
+ *  - performance/memcpy/hipMemcpyWithStream.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - Device supports Peer-to-Peer access
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemcpyWithStream_DeviceToDevice_EnablePeerAccess") {
+  if (HipTest::getDeviceCount() < 2) {
+    HipTest::HIP_SKIP_TEST("This test requires 2 GPUs. Skipping.");
+    return;
+  }
+  const auto allocation_size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto src_allocation_type = LinearAllocs::hipMalloc;
+  const auto dst_allocation_type = LinearAllocs::hipMalloc;
+  RunBenchmark(dst_allocation_type, src_allocation_type, allocation_size, hipMemcpyDeviceToDevice, true);
+}
@@ -0,0 +1,105 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <performance_common.hh>
+
+static hip_Memcpy2D CreateMemcpy2DParam(void* dst, size_t dpitch, void* src, size_t spitch,
+                                        size_t width, size_t height, hipMemcpyKind kind) {
+  hip_Memcpy2D params = {};
+  memset(&params, 0, sizeof(hip_Memcpy2D));
+
+  const hipExtent src_offset = {0, 0, 0};
+  const hipExtent dst_offset = {0, 0, 0};
+
+  params.dstPitch = dpitch;
+  switch (kind) {
+    case hipMemcpyDeviceToHost:
+    case hipMemcpyHostToHost:
+        params.dstMemoryType = hipMemoryTypeHost;
+      params.dstHost = dst;
+      break;
+    case hipMemcpyDeviceToDevice:
+    case hipMemcpyHostToDevice:
+        params.dstMemoryType = hipMemoryTypeDevice;
+      params.dstDevice = reinterpret_cast<hipDeviceptr_t>(dst);
+      break;
+    default:
+      REQUIRE(false);
+  }
+
+  params.srcPitch = dpitch;
+  switch (kind) {
+    case hipMemcpyDeviceToHost:
+    case hipMemcpyHostToHost:
+        params.srcMemoryType = hipMemoryTypeHost;
+      params.srcHost = src;
+      break;
+    case hipMemcpyDeviceToDevice:
+    case hipMemcpyHostToDevice:
+        params.srcMemoryType = hipMemoryTypeDevice;
+      params.srcDevice = reinterpret_cast<hipDeviceptr_t>(src);
+      break;
+    default:
+      REQUIRE(false);
+  }
+
+  params.WidthInBytes = width;
+  params.Height = height;
+  params.srcXInBytes = src_offset.width;
+  params.srcY = src_offset.height;
+  params.dstXInBytes = dst_offset.width;
+  params.dstY = dst_offset.height;
+
+  return params;
+}
+
+static hipMemcpy3DParms CreateMemcpy3DParam(hipPitchedPtr dst_ptr, hipPos dst_pos,
+                                            hipPitchedPtr src_ptr, hipPos src_pos,
+                                            hipExtent extent, hipMemcpyKind kind) {
+  hipMemcpy3DParms params = {};
+  memset(&params, 0, sizeof(hipMemcpy3DParms));
+  params.dstPtr = dst_ptr;
+  params.dstPos = dst_pos;
+  params.srcPtr = src_ptr;
+  params.srcPos = src_pos;
+  params.extent = extent;
+  params.kind = kind;
+  return params;
+}
+
+static std::tuple<int, int> GetDeviceIds(bool enable_peer_access) {
+  int src_device = 0;
+  int dst_device = 1;
+
+  if (enable_peer_access) {
+    int can_access_peer = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
+    if (!can_access_peer) {
+      INFO("Peer access cannot be enabled between devices " << src_device << " and " << dst_device);
+      REQUIRE(can_access_peer);
+    }
+    HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
+  } else {
+    dst_device = 0;
+  }
+
+  return {src_device, dst_device};
+}
@@ -0,0 +1,39 @@
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+set(TEST_SRC
+    hipMemset.cc
+    hipMemsetAsync.cc
+    hipMemsetD8.cc
+    hipMemsetD8Async.cc
+    hipMemsetD16.cc
+    hipMemsetD16Async.cc
+    hipMemsetD32.cc
+    hipMemsetD32Async.cc
+    hipMemset2D.cc
+    hipMemset2DAsync.cc
+    hipMemset3D.cc
+    hipMemset3DAsync.cc
+)
+
+hip_add_exe_to_target(NAME MemsetPerformance
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,79 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ * Contains performance tests for all memset HIP APIs.
+ */
+
+class MemsetBenchmark : public Benchmark<MemsetBenchmark> {
+ public:
+  MemsetBenchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) { HIP_CHECK(hipMemset(dst_.ptr(), 17, size_)); }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetBenchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemset`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemset.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemset") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,71 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memset2DBenchmark : public Benchmark<Memset2DBenchmark> {
+ public:
+  Memset2DBenchmark(size_t width, size_t height) : dst_(width, height) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) {
+      HIP_CHECK(hipMemset2D(dst_.ptr(), dst_.pitch(), 17, dst_.width(), dst_.height()));
+    }
+  }
+
+ private:
+  LinearAllocGuard2D<char> dst_;
+};
+
+static void RunBenchmark(size_t width, size_t height) {
+  Memset2DBenchmark benchmark(width, height);
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemset2D`:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemset2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemset2D") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memset2DAsyncBenchmark : public Benchmark<Memset2DAsyncBenchmark> {
+ public:
+  Memset2DAsyncBenchmark(size_t width, size_t height)
+      : dst_(width, height), stream_(Streams::created) {}
+
+  void operator()(size_t width, size_t height) {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemset2DAsync(dst_.ptr(), dst_.pitch(), 17, dst_.width(), dst_.height(),
+                                 stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard2D<char> dst_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(size_t width, size_t height) {
+  Memset2DAsyncBenchmark benchmark(width, height);
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ")");
+  benchmark.Run(width, height);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemset2DAsync`:
+ *    -# Allocation size
+ *      - Small: 4 KB x 32 B
+ *      - Medium: 4 MB x 32 B
+ *      - Large: 16 MB x 32 B
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemset2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemset2DAsync") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 32);
+}
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memset3DBenchmark : public Benchmark<Memset3DBenchmark> {
+ public:
+  Memset3DBenchmark(size_t width, size_t height, size_t depth) : dst_(width, height, depth) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) {
+      HIP_CHECK(hipMemset3D(dst_.pitched_ptr(), 17, dst_.extent()));
+    }
+  }
+
+ private:
+  LinearAllocGuard3D<char> dst_;
+};
+
+static void RunBenchmark(size_t width, size_t height, size_t depth) {
+  Memset3DBenchmark benchmark(width, height, depth);
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ", " +
+                           std::to_string(depth) + ")");
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemset3D`:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemset3D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemset3D") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 16, 4);
+}
@@ -0,0 +1,74 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class Memset3DAsyncBenchmark : public Benchmark<Memset3DAsyncBenchmark> {
+ public:
+  Memset3DAsyncBenchmark(size_t width, size_t height, size_t depth)
+      : dst_(width, height, depth), stream_(Streams::created) {}
+
+  void operator()() {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemset3DAsync(dst_.pitched_ptr(), 17, dst_.extent(), stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard3D<char> dst_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(size_t width, size_t height, size_t depth) {
+  Memset3DAsyncBenchmark benchmark(width, height, depth);
+  benchmark.AddSectionName("(" + std::to_string(width) + ", " + std::to_string(height) + ", " +
+                           std::to_string(depth) + ")");
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemset3DAsync`:
+ *    -# Allocation size
+ *      - Small: 4 KB x 16 B x 4 B
+ *      - Medium: 4 MB x 16 B x 4 B
+ *      - Large: 16 MB x 16 B x 4 B
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemset3DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemset3DAsync") {
+  const auto width = GENERATE(4_KB, 4_MB, 16_MB);
+  RunBenchmark(width, 16, 4);
+}
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetAsyncBenchmark : public Benchmark<MemsetAsyncBenchmark> {
+ public:
+  MemsetAsyncBenchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size), stream_(Streams::created) {}
+
+  void operator()() {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemsetAsync(dst_.ptr(), 17, size_, stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetAsyncBenchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetAsync`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetAsync") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD16Benchmark : public Benchmark<MemsetD16Benchmark> {
+ public:
+  MemsetD16Benchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) {
+      HIP_CHECK(hipMemsetD16(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 311, size_));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD16Benchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD16`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD16.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD16") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,82 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD16AsyncBenchmark : public Benchmark<MemsetD16AsyncBenchmark> {
+ public:
+  MemsetD16AsyncBenchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size), stream_(Streams::created) {}
+
+  void operator()() {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemsetD16Async(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 311, size_,
+                                  stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD16AsyncBenchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD16Async`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD16Async.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD16Async") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD32Benchmark : public Benchmark<MemsetD32Benchmark> {
+ public:
+  MemsetD32Benchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) {
+      HIP_CHECK(hipMemsetD32(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 123'456, size_));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD32Benchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD32`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD32.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD32") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,82 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD32AsyncBenchmark : public Benchmark<MemsetD32AsyncBenchmark> {
+ public:
+  MemsetD32AsyncBenchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size), stream_(Streams::created) {}
+
+  void operator()() {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemsetD32Async(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 123'456, size_,
+                                  stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD32AsyncBenchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD32Async`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD32Async.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD32Async") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD8Benchmark : public Benchmark<MemsetD8Benchmark> {
+ public:
+  MemsetD8Benchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size) {}
+
+  void operator()() {
+    TIMED_SECTION(kTimerTypeEvent) {
+      HIP_CHECK(hipMemsetD8(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 17, size_));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD8Benchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD8`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD8.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD8") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -0,0 +1,82 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <performance_common.hh>
+#include <resource_guards.hh>
+
+/**
+ * @addtogroup memset memset
+ * @{
+ * @ingroup PerformanceTest
+ */
+
+class MemsetD8AsyncBenchmark : public Benchmark<MemsetD8AsyncBenchmark> {
+ public:
+  MemsetD8AsyncBenchmark(LinearAllocs allocation_type, size_t size)
+      : dst_(allocation_type, size), size_(size), stream_(Streams::created) {}
+
+  void operator()() {
+    TIMED_SECTION_STREAM(kTimerTypeEvent, stream_.stream()) {
+      HIP_CHECK(hipMemsetD8Async(reinterpret_cast<hipDeviceptr_t>(dst_.ptr()), 17, size_,
+                                 stream_.stream()));
+    }
+  }
+
+ private:
+  LinearAllocGuard<void> dst_;
+  const size_t size_;
+  StreamGuard stream_;
+};
+
+static void RunBenchmark(LinearAllocs allocation_type, size_t size) {
+  MemsetD8AsyncBenchmark benchmark(allocation_type, size);
+  benchmark.AddSectionName(std::to_string(size));
+  benchmark.AddSectionName(GetAllocationSectionName(allocation_type));
+  benchmark.Run();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Executes `hipMemsetD8Async`:
+ *    -# Allocation size
+ *      - Small: 4 KB
+ *      - Medium: 4 MB
+ *      - Large: 16 MB
+ *    -# Allocation type
+ *      - device
+ *      - host
+ *      - managed
+ * Test source
+ * ------------------------
+ *  - performance/memset/hipMemsetD8Async.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Performance_hipMemsetD8Async") {
+  const auto size = GENERATE(4_KB, 4_MB, 16_MB);
+  const auto allocation_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipHostMalloc,
+                                        LinearAllocs::hipMallocManaged);
+  RunBenchmark(allocation_type, size);
+}
@@ -35,9 +35,9 @@ add_subdirectory(multiThread)
 add_subdirectory(compiler)
 add_subdirectory(errorHandling)
 add_subdirectory(cooperativeGrps)
-add_subdirectory(warp)
 add_subdirectory(context)
 add_subdirectory(device_memory)
+add_subdirectory(warp)
 add_subdirectory(dynamicLoading)
 add_subdirectory(g++)
 add_subdirectory(module)
@@ -1,5 +1,7 @@
 # Common Tests - Test independent of all platforms
 set(TEST_SRC
+  thread_block.cc
+  thread_block_tile.cc
  hipCGThreadBlockType_old.cc
  hipCGMultiGridGroupType_old.cc
  hipCGGridGroupType_old.cc
@@ -31,6 +31,14 @@ constexpr size_t kWarpSize = 64;
 constexpr int kMaxGPUs = 8;
 }  // namespace

+constexpr int MaxGPUs = 8;
+
+inline bool operator==(const dim3& l, const dim3& r) {
+  return l.x == r.x && l.y == r.y && l.z == r.z;
+}
+
+inline bool operator!=(const dim3& l, const dim3& r) { return !(l == r); }
+
 __device__ inline unsigned int thread_rank_in_grid() {
  const auto block_size = blockDim.x * blockDim.y * blockDim.z;
  const auto block_rank_in_grid = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
@@ -67,4 +75,4 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc
  }

  return true;
-}
+}
@@ -0,0 +1,350 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "cooperative_groups_common.hh"
+
+#include <cpu_grid.h>
+#include <optional>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+#include <cmd_options.hh>
+
+/**
+ * @addtogroup thread_block thread_block
+ * @{
+ * @ingroup DeviceLanguageTest
+ * Contains unit tests for all thread_block APIs
+ */
+
+namespace cg = cooperative_groups;
+
+template <typename BaseType = cg::thread_block>
+static __global__ void thread_block_size_getter(unsigned int* sizes) {
+  const BaseType group = cg::this_thread_block();
+  sizes[thread_rank_in_grid()] = group.size();
+}
+
+template <typename BaseType = cg::thread_block>
+static __global__ void thread_block_thread_rank_getter(unsigned int* thread_ranks) {
+  const BaseType group = cg::this_thread_block();
+  thread_ranks[thread_rank_in_grid()] = group.thread_rank();
+}
+
+static __global__ void thread_block_group_indices_getter(dim3* group_indices) {
+  group_indices[thread_rank_in_grid()] = cg::this_thread_block().group_index();
+}
+
+static __global__ void thread_block_thread_indices_getter(dim3* thread_indices) {
+  thread_indices[thread_rank_in_grid()] = cg::this_thread_block().thread_index();
+}
+
+static __global__ void thread_block_non_member_size_getter(unsigned int* sizes) {
+  sizes[thread_rank_in_grid()] = cg::group_size(cg::this_thread_block());
+}
+
+static __global__ void thread_block_non_member_thread_rank_getter(unsigned int* thread_ranks) {
+  thread_ranks[thread_rank_in_grid()] = cg::thread_rank(cg::this_thread_block());
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size, thread_rank, group_index, and
+ * thread_index member functions to an output array that is validated on the host side. The kernels
+ * are run sequentially, reusing the output array, to avoid running out of device memory for large
+ * kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  const CPUGrid grid(blocks, threads);
+
+  {
+    LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+    LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                            grid.thread_count_ * sizeof(unsigned int));
+
+    thread_block_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    thread_block_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    // Validate thread_block.size() values
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+               [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Validate thread_block.thread_rank() values
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+  }
+
+  {
+    LinearAllocGuard<dim3> dim3_arr_dev(LinearAllocs::hipMalloc, grid.thread_count_ * sizeof(dim3));
+    LinearAllocGuard<dim3> dim3_arr(LinearAllocs::hipHostMalloc, grid.thread_count_ * sizeof(dim3));
+
+    thread_block_group_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    thread_block_thread_indices_getter<<<blocks, threads>>>(dim3_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    // Validate thread_block.group_index() values
+    ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.block_idx(i).value(); });
+
+    HIP_CHECK(hipMemcpy(dim3_arr.ptr(), dim3_arr_dev.ptr(),
+                        grid.thread_count_ * sizeof(*dim3_arr.ptr()), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Validate thread_block.thread_index() values
+    ArrayAllOf(dim3_arr.ptr(), grid.thread_count_,
+               [&grid](uint32_t i) { return grid.thread_idx(i).value(); });
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank member functions to an
+ * output array that is validated on the host side, while treating the thread block as a thread
+ * group. The kernels are run sequentially, reusing the output array, to avoid running out of device
+ * memory for large kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Via_Base_Type_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+
+  thread_block_size_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  thread_block_thread_rank_getter<cg::thread_group><<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  // Validate thread_block.size() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Validate thread_block.thread_rank() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches kernels that write the return values of size and thread_rank non-member functions
+ * to an output array that is validated on the host side. The kernels are run sequentially, reusing
+ * the output array, to avoid running out of device memory for large kernel launches.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Getters_Via_Non_Member_Functions_Positive_Basic") {
+  const auto blocks = GenerateBlockDimensions();
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+
+  const CPUGrid grid(blocks, threads);
+
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc,
+                                              grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc,
+                                          grid.thread_count_ * sizeof(unsigned int));
+
+  thread_block_non_member_size_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  thread_block_non_member_thread_rank_getter<<<blocks, threads>>>(uint_arr_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  // Validate thread_block.size() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [size = grid.threads_in_block_count_](uint32_t) { return size; });
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(),
+                      grid.thread_count_ * sizeof(*uint_arr.ptr()), hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // Validate thread_block.thread_rank() values
+  ArrayAllOf(uint_arr.ptr(), grid.thread_count_,
+             [&grid](uint32_t i) { return grid.thread_rank_in_block(i).value(); });
+}
+
+
+template <bool use_global, typename T>
+__global__ void thread_block_sync_check(T* global_data, unsigned int* wait_modifiers,
+                                        unsigned int* read_offsets) {
+  extern __shared__ uint8_t shared_data[];
+  T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
+  const auto block = cg::this_thread_block();
+  constexpr T divisor = 255;
+  const auto tid = block.thread_rank();
+  const auto wait_modifier = wait_modifiers[tid];
+  const auto read_offset = read_offsets[tid];
+  busy_wait(wait_modifier);
+  data[tid] = tid % divisor;
+  block.sync();
+  bool valid = true;
+  for (auto i = 0; i < block.size(); ++i) {
+    const auto offset = block.size() + read_offset;
+    const auto expected = (tid + offset + i) % block.size();
+    if (!(valid &= (data[expected] == expected % divisor))) {
+      break;
+    }
+  }
+  block.sync();
+  data[tid] = valid;
+  if constexpr (!use_global) {
+    global_data[tid] = data[tid];
+  }
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  // With a static seed the tests will remain consistent between runs, yet it relieves the problem
+  // of predetermining a set of modifiers by hand. The sets of modifiers could actually be
+  // determined at compile time if std::random objects could operate in a constexpr context.
+  static std::mt19937 mt(17);
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <bool global_memory, typename T> void ThreadBlockSyncTest() {
+  const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
+  INFO("Run number: " << randomized_run_count + 1);
+  const auto blocks = dim3(1, 1, 1);
+  const auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  int max_shared_mem_per_block = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
+                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+  if (!global_memory && max_shared_mem_per_block < alloc_size) {
+    return;
+  }
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
+                                                    grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+  std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
+                [&] { return GenerateRandomInteger(0u, 1500u); });
+
+  LinearAllocGuard<unsigned int> read_offsets_dev(LinearAllocs::hipMalloc,
+                                                  grid.thread_count_ * sizeof(unsigned int));
+  std::vector<unsigned int> read_offsets(grid.thread_count_, 0u);
+  if (randomized_run_count != 0) {
+    std::generate(read_offsets.begin(), read_offsets.end(),
+                  [&] { return GenerateRandomInteger(0u, grid.thread_count_); });
+  }
+
+  const auto shared_memory_size = global_memory ? 0u : alloc_size;
+  HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(read_offsets_dev.ptr(), read_offsets.data(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+
+  thread_block_sync_check<global_memory><<<blocks, threads, shared_memory_size>>>(
+      arr_dev.ptr(), wait_modifiers_dev.ptr(), read_offsets_dev.ptr());
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  REQUIRE(std::all_of(arr.ptr(), arr.ptr() + grid.thread_count_, [](unsigned int e) { return e; }));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches a kernel wherein every thread writes its grid-wide linear index into an array. The
+ * array is either in global or dynamic shared memory based on a compile time switch, and the test
+ * is run for arrays of 1, 2, and 4 byte elements. Before the write each thread executes a busy wait
+ * loop for a random amount of clock cycles, the amount being read from an input array. After the
+ * write a block-wide sync is performed and each thread validates that it can read the expected
+ * values that other threads have written to their respective array slots. Each thread begins the
+ * validation from a given offset from its own index. For the first run of the test, all the offsets
+ * are zero, so memory reads should be coalesced as adjacent threads read from adjacent memory
+ * locations. On subsequent runs the offsets are randomized for each thread, leading to
+ * non-coalesced reads and cache thrashing.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) {
+  SECTION("Global memory") { ThreadBlockSyncTest<true, TestType>(); }
+  SECTION("Shared memory") { ThreadBlockSyncTest<false, TestType>(); }
+}
@@ -0,0 +1,553 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "cooperative_groups_common.hh"
+
+#include <bitset>
+#include <array>
+
+#include <cmd_options.hh>
+#include <cpu_grid.h>
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+
+/**
+ * @addtogroup thread_block_tile thread_block_tile
+ * @{
+ * @ingroup DeviceLanguageTest
+ * Contains unit tests for all thread_block_tile APIs and dynamic block partitioning
+ */
+
+namespace cg = cooperative_groups;
+
+template <bool dynamic, unsigned int tile_size>
+__global__ void thread_block_partition_size_getter(unsigned int* sizes) {
+  const auto group = cg::this_thread_block();
+  if constexpr (dynamic) {
+    sizes[thread_rank_in_grid()] = cg::tiled_partition(group, tile_size).size();
+  } else {
+    cg::thread_block_tile<tile_size> tiled_partition = cg::tiled_partition<tile_size>(group);
+    sizes[thread_rank_in_grid()] = tiled_partition.size();
+  }
+}
+
+template <bool dynamic, unsigned int tile_size>
+__global__ void thread_block_partition_thread_rank_getter(unsigned int* thread_ranks) {
+  const auto group = cg::this_thread_block();
+  if constexpr (dynamic) {
+    thread_ranks[thread_rank_in_grid()] = cg::tiled_partition(group, tile_size).thread_rank();
+  } else {
+    cg::thread_block_tile<tile_size> tiled_partition = cg::tiled_partition<tile_size>(group);
+    thread_ranks[thread_rank_in_grid()] = tiled_partition.thread_rank();
+  }
+}
+
+template <bool dynamic, size_t tile_size> void BlockPartitionGettersBasicTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    auto blocks = GenerateBlockDimensions();
+    auto threads = GenerateThreadDimensions();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(unsigned int);
+    LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+    thread_block_partition_size_getter<dynamic, tile_size><<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+    thread_block_partition_thread_rank_getter<dynamic, tile_size>
+        <<<blocks, threads>>>(uint_arr_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [&grid](unsigned int i) {
+      if constexpr (!dynamic) {
+        return tile_size;
+      }
+
+      const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size;
+      const auto rank_in_block = grid.thread_rank_in_block(i).value();
+
+      const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_;
+      return tile_size - tail * (rank_in_block >= (partitions_in_block - 1) * tile_size);
+    });
+
+    HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    ArrayAllOf(uint_arr.ptr(), grid.thread_count_, [&grid](unsigned int i) {
+      return grid.thread_rank_in_block(i).value() % tile_size;
+    });
+  }
+}
+
+template <bool dynamic, size_t... tile_sizes> void BlockPartitionGettersBasicTest() {
+  static_cast<void>((BlockPartitionGettersBasicTestImpl<dynamic, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Creates tiled partitions for each of the valid sizes{2, 4, 8, 16, 32, 64(if AMD)} and writes
+ * the return values of size and thread_rank member functions to an output array that is validated
+ * on the host side.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Tile_Getters_Positive_Basic") {
+  BlockPartitionGettersBasicTest<false, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockPartitionGettersBasicTest<false, 64>();
+#endif
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Creates tiled partitions for each of the valid sizes{2, 4, 8, 16, 32, 64(if AMD)} via the
+ * dynamic tiled partition api and writes the return values of size and thread_rank member functions
+ * to an output array that is validated on host.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Thread_Block_Tile_Dynamic_Getters_Positive_Basic") {
+  BlockPartitionGettersBasicTest<true, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockPartitionGettersBasicTest<true, 64>();
+#endif
+}
+
+
+template <typename T, size_t tile_size>
+__global__ void block_tile_shfl_up(T* const out, const unsigned int delta) {
+  const cg::thread_block_tile<tile_size> partition =
+      cg::tiled_partition<tile_size>(cg::this_thread_block());
+  T var = static_cast<T>(partition.thread_rank());
+  out[thread_rank_in_grid()] = partition.shfl_up(var, delta);
+}
+
+template <typename T, size_t tile_size> void BlockTileShflUpTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    auto blocks = GenerateBlockDimensionsForShuffle();
+    auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    auto delta = GENERATE(range(static_cast<size_t>(0), tile_size));
+    INFO("Delta: " << delta);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+    block_tile_shfl_up<T, tile_size><<<blocks, threads>>>(arr_dev.ptr(), delta);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    ArrayAllOf(arr.ptr(), grid.thread_count_, [delta, &grid](unsigned int i) -> std::optional<T> {
+      const int rank_in_partition = grid.thread_rank_in_block(i).value() % tile_size;
+      const int target = rank_in_partition - delta;
+      return target < 0 ? rank_in_partition : target;
+    });
+  }
+}
+
+template <typename T, size_t... tile_sizes> void BlockTileShflUpTest() {
+  static_cast<void>((BlockTileShflUpTestImpl<T, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle up behavior of thread block tiles of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for delta values of [0, tile size). The test is run for all overloads of shfl_up.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Up_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  BlockTileShflUpTest<TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockTileShflUpTest<TestType, 64>();
+#endif
+}
+
+
+template <typename T, size_t tile_size>
+__global__ void block_tile_shfl_down(T* const out, const unsigned int delta) {
+  const cg::thread_block_tile<tile_size> partition =
+      cg::tiled_partition<tile_size>(cg::this_thread_block());
+  T var = static_cast<T>(partition.thread_rank());
+  out[thread_rank_in_grid()] = partition.shfl_down(var, delta);
+}
+
+template <typename T, size_t tile_size> void BlockTileShflDownTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    auto blocks = GenerateBlockDimensionsForShuffle();
+    auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    auto delta = GENERATE(range(static_cast<size_t>(0), tile_size));
+    INFO("Delta: " << delta);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+    block_tile_shfl_down<T, tile_size><<<blocks, threads>>>(arr_dev.ptr(), delta);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    ArrayAllOf(arr.ptr(), grid.thread_count_, [delta, &grid](unsigned int i) -> std::optional<T> {
+      const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size;
+      const auto rank_in_block = grid.thread_rank_in_block(i).value();
+      const auto rank_in_group = rank_in_block % tile_size;
+      const auto target = rank_in_group + delta;
+      if (rank_in_block < (partitions_in_block - 1) * tile_size) {
+        return target < tile_size ? target : rank_in_group;
+      } else {
+        // If the number of threads in a block is not an integer multiple of tile_size, the
+        // final(tail end) tile will contain inactive threads.
+        // Shuffling from an inactive thread returns an undefined value, accordingly threads that
+        // shuffle from one must be skipped
+        const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_;
+        return target < tile_size - tail ? std::optional(target) : std::nullopt;
+      }
+    });
+  }
+}
+
+template <typename T, size_t... tile_sizes> void BlockTileShflDownTest() {
+  static_cast<void>((BlockTileShflDownTestImpl<T, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle down behavior of thread block tiles of all valid sizes{2, 4, 8, 16,
+ * 32, 64(if AMD)} for delta values of [0, tile size). The test is run for all overloads of
+ * shfl_down.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Down_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  BlockTileShflDownTest<TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockTileShflDownTest<TestType, 64>();
+#endif
+}
+
+
+template <typename T, size_t tile_size>
+__global__ void block_tile_shfl_xor(T* const out, const unsigned mask) {
+  const cg::thread_block_tile<tile_size> partition =
+      cg::tiled_partition<tile_size>(cg::this_thread_block());
+  T var = static_cast<T>(partition.thread_rank());
+  out[thread_rank_in_grid()] = partition.shfl_xor(var, mask);
+}
+
+template <typename T, size_t tile_size> void BlockTileShflXORTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    auto blocks = GenerateBlockDimensionsForShuffle();
+    auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    const auto mask = GENERATE(range(static_cast<size_t>(0), tile_size));
+    INFO("Mask: 0x" << std::hex << mask);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+    block_tile_shfl_xor<T, tile_size><<<blocks, threads>>>(arr_dev.ptr(), mask);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    const auto f = [mask, &grid](unsigned int i) -> std::optional<T> {
+      const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size;
+      const auto rank_in_block = grid.thread_rank_in_block(i).value();
+      const int rank_in_partition = rank_in_block % tile_size;
+      const auto target = rank_in_partition ^ mask;
+      if (rank_in_block < (partitions_in_block - 1) * tile_size) {
+        return target;
+      }
+      const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_;
+      return target < tile_size - tail ? std::optional(target) : std::nullopt;
+    };
+    ArrayAllOf(arr.ptr(), grid.thread_count_, f);
+  }
+}
+
+template <typename T, size_t... tile_sizes> void BlockTileShflXORTest() {
+  static_cast<void>((BlockTileShflXORTestImpl<T, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle xor behavior of thread block tiles of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for mask values of [0, tile size). The test is run for all overloads of shfl_xor.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_XOR_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  BlockTileShflXORTest<TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockTileShflXORTest<TestType, 64>();
+#endif
+}
+
+template <typename T, size_t tile_size>
+__global__ void block_tile_shfl(T* const out, uint8_t* target_lanes) {
+  const cg::thread_block_tile<tile_size> partition =
+      cg::tiled_partition<tile_size>(cg::this_thread_block());
+  T var = static_cast<T>(partition.thread_rank());
+  out[thread_rank_in_grid()] = partition.shfl(var, target_lanes[partition.thread_rank()]);
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  static std::mt19937 mt(11);
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <typename T, size_t tile_size> void BlockTileShflTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    auto blocks = GenerateBlockDimensionsForShuffle();
+    auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+    LinearAllocGuard<uint8_t> target_lanes_dev(LinearAllocs::hipMalloc,
+                                               tile_size * sizeof(uint8_t));
+    LinearAllocGuard<uint8_t> target_lanes(LinearAllocs::hipHostMalloc,
+                                           tile_size * sizeof(uint8_t));
+    std::generate(target_lanes.ptr(), target_lanes.ptr() + tile_size,
+                  [] { return GenerateRandomInteger(0, static_cast<int>(2 * tile_size)); });
+
+    HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), tile_size * sizeof(uint8_t),
+                        hipMemcpyHostToDevice));
+    block_tile_shfl<T, tile_size><<<blocks, threads>>>(arr_dev.ptr(), target_lanes_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    const auto f = [&target_lanes, &grid](unsigned int i) -> std::optional<T> {
+      const auto partitions_in_block = (grid.threads_in_block_count_ + tile_size - 1) / tile_size;
+      const auto rank_in_block = grid.thread_rank_in_block(i).value();
+      const int rank_in_partition = rank_in_block % tile_size;
+      const auto target = target_lanes.ptr()[rank_in_partition] % tile_size;
+      if (rank_in_block < (partitions_in_block - 1) * tile_size) {
+        return target;
+      }
+      const auto tail = partitions_in_block * tile_size - grid.threads_in_block_count_;
+      return target < tile_size - tail ? std::optional(target) : std::nullopt;
+    };
+    ArrayAllOf(arr.ptr(), grid.thread_count_, f);
+  }
+}
+
+template <typename T, size_t... tile_sizes> void BlockTileShflTest() {
+  static_cast<void>((BlockTileShflTestImpl<T, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle behavior of thread block tiles of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for generated shuffle target lanes. The test is run for all overloads of shfl. Test
+ * source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Shfl_Positive_Basic", "", int, unsigned int, long,
+                   unsigned long, long long, unsigned long long, float, double) {
+  BlockTileShflTest<TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+  BlockTileShflTest<TestType, 64>();
+#endif
+}
+
+
+template <bool use_global, size_t tile_size, typename T>
+__global__ void block_tile_sync_check(T* global_data, unsigned int* wait_modifiers) {
+  extern __shared__ uint8_t shared_data[];
+  T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
+  const auto tid = cg::this_grid().thread_rank();
+  const auto block = cg::this_thread_block();
+  const cg::thread_block_tile<tile_size> partition =
+      cg::tiled_partition<tile_size>(cg::this_thread_block());
+
+  const auto data_idx = [&block](unsigned int i) { return use_global ? i : (i % block.size()); };
+
+  const auto partitions_in_block = (block.size() + partition.size() - 1) / partition.size();
+  const auto partition_rank = block.thread_rank() / partition.size();
+  const auto tail = partitions_in_block * partition.size() - block.size();
+  const auto window_size = partition.size() - tail * (partition_rank == partitions_in_block - 1);
+
+  const auto block_base_idx = tid / block.size() * block.size();
+  const auto tile_base_idx = block_base_idx + partition_rank * partition.size();
+
+  const auto wait_modifier = wait_modifiers[tid];
+  busy_wait(wait_modifier);
+  data[data_idx(tid)] = partition.thread_rank();
+  partition.sync();
+  bool valid = true;
+  for (auto i = 0; i < window_size; ++i) {
+    const auto expected = (partition.thread_rank() + i) % window_size;
+
+    if (!(valid &= (data[data_idx(tile_base_idx + expected)] == expected))) {
+      break;
+    }
+  }
+  partition.sync();
+  data[data_idx(tid)] = valid;
+  if constexpr (!use_global) {
+    global_data[tid] = data[data_idx(tid)];
+  }
+}
+
+template <bool global_memory, typename T, size_t tile_size> void BlockTileSyncTestImpl() {
+  DYNAMIC_SECTION("Tile size: " << tile_size) {
+    const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
+    INFO("Run number: " << randomized_run_count + 1);
+    auto blocks = GenerateBlockDimensions();
+    auto threads = GenerateThreadDimensions();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    CPUGrid grid(blocks, threads);
+
+    const auto alloc_size = grid.thread_count_ * sizeof(T);
+    const auto alloc_size_per_block = alloc_size / grid.block_count_;
+    int max_shared_mem_per_block = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
+                                    hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+    if (!global_memory && (max_shared_mem_per_block < alloc_size_per_block)) {
+      return;
+    }
+
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+    LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
+                                                      grid.thread_count_ * sizeof(unsigned int));
+    LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
+                                                  grid.thread_count_ * sizeof(unsigned int));
+    if (randomized_run_count != 0) {
+      std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
+                    [] { return GenerateRandomInteger(0u, 1500u); });
+    } else {
+      std::fill_n(wait_modifiers.ptr(), grid.thread_count_, 0u);
+    }
+
+    const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block;
+    HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
+                        grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+
+    block_tile_sync_check<global_memory, tile_size>
+        <<<blocks, threads, shared_memory_size>>>(arr_dev.ptr(), wait_modifiers_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    REQUIRE(
+        std::all_of(arr.ptr(), arr.ptr() + grid.thread_count_, [](unsigned int e) { return e; }));
+  }
+}
+
+template <bool global_memory, typename T, size_t... tile_sizes> void BlockTileSyncTest() {
+  static_cast<void>((BlockTileSyncTestImpl<global_memory, T, tile_sizes>(), ...));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches a kernel wherein blocks are divided into tiled partitions(size of 2, 4, 8, 16, 32,
+ * 64 if AMD) and every thread writes its intra-tile rank into an array slot determined by its
+ * grid-wide linear index. The array is either in global or dynamic shared memory based on a compile
+ * time switch, and the test is run for arrays of 1, 2, and 4 byte elements. Before the write each
+ * thread executes a busy wait loop for a random amount of clock cycles, the amount being read from
+ * an input array. After the write a tile-wide sync is performed and each thread validates that it
+ * can read the expected values that other threads within the same tile have written to their
+ * respective array slots.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/thread_block_tile.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Thread_Block_Tile_Sync_Positive_Basic", "", uint8_t, uint16_t, uint32_t) {
+  SECTION("Global memory") {
+    BlockTileSyncTest<true, TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+    BlockTileSyncTest<true, TestType, 64>();
+#endif
+  }
+  SECTION("Shared memory") {
+    BlockTileSyncTest<false, TestType, 2, 4, 8, 16, 32>();
+#if HT_AMD && (__GFX8__ || __GFX9__)
+    BlockTileSyncTest<true, TestType, 64>();
+#endif
+  }
+}
@@ -3,9 +3,19 @@ set(TEST_SRC
    hipFuncSetCacheConfig.cc
    hipFuncSetSharedMemConfig.cc
    hipFuncSetAttribute.cc
+    hipFuncGetAttributes.cc
+    hipLaunchCooperativeKernel.cc
+    hipLaunchCooperativeKernelMultiDevice.cc
 )

+if(HIP_PLATFORM MATCHES "amd")
+    set(TEST_SRC ${TEST_SRC}
+        hipExtLaunchKernel.cc
+        hipExtLaunchMultiKernelMultiDevice.cc
+    )
+endif()
+
 hip_add_exe_to_target(NAME ExecutionControlTest
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++17)
+                      COMPILE_OPTIONS -std=c++17)
@@ -23,5 +23,15 @@ THE SOFTWARE.
 #include "execution_control_common.hh"

 #include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>

-__global__ void kernel() {}
+__global__ void kernel() {}
+
+__global__ void kernel2() {}
+
+__global__ void kernel_42(int* val) { *val = 42; }
+
+__global__ void coop_kernel() {
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  grid.sync();
+}
@@ -22,4 +22,10 @@ THE SOFTWARE.

 #pragma once

-__global__ void kernel();
+__global__ void kernel();
+
+__global__ void kernel2();
+
+__global__ void kernel_42(int* val);
+
+__global__ void coop_kernel();
@@ -0,0 +1,176 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "execution_control_common.hh"
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+TEST_CASE("Unit_hipExtLaunchKernel_Positive_Basic") {
+  SECTION("Kernel with no arguments") {
+    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{1, 1, 1},
+                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+
+  SECTION("Kernel with arguments using kernelParams") {
+    LinearAllocGuard<int> result_dev(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK(hipMemset(result_dev.ptr(), 0, sizeof(*result_dev.ptr())));
+    int* result_ptr = result_dev.ptr();
+    void* kernel_args[1] = {&result_ptr};
+    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel_42), dim3{1, 1, 1}, dim3{1, 1, 1},
+                                 kernel_args, 0, nullptr, nullptr, nullptr, 0u));
+    int result = 0;
+    HIP_CHECK(hipMemcpy(&result, result_dev.ptr(), sizeof(result), hipMemcpyDefault));
+    REQUIRE(result == 42);
+  }
+}
+
+TEST_CASE("Unit_hipExtLaunchKernel_Positive_Parameters") {
+  SECTION("blockDim.x == maxBlockDimX") {
+    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX);
+    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{x, 1, 1},
+                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
+  }
+
+  SECTION("blockDim.y == maxBlockDimY") {
+    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY);
+    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{y, 1, 1},
+                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
+  }
+
+  SECTION("blockDim.z == maxBlockDimZ") {
+    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ);
+    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{z, 1, 1},
+                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
+  }
+}
+
+TEST_CASE("Unit_hipExtLaunchKernel_Negative_Parameters") {
+  SECTION("f == nullptr") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(nullptr, dim3{1, 1, 1}, dim3{1, 1, 1}, nullptr, 0, nullptr,
+                                       nullptr, nullptr, 0u),
+                    hipErrorInvalidDeviceFunction);
+  }
+
+  SECTION("gridDim.x == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{0, 1, 1},
+                                       dim3{1, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("gridDim.y == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 0, 1},
+                                       dim3{1, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("gridDim.z == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 0},
+                                       dim3{1, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("blockDim.x == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{0, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("blockDim.y == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 0, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("blockDim.z == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, 0}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("blockDim.x > maxBlockDimX") {
+    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX) + 1u;
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{x, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.y > maxBlockDimY") {
+    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY) + 1u;
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, y, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.z > maxBlockDimZ") {
+    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ) + 1u;
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, z}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.x * blockDim.y * blockDim.z > maxThreadsPerBlock") {
+    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxThreadsPerBlock);
+    const unsigned int dim = std::ceil(std::cbrt(max));
+    HIP_CHECK_ERROR(
+        hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{dim, dim, dim},
+                           nullptr, 0, nullptr, nullptr, nullptr, 0u),
+        hipErrorInvalidConfiguration);
+  }
+
+  SECTION("sharedMemBytes > maxSharedMemoryPerBlock") {
+    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxSharedMemoryPerBlock) + 1u;
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, 1}, nullptr, max, nullptr, nullptr, nullptr, 0u),
+                    hipErrorOutOfMemory);
+  }
+
+  SECTION("Invalid stream") {
+    hipStream_t stream = nullptr;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipStreamDestroy(stream));
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, 1}, nullptr, 0, stream, nullptr, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Invalid startEvent") {
+    hipEvent_t event = nullptr;
+    HIP_CHECK(hipEventCreate(&event));
+    HIP_CHECK(hipEventDestroy(event));
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, 1}, nullptr, 0, nullptr, event, nullptr, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Invalid endEvent") {
+    hipEvent_t event = nullptr;
+    HIP_CHECK(hipEventCreate(&event));
+    HIP_CHECK(hipEventDestroy(event));
+    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                       dim3{1, 1, 1}, nullptr, 0, nullptr, nullptr, event, 0u),
+                    hipErrorInvalidValue);
+  }
+}
@@ -0,0 +1,144 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "execution_control_common.hh"
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Positive_Basic") {
+  const auto device_count = HipTest::getDeviceCount();
+
+  std::vector<hipLaunchParams> params_list(device_count);
+
+  int device = 0;
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipSetDevice(device++));
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  HIP_CHECK(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 0u));
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamSynchronize(params.stream));
+  }
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
+
+TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Negative_Parameters") {
+  const auto device_count = HipTest::getDeviceCount();
+
+  std::vector<hipLaunchParams> params_list(device_count);
+
+  int device = 0;
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipSetDevice(device++));
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  SECTION("launchParamsList == nullptr") {
+    HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(nullptr, device_count, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("numDevices == 0") {
+    HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), 0, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("numDevices > device count") {
+    HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count + 1, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid flags") {
+    HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 999),
+                    hipErrorInvalidValue);
+  }
+
+  if (device_count > 1) {
+    SECTION("launchParamsList.func doesn't match across all devices") {
+      params_list[1].func = reinterpret_cast<void*>(kernel2);
+      HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.gridDim doesn't match across all kernels") {
+      params_list[1].gridDim = dim3{2, 2, 2};
+      HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.blockDim doesn't match across all kernels") {
+      params_list[1].blockDim = dim3{2, 2, 2};
+      HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.sharedMem doesn't match across all kernels") {
+      params_list[1].sharedMem = 1024;
+      HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+  }
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
+
+TEST_CASE("Unit_hipExtLaunchMultiKernelMultiDevice_Negative_MultiKernelSameDevice") {
+  HIP_CHECK(hipSetDevice(0));
+
+  std::vector<hipLaunchParams> params_list(2);
+
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  HIP_CHECK_ERROR(hipExtLaunchMultiKernelMultiDevice(params_list.data(), 2, 0u),
+                  hipErrorInvalidValue);
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+
+constexpr size_t kConstSizeBytes = 128;
+__constant__ char const_data[kConstSizeBytes];
+
+__global__ void attribute_test_kernel() {}
+
+TEST_CASE("Unit_hipFuncGetAttributes_Positive_Basic") {
+  hipFuncAttributes attr;
+  HIP_CHECK(hipFuncGetAttributes(&attr, reinterpret_cast<void*>(attribute_test_kernel)));
+
+  SECTION("binaryVersion") {
+#if HT_NVIDIA
+    const auto major = GetDeviceAttribute(0, hipDeviceAttributeComputeCapabilityMajor);
+    const auto minor = GetDeviceAttribute(0, hipDeviceAttributeComputeCapabilityMinor);
+    REQUIRE(attr.binaryVersion == major * 10 + minor);
+#elif HT_AMD
+    REQUIRE(attr.binaryVersion > 0);
+#endif
+  }
+
+  SECTION("cacheModeCA") { REQUIRE((attr.cacheModeCA == 0 || attr.cacheModeCA == 1)); }
+
+  SECTION("constSizeBytes") { REQUIRE(attr.constSizeBytes == kConstSizeBytes); }
+
+  SECTION("maxThreadsPerBlock") {
+    REQUIRE(attr.maxThreadsPerBlock == GetDeviceAttribute(0, hipDeviceAttributeMaxThreadsPerBlock));
+  }
+
+  SECTION("numRegs") { REQUIRE(attr.numRegs >= 0); }
+
+  SECTION("ptxVersion") { REQUIRE(attr.ptxVersion > 0); }
+
+  SECTION("sharedSizeBytes") {
+    REQUIRE(attr.sharedSizeBytes <=
+            GetDeviceAttribute(0, hipDeviceAttributeMaxSharedMemoryPerBlock));
+  }
+}
+
+TEST_CASE("Unit_hipFuncGetAttributes_Negative_Parameters") {
+  SECTION("attr == nullptr") {
+    HIP_CHECK_ERROR(hipFuncGetAttributes(nullptr, reinterpret_cast<void*>(attribute_test_kernel)),
+                    hipErrorInvalidValue);
+  }
+  SECTION("func == nullptr") {
+    hipFuncAttributes attr;
+    HIP_CHECK_ERROR(hipFuncGetAttributes(&attr, nullptr), hipErrorInvalidDeviceFunction);
+  }
+}
@@ -0,0 +1,188 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "execution_control_common.hh"
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+TEST_CASE("Unit_hipLaunchCooperativeKernel_Positive_Basic") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  SECTION("Cooperative kernel with no arguments") {
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(coop_kernel), dim3{2, 2, 1},
+                                         dim3{1, 1, 1}, nullptr, 0, nullptr));
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+
+  SECTION("Kernel with arguments using kernelParams") {
+    LinearAllocGuard<int> result_dev(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK(hipMemset(result_dev.ptr(), 0, sizeof(*result_dev.ptr())));
+
+    int* result_ptr = result_dev.ptr();
+    void* kernel_args[1] = {&result_ptr};
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel_42), dim3{1, 1, 1},
+                                         dim3{1, 1, 1}, kernel_args, 0, nullptr));
+
+    int result = 0;
+    HIP_CHECK(hipMemcpy(&result, result_dev.ptr(), sizeof(result), hipMemcpyDefault));
+    REQUIRE(result == 42);
+  }
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernel_Positive_Parameters") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  SECTION("blockDim.x == maxBlockDimX") {
+    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX);
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                         dim3{x, 1, 1}, nullptr, 0, nullptr));
+  }
+
+  SECTION("blockDim.y == maxBlockDimY") {
+    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY);
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                         dim3{y, 1, 1}, nullptr, 0, nullptr));
+  }
+
+  SECTION("blockDim.z == maxBlockDimZ") {
+    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ);
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                         dim3{z, 1, 1}, nullptr, 0, nullptr));
+  }
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernel_Negative_Parameters") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  SECTION("f == nullptr") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(static_cast<void*>(nullptr), dim3{1, 1, 1},
+                                               dim3{1, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidDeviceFunction);
+  }
+
+  SECTION("gridDim.x == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{0, 1, 1},
+                                               dim3{1, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("gridDim.y == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 0, 1},
+                                               dim3{1, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("gridDim.z == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 0},
+                                               dim3{1, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.x == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{0, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.y == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, 0, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.z == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, 1, 0}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.x > maxBlockDimX") {
+    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX) + 1u;
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{x, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.y > maxBlockDimY") {
+    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY) + 1u;
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, y, 1}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.z > maxBlockDimZ") {
+    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ) + 1u;
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, 1, z}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION("blockDim.x * blockDim.y * blockDim.z > maxThreadsPerBlock") {
+    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxThreadsPerBlock);
+    const unsigned int dim = std::ceil(std::cbrt(max));
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{dim, dim, dim}, nullptr, 0, nullptr),
+                    hipErrorInvalidConfiguration);
+  }
+
+  SECTION(
+      "gridDim.x * gridDim.y * gridDim.z > maxActiveBlocksPerMultiprocessor * "
+      "multiProcessorCount") {
+    int max_blocks;
+    HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
+                                                           reinterpret_cast<void*>(kernel), 1, 0));
+    const unsigned int multiproc_count =
+        GetDeviceAttribute(0, hipDeviceAttributeMultiprocessorCount);
+    const unsigned int dim = std::ceil(std::cbrt(max_blocks * multiproc_count));
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{dim, dim, dim},
+                                               dim3{1, 1, 1}, nullptr, 0, nullptr),
+                    hipErrorCooperativeLaunchTooLarge);
+  }
+
+  SECTION("sharedMemBytes > maxSharedMemoryPerBlock") {
+    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxSharedMemoryPerBlock) + 1u;
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, 1, 1}, nullptr, max, nullptr),
+                    hipErrorCooperativeLaunchTooLarge);
+  }
+
+  SECTION("Invalid stream") {
+    hipStream_t stream = nullptr;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipStreamDestroy(stream));
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
+                                               dim3{1, 1, 1}, nullptr, 0, stream),
+                    hipErrorContextIsDestroyed);
+  }
+}
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "execution_control_common.hh"
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Positive_Basic") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  const auto device_count = HipTest::getDeviceCount();
+
+  std::vector<hipLaunchParams> params_list(device_count);
+
+  int device = 0;
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(coop_kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipSetDevice(device++));
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 0u));
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamSynchronize(params.stream));
+  }
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Negative_Parameters") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  const auto device_count = HipTest::getDeviceCount();
+
+  std::vector<hipLaunchParams> params_list(device_count);
+
+  int device = 0;
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(coop_kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipSetDevice(device++));
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  SECTION("launchParamsList == nullptr") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(nullptr, device_count, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("numDevices == 0") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), 0, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("numDevices > device count") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count + 1, 0u),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("invalid flags") {
+    HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 999),
+                    hipErrorInvalidValue);
+  }
+
+  if (device_count > 1) {
+    SECTION("launchParamsList.func doesn't match across all devices") {
+      params_list[1].func = reinterpret_cast<void*>(kernel);
+      HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.gridDim doesn't match across all kernels") {
+      params_list[1].gridDim = dim3{2, 2, 2};
+      HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.blockDim doesn't match across all kernels") {
+      params_list[1].blockDim = dim3{2, 2, 2};
+      HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("launchParamsList.sharedMem doesn't match across all kernels") {
+      params_list[1].sharedMem = 1024;
+      HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), device_count, 0u),
+                      hipErrorInvalidValue);
+    }
+  }
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
+
+TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Negative_MultiKernelSameDevice") {
+  if (!DeviceAttributesSupport(0, hipDeviceAttributeCooperativeLaunch)) {
+    HipTest::HIP_SKIP_TEST("CooperativeLaunch not supported");
+    return;
+  }
+
+  HIP_CHECK(hipSetDevice(0));
+
+  std::vector<hipLaunchParams> params_list(2);
+
+  for (auto& params : params_list) {
+    params.func = reinterpret_cast<void*>(coop_kernel);
+    params.gridDim = dim3{1, 1, 1};
+    params.blockDim = dim3{1, 1, 1};
+    params.args = nullptr;
+    params.sharedMem = 0;
+    HIP_CHECK(hipStreamCreate(&params.stream));
+  }
+
+  HIP_CHECK_ERROR(hipLaunchCooperativeKernelMultiDevice(params_list.data(), 2, 0u),
+                  hipErrorInvalidValue);
+
+  for (const auto params : params_list) {
+    HIP_CHECK(hipStreamDestroy(params.stream));
+  }
+}
@@ -32,6 +32,7 @@ set(TEST_SRC
  hipGraph.cc
  hipSimpleGraphWithKernel.cc
  hipGraphAddMemcpyNode.cc
+  hipGraphAddMemcpyNode_old.cc
  hipGraphClone.cc
  hipGraphInstantiateWithFlags.cc
  hipGraphAddHostNode.cc
@@ -54,6 +55,7 @@ set(TEST_SRC
  hipGraphAddMemcpyNode1D.cc
  hipGraphAddChildGraphNode.cc
  hipGraphNodeGetType.cc
+  hipGraphExecMemcpyNodeSetParams1D_old.cc
  hipGraphExecMemcpyNodeSetParams1D.cc
  hipGraphGetEdges.cc
  hipGraphGetEdges_old.cc
@@ -71,7 +73,10 @@ set(TEST_SRC
  hipGraphEventRecordNodeSetEvent.cc
  hipGraphEventWaitNodeGetEvent.cc
  hipGraphExecMemcpyNodeSetParams.cc
+  hipGraphExecMemcpyNodeSetParams_old.cc
  hipStreamBeginCapture.cc
+  hipGraphAddMemcpyNode1D_old.cc
+  hipGraphAddMemcpyNode1D.cc
  hipStreamBeginCapture_old.cc
  hipStreamIsCapturing.cc
  hipStreamIsCapturing_old.cc)
@@ -98,13 +103,16 @@ set(TEST_SRC
  hipGraphAddMemsetNode.cc
  hipGraphAddKernelNode.cc
  hipGraphMemcpyNodeGetParams.cc
+  hipGraphMemcpyNodeGetParams_old.cc
  hipGraphMemcpyNodeSetParams.cc
+  hipGraphMemcpyNodeSetParams_old.cc
  hipGraphKernelNodeGetParams.cc
  hipGraphKernelNodeSetParams.cc
  hipGraphExecKernelNodeSetParams.cc
  hipGraphLaunch.cc
  hipGraphLaunch_old.cc
  hipGraphMemcpyNodeSetParams1D.cc
+  hipGraphMemcpyNodeSetParams1D_old.cc
  hipGraphExecMemcpyNodeSetParamsToSymbol_old.cc
  hipGraphExecMemcpyNodeSetParamsToSymbol.cc
  hipGraphNodeGetDependentNodes.cc
@@ -1,576 +1,287 @@
 /*
 Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios : Negative
-1) Pass pGraphNode as nullptr and check if api returns error.
-2) When graph is un-initialized argument(skipping graph creation),
-   api should return error code.
-3) Passing pDependencies as nullptr, api should return success.
-4) When numDependencies is max(size_t) and pDependencies is not valid ptr,
-   api expected to return error code.
-5) When pDependencies is nullptr, but numDependencies is non-zero,
-   api expected to return error.
-6) When pCopyParams is nullptr, api expected to return error code.
-7) API expects atleast one memcpy src pointer to be set.
-   When hipMemcpy3DParms::srcArray and hipMemcpy3DParms::srcPtr.ptr both
-   are nullptr, api expected to return error code.
-8) API expects atleast one memcpy dst pointer to be set.
-   When hipMemcpy3DParms::dstArray and hipMemcpy3DParms::dstPtr.ptr both
-   are nullptr, api expected to return error code.
-9) Passing different element size for hipMemcpy3DParms::srcArray and
-   hipMemcpy3DParms::dstArray is expected to return error code.
-
-Testcase Scenarios : Functional
-1) Add memcpy node to graph and verify memcpy operation is success for all
-   memcpy kinds(H2D, D2H and D2D).
-   Memcpy nodes are added and assigned to default device.
-2) Perform memcpy operation for 1D, 2D and 3D arrays on default device and
-   verify the results.
-3) Add memcpy node to graph and verify memcpy operation is success for all
-   memcpy kinds(H2D, D2H and D2D).
-   Memcpy nodes are added and assigned to Peer device.
-4) Perform memcpy operation for 1D, 2D and 3D arrays on Peer device and
-   verify the results.
-5) Create two host pointers, copy the data between them by the api
-   hipGraphAddMemcpyNode with data transfer kind hipMemcpyHostToHost.
-   Validate the output.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <vector>
-#include <numeric>
+#include <hip_test_defgroups.hh>
+#include <memcpy3d_tests_common.hh>

-#define ZSIZE 32
-#define YSIZE 32
-#define XSIZE 32
+#include "graph_tests_common.hh"

-/* Test verifies hipGraphAddMemcpyNode API Negative scenarios.
+/**
+ * @addtogroup hipGraphAddMemcpyNode hipGraphAddMemcpyNode
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphAddMemcpyNode(hipGraphNode_t *pGraphNode, hipGraph_t graph, const
+ * hipGraphNode_t *pDependencies, size_t numDependencies, const hipMemcpy3DParms
+ * *pCopyParams)` - Creates a memcpy node and adds it to a graph
 */

-TEST_CASE("Unit_hipGraphAddMemcpyNode_Negative") {
-  CHECK_IMAGE_SUPPORT
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify basic API behavior. A Memcpy node is created with parameters set according to the
+ * test run, after which the graph is run and the memcpy results are verified.
+ * The test is run for all possible memcpy directions, with both the corresponding memcpy
+ * kind and hipMemcpyDefault, as well as half page and full page allocation sizes.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode_Positive_Basic") {
+  constexpr bool async = false;

-  constexpr int width{10}, height{10}, depth{10};
-  hipArray_t devArray1;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparams;
-  uint32_t size = width * height * depth * sizeof(int);
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  hipStream_t streamForGraph;
-  hipError_t ret;
+  SECTION("Device to host") { Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<async, true>); }

-  int *hData = reinterpret_cast<int*>(malloc(size));
-  int *hOutputData = reinterpret_cast<int *>(malloc(size));
+  SECTION("Device to host with default kind") {
+    Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<async, true>);
+  }

-  REQUIRE(hData != nullptr);
-  REQUIRE(hOutputData != nullptr);
-  memset(hData, 0, size);
-  memset(hOutputData, 0,  size);
+  SECTION("Host to device") { Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<async, true>); }

-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
+  SECTION("Host to device with default kind") {
+    Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<async, true>);
+  }

-  // Initialize host buffer
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
+  SECTION("Host to host") { Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<async, true>); }
+
+  SECTION("Host to host with default kind") {
+    Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<async, true>);
+  }
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<async, true>);
+    }
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<async, true>);
    }
  }

-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                                          0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
-                       make_hipExtent(width, height, depth), hipArrayDefault));
-
-  // Host to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width , height, depth);
-  myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparams.dstArray = devArray1;
-  myparams.kind = hipMemcpyHostToDevice;
-
-  SECTION("Pass pGraphNode as nullptr") {
-    ret = hipGraphAddMemcpyNode(nullptr, graph, nullptr, 0, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("When graph is nullptr") {
-    ret = hipGraphAddMemcpyNode(&memcpyNode, nullptr,  nullptr, 0, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Passing pDependencies as nullptr") {
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("When numDependencies is max and pDependencies is not valid ptr") {
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph,
-                                nullptr, INT_MAX, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("When pDependencies is nullptr, but numDependencies is non-zero") {
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 11, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pCopyParams as nullptr") {
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("API expects atleast one memcpy src pointer to be set") {
-    memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-    myparams.srcPos = make_hipPos(0, 0, 0);
-    myparams.dstPos = make_hipPos(0, 0, 0);
-    myparams.extent = make_hipExtent(width , height, depth);
-    myparams.dstArray = devArray1;
-    myparams.kind = hipMemcpyHostToDevice;
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("API expects atleast one memcpy dst pointer to be set") {
-    memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-    myparams.srcPos = make_hipPos(0, 0, 0);
-    myparams.dstPos = make_hipPos(0, 0, 0);
-    myparams.extent = make_hipExtent(width , height, depth);
-    myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-    myparams.kind = hipMemcpyHostToDevice;
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Passing different element size for hipMemcpy3DParms::srcArray"
-                   "and hipMemcpy3DParms::dstArray") {
-    myparams.srcArray = devArray1;
-    hipArray_t devArray2;
-    HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
-              make_hipExtent(width+1, height+1, depth+1), hipArrayDefault));
-    myparams.dstArray = devArray2;
-    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
-    REQUIRE(hipErrorInvalidValue == ret);
-    HIP_CHECK(hipFreeArray(devArray2));
-  }
-
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray1));
-  free(hData);
-  free(hOutputData);
-}
-
-static void validateMemcpyNode3DArray(bool peerAccess = false) {
-  constexpr int width{10}, height{10}, depth{10};
-  hipArray_t devArray1, devArray2;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparams;
-  uint32_t size = width * height * depth * sizeof(int);
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  std::vector<hipGraphNode_t> dependencies;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipSetDevice(0));
-  int *hData = reinterpret_cast<int*>(malloc(size));
-  int *hOutputData = reinterpret_cast<int *>(malloc(size));
-
-  REQUIRE(hData != nullptr);
-  REQUIRE(hOutputData != nullptr);
-  memset(hData, 0, size);
-  memset(hOutputData, 0,  size);
-
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  // Initialize host buffer
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<async, true>);
+    }
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<async, true>);
    }
  }

-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                                          0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
-                       make_hipExtent(width, height, depth), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
-                       make_hipExtent(width, height, depth), hipArrayDefault));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
+  SECTION("Array from/to Host") { Memcpy3DArrayHostShell<async>(Memcpy3DWrapper<async, true>); }

-  // For peer access test, Memory is allocated on device(0)
-  // while memcpy nodes are allocated and assigned to peer device(1)
-  if (peerAccess) {
-    HIP_CHECK(hipSetDevice(1));
-  }
-
-  // Host to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width , height, depth);
-  myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparams.dstArray = devArray1;
-  myparams.kind = hipMemcpyHostToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
-  dependencies.push_back(memcpyNode);
-
-  // Device to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.srcArray = devArray1;
-  myparams.dstArray = devArray2;
-  myparams.extent = make_hipExtent(width, height, depth);
-  myparams.kind = hipMemcpyDeviceToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                             dependencies.size(), &myparams));
-  dependencies.clear();
-  dependencies.push_back(memcpyNode);
-
-  // Device to host
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(int),
-                                      width, height);
-  myparams.srcArray = devArray2;
-  myparams.extent = make_hipExtent(width, height, depth);
-  myparams.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                             dependencies.size(), &myparams));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Check result
-  HipTest::checkArray(hData, hOutputData, width, height, depth);
-
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray1));
-  HIP_CHECK(hipFreeArray(devArray2));
-  free(hData);
-  free(hOutputData);
-}
-
-static void validateMemcpyNode2DArray(bool peerAccess = false) {
-  int harray2D[YSIZE][XSIZE]{};
-  int harray2Dres[YSIZE][XSIZE]{};
-  constexpr int width{XSIZE}, height{YSIZE};
-  hipArray_t devArray1, devArray2;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparams;
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  std::vector<hipGraphNode_t> dependencies;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipSetDevice(0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  // Initialize 2D object
-  for (int i = 0; i < YSIZE; i++) {
-    for (int j = 0; j < XSIZE; j++) {
-      harray2D[i][j] = i + j + 1;
-    }
-  }
-
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                                          0, 0, 0, formatKind);
-  // Allocate 2D device array by passing depth(0)
-  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
-                       make_hipExtent(width, height, 0), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
-                       make_hipExtent(width, height, 0), hipArrayDefault));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-
-  // For peer access test, Memory is allocated on device(0)
-  // while memcpy nodes are allocated and assigned to peer device(1)
-  if (peerAccess) {
-    HIP_CHECK(hipSetDevice(1));
-  }
-
-  // Host to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, height, 1);
-  myparams.srcPtr = make_hipPitchedPtr(harray2D, width * sizeof(int),
-                                      width, height);
-  myparams.dstArray = devArray1;
-  myparams.kind = hipMemcpyHostToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
-  dependencies.push_back(memcpyNode);
-
-  // Device to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.srcArray = devArray1;
-  myparams.dstArray = devArray2;
-  myparams.extent = make_hipExtent(width, height, 1);
-  myparams.kind = hipMemcpyDeviceToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                             dependencies.size(), &myparams));
-  dependencies.clear();
-  dependencies.push_back(memcpyNode);
-
-  // Device to host
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, height, 1);
-  myparams.dstPtr = make_hipPitchedPtr(harray2Dres, width * sizeof(int),
-                                      width, height);
-  myparams.srcArray = devArray2;
-  myparams.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                             dependencies.size(), &myparams));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Validate result
-  for (int i = 0; i < YSIZE; i++) {
-    for (int j = 0; j < XSIZE; j++) {
-      if (harray2D[i][j] != harray2Dres[i][j]) {
-        INFO("harray2D: " << harray2D[i][j] << "harray2Dres: "
-              << harray2Dres[i][j] << " mismatch at (i,j) : " << i << j);
-        REQUIRE(false);
-      }
-    }
-  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray1));
-  HIP_CHECK(hipFreeArray(devArray2));
-}
-
-static void validateMemcpyNode1DArray(bool peerAccess = false) {
-  int harray1D[XSIZE]{};
-  int harray1Dres[XSIZE]{};
-  constexpr int width{XSIZE};
-  hipArray_t devArray1, devArray2;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparams;
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  std::vector<hipGraphNode_t> dependencies;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipSetDevice(0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  // Initialize 1D object
-  for (int i = 0; i < XSIZE; i++) {
-    harray1D[i] = i + 1;
-  }
-
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                                          0, 0, 0, formatKind);
-  // Allocate 1D device array by passing depth(0), height(0)
-  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
-                       make_hipExtent(width, 0, 0), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
-                       make_hipExtent(width, 0, 0), hipArrayDefault));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-
-  // For peer access test, Memory is allocated on device(0)
-  // while memcpy nodes are allocated and assigned to peer device(1)
-  if (peerAccess) {
-    HIP_CHECK(hipSetDevice(1));
-  }
-
-  // Host to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.srcPtr = make_hipPitchedPtr(harray1D, width * sizeof(int),
-                                      width, 1);
-  myparams.dstArray = devArray1;
-  myparams.kind = hipMemcpyHostToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
-  dependencies.push_back(memcpyNode);
-
-  // Device to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.srcArray = devArray1;
-  myparams.dstArray = devArray2;
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.kind = hipMemcpyDeviceToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                             dependencies.size(), &myparams));
-  dependencies.clear();
-  dependencies.push_back(memcpyNode);
-
-  // Device to host
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.dstPtr = make_hipPitchedPtr(harray1Dres, width * sizeof(int),
-                                      width, 1);
-  myparams.srcArray = devArray2;
-  myparams.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                              dependencies.size(), &myparams));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Validate result
-  for (int i = 0; i < XSIZE; i++) {
-    if (harray1D[i] != harray1Dres[i]) {
-      INFO("harray1D: " << harray1D[i] << " harray1Dres: " << harray1Dres[i]
-            << " mismatch at : " << i);
-      REQUIRE(false);
-    }
-  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray1));
-  HIP_CHECK(hipFreeArray(devArray2));
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-220
+  SECTION("Array from/to Device") { Memcpy3DArrayDeviceShell<async>(Memcpy3DWrapper<async, true>); }
+#endif
 }

 /**
- * Basic Functional Tests adds memcpy nodes of types H2D, D2D and D2H to graph
- * and verifies execution sequence by launching graph on default device.
- * Tests also verify memcpy node addition with 1D, 2D and 3D objects.
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# node is nullptr
+ *        -# graph is nullptr
+ *        -# pDependencies is nullptr when numDependencies is not zero
+ *        -# A node in pDependencies originates from a different graph
+ *        -# numDependencies is invalid
+ *        -# A node is duplicated in pDependencies
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
 */
-TEST_CASE("Unit_hipGraphAddMemcpyNode_BasicFunctional") {
-  CHECK_IMAGE_SUPPORT
+TEST_CASE("Unit_hipGraphAddMemcpyNode_Negative_Parameters") {
+  using namespace std::placeholders;

-  SECTION("Memcpy with 3D array on default device") {
-    validateMemcpyNode3DArray();
+  constexpr hipExtent extent{128 * sizeof(int), 128, 8};
+
+  constexpr auto NegativeTests = [](hipPitchedPtr dst_ptr, hipPos dst_pos, hipPitchedPtr src_ptr,
+                                    hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+
+    auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+    GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddMemcpyNode, _1, _2, _3, _4, &params),
+                                    graph);
+
+    SECTION("dst_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.ptr = nullptr;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.ptr = nullptr;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidPitchValue);
+    }
+
+    SECTION("src_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidPitchValue);
+    }
+
+    SECTION("dst_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = attr;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = attr;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + dst_pos.x > dst_ptr.pitch") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.x = dst_ptr.pitch - extent.width + 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + src_pos.x > src_ptr.pitch") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.x = src_ptr.pitch - extent.width + 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.y out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.y = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.y out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.y = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.z out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.z = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.z out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.z = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("Invalid MemcpyKind") {
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent,
+                                     static_cast<hipMemcpyKind>(-1));
+      HIP_CHECK_ERROR(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params),
+                      hipErrorInvalidMemcpyDirection);
+    }
+
+    HIP_CHECK(hipGraphDestroy(graph));
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToDevice);
  }

-  SECTION("Memcpy with 2D array on default device") {
-    validateMemcpyNode2DArray();
+  SECTION("Device to Host") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0), extent,
+                  hipMemcpyDeviceToHost);
  }

-  SECTION("Memcpy with 1D array on default device") {
-    validateMemcpyNode1DArray();
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    NegativeTests(make_hipPitchedPtr(dst_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(src_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard3D<int> src_alloc(extent);
+    LinearAllocGuard3D<int> dst_alloc(extent);
+    NegativeTests(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice);
  }
 }
-
-/**
- * Peer access tests adds and assigns memcpy nodes of types H2D, D2D and D2H
- * to peer device. Memory allocations happen on device(0) and memcpy operations
- * are performed from device(1).
- * Tests also verify memcpy node addition with 1D, 2D and 3D objects.
- */
-TEST_CASE("Unit_hipGraphAddMemcpyNode_PeerAccessFunctional") {
-  CHECK_IMAGE_SUPPORT
-
-  int numDevices{}, peerAccess{};
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-  }
-
-  if (!peerAccess) {
-    WARN("Skipping test as peer device access is not found!");
-    return;
-  }
-
-  SECTION("Memcpy with 3D array on peer device") {
-    validateMemcpyNode3DArray(true);
-  }
-
-  SECTION("Memcpy with 2D array on peer device") {
-    validateMemcpyNode2DArray(true);
-  }
-
-  SECTION("Memcpy with 1D array on peer device") {
-    validateMemcpyNode1DArray(true);
-  }
-}
-/*
-* Create two host pointers, copy the data between them by the api
-* hipGraphAddMemcpyNode with data transfer kind hipMemcpyHostToHost.
-* Validate the output.
-*/
-TEST_CASE("Unit_hipGraphAddMemcpyNode_HostToHost") {
-  constexpr size_t size = 1024;
-  size_t numW = size * sizeof(int);
-  // Host Vectors
-  std::vector<int> A_h(numW);
-  std::vector<int> B_h(numW);
-  // Initialization
-  std::iota(A_h.begin(), A_h.end(), 0);
-  std::fill_n(B_h.begin(), size, 0);
-
-  hipGraph_t graph;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-  hipGraphNode_t memcpyH2H;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  hipMemcpy3DParms myparms{};
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.srcPtr = make_hipPitchedPtr(A_h.data(), numW, numW, 1);
-  myparms.dstPtr = make_hipPitchedPtr(B_h.data(), numW, numW, 1);
-  myparms.extent = make_hipExtent(numW, 1, 1);
-  myparms.kind = hipMemcpyHostToHost;
-
-  // Host to Host
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyH2H, graph, nullptr,
-                                            0, &myparms));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-
-  // Validation
-  REQUIRE(memcmp(A_h.data(), B_h.data(), numW) == 0);
-}
@@ -6,237 +6,179 @@ in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios :
-Functional -
-1) Add 1D memcpy node to graph and verify memcpy operation is success for all memcpy kinds(H2D, D2H and D2D).
- Memcpy nodes are added and assigned to default device.
-2) Allocate memory on default device(Dev 0), Perform memcpy operation for 1D arrays on Peer device(Dev 1) and
- verify the results.
-3) Create two host pointers, copy the data between them by the api hipGraphAddMemcpyNode1D with data transfer
- kind hipMemcpyHostToHost. Validate the output.
-
-Negative -
-1) Pass pGraphNode as nullptr and check if api returns error.
-2) When graph is un-initialized argument(skipping graph creation), api should return error code.
-3) Passing pDependencies as nullptr, api should return success.
-4) When numDependencies is max(size_t) and pDependencies is not valid ptr, api expected to return error code.
-5) When pDependencies is nullptr, but numDependencies is non-zero, api expected to return error.
-6) When destination ptr  is nullptr, api expected to return error code.
-7) When source ptr is nullptr, api expected to return error code.
-8) If count is more than allocated size for source and destination ptr, error code is returned.
-9) If count is less than or equal to allocated size of source and destination ptr, api should return success.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <vector>
-#include <numeric>
-
-static void validateMemcpyNode1DArray(bool peerAccess) {
-  constexpr int SIZE{32};
-  int harray1D[SIZE]{};
-  int harray1Dres[SIZE]{};
-  hipGraph_t graph;
-  hipArray_t devArray1, devArray2;
-  hipGraphNode_t memcpyH2D, memcpyD2H, memcpyD2D;
-  constexpr int numBytes{SIZE * sizeof(int)};
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipSetDevice(0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipMalloc(&devArray1, numBytes));
-  HIP_CHECK(hipMalloc(&devArray2, numBytes));
-
-  // Initialize 1D object
-  for (int i = 0; i < SIZE; i++) {
-    harray1D[i] = i + 1;
-  }
-
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-
-  // For peer access test, Memory is allocated on device(0)
-  // while memcpy nodes are allocated and assigned to peer device(1)
-  if (peerAccess) {
-    HIP_CHECK(hipSetDevice(1));
-  }
-
-  // Host to Device (harray1D -> devArray1)
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0,
-                     devArray1, harray1D, numBytes, hipMemcpyHostToDevice));
-
-  // Device to Device (devArray1 -> devArray2)
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2D, graph, &memcpyH2D, 1,
-                     devArray2, devArray1, numBytes, hipMemcpyDeviceToDevice));
-
-  // Device to host (devArray2 -> harray1Dres)
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, &memcpyD2D, 1,
-                     harray1Dres, devArray2, numBytes, hipMemcpyDeviceToHost));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Validate result
-  for (int i = 0; i < SIZE; i++) {
-    if (harray1D[i] != harray1Dres[i]) {
-      INFO("harray1D: " << harray1D[i] << " harray1Dres: " << harray1Dres[i]
-            << " mismatch at : " << i);
-      REQUIRE(false);
-    }
-  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFree(devArray1));
-  HIP_CHECK(hipFree(devArray2));
-}
+#include <hip_test_defgroups.hh>
+#include <memcpy1d_tests_common.hh>

+#include "graph_tests_common.hh"

 /**
- * Functional Tests adds memcpy 1D nodes of types H2D, D2D and D2H to graph
- * and verifies execution sequence by launching graph.
- *
- * For Default device test: Memory allocations and memory operations
- * are performed from device(0).
- * For Peer device test: Memory allocations happen on device(0) and memcpy operations
- * are performed from device(1).
+ * @addtogroup hipGraphAddMemcpyNode1D hipGraphAddMemcpyNode1D
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphAddMemcpyNode1D(hipGraphNode_t *pGraphNode, hipGraph_t graph, const hipGraphNode_t
+ * *pDependencies, size_t numDependencies, void *dst, const void *src, size_t count, hipMemcpyKind
+ * kind)` - Creates a 1D memcpy node and adds it to a graph
 */
-TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Functional") {
-  SECTION("Memcpy with 1D array on default device") {
-    validateMemcpyNode1DArray(false);
-  }
-
-  SECTION("Memcpy with 1D array on peer device") {
-    int numDevices{}, peerAccess{};
-    HIP_CHECK(hipGetDeviceCount(&numDevices));
-    if (numDevices > 1) {
-      HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    }
-
-    if (!peerAccess) {
-      WARN("Skipping test as peer device access is not found!");
-      return;
-    }
-    validateMemcpyNode1DArray(true);
-  }
-}
-
-

 /**
- * Negative Test for API hipGraphAddMemcpyNode1D
+ * Test Description
+ * ------------------------
+ *    - Verify basic API behavior. A Memcpy1D node is created with parameters set according to the
+ * test run, after which the graph is run and the memcpy results are verified.
+ * The test is run for all possible memcpy directions, with both the corresponding memcpy
+ * kind and hipMemcpyDefault, as well as half page and full page allocation sizes.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
 */
-TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Negative") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  int *A_d, *A_h;
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode{};
-  hipError_t ret;
+TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Positive_Basic") {
+  constexpr auto f = [](void* dst, void* src, size_t count, hipMemcpyKind direction) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, src, count, direction));
+    hipGraphExec_t graph_exec = nullptr;
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));

-  HIP_CHECK(hipMalloc(&A_d, Nbytes));
-  HIP_CHECK(hipMalloc(&A_h, Nbytes));
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));
+
+    return hipSuccess;
+  };
+
+#if HT_NVIDIA
+  MemcpyWithDirectionCommonTests<false>(f);
+#else
+  using namespace std::placeholders;
+
+  SECTION("Device to host") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToHost));
+  }
+
+  SECTION("Device to host with default kind") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+
+  SECTION("Host to device") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToDevice));
+  }
+
+  SECTION("Host to device with default kind") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+
+// Disabled on AMD due to defect - EXSWHTEC-209
+#if 0
+  SECTION("Host to host") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToHost));
+  }
+
+  SECTION("Host to host with default kind") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+  }
+
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+  }
+#endif
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# node is nullptr
+ *        -# graph is nullptr
+ *        -# pDependencies is nullptr when numDependencies is not zero
+ *        -# A node in pDependencies originates from a different graph
+ *        -# numDependencies is invalid
+ *        -# A node is duplicated in pDependencies
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Negative_Parameters") {
+  using namespace std::placeholders;
+  hipGraph_t graph = nullptr;
  HIP_CHECK(hipGraphCreate(&graph, 0));
+  hipGraphNode_t node = nullptr;
+  int src[2] = {}, dst[2] = {};

-  SECTION("Pass pGraphNode as nullptr") {
-    ret = hipGraphAddMemcpyNode1D(nullptr, graph,
-            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+  GraphAddNodeCommonNegativeTests(
+      std::bind(hipGraphAddMemcpyNode1D, _1, _2, _3, _4, dst, src, sizeof(dst), hipMemcpyDefault),
+      graph);
+
+  MemcpyWithDirectionCommonNegativeTests(
+      std::bind(hipGraphAddMemcpyNode1D, &node, graph, nullptr, 0, _1, _2, _3, _4), dst, src,
+      sizeof(dst), hipMemcpyDefault);
+
+// Disabled on AMD due to defect - EXSWHTEC-211
+#if HT_NVIDIA
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(
+        hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, src, 0, hipMemcpyDefault),
+        hipErrorInvalidValue);
  }
-  SECTION("Pass graph as nullptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, nullptr,
-            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+#endif
+
+  SECTION("count larger than dst allocation size") {
+    LinearAllocGuard<int> dev_dst(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dev_dst.ptr(), src,
+                                            sizeof(src), hipMemcpyDefault),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass pDependencies as nullptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipSuccess == ret);
+
+  SECTION("count larger than src allocation size") {
+    LinearAllocGuard<int> dev_src(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, dev_src.ptr(),
+                                            sizeof(dst), hipMemcpyDefault),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass numDependencies is max and pDependencies is not valid ptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, INT_MAX, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pDependencies as nullptr, but numDependencies is non-zero") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 9, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass destination ptr as nullptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 0, nullptr, A_h, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass source ptr as nullptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 0, A_d, nullptr, Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass count as more than allocated size for source ptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 0, A_d, A_h, Nbytes+10, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass count as less than allocated size for destination ptr") {
-    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
-            nullptr, 0, A_d, A_h, Nbytes-10, hipMemcpyHostToDevice);
-    REQUIRE(hipSuccess == ret);
-  }
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(A_h));
+
  HIP_CHECK(hipGraphDestroy(graph));
 }
-/*
- * Create two host pointers, copy the data between them by the api
- * hipGraphAddMemcpyNode1D with data transfer kind hipMemcpyHostToHost.
- * Validate the output.
-*/
-TEST_CASE("Unit_hipGraphAddMemcpyNode1D_HostToHost") {
-  constexpr size_t size = 1024;
-  size_t numBytes{size * sizeof(int)};
-
-  // Host Vectors
-  std::vector<int> A_h(size);
-  std::vector<int> B_h(size);
-  // Initialization
-  std::iota(A_h.begin(), A_h.end(), 0);
-  std::fill_n(B_h.begin(), size, 0);
-
-  hipGraph_t graph;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-  hipGraphNode_t memcpyH2H;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  // Host to Host
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2H, graph, nullptr, 0,
-                     B_h.data(), A_h.data(), numBytes, hipMemcpyHostToHost));
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-
-  // Validation
-  REQUIRE(std::equal(A_h.begin(), A_h.end(), B_h.begin(), B_h.end()));
-}
@@ -0,0 +1,242 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+Functional -
+1) Add 1D memcpy node to graph and verify memcpy operation is success for all memcpy kinds(H2D, D2H and D2D).
+ Memcpy nodes are added and assigned to default device.
+2) Allocate memory on default device(Dev 0), Perform memcpy operation for 1D arrays on Peer device(Dev 1) and
+ verify the results.
+3) Create two host pointers, copy the data between them by the api hipGraphAddMemcpyNode1D with data transfer
+ kind hipMemcpyHostToHost. Validate the output.
+
+Negative -
+1) Pass pGraphNode as nullptr and check if api returns error.
+2) When graph is un-initialized argument(skipping graph creation), api should return error code.
+3) Passing pDependencies as nullptr, api should return success.
+4) When numDependencies is max(size_t) and pDependencies is not valid ptr, api expected to return error code.
+5) When pDependencies is nullptr, but numDependencies is non-zero, api expected to return error.
+6) When destination ptr  is nullptr, api expected to return error code.
+7) When source ptr is nullptr, api expected to return error code.
+8) If count is more than allocated size for source and destination ptr, error code is returned.
+9) If count is less than or equal to allocated size of source and destination ptr, api should return success.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <vector>
+#include <numeric>
+
+static void validateMemcpyNode1DArray(bool peerAccess) {
+  constexpr int SIZE{32};
+  int harray1D[SIZE]{};
+  int harray1Dres[SIZE]{};
+  hipGraph_t graph;
+  hipArray_t devArray1, devArray2;
+  hipGraphNode_t memcpyH2D, memcpyD2H, memcpyD2D;
+  constexpr int numBytes{SIZE * sizeof(int)};
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  HIP_CHECK(hipMalloc(&devArray1, numBytes));
+  HIP_CHECK(hipMalloc(&devArray2, numBytes));
+
+  // Initialize 1D object
+  for (int i = 0; i < SIZE; i++) {
+    harray1D[i] = i + 1;
+  }
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // For peer access test, Memory is allocated on device(0)
+  // while memcpy nodes are allocated and assigned to peer device(1)
+  if (peerAccess) {
+    HIP_CHECK(hipSetDevice(1));
+  }
+
+  // Host to Device (harray1D -> devArray1)
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0,
+                     devArray1, harray1D, numBytes, hipMemcpyHostToDevice));
+
+  // Device to Device (devArray1 -> devArray2)
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2D, graph, &memcpyH2D, 1,
+                     devArray2, devArray1, numBytes, hipMemcpyDeviceToDevice));
+
+  // Device to host (devArray2 -> harray1Dres)
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, &memcpyD2D, 1,
+                     harray1Dres, devArray2, numBytes, hipMemcpyDeviceToHost));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Validate result
+  for (int i = 0; i < SIZE; i++) {
+    if (harray1D[i] != harray1Dres[i]) {
+      INFO("harray1D: " << harray1D[i] << " harray1Dres: " << harray1Dres[i]
+            << " mismatch at : " << i);
+      REQUIRE(false);
+    }
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFree(devArray1));
+  HIP_CHECK(hipFree(devArray2));
+}
+
+
+/**
+ * Functional Tests adds memcpy 1D nodes of types H2D, D2D and D2H to graph
+ * and verifies execution sequence by launching graph.
+ *
+ * For Default device test: Memory allocations and memory operations
+ * are performed from device(0).
+ * For Peer device test: Memory allocations happen on device(0) and memcpy operations
+ * are performed from device(1).
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Functional") {
+  SECTION("Memcpy with 1D array on default device") {
+    validateMemcpyNode1DArray(false);
+  }
+
+  SECTION("Memcpy with 1D array on peer device") {
+    int numDevices{}, peerAccess{};
+    HIP_CHECK(hipGetDeviceCount(&numDevices));
+    if (numDevices > 1) {
+      HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    }
+
+    if (!peerAccess) {
+      WARN("Skipping test as peer device access is not found!");
+      return;
+    }
+    validateMemcpyNode1DArray(true);
+  }
+}
+
+
+
+/**
+ * Negative Test for API hipGraphAddMemcpyNode1D
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode1D_Negative") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  int *A_d, *A_h;
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode{};
+  hipError_t ret;
+
+  HIP_CHECK(hipMalloc(&A_d, Nbytes));
+  HIP_CHECK(hipMalloc(&A_h, Nbytes));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  SECTION("Pass pGraphNode as nullptr") {
+    ret = hipGraphAddMemcpyNode1D(nullptr, graph,
+            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass graph as nullptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, nullptr,
+            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pDependencies as nullptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 0, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("Pass numDependencies is max and pDependencies is not valid ptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, INT_MAX, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pDependencies as nullptr, but numDependencies is non-zero") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 9, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass destination ptr as nullptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 0, nullptr, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass source ptr as nullptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 0, A_d, nullptr, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass count as more than allocated size for source ptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 0, A_d, A_h, Nbytes+10, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass count as less than allocated size for destination ptr") {
+    ret = hipGraphAddMemcpyNode1D(&memcpyNode, graph,
+            nullptr, 0, A_d, A_h, Nbytes-10, hipMemcpyHostToDevice);
+    REQUIRE(hipSuccess == ret);
+  }
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(A_h));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+/*
+ * Create two host pointers, copy the data between them by the api
+ * hipGraphAddMemcpyNode1D with data transfer kind hipMemcpyHostToHost.
+ * Validate the output.
+*/
+TEST_CASE("Unit_hipGraphAddMemcpyNode1D_HostToHost") {
+  constexpr size_t size = 1024;
+  size_t numBytes{size * sizeof(int)};
+
+  // Host Vectors
+  std::vector<int> A_h(size);
+  std::vector<int> B_h(size);
+  // Initialization
+  std::iota(A_h.begin(), A_h.end(), 0);
+  std::fill_n(B_h.begin(), size, 0);
+
+  hipGraph_t graph;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+  hipGraphNode_t memcpyH2H;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  // Host to Host
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2H, graph, nullptr, 0,
+                     B_h.data(), A_h.data(), numBytes, hipMemcpyHostToHost));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+
+  // Validation
+  REQUIRE(std::equal(A_h.begin(), A_h.end(), B_h.begin(), B_h.end()));
+}
@@ -0,0 +1,576 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios : Negative
+1) Pass pGraphNode as nullptr and check if api returns error.
+2) When graph is un-initialized argument(skipping graph creation),
+   api should return error code.
+3) Passing pDependencies as nullptr, api should return success.
+4) When numDependencies is max(size_t) and pDependencies is not valid ptr,
+   api expected to return error code.
+5) When pDependencies is nullptr, but numDependencies is non-zero,
+   api expected to return error.
+6) When pCopyParams is nullptr, api expected to return error code.
+7) API expects atleast one memcpy src pointer to be set.
+   When hipMemcpy3DParms::srcArray and hipMemcpy3DParms::srcPtr.ptr both
+   are nullptr, api expected to return error code.
+8) API expects atleast one memcpy dst pointer to be set.
+   When hipMemcpy3DParms::dstArray and hipMemcpy3DParms::dstPtr.ptr both
+   are nullptr, api expected to return error code.
+9) Passing different element size for hipMemcpy3DParms::srcArray and
+   hipMemcpy3DParms::dstArray is expected to return error code.
+
+Testcase Scenarios : Functional
+1) Add memcpy node to graph and verify memcpy operation is success for all
+   memcpy kinds(H2D, D2H and D2D).
+   Memcpy nodes are added and assigned to default device.
+2) Perform memcpy operation for 1D, 2D and 3D arrays on default device and
+   verify the results.
+3) Add memcpy node to graph and verify memcpy operation is success for all
+   memcpy kinds(H2D, D2H and D2D).
+   Memcpy nodes are added and assigned to Peer device.
+4) Perform memcpy operation for 1D, 2D and 3D arrays on Peer device and
+   verify the results.
+5) Create two host pointers, copy the data between them by the api
+   hipGraphAddMemcpyNode with data transfer kind hipMemcpyHostToHost.
+   Validate the output.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <vector>
+#include <numeric>
+
+#define ZSIZE 32
+#define YSIZE 32
+#define XSIZE 32
+
+/* Test verifies hipGraphAddMemcpyNode API Negative scenarios.
+ */
+
+TEST_CASE("Unit_hipGraphAddMemcpyNode_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{10}, height{10}, depth{10};
+  hipArray_t devArray1;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparams;
+  uint32_t size = width * height * depth * sizeof(int);
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  hipStream_t streamForGraph;
+  hipError_t ret;
+
+  int *hData = reinterpret_cast<int*>(malloc(size));
+  int *hOutputData = reinterpret_cast<int *>(malloc(size));
+
+  REQUIRE(hData != nullptr);
+  REQUIRE(hOutputData != nullptr);
+  memset(hData, 0, size);
+  memset(hOutputData, 0,  size);
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // Initialize host buffer
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                                          0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
+                       make_hipExtent(width, height, depth), hipArrayDefault));
+
+  // Host to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width , height, depth);
+  myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparams.dstArray = devArray1;
+  myparams.kind = hipMemcpyHostToDevice;
+
+  SECTION("Pass pGraphNode as nullptr") {
+    ret = hipGraphAddMemcpyNode(nullptr, graph, nullptr, 0, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("When graph is nullptr") {
+    ret = hipGraphAddMemcpyNode(&memcpyNode, nullptr,  nullptr, 0, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Passing pDependencies as nullptr") {
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("When numDependencies is max and pDependencies is not valid ptr") {
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph,
+                                nullptr, INT_MAX, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("When pDependencies is nullptr, but numDependencies is non-zero") {
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 11, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pCopyParams as nullptr") {
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, nullptr);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("API expects atleast one memcpy src pointer to be set") {
+    memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+    myparams.srcPos = make_hipPos(0, 0, 0);
+    myparams.dstPos = make_hipPos(0, 0, 0);
+    myparams.extent = make_hipExtent(width , height, depth);
+    myparams.dstArray = devArray1;
+    myparams.kind = hipMemcpyHostToDevice;
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("API expects atleast one memcpy dst pointer to be set") {
+    memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+    myparams.srcPos = make_hipPos(0, 0, 0);
+    myparams.dstPos = make_hipPos(0, 0, 0);
+    myparams.extent = make_hipExtent(width , height, depth);
+    myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+    myparams.kind = hipMemcpyHostToDevice;
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Passing different element size for hipMemcpy3DParms::srcArray"
+                   "and hipMemcpy3DParms::dstArray") {
+    myparams.srcArray = devArray1;
+    hipArray_t devArray2;
+    HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
+              make_hipExtent(width+1, height+1, depth+1), hipArrayDefault));
+    myparams.dstArray = devArray2;
+    ret = hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams);
+    REQUIRE(hipErrorInvalidValue == ret);
+    HIP_CHECK(hipFreeArray(devArray2));
+  }
+
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray1));
+  free(hData);
+  free(hOutputData);
+}
+
+static void validateMemcpyNode3DArray(bool peerAccess = false) {
+  constexpr int width{10}, height{10}, depth{10};
+  hipArray_t devArray1, devArray2;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparams;
+  uint32_t size = width * height * depth * sizeof(int);
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  std::vector<hipGraphNode_t> dependencies;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipSetDevice(0));
+  int *hData = reinterpret_cast<int*>(malloc(size));
+  int *hOutputData = reinterpret_cast<int *>(malloc(size));
+
+  REQUIRE(hData != nullptr);
+  REQUIRE(hOutputData != nullptr);
+  memset(hData, 0, size);
+  memset(hOutputData, 0,  size);
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  // Initialize host buffer
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                                          0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
+                       make_hipExtent(width, height, depth), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
+                       make_hipExtent(width, height, depth), hipArrayDefault));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // For peer access test, Memory is allocated on device(0)
+  // while memcpy nodes are allocated and assigned to peer device(1)
+  if (peerAccess) {
+    HIP_CHECK(hipSetDevice(1));
+  }
+
+  // Host to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width , height, depth);
+  myparams.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparams.dstArray = devArray1;
+  myparams.kind = hipMemcpyHostToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
+  dependencies.push_back(memcpyNode);
+
+  // Device to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.srcArray = devArray1;
+  myparams.dstArray = devArray2;
+  myparams.extent = make_hipExtent(width, height, depth);
+  myparams.kind = hipMemcpyDeviceToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                             dependencies.size(), &myparams));
+  dependencies.clear();
+  dependencies.push_back(memcpyNode);
+
+  // Device to host
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(int),
+                                      width, height);
+  myparams.srcArray = devArray2;
+  myparams.extent = make_hipExtent(width, height, depth);
+  myparams.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                             dependencies.size(), &myparams));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Check result
+  HipTest::checkArray(hData, hOutputData, width, height, depth);
+
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray1));
+  HIP_CHECK(hipFreeArray(devArray2));
+  free(hData);
+  free(hOutputData);
+}
+
+static void validateMemcpyNode2DArray(bool peerAccess = false) {
+  int harray2D[YSIZE][XSIZE]{};
+  int harray2Dres[YSIZE][XSIZE]{};
+  constexpr int width{XSIZE}, height{YSIZE};
+  hipArray_t devArray1, devArray2;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparams;
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  std::vector<hipGraphNode_t> dependencies;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  // Initialize 2D object
+  for (int i = 0; i < YSIZE; i++) {
+    for (int j = 0; j < XSIZE; j++) {
+      harray2D[i][j] = i + j + 1;
+    }
+  }
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                                          0, 0, 0, formatKind);
+  // Allocate 2D device array by passing depth(0)
+  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
+                       make_hipExtent(width, height, 0), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
+                       make_hipExtent(width, height, 0), hipArrayDefault));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // For peer access test, Memory is allocated on device(0)
+  // while memcpy nodes are allocated and assigned to peer device(1)
+  if (peerAccess) {
+    HIP_CHECK(hipSetDevice(1));
+  }
+
+  // Host to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, height, 1);
+  myparams.srcPtr = make_hipPitchedPtr(harray2D, width * sizeof(int),
+                                      width, height);
+  myparams.dstArray = devArray1;
+  myparams.kind = hipMemcpyHostToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
+  dependencies.push_back(memcpyNode);
+
+  // Device to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.srcArray = devArray1;
+  myparams.dstArray = devArray2;
+  myparams.extent = make_hipExtent(width, height, 1);
+  myparams.kind = hipMemcpyDeviceToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                             dependencies.size(), &myparams));
+  dependencies.clear();
+  dependencies.push_back(memcpyNode);
+
+  // Device to host
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, height, 1);
+  myparams.dstPtr = make_hipPitchedPtr(harray2Dres, width * sizeof(int),
+                                      width, height);
+  myparams.srcArray = devArray2;
+  myparams.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                             dependencies.size(), &myparams));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Validate result
+  for (int i = 0; i < YSIZE; i++) {
+    for (int j = 0; j < XSIZE; j++) {
+      if (harray2D[i][j] != harray2Dres[i][j]) {
+        INFO("harray2D: " << harray2D[i][j] << "harray2Dres: "
+              << harray2Dres[i][j] << " mismatch at (i,j) : " << i << j);
+        REQUIRE(false);
+      }
+    }
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray1));
+  HIP_CHECK(hipFreeArray(devArray2));
+}
+
+static void validateMemcpyNode1DArray(bool peerAccess = false) {
+  int harray1D[XSIZE]{};
+  int harray1Dres[XSIZE]{};
+  constexpr int width{XSIZE};
+  hipArray_t devArray1, devArray2;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparams;
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  std::vector<hipGraphNode_t> dependencies;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  // Initialize 1D object
+  for (int i = 0; i < XSIZE; i++) {
+    harray1D[i] = i + 1;
+  }
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                                          0, 0, 0, formatKind);
+  // Allocate 1D device array by passing depth(0), height(0)
+  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
+                       make_hipExtent(width, 0, 0), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
+                       make_hipExtent(width, 0, 0), hipArrayDefault));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // For peer access test, Memory is allocated on device(0)
+  // while memcpy nodes are allocated and assigned to peer device(1)
+  if (peerAccess) {
+    HIP_CHECK(hipSetDevice(1));
+  }
+
+  // Host to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.srcPtr = make_hipPitchedPtr(harray1D, width * sizeof(int),
+                                      width, 1);
+  myparams.dstArray = devArray1;
+  myparams.kind = hipMemcpyHostToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
+  dependencies.push_back(memcpyNode);
+
+  // Device to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.srcArray = devArray1;
+  myparams.dstArray = devArray2;
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.kind = hipMemcpyDeviceToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                             dependencies.size(), &myparams));
+  dependencies.clear();
+  dependencies.push_back(memcpyNode);
+
+  // Device to host
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.dstPtr = make_hipPitchedPtr(harray1Dres, width * sizeof(int),
+                                      width, 1);
+  myparams.srcArray = devArray2;
+  myparams.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                              dependencies.size(), &myparams));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Validate result
+  for (int i = 0; i < XSIZE; i++) {
+    if (harray1D[i] != harray1Dres[i]) {
+      INFO("harray1D: " << harray1D[i] << " harray1Dres: " << harray1Dres[i]
+            << " mismatch at : " << i);
+      REQUIRE(false);
+    }
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray1));
+  HIP_CHECK(hipFreeArray(devArray2));
+}
+
+/**
+ * Basic Functional Tests adds memcpy nodes of types H2D, D2D and D2H to graph
+ * and verifies execution sequence by launching graph on default device.
+ * Tests also verify memcpy node addition with 1D, 2D and 3D objects.
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode_BasicFunctional") {
+  CHECK_IMAGE_SUPPORT
+
+  SECTION("Memcpy with 3D array on default device") {
+    validateMemcpyNode3DArray();
+  }
+
+  SECTION("Memcpy with 2D array on default device") {
+    validateMemcpyNode2DArray();
+  }
+
+  SECTION("Memcpy with 1D array on default device") {
+    validateMemcpyNode1DArray();
+  }
+}
+
+/**
+ * Peer access tests adds and assigns memcpy nodes of types H2D, D2D and D2H
+ * to peer device. Memory allocations happen on device(0) and memcpy operations
+ * are performed from device(1).
+ * Tests also verify memcpy node addition with 1D, 2D and 3D objects.
+ */
+TEST_CASE("Unit_hipGraphAddMemcpyNode_PeerAccessFunctional") {
+  CHECK_IMAGE_SUPPORT
+
+  int numDevices{}, peerAccess{};
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+  }
+
+  if (!peerAccess) {
+    WARN("Skipping test as peer device access is not found!");
+    return;
+  }
+
+  SECTION("Memcpy with 3D array on peer device") {
+    validateMemcpyNode3DArray(true);
+  }
+
+  SECTION("Memcpy with 2D array on peer device") {
+    validateMemcpyNode2DArray(true);
+  }
+
+  SECTION("Memcpy with 1D array on peer device") {
+    validateMemcpyNode1DArray(true);
+  }
+}
+/*
+* Create two host pointers, copy the data between them by the api
+* hipGraphAddMemcpyNode with data transfer kind hipMemcpyHostToHost.
+* Validate the output.
+*/
+TEST_CASE("Unit_hipGraphAddMemcpyNode_HostToHost") {
+  constexpr size_t size = 1024;
+  size_t numW = size * sizeof(int);
+  // Host Vectors
+  std::vector<int> A_h(numW);
+  std::vector<int> B_h(numW);
+  // Initialization
+  std::iota(A_h.begin(), A_h.end(), 0);
+  std::fill_n(B_h.begin(), size, 0);
+
+  hipGraph_t graph;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+  hipGraphNode_t memcpyH2H;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  hipMemcpy3DParms myparms{};
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.srcPtr = make_hipPitchedPtr(A_h.data(), numW, numW, 1);
+  myparms.dstPtr = make_hipPitchedPtr(B_h.data(), numW, numW, 1);
+  myparms.extent = make_hipExtent(numW, 1, 1);
+  myparms.kind = hipMemcpyHostToHost;
+
+  // Host to Host
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyH2H, graph, nullptr,
+                                            0, &myparms));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+
+  // Validation
+  REQUIRE(memcmp(A_h.data(), B_h.data(), numW) == 0);
+}
@@ -1,13 +1,16 @@
 /*
 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,247 +20,248 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios :
-Functional-
-1) Instantiate a graph with memcpy node, obtain executable graph and update the hipMemcpy3DParms node params with set. Make sure they are taking effect.
-Negative-
-1) Pass hGraphExec as nullptr and verify api returns error code.
-2) Pass node as nullptr and verify api returns error code.
-3) Pass pNodeParams as nullptr and verify api returns error code.
-4) Pass pNodeParams as empty structure object and verify api returns error code.
-5) API expects atleast one memcpy src pointer to be set. When hipMemcpy3DParms::srcArray and hipMemcpy3DParms::srcPtr.ptr both are nullptr, api expected to return error code.
-6) API expects atleast one memcpy dst pointer to be set. When hipMemcpy3DParms::dstArray and hipMemcpy3DParms::dstPtr.ptr both are nullptr, api expected to return error code.
-7) Passing different element size for hipMemcpy3DParms::srcArray and hipMemcpy3DParms::dstArray is expected to return error code.
-8) Pass node of different graph and verify api returns error code.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip_test_defgroups.hh>
+#include <memcpy1d_tests_common.hh>
+#include <memcpy3d_tests_common.hh>

-/* Test verifies hipGraphExecMemcpyNodeSetParams API Negative scenarios.
+#include "graph_tests_common.hh"
+
+/**
+ * @addtogroup hipGraphExecMemcpyNodeSetParams hipGraphExecMemcpyNodeSetParams
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, hipMemcpy3DParms
+ * *pNodeParams)` - Sets the parameters for a memcpy node in the given graphExec
 */
-TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Negative") {
-  CHECK_IMAGE_SUPPORT

-  constexpr int width{10}, height{10}, depth{10};
-  hipArray_t devArray, devArray2;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparms;
-  hipError_t ret;
-  int* hData;
-  uint32_t size = width * height * depth * sizeof(int);
-  hData = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that node parameters get updated correctly by creating a node with valid but
+ * incorrect parameters, and the setting them to the correct values in the executable graph. The
+ * executable graph is run and the results of the memcpy verified. The test is run for all possible
+ * memcpy directions, with both the corresponding memcpy kind and hipMemcpyDefault, as well as half
+ * page and full page allocation sizes. Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecMemcpyNodeSetParams.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Positive_Basic") {
+  constexpr auto f = [](void* dst, void* src, size_t count, hipMemcpyKind direction) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+    const auto offset_src = reinterpret_cast<uint8_t*>(src) + 1;
+    const auto offset_dst = reinterpret_cast<uint8_t*>(dst) + 1;
+    auto params =
+        GetMemcpy3DParms(make_hipPitchedPtr(offset_dst, 0, count - 1, 0), make_hipPos(0, 0, 0),
+                         make_hipPitchedPtr(offset_src, 0, count - 1, 0), make_hipPos(0, 0, 0),
+                         make_hipExtent(count - 1, 1, 1), direction);
+    HIP_CHECK(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params));
+    hipGraphExec_t graph_exec = nullptr;
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    params = GetMemcpy3DParms(make_hipPitchedPtr(dst, 0, count, 0), make_hipPos(0, 0, 0),
+                              make_hipPitchedPtr(src, 0, count, 0), make_hipPos(0, 0, 0),
+                              make_hipExtent(count, 1, 1), direction);
+    HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graph_exec, node, &params));
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));
+
+    return hipSuccess;
+  };
+
+#if HT_NVIDIA
+  MemcpyWithDirectionCommonTests<false>(f);
+#else
+  using namespace std::placeholders;
+
+  SECTION("Device to host") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToHost));
+  }
+
+  SECTION("Host to device") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToDevice));
+  }
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
    }
  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
-                             height, depth), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc, make_hipExtent(width+1,
-                             height+1, depth+1), hipArrayDefault));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparms.dstArray = devArray;
-  myparms.kind = hipMemcpyHostToDevice;

-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  hipGraphExec_t graphExec;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
-
-  // Instantiate the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
-  SECTION("Pass hGraphExec as nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams(nullptr, memcpyNode, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass node as nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, nullptr, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pNodeParams as nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass pNodeParams as empty structure object") {
-    hipMemcpy3DParms temp{};
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("API expects atleast one memcpy src pointer to be set") {
-    hipMemcpy3DParms temp;
-    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
-    temp.srcPos = make_hipPos(0, 0, 0);
-    temp.dstPos = make_hipPos(0, 0, 0);
-    temp.extent = make_hipExtent(width , height, depth);
-    temp.dstArray = devArray;
-    temp.kind = hipMemcpyHostToDevice;
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("API expects atleast one memcpy dst pointer to be set") {
-    hipMemcpy3DParms temp;
-    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
-    temp.srcPos = make_hipPos(0, 0, 0);
-    temp.dstPos = make_hipPos(0, 0, 0);
-    temp.extent = make_hipExtent(width , height, depth);
-    temp.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-    temp.kind = hipMemcpyHostToDevice;
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Passing different element size for hipMemcpy3DParms::srcArray"
-                   "and hipMemcpy3DParms::dstArray") {
-    hipMemcpy3DParms temp;
-    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
-    temp.srcPos = make_hipPos(0, 0, 0);
-    temp.dstPos = make_hipPos(0, 0, 0);
-    temp.extent = make_hipExtent(width , height, depth);
-    temp.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-    temp.kind = hipMemcpyHostToDevice;
-    temp.srcArray = devArray;
-    temp.dstArray = devArray2;
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Check with other graph node") {
-    hipGraph_t graph1;
-    hipGraphNode_t memcpyNode1;
-    HIP_CHECK(hipGraphCreate(&graph1, 0));
-    HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode1, graph1, NULL, 0, &myparms));
-    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode1, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-    HIP_CHECK(hipGraphDestroy(graph1));
-  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipFreeArray(devArray));
-  HIP_CHECK(hipFreeArray(devArray2));
-  free(hData);
-}
-
-/* Test verifies hipGraphExecMemcpyNodeSetParams API Functional scenarios.
- */
-TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Functional") {
-  CHECK_IMAGE_SUPPORT
-
-  constexpr int XSIZE = 1024;
-  int harray1D[XSIZE]{};
-  int harray1Dres[XSIZE]{};
-  constexpr int width{XSIZE};
-  hipArray_t devArray1, devArray2;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparams;
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  std::vector<hipGraphNode_t> dependencies;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  // Initialize 1D object
-  for (int i = 0; i < XSIZE; i++) {
-    harray1D[i] = i + 1;
-  }
-
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                                          0, 0, 0, formatKind);
-  // Allocate 1D device array by passing depth(0), height(0)
-  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
-                       make_hipExtent(width, 0, 0), hipArrayDefault));
-  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
-                       make_hipExtent(width, 0, 0), hipArrayDefault));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-
-  // Host to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.srcPtr = make_hipPitchedPtr(harray1D, width * sizeof(int),
-                                       width, 1);
-  myparams.dstArray = devArray1;
-  myparams.kind = hipMemcpyHostToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
-  dependencies.push_back(memcpyNode);
-
-  // Device to Device
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.srcArray = devArray1;
-  myparams.dstArray = devArray2;
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.kind = hipMemcpyDeviceToDevice;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                  dependencies.size(), &myparams));
-  dependencies.clear();
-  dependencies.push_back(memcpyNode);
-
-  // Device to host
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.dstPtr = make_hipPitchedPtr(harray1Dres, width * sizeof(int),
-                                       width, 1);
-  myparams.srcArray = devArray2;
-  myparams.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                  dependencies.size(), &myparams));
-
-  // Instantiate the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-
-  int harray1Dupdate[XSIZE]{};
-  hipArray_t devArray3;
-  HIP_CHECK(hipMalloc3DArray(&devArray3, &channelDesc,
-                       make_hipExtent(width, 0, 0), hipArrayDefault));
-
-  // D2H updated with different pointer harray1Dres -> harray1Dupdate
-  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
-  myparams.srcPos = make_hipPos(0, 0, 0);
-  myparams.dstPos = make_hipPos(0, 0, 0);
-  myparams.extent = make_hipExtent(width, 1, 1);
-  myparams.dstPtr = make_hipPitchedPtr(harray1Dupdate, width * sizeof(int),
-                                      width, 1);
-  myparams.srcArray = devArray2;
-  myparams.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &myparams));
-
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Validate result
-  for (int i = 0; i < XSIZE; i++) {
-    if (harray1D[i] != harray1Dupdate[i]) {
-      INFO("harray1D: " << harray1D[i] << " harray1Dupdate: " <<
-                      harray1Dupdate[i] << " mismatch at : " << i);
-      REQUIRE(false);
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
    }
  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray1));
-  HIP_CHECK(hipFreeArray(devArray2));
+
+// Disabled on AMD due to defect - EXSWHTEC-209
+#if 0
+  SECTION("Host to host") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToHost));
+  }
+
+  SECTION("Host to host with default kind") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+// Disabled on AMD due to defect - EXSWHTEC-210
+#if 0
+  SECTION("Device to host with default kind") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+
+  SECTION("Host to device with default kind") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+#endif
 }
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# pGraphExec is nullptr
+ *        -# node is nullptr
+ *        -# graph is nullptr
+ *        -# pDependencies is nullptr when numDependencies is not zero
+ *        -# A node in pDependencies originates from a different graph
+ *        -# numDependencies is invalid
+ *        -# A node is duplicated in pDependencies
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Negative_Parameters") {
+  using namespace std::placeholders;
+  hipGraph_t graph = nullptr;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  int src[2] = {}, dst[2] = {};
+
+  auto params = GetMemcpy3DParms(make_hipPitchedPtr(dst, 0, sizeof(dst), 0), make_hipPos(0, 0, 0),
+                                 make_hipPitchedPtr(src, 0, sizeof(src), 0), make_hipPos(0, 0, 0),
+                                 make_hipExtent(sizeof(dst), 1, 1), hipMemcpyDefault);
+
+  hipGraphNode_t node = nullptr;
+  HIP_CHECK(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params));
+
+  hipGraphExec_t graph_exec = nullptr;
+  HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+
+  SECTION("pGraphExec == nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams(nullptr, node, &params), hipErrorInvalidValue);
+  }
+
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams(graph_exec, nullptr, &params),
+                    hipErrorInvalidValue);
+  }
+
+  auto f = [&](void* dst, void* src, size_t count, hipMemcpyKind kind) {
+    auto params = GetMemcpy3DParms(make_hipPitchedPtr(dst, 0, sizeof(dst), 0), make_hipPos(0, 0, 0),
+                                   make_hipPitchedPtr(src, 0, sizeof(src), 0), make_hipPos(0, 0, 0),
+                                   make_hipExtent(count, 1, 1), kind);
+    return hipGraphExecMemcpyNodeSetParams(graph_exec, node, &params);
+  };
+  MemcpyWithDirectionCommonNegativeTests(f, dst, src, sizeof(dst), hipMemcpyDefault);
+
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(
+        hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dst, src, 0, hipMemcpyDefault),
+        hipErrorInvalidValue);
+  }
+
+  SECTION("count larger than dst allocation size") {
+    LinearAllocGuard<int> dev_dst(LinearAllocs::hipMalloc, sizeof(int));
+    params.dstPtr = make_hipPitchedPtr(dev_dst.ptr(), 0, sizeof(int), 0);
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams(graph_exec, node, &params),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("count larger than src allocation size") {
+    LinearAllocGuard<int> dev_src(LinearAllocs::hipMalloc, sizeof(int));
+    params.dstPtr = make_hipPitchedPtr(dev_src.ptr(), 0, sizeof(int), 0);
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams(graph_exec, node, &params),
+                    hipErrorInvalidValue);
+  }
+
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that memcpy direction cannot be altered in an executable graph. The test is run for
+ * all memcpy directions with appropriate memory allocations.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecMemcpyNodeSetParams.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Negative_Changing_Memcpy_Direction") {
+  int host;
+  LinearAllocGuard<int> dev(LinearAllocs::hipMalloc, sizeof(int));
+
+  const auto [dir, src, dst] =
+      GENERATE_REF(std::make_tuple(hipMemcpyHostToHost, &host, &host),
+                   std::make_tuple(hipMemcpyHostToDevice, &host, dev.ptr()),
+                   std::make_tuple(hipMemcpyDeviceToHost, dev.ptr(), &host),
+                   std::make_tuple(hipMemcpyDeviceToDevice, dev.ptr(), dev.ptr()));
+
+  hipGraph_t graph = nullptr;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  auto params = GetMemcpy3DParms(make_hipPitchedPtr(dst, 0, sizeof(int), 0), make_hipPos(0, 0, 0),
+                                 make_hipPitchedPtr(src, 0, sizeof(int), 0), make_hipPos(0, 0, 0),
+                                 make_hipExtent(sizeof(int), 1, 1), dir);
+
+  hipGraphNode_t node = nullptr;
+  HIP_CHECK(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params));
+
+  hipGraphExec_t graph_exec = nullptr;
+  HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+
+  const auto set_dir = GENERATE(hipMemcpyHostToHost, hipMemcpyHostToDevice, hipMemcpyDeviceToHost,
+                                hipMemcpyDeviceToDevice, hipMemcpyDefault);
+  if (dir == set_dir) {
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));
+    return;
+  }
+
+  params.kind = set_dir;
+  HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams(graph_exec, node, &params), hipErrorInvalidValue);
+
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
@@ -6,8 +6,10 @@ in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,182 +19,235 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/*
-Testcase Scenarios :
-Functional-
-1) Instantiate a graph with memcpy node, obtain executable graph and update the
-   node params with set exec api call. Make sure they are taking effect.
-Negative-
-1) Pass hGraphExec as nullptr and check if api returns error.
-2) Pass GraphNode as nullptr and check if api returns error.
-3) Pass destination ptr is nullptr, api expected to return error code.
-4) Pass source ptr is nullptr, api expected to return error code.
-5) Pass count as zero, api expected to return error code.
-6) Pass same pointer as source ptr and destination ptr, api expected to return error code.
-7) Pass overlap memory address as source ptr and destination ptr, api expected to return error code.
-7) Pass overlap memory as source ptr and destination ptr where source ptr is ahead of destination ptr, api expected to return error code.
-8) Pass overlap memory as source ptr and destination ptr where destination ptr is ahead of source ptr, api expected to return error code.
-9) If count is more than allocated size for source and destination ptr, api should return error code.
-10) If count is less than allocated size for source and destination ptr, api should return error code.
-11) Change the hipMemcpyKind from H2D to D2H but allocate pointer memory for H2D, api should return error code.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_kernels.hh>
+#include <hip_test_defgroups.hh>
+#include <memcpy1d_tests_common.hh>

-/* Test verifies hipGraphExecMemcpyNodeSetParams1D API Negative scenarios.
+#include "graph_tests_common.hh"
+
+/**
+ * @addtogroup hipGraphExecMemcpyNodeSetParams1D hipGraphExecMemcpyNodeSetParams1D
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraphNode_t node, void *dst,
+ * const void *src, size_t count, hipMemcpyKind kind)` - Sets the parameters for a memcpy node in
+ * the given graphExec to perform a 1-dimensional copy
 */
-TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Negative") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);

-  int *A_d;
-  HIP_CHECK(hipMalloc(&A_d, Nbytes));
-  int *A_h = reinterpret_cast<int*>(malloc(Nbytes));
-  REQUIRE(A_h != nullptr);
-  memset(A_h, 0, Nbytes);
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that node parameters get updated correctly by creating a node with valid but
+ * incorrect parameters, and the setting them to the correct values in the executable graph. The
+ * executable graph is run and the results of the memcpy verified. The test is run for all possible
+ * memcpy directions, with both the corresponding memcpy kind and hipMemcpyDefault, as well as half
+ * page and full page allocation sizes. Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecMemcpyNodeSetParams1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Positive_Basic") {
+  constexpr auto f = [](void* dst, void* src, size_t count, hipMemcpyKind direction) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+    const auto offset_src = reinterpret_cast<uint8_t*>(src) + 1;
+    const auto offset_dst = reinterpret_cast<uint8_t*>(dst) + 1;
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, offset_dst, offset_src, count - 1,
+                                      direction));
+    hipGraphExec_t graph_exec = nullptr;
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dst, src, count, direction));
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));

-  hipError_t ret;
-  hipGraphNode_t memcpyH2D;
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));

+    return hipSuccess;
+  };
+
+#if HT_NVIDIA
+  MemcpyWithDirectionCommonTests<false>(f);
+#else
+  using namespace std::placeholders;
+
+  SECTION("Device to host") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToHost));
+  }
+
+  SECTION("Host to device") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToDevice));
+  }
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+  }
+
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+  }
+
+// Disabled on AMD due to defect - EXSWHTEC-209
+#if 0
+  SECTION("Host to host") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToHost));
+  }
+
+  SECTION("Host to host with default kind") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+// Disabled on AMD due to defect - EXSWHTEC-210
+#if 0
+  SECTION("Device to host with default kind") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+
+  SECTION("Host to device with default kind") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+#endif
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# pGraphExec is nullptr
+ *        -# node is nullptr
+ *        -# graph is nullptr
+ *        -# pDependencies is nullptr when numDependencies is not zero
+ *        -# A node in pDependencies originates from a different graph
+ *        -# numDependencies is invalid
+ *        -# A node is duplicated in pDependencies
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Negative_Parameters") {
+  using namespace std::placeholders;
+  hipGraph_t graph = nullptr;
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  // Instantiate the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));

-  SECTION("Pass hGraphExec as nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(nullptr, memcpyH2D, A_d, A_h,
-                                            Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+  int src[2] = {}, dst[2] = {};
+
+  hipGraphNode_t node = nullptr;
+  HIP_CHECK(
+      hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, src, sizeof(dst), hipMemcpyDefault));
+
+  hipGraphExec_t graph_exec = nullptr;
+  HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+
+  SECTION("pGraphExec == nullptr") {
+    HIP_CHECK_ERROR(
+        hipGraphExecMemcpyNodeSetParams1D(nullptr, node, dst, src, sizeof(dst), hipMemcpyDefault),
+        hipErrorInvalidValue);
  }
-  SECTION("Pass GraphNode as nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, nullptr, A_d, A_h,
-                                            Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graph_exec, nullptr, dst, src, sizeof(dst),
+                                                      hipMemcpyDefault),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass destination ptr is nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, nullptr, A_h,
-                                            Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  MemcpyWithDirectionCommonNegativeTests(
+      std::bind(hipGraphExecMemcpyNodeSetParams1D, graph_exec, node, _1, _2, _3, _4), dst, src,
+      sizeof(dst), hipMemcpyDefault);
+
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(
+        hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dst, src, 0, hipMemcpyDefault),
+        hipErrorInvalidValue);
  }
-  SECTION("Pass source ptr is nullptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, nullptr,
-                                            Nbytes, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("count larger than dst allocation size") {
+    LinearAllocGuard<int> dev_dst(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dev_dst.ptr(), src,
+                                                      sizeof(src), hipMemcpyDefault),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass count as zero") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_h,
-                                            0, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("count larger than src allocation size") {
+    LinearAllocGuard<int> dev_src(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dst, dev_src.ptr(),
+                                                      sizeof(dst), hipMemcpyDefault),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass same pointer as source ptr and destination ptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_d,
-                                            Nbytes, hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass overlap memory where destination ptr is ahead of source ptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_d-5,
-                                            Nbytes, hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass overlap memory where source ptr is ahead of destination ptr") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d+5, A_d,
-                                            Nbytes, hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Copy more than allocated memory") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_h,
-                                            Nbytes+8, hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Copy less than allocated memory") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_h,
-                                            Nbytes-8, hipMemcpyHostToDevice);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("Change the hipMemcpyKind from H2D to D2H") {
-    ret = hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d, A_h,
-                                            Nbytes, hipMemcpyDeviceToHost);
-    REQUIRE(hipSuccess != ret);
-  }
-  HIP_CHECK(hipFree(A_d));
-  free(A_h);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
+
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
  HIP_CHECK(hipGraphDestroy(graph));
 }

-/* Test verifies hipGraphExecMemcpyNodeSetParams1D API Functional scenarios.
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that memcpy direction cannot be altered in an executable graph. The test is run for
+ * all memcpy directions with appropriate memory allocations.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphExecMemcpyNodeSetParams1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
 */
-TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Functional") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Negative_Changing_Memcpy_Direction") {
+  int host;
+  LinearAllocGuard<int> dev(LinearAllocs::hipMalloc, sizeof(int));

-  int *hData = reinterpret_cast<int*>(malloc(Nbytes));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, Nbytes);
-
-  hipGraphNode_t memcpyH2D_A, memcpyH2D_B, memcpyD2H_C;
-  hipGraphNode_t kernel_vecAdd;
-  hipKernelNodeParams kernelNodeParams{};
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipStream_t streamForGraph;
-
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  const auto [dir, src, dst] =
+      GENERATE_REF(std::make_tuple(hipMemcpyHostToHost, &host, &host),
+                   std::make_tuple(hipMemcpyHostToDevice, &host, dev.ptr()),
+                   std::make_tuple(hipMemcpyDeviceToHost, dev.ptr(), &host),
+                   std::make_tuple(hipMemcpyDeviceToDevice, dev.ptr(), dev.ptr()));

+  hipGraph_t graph = nullptr;
  HIP_CHECK(hipGraphCreate(&graph, 0));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  hipGraphNode_t node = nullptr;
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, src, sizeof(int), dir));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  hipGraphExec_t graph_exec = nullptr;
+  HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
+  const auto set_dir = GENERATE(hipMemcpyHostToHost, hipMemcpyHostToDevice, hipMemcpyDeviceToHost,
+                                hipMemcpyDeviceToDevice, hipMemcpyDefault);
+  if (dir == set_dir) {
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));
+    return;
+  }

-  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kernelNodeParams.gridDim = dim3(blocks);
-  kernelNodeParams.blockDim = dim3(threadsPerBlock);
-  kernelNodeParams.sharedMemBytes = 0;
-  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
-  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                                        &kernelNodeParams));
+  HIP_CHECK_ERROR(
+      hipGraphExecMemcpyNodeSetParams1D(graph_exec, node, dst, src, sizeof(int), set_dir),
+      hipErrorInvalidValue);

-  // Create dependencies
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_A, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_B, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpyD2H_C, 1));
-
-  // Instantiate the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-
-  HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyD2H_C, hData,
-                                        C_d, Nbytes, hipMemcpyDeviceToHost));
-
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-  // Verify graph execution result
-  HipTest::checkVectorADD(A_h, B_h, hData, N);
-
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
  HIP_CHECK(hipGraphDestroy(graph));
-  free(hData);
 }
@@ -0,0 +1,201 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+Testcase Scenarios :
+Functional-
+1) Instantiate a graph with memcpy node, obtain executable graph and update the
+   node params with set exec api call. Make sure they are taking effect.
+Negative-
+1) Pass hGraphExec as nullptr and check if api returns error.
+2) Pass GraphNode as nullptr and check if api returns error.
+3) Pass destination ptr is nullptr, api expected to return error code.
+4) Pass source ptr is nullptr, api expected to return error code.
+5) Pass count as zero, api expected to return error code.
+6) Pass same pointer as source ptr and destination ptr, api expected to return error code.
+7) Pass overlap memory address as source ptr and destination ptr, api expected to return error code.
+7) Pass overlap memory as source ptr and destination ptr where source ptr is ahead of destination ptr, api expected to return error code.
+8) Pass overlap memory as source ptr and destination ptr where destination ptr is ahead of source ptr, api expected to return error code.
+9) If count is more than allocated size for source and destination ptr, api should return error code.
+10) If count is less than allocated size for source and destination ptr, api should return error code.
+11) Change the hipMemcpyKind from H2D to D2H but allocate pointer memory for H2D, api should return error code.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_kernels.hh>
+#include <memcpy1d_tests_common.hh>
+
+/* Test verifies hipGraphExecMemcpyNodeSetParams1D API Functional scenarios.
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Functional") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  constexpr auto blocksPerCU = 6;  // to hide latency
+  constexpr auto threadsPerBlock = 256;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  size_t NElem{N};
+
+  int *hData = reinterpret_cast<int*>(malloc(Nbytes));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, Nbytes);
+
+  hipGraphNode_t memcpyH2D_A, memcpyH2D_B, memcpyD2H_C;
+  hipGraphNode_t kernel_vecAdd;
+  hipKernelNodeParams kernelNodeParams{};
+  hipGraph_t graph;
+  hipGraphExec_t graphExec;
+  hipStream_t streamForGraph;
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d,
+                                    Nbytes, hipMemcpyDeviceToHost));
+
+  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  kernelNodeParams.gridDim = dim3(blocks);
+  kernelNodeParams.blockDim = dim3(threadsPerBlock);
+  kernelNodeParams.sharedMemBytes = 0;
+  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
+  kernelNodeParams.extra = nullptr;
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
+                                                        &kernelNodeParams));
+
+  // Create dependencies
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_A, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_B, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpyD2H_C, 1));
+
+  // Instantiate the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+
+  HIP_CHECK(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyD2H_C, hData,
+                                        C_d, Nbytes, hipMemcpyDeviceToHost));
+
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Verify graph execution result
+  HipTest::checkVectorADD(A_h, B_h, hData, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipGraphDestroy(graph));
+  free(hData);
+}
+
+/* Test verifies hipGraphExecMemcpyNodeSetParams1D API Negative scenarios.
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams1D_Negative") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+
+  LinearAllocGuard<int> A_d(LinearAllocs::hipMalloc, Nbytes);
+  LinearAllocGuard<int> A_h(LinearAllocs::malloc, Nbytes);
+  memset(A_h.ptr(), 0, Nbytes);
+
+  hipGraph_t graph;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  hipGraphNode_t memcpyH2D;
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, A_d.ptr(), A_h.ptr(),
+                                    Nbytes, hipMemcpyHostToDevice));
+  // Instantiate the graph
+  hipGraphExec_t graphExec;
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+
+  SECTION("Pass hGraphExec as nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(nullptr, memcpyH2D, A_d.ptr(),
+                                                      A_h.ptr(), Nbytes,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass GraphNode as nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, nullptr, A_d.ptr(),
+                                                      A_h.ptr(), Nbytes,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass destination ptr is nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D,
+                                                      nullptr, A_h.ptr(), Nbytes,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass source ptr is nullptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d.ptr(),
+                                                      nullptr, Nbytes,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass count as zero") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d.ptr(),
+                                                      A_h.ptr(), 0,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass same pointer as source ptr and destination ptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d.ptr(),
+                                                      A_d.ptr(), Nbytes,
+                                                      hipMemcpyDeviceToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass overlap memory where destination ptr is ahead of source ptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d.ptr(),
+                                                      A_d.ptr() - 5, Nbytes,
+                                                      hipMemcpyDeviceToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Pass overlap memory where source ptr is ahead of destination ptr") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D,
+                                                      A_d.ptr() + 5, A_d.ptr(), Nbytes,
+                                                      hipMemcpyDeviceToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("Copy more than allocated memory") {
+    HIP_CHECK_ERROR(hipGraphExecMemcpyNodeSetParams1D(graphExec, memcpyH2D, A_d.ptr(),
+                                                      A_h.ptr(), Nbytes + 8,
+                                                      hipMemcpyHostToDevice),
+                    hipErrorInvalidValue);
+  }
+
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
@@ -0,0 +1,263 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+Functional-
+1) Instantiate a graph with memcpy node, obtain executable graph and update the hipMemcpy3DParms node params with set. Make sure they are taking effect.
+Negative-
+1) Pass hGraphExec as nullptr and verify api returns error code.
+2) Pass node as nullptr and verify api returns error code.
+3) Pass pNodeParams as nullptr and verify api returns error code.
+4) Pass pNodeParams as empty structure object and verify api returns error code.
+5) API expects atleast one memcpy src pointer to be set. When hipMemcpy3DParms::srcArray and hipMemcpy3DParms::srcPtr.ptr both are nullptr, api expected to return error code.
+6) API expects atleast one memcpy dst pointer to be set. When hipMemcpy3DParms::dstArray and hipMemcpy3DParms::dstPtr.ptr both are nullptr, api expected to return error code.
+7) Passing different element size for hipMemcpy3DParms::srcArray and hipMemcpy3DParms::dstArray is expected to return error code.
+8) Pass node of different graph and verify api returns error code.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+/* Test verifies hipGraphExecMemcpyNodeSetParams API Negative scenarios.
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{10}, height{10}, depth{10};
+  hipArray_t devArray, devArray2;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparms;
+  hipError_t ret;
+  int* hData;
+  uint32_t size = width * height * depth * sizeof(int);
+  hData = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
+                             height, depth), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc, make_hipExtent(width+1,
+                             height+1, depth+1), hipArrayDefault));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparms.dstArray = devArray;
+  myparms.kind = hipMemcpyHostToDevice;
+
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  hipGraphExec_t graphExec;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
+
+  // Instantiate the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+  SECTION("Pass hGraphExec as nullptr") {
+    ret = hipGraphExecMemcpyNodeSetParams(nullptr, memcpyNode, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass node as nullptr") {
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, nullptr, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pNodeParams as nullptr") {
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, nullptr);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass pNodeParams as empty structure object") {
+    hipMemcpy3DParms temp{};
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("API expects atleast one memcpy src pointer to be set") {
+    hipMemcpy3DParms temp;
+    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
+    temp.srcPos = make_hipPos(0, 0, 0);
+    temp.dstPos = make_hipPos(0, 0, 0);
+    temp.extent = make_hipExtent(width , height, depth);
+    temp.dstArray = devArray;
+    temp.kind = hipMemcpyHostToDevice;
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("API expects atleast one memcpy dst pointer to be set") {
+    hipMemcpy3DParms temp;
+    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
+    temp.srcPos = make_hipPos(0, 0, 0);
+    temp.dstPos = make_hipPos(0, 0, 0);
+    temp.extent = make_hipExtent(width , height, depth);
+    temp.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+    temp.kind = hipMemcpyHostToDevice;
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Passing different element size for hipMemcpy3DParms::srcArray"
+                   "and hipMemcpy3DParms::dstArray") {
+    hipMemcpy3DParms temp;
+    memset(&temp, 0x0, sizeof(hipMemcpy3DParms));
+    temp.srcPos = make_hipPos(0, 0, 0);
+    temp.dstPos = make_hipPos(0, 0, 0);
+    temp.extent = make_hipExtent(width , height, depth);
+    temp.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+    temp.kind = hipMemcpyHostToDevice;
+    temp.srcArray = devArray;
+    temp.dstArray = devArray2;
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &temp);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Check with other graph node") {
+    hipGraph_t graph1;
+    hipGraphNode_t memcpyNode1;
+    HIP_CHECK(hipGraphCreate(&graph1, 0));
+    HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode1, graph1, NULL, 0, &myparms));
+    ret = hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode1, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+    HIP_CHECK(hipGraphDestroy(graph1));
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipFreeArray(devArray));
+  HIP_CHECK(hipFreeArray(devArray2));
+  free(hData);
+}
+
+/* Test verifies hipGraphExecMemcpyNodeSetParams API Functional scenarios.
+ */
+TEST_CASE("Unit_hipGraphExecMemcpyNodeSetParams_Functional") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int XSIZE = 1024;
+  int harray1D[XSIZE]{};
+  int harray1Dres[XSIZE]{};
+  constexpr int width{XSIZE};
+  hipArray_t devArray1, devArray2;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparams;
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  std::vector<hipGraphNode_t> dependencies;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  // Initialize 1D object
+  for (int i = 0; i < XSIZE; i++) {
+    harray1D[i] = i + 1;
+  }
+
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                                          0, 0, 0, formatKind);
+  // Allocate 1D device array by passing depth(0), height(0)
+  HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc,
+                       make_hipExtent(width, 0, 0), hipArrayDefault));
+  HIP_CHECK(hipMalloc3DArray(&devArray2, &channelDesc,
+                       make_hipExtent(width, 0, 0), hipArrayDefault));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  // Host to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.srcPtr = make_hipPitchedPtr(harray1D, width * sizeof(int),
+                                       width, 1);
+  myparams.dstArray = devArray1;
+  myparams.kind = hipMemcpyHostToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, nullptr, 0, &myparams));
+  dependencies.push_back(memcpyNode);
+
+  // Device to Device
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.srcArray = devArray1;
+  myparams.dstArray = devArray2;
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.kind = hipMemcpyDeviceToDevice;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                  dependencies.size(), &myparams));
+  dependencies.clear();
+  dependencies.push_back(memcpyNode);
+
+  // Device to host
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.dstPtr = make_hipPitchedPtr(harray1Dres, width * sizeof(int),
+                                       width, 1);
+  myparams.srcArray = devArray2;
+  myparams.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                  dependencies.size(), &myparams));
+
+  // Instantiate the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+
+  int harray1Dupdate[XSIZE]{};
+  hipArray_t devArray3;
+  HIP_CHECK(hipMalloc3DArray(&devArray3, &channelDesc,
+                       make_hipExtent(width, 0, 0), hipArrayDefault));
+
+  // D2H updated with different pointer harray1Dres -> harray1Dupdate
+  memset(&myparams, 0x0, sizeof(hipMemcpy3DParms));
+  myparams.srcPos = make_hipPos(0, 0, 0);
+  myparams.dstPos = make_hipPos(0, 0, 0);
+  myparams.extent = make_hipExtent(width, 1, 1);
+  myparams.dstPtr = make_hipPitchedPtr(harray1Dupdate, width * sizeof(int),
+                                      width, 1);
+  myparams.srcArray = devArray2;
+  myparams.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphExecMemcpyNodeSetParams(graphExec, memcpyNode, &myparams));
+
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Validate result
+  for (int i = 0; i < XSIZE; i++) {
+    if (harray1D[i] != harray1Dupdate[i]) {
+      INFO("harray1D: " << harray1D[i] << " harray1Dupdate: " <<
+                      harray1Dupdate[i] << " mismatch at : " << i);
+      REQUIRE(false);
+    }
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray1));
+  HIP_CHECK(hipFreeArray(devArray2));
+}
@@ -1,13 +1,16 @@
 /*
 Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -18,178 +21,66 @@ THE SOFTWARE.
 */

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
 #include <hip_test_kernels.hh>

-/**
-* @addtogroup hipGraphKernelNodeGetAttribute hipGraphKernelNodeGetAttribute
-* @{
-* @ingroup GraphTest
-* `hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode,
-*          hipKernelNodeAttrID attr, hipKernelNodeAttrValue* value_out )` -
-* Queries node attribute.
-*/
+#define THREADS_PER_BLOCK 512

-/**
-* Test Description
-* ------------------------
-*  - Functional Test for API - hipGraphKernelNodeGetAttribute
-*    1) GetKernelAttribute for ID hipKernelNodeAttributeCooperative
-*    2) GetKernelAttribute for ID hipKernelNodeAttributeAccessPolicyWindow
-* Test source
-* ------------------------
-*  - unit/graph/hipGraphKernelNodeGetAttribute.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+TEST_CASE("Unit_hipGraphKernelNodeGetAttribute_Negative_Parameters") {
+  constexpr int N = 1024;

-TEST_CASE("Unit_hipGraphKernelNodeGetAttribute_Functional") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
-  hipKernelNodeParams kNodeParams{};
-  hipStream_t stream;
  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
+  HIP_CHECK(hipMalloc(&A_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&B_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&C_d, sizeof(int) * N));

-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&stream));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));
-
-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kNodeParams.gridDim = dim3(blocks);
-  kNodeParams.blockDim = dim3(threadsPerBlock);
-  kNodeParams.sharedMemBytes = 0;
-  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
-  kNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                  &kNodeParams));
-
-  // Create dependencies
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpy_C, 1));
-
-  hipKernelNodeAttrValue value_out;
-  memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
-
-  SECTION("GetKernelAttribute for hipKernelNodeAttributeCooperative") {
-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeCooperative, &value_out));
-  }
-  SECTION("GetKernelAttribute for hipKernelNodeAttributeAccessPolicyWindow") {
-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_out));
-  }
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, stream));
-  HIP_CHECK(hipStreamSynchronize(stream));
-
-  // Verify graph execution result
-  HipTest::checkVectorADD<int>(A_h, B_h, C_h, N);
-
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(stream));
-}
-
-/**
-* Test Description
-* ------------------------
-*  - Negative Test for API - hipGraphKernelNodeGetAttribute
-*    1) Pass kernel node as nullptr for Get attribute api & verify
-*    2) Pass KernelNodeAttrID as negative value for Get attribute api & verify
-*    3) Pass KernelNodeAttrID as INT_MAX value for Get attribute api & verify
-*    4) Pass KernelNodeAttrValue as nullptr for Get attribute api & verify
-* Test source
-* ------------------------
-*  - unit/graph/hipGraphKernelNodeGetAttribute.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
-
-TEST_CASE("Unit_hipGraphKernelNodeGetAttribute_Negative") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
  hipGraph_t graph;
-  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
-  hipKernelNodeParams kNodeParams{};
-  hipStream_t stream;
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
-  hipError_t ret;
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&stream));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));

-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kNodeParams.gridDim = dim3(blocks);
-  kNodeParams.blockDim = dim3(threadsPerBlock);
-  kNodeParams.sharedMemBytes = 0;
-  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
-  kNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                  &kNodeParams));
+  hipKernelNodeParams node_params{};
+  node_params.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
+  node_params.gridDim = dim3(N / THREADS_PER_BLOCK, 1, 1);
+  node_params.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);

-  hipKernelNodeAttrValue value_out;
-  memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+  size_t N_elem{N};
+  void* kernel_params[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&N_elem)};
+  node_params.kernelParams = reinterpret_cast<void**>(kernel_params);

-  SECTION("Pass kernel node as nullptr for Get attribute api") {
-    ret = hipGraphKernelNodeGetAttribute(nullptr,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_out);
-    REQUIRE(hipErrorInvalidValue == ret);
+  hipGraphNode_t graph_node;
+  HIP_CHECK(hipGraphAddKernelNode(&graph_node, graph, nullptr, 0, &node_params));
+
+  hipKernelNodeAttrValue node_attribute;
+
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeGetAttribute(
+                        nullptr, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass KernelNodeAttrID as negative value for Get attribute api") {
-    ret = hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-                         hipKernelNodeAttrID(-1), &value_out);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("node is not a kernel node") {
+    hipGraphNode_t empty_node;
+    HIP_CHECK(hipGraphAddEmptyNode(&empty_node, graph, nullptr, 0));
+    HIP_CHECK_ERROR(hipGraphKernelNodeGetAttribute(
+                        empty_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass KernelNodeAttrID as INT_MAX value for Get attribute api") {
-    ret = hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-                         hipKernelNodeAttrID(INT_MAX), &value_out);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("invalid attribute") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeGetAttribute(graph_node, static_cast<hipKernelNodeAttrID>(-1),
+                                                   &node_attribute),
+                    hipErrorInvalidValue);
  }
-#if HT_AMD  // getting SIGSEGV error in Cuda Setup
-  SECTION("Pass KernelNodeAttrValue as nullptr for Get attribute api") {
-    ret = hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+#if HT_AMD  // segfaults on NVIDIA
+  SECTION("value == nullptr") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeGetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, nullptr),
+                    hipErrorInvalidValue);
  }
 #endif

-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(stream));
+
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(C_d));
 }
@@ -1,13 +1,16 @@
 /*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -18,353 +21,214 @@ THE SOFTWARE.
 */

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
 #include <hip_test_kernels.hh>

-/**
-* @addtogroup hipGraphKernelNodeSetAttribute hipGraphKernelNodeSetAttribute
-* @{
-* @ingroup GraphTest
-* `hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode,
-*          hipKernelNodeAttrID attr, const hipKernelNodeAttrValue* value )` -
-* Sets node attribute.
-*/
+#define THREADS_PER_BLOCK 512

-/**
-* Test Description
-* ------------------------
-*  - Functional Test for API - hipGraphKernelNodeSetAttribute
-*    1) Check hipGraphKernelNodeSetAttribute for AccessPolicyWindow attributes
-*    2) Check hipGraphKernelNodeSetAttribute for cooperative attributes
-*    3) Check hipGraphKernelNodeSetAttribute for window cooperative attributes
-* Test source
-* ------------------------
-*  - unit/graph/hipGraphKernelNodeGetAttribute.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+namespace {
+constexpr std::array<hipAccessProperty, 3> kAccessProperties{
+    hipAccessPropertyNormal, hipAccessPropertyStreaming, hipAccessPropertyPersisting};
+}  // anonymous namespace

-static bool validateKernelNodeAttrValue(hipKernelNodeAttrValue in,
-                                        hipKernelNodeAttrValue out) {
-  if ((in.accessPolicyWindow.base_ptr  != out.accessPolicyWindow.base_ptr)  ||
-      (in.accessPolicyWindow.hitProp   != out.accessPolicyWindow.hitProp)   ||
-      (in.accessPolicyWindow.hitRatio  != out.accessPolicyWindow.hitRatio)  ||
-      (in.accessPolicyWindow.missProp  != out.accessPolicyWindow.missProp)  ||
-      (in.accessPolicyWindow.num_bytes != out.accessPolicyWindow.num_bytes) ||
-      (in.cooperative != out.cooperative)) {
-    return false;
-  }
-  return true;
+static bool CompareAccessPolicyWindow(const hipKernelNodeAttrValue& lhs,
+                                      const hipKernelNodeAttrValue& rhs) {
+  return lhs.accessPolicyWindow.base_ptr == rhs.accessPolicyWindow.base_ptr &&
+      lhs.accessPolicyWindow.num_bytes == rhs.accessPolicyWindow.num_bytes &&
+      lhs.accessPolicyWindow.hitRatio == rhs.accessPolicyWindow.hitRatio &&
+      lhs.accessPolicyWindow.hitProp == rhs.accessPolicyWindow.hitProp &&
+      lhs.accessPolicyWindow.missProp == rhs.accessPolicyWindow.missProp;
 }

-TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Functional") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
-  hipKernelNodeParams kNodeParams{};
-  hipStream_t stream;
+TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Positive_AccessPolicyWindow") {
+  constexpr int N = 1024;
+
+  const auto hit_prop = GENERATE(from_range(begin(kAccessProperties), end(kAccessProperties)));
+  const auto miss_prop = GENERATE(from_range(begin(kAccessProperties), end(kAccessProperties) - 1));
+
  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIP_CHECK(hipMalloc(&A_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&B_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&C_d, sizeof(int) * N));

+  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&stream));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));

-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kNodeParams.gridDim = dim3(blocks);
-  kNodeParams.blockDim = dim3(threadsPerBlock);
-  kNodeParams.sharedMemBytes = 0;
-  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
-  kNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                  &kNodeParams));
+  hipKernelNodeParams node_params{};
+  node_params.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
+  node_params.gridDim = dim3(N / THREADS_PER_BLOCK, 1, 1);
+  node_params.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);

-  // Create dependencies
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpy_C, 1));
+  size_t N_elem{N};
+  void* kernel_params[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&N_elem)};
+  node_params.kernelParams = reinterpret_cast<void**>(kernel_params);

-  hipKernelNodeAttrValue value_in, value_out;
+  hipGraphNode_t graph_node;
+  HIP_CHECK(hipGraphAddKernelNode(&graph_node, graph, nullptr, 0, &node_params));

-  SECTION("Check hipGraphKernelNodeSetAttribute for AccessPolicyWindow") {
-    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
-    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+  int max_window_size;
+  HIP_CHECK(
+      hipDeviceGetAttribute(&max_window_size, hipDeviceAttributeAccessPolicyMaxWindowSize, 0));

-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+  hipKernelNodeAttrValue node_attribute_1;
+  node_attribute_1.accessPolicyWindow.base_ptr = reinterpret_cast<void*>(A_d);
+  node_attribute_1.accessPolicyWindow.num_bytes =
+      std::min<unsigned long>(static_cast<unsigned long>(max_window_size), sizeof(int) * N);
+  node_attribute_1.accessPolicyWindow.hitRatio = 0.6;
+  node_attribute_1.accessPolicyWindow.hitProp = hit_prop;
+  node_attribute_1.accessPolicyWindow.missProp = miss_prop;

-    value_in.accessPolicyWindow.hitRatio = 0.8;
-    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
-    value_in.accessPolicyWindow.missProp = hipAccessPropertyStreaming;
+  HIP_CHECK(hipGraphKernelNodeSetAttribute(graph_node, hipKernelNodeAttributeAccessPolicyWindow,
+                                           &node_attribute_1));

-    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+  hipKernelNodeAttrValue node_attribute_2;
+  HIP_CHECK(hipGraphKernelNodeGetAttribute(graph_node, hipKernelNodeAttributeAccessPolicyWindow,
+                                           &node_attribute_2));

-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
-    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
-  }
-  SECTION("Check hipGraphKernelNodeSetAttribute for cooperative") {
-    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
-    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+  REQUIRE(CompareAccessPolicyWindow(node_attribute_1, node_attribute_2));

-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
-
-    value_in.cooperative = 2;
-
-    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
-
-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
-    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
-  }
-
-  SECTION("Check hipGraphKernelNodeSetAttribute for window and cooperative") {
-    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
-    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
-
-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
-
-    value_in.cooperative = 8;
-    value_in.accessPolicyWindow.hitRatio = 0.1;
-    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
-    value_in.accessPolicyWindow.missProp = hipAccessPropertyNormal;
-
-    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
-
-    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
-    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
-  }
-
-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, stream));
-  HIP_CHECK(hipStreamSynchronize(stream));
-
-  // Verify graph execution result
-  HipTest::checkVectorADD<int>(A_h, B_h, C_h, N);
-
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(stream));
+
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(C_d));
 }

-/**
-* Test Description
-* ------------------------
-*  - Negative/argument Test for API - hipGraphKernelNodeSetAttribute
-*    1) Pass kernel node as nullptr for Set attribute api and verify
-*    2) Pass KernelNodeAttrID as invalid value for Set attribute api and verify
-*    3) Pass KernelNodeAttrID as INT_MAX value for Get attribute api and verify
-*    4) Pass KernelNodeAttrValue as nullptr for Set attribute api and verify
-*    5) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value missProp as hipAccessPropertyPersisting
-*    6) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value hitProp as hipAccessPropertyPersisting
-*    7) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.hitRatio as 1.4
-*    8) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.hitRatio as 0
-*    9) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.hitRatio as 1
-*    10) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.hitRatio as -1.8
-*    11) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.hitRatio as -0.6
-*    12) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass accessPolicyWindow.num_bytes as 1024 & hitRatio as 0.6
-*    13) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-*            and pass accessPolicyWindow.num_bytes as 1 GB & hitRatio as -0.6
-*    14) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value accessPolicyWindow.num_bytes as 1 MB
-*    15) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
-*            and pass value base_ptr as nullptr
-* Test source
-* ------------------------
-*  - unit/graph/hipGraphKernelNodeSetAttribute.cc
-* Test requirements
-* ------------------------
-*  - HIP_VERSION >= 5.6
-*/
+TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Positive_Cooperative") {
+  constexpr int N = 1024;

-TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Negative") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  hipGraph_t graph;
-  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
-  hipKernelNodeParams kNodeParams{};
-  hipStream_t stream;
  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
-  hipError_t ret;
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  HIP_CHECK(hipMalloc(&A_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&B_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&C_d, sizeof(int) * N));

+  hipGraph_t graph;
  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipStreamCreate(&stream));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));

-  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kNodeParams.gridDim = dim3(blocks);
-  kNodeParams.blockDim = dim3(threadsPerBlock);
-  kNodeParams.sharedMemBytes = 0;
-  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
-  kNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                  &kNodeParams));
+  hipKernelNodeParams node_params{};
+  node_params.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
+  node_params.gridDim = dim3(N / THREADS_PER_BLOCK, 1, 1);
+  node_params.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);

-  hipKernelNodeAttrValue value_in, value_out;
-  memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
-  memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
-  HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
-            hipKernelNodeAttributeAccessPolicyWindow, &value_in));
-  memcpy(&value_out, &value_in, sizeof(hipKernelNodeAttrValue));
+  size_t N_elem{N};
+  void* kernel_params[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&N_elem)};
+  node_params.kernelParams = reinterpret_cast<void**>(kernel_params);

-  SECTION("Pass kernel node as nullptr for Set attribute api") {
-    ret = hipGraphKernelNodeSetAttribute(nullptr,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
+  hipGraphNode_t graph_node;
+  HIP_CHECK(hipGraphAddKernelNode(&graph_node, graph, nullptr, 0, &node_params));
+
+  hipKernelNodeAttrValue node_attribute_1;
+  node_attribute_1.cooperative = 2;
+
+  HIP_CHECK(hipGraphKernelNodeSetAttribute(graph_node, hipKernelNodeAttributeCooperative,
+                                           &node_attribute_1));
+
+  hipKernelNodeAttrValue node_attribute_2;
+  HIP_CHECK(hipGraphKernelNodeGetAttribute(graph_node, hipKernelNodeAttributeCooperative,
+                                           &node_attribute_2));
+
+  REQUIRE(node_attribute_1.cooperative == node_attribute_2.cooperative);
+
+  HIP_CHECK(hipGraphDestroy(graph));
+
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(C_d));
+}
+
+TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Negative_Parameters") {
+  constexpr int N = 1024;
+
+  int *A_d, *B_d, *C_d;
+  HIP_CHECK(hipMalloc(&A_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&B_d, sizeof(int) * N));
+  HIP_CHECK(hipMalloc(&C_d, sizeof(int) * N));
+
+  hipGraph_t graph;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  hipKernelNodeParams node_params{};
+  node_params.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
+  node_params.gridDim = dim3(N / THREADS_PER_BLOCK, 1, 1);
+  node_params.blockDim = dim3(THREADS_PER_BLOCK, 1, 1);
+
+  size_t N_elem{N};
+  void* kernel_params[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&N_elem)};
+  node_params.kernelParams = reinterpret_cast<void**>(kernel_params);
+
+  hipGraphNode_t graph_node;
+  HIP_CHECK(hipGraphAddKernelNode(&graph_node, graph, nullptr, 0, &node_params));
+
+  int max_window_size;
+  HIP_CHECK(
+      hipDeviceGetAttribute(&max_window_size, hipDeviceAttributeAccessPolicyMaxWindowSize, 0));
+
+  hipKernelNodeAttrValue node_attribute;
+  node_attribute.accessPolicyWindow.base_ptr = reinterpret_cast<void*>(A_d);
+  node_attribute.accessPolicyWindow.num_bytes =
+      std::min<unsigned long>(static_cast<unsigned long>(max_window_size), sizeof(int) * N);
+  node_attribute.accessPolicyWindow.hitRatio = 0.6;
+  node_attribute.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
+  node_attribute.accessPolicyWindow.missProp = hipAccessPropertyStreaming;
+
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        nullptr, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass KernelNodeAttrID as invalid value for Set attribute api") {
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                         hipKernelNodeAttrID(-1), &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("node is not a kernel node") {
+    hipGraphNode_t empty_node;
+    HIP_CHECK(hipGraphAddEmptyNode(&empty_node, graph, nullptr, 0));
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        empty_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
  }
-  SECTION("Pass KernelNodeAttrID as INT_MAX value for Set attribute api") {
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                         hipKernelNodeAttrID(INT_MAX), &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("invalid attribute") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(graph_node, static_cast<hipKernelNodeAttrID>(-1),
+                                                   &node_attribute),
+                    hipErrorInvalidValue);
  }
-#if HT_AMD  // getting SIGSEGV error in Cuda Setup
-  SECTION("Pass KernelNodeAttrValue as nullptr for Set attribute api") {
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+#if HT_AMD  // segfaults on NVIDIA
+  SECTION("value == nullptr") {
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, nullptr),
+                    hipErrorInvalidValue);
  }
 #endif
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value missProp as hipAccessPropertyPersisting") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.missProp = hipAccessPropertyPersisting;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value hitProp as hipAccessPropertyPersisting") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.hitRatio as 1.4") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitRatio = 1.4;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.hitRatio as 0") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitRatio = 0;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.hitRatio as 1") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitRatio = 1;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.hitRatio as -1.8") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitRatio = -1.8;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.hitRatio as -0.6") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.hitRatio = -0.6;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " & pass accessPolicyWindow.num_bytes as 1024 & hitRatio as 0.6") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.num_bytes = 1024;
-    value_in.accessPolicyWindow.hitRatio = 0.6;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " & pass accessPolicyWindow.num_bytes as 1 GB & hitRatio as -0.6") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.num_bytes = 1024 * 1024 * 1024;
-    value_in.accessPolicyWindow.hitRatio = -0.6;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value accessPolicyWindow.num_bytes as 1 MB") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.num_bytes = 1024 * 1024;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
-          " and pass value base_ptr as nullptr") {
-    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
-    value_in.accessPolicyWindow.base_ptr = nullptr;
-    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
-                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
-    REQUIRE(hipSuccess == ret);
+
+  SECTION("accessPolicyWindow.num_bytes > accessPolicyMaxWindowSize") {
+    node_attribute.accessPolicyWindow.num_bytes = max_window_size + 1;
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("accessPolicyWindow.hitRatio < 0") {
+    node_attribute.accessPolicyWindow.hitRatio = -0.6;
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("accessPolicyWindow.hitRatio > 1.0") {
+    node_attribute.accessPolicyWindow.hitRatio = 1.1;
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
+  }
+
+  SECTION("accessPolicyWindow.missProp == hipAccessPropertyPersisting") {
+    node_attribute.accessPolicyWindow.missProp = hipAccessPropertyPersisting;
+    HIP_CHECK_ERROR(hipGraphKernelNodeSetAttribute(
+                        graph_node, hipKernelNodeAttributeAccessPolicyWindow, &node_attribute),
+                    hipErrorInvalidValue);
  }

-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(stream));
-}
+
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(C_d));
+}
@@ -0,0 +1,370 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_kernels.hh>
+
+/**
+* @addtogroup hipGraphKernelNodeSetAttribute hipGraphKernelNodeSetAttribute
+* @{
+* @ingroup GraphTest
+* `hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode,
+*          hipKernelNodeAttrID attr, const hipKernelNodeAttrValue* value )` -
+* Sets node attribute.
+*/
+
+/**
+* Test Description
+* ------------------------
+*  - Functional Test for API - hipGraphKernelNodeSetAttribute
+*    1) Check hipGraphKernelNodeSetAttribute for AccessPolicyWindow attributes
+*    2) Check hipGraphKernelNodeSetAttribute for cooperative attributes
+*    3) Check hipGraphKernelNodeSetAttribute for window cooperative attributes
+* Test source
+* ------------------------
+*  - unit/graph/hipGraphKernelNodeGetAttribute.cc
+* Test requirements
+* ------------------------
+*  - HIP_VERSION >= 5.6
+*/
+
+static bool validateKernelNodeAttrValue(hipKernelNodeAttrValue in,
+                                        hipKernelNodeAttrValue out) {
+  if ((in.accessPolicyWindow.base_ptr  != out.accessPolicyWindow.base_ptr)  ||
+      (in.accessPolicyWindow.hitProp   != out.accessPolicyWindow.hitProp)   ||
+      (in.accessPolicyWindow.hitRatio  != out.accessPolicyWindow.hitRatio)  ||
+      (in.accessPolicyWindow.missProp  != out.accessPolicyWindow.missProp)  ||
+      (in.accessPolicyWindow.num_bytes != out.accessPolicyWindow.num_bytes) ||
+      (in.cooperative != out.cooperative)) {
+    return false;
+  }
+  return true;
+}
+
+TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Functional") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  constexpr auto blocksPerCU = 6;  // to hide latency
+  constexpr auto threadsPerBlock = 256;
+  hipGraph_t graph;
+  hipGraphExec_t graphExec;
+  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
+  hipKernelNodeParams kNodeParams{};
+  hipStream_t stream;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  size_t NElem{N};
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipStreamCreate(&stream));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
+                                    Nbytes, hipMemcpyDeviceToHost));
+
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
+  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  kNodeParams.gridDim = dim3(blocks);
+  kNodeParams.blockDim = dim3(threadsPerBlock);
+  kNodeParams.sharedMemBytes = 0;
+  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
+  kNodeParams.extra = nullptr;
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
+                                  &kNodeParams));
+
+  // Create dependencies
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpy_C, 1));
+
+  hipKernelNodeAttrValue value_in, value_out;
+
+  SECTION("Check hipGraphKernelNodeSetAttribute for AccessPolicyWindow") {
+    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
+    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    value_in.accessPolicyWindow.hitRatio = 0.8;
+    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
+    value_in.accessPolicyWindow.missProp = hipAccessPropertyStreaming;
+
+    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
+    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
+  }
+  SECTION("Check hipGraphKernelNodeSetAttribute for cooperative") {
+    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
+    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    value_in.cooperative = 2;
+
+    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
+    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
+  }
+
+  SECTION("Check hipGraphKernelNodeSetAttribute for window and cooperative") {
+    memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
+    memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    value_in.cooperative = 8;
+    value_in.accessPolicyWindow.hitRatio = 0.1;
+    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
+    value_in.accessPolicyWindow.missProp = hipAccessPropertyNormal;
+
+    HIP_CHECK(hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+
+    HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+              hipKernelNodeAttributeAccessPolicyWindow, &value_out));
+    REQUIRE(true == validateKernelNodeAttrValue(value_in, value_out));
+  }
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, stream));
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  // Verify graph execution result
+  HipTest::checkVectorADD<int>(A_h, B_h, C_h, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(stream));
+}
+
+/**
+* Test Description
+* ------------------------
+*  - Negative/argument Test for API - hipGraphKernelNodeSetAttribute
+*    1) Pass kernel node as nullptr for Set attribute api and verify
+*    2) Pass KernelNodeAttrID as invalid value for Set attribute api and verify
+*    3) Pass KernelNodeAttrID as INT_MAX value for Get attribute api and verify
+*    4) Pass KernelNodeAttrValue as nullptr for Set attribute api and verify
+*    5) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value missProp as hipAccessPropertyPersisting
+*    6) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value hitProp as hipAccessPropertyPersisting
+*    7) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.hitRatio as 1.4
+*    8) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.hitRatio as 0
+*    9) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.hitRatio as 1
+*    10) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.hitRatio as -1.8
+*    11) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.hitRatio as -0.6
+*    12) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass accessPolicyWindow.num_bytes as 1024 & hitRatio as 0.6
+*    13) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+*            and pass accessPolicyWindow.num_bytes as 1 GB & hitRatio as -0.6
+*    14) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value accessPolicyWindow.num_bytes as 1 MB
+*    15) Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow
+*            and pass value base_ptr as nullptr
+* Test source
+* ------------------------
+*  - unit/graph/hipGraphKernelNodeSetAttribute.cc
+* Test requirements
+* ------------------------
+*  - HIP_VERSION >= 5.6
+*/
+
+TEST_CASE("Unit_hipGraphKernelNodeSetAttribute_Negative") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  constexpr auto blocksPerCU = 6;  // to hide latency
+  constexpr auto threadsPerBlock = 256;
+  hipGraph_t graph;
+  hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, kernel_vecAdd;
+  hipKernelNodeParams kNodeParams{};
+  hipStream_t stream;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  size_t NElem{N};
+  hipError_t ret;
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipStreamCreate(&stream));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
+                                    Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
+                                    Nbytes, hipMemcpyDeviceToHost));
+
+  void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
+  kNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
+  kNodeParams.gridDim = dim3(blocks);
+  kNodeParams.blockDim = dim3(threadsPerBlock);
+  kNodeParams.sharedMemBytes = 0;
+  kNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
+  kNodeParams.extra = nullptr;
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
+                                  &kNodeParams));
+
+  hipKernelNodeAttrValue value_in, value_out;
+  memset(&value_in, 0, sizeof(hipKernelNodeAttrValue));
+  memset(&value_out, 0, sizeof(hipKernelNodeAttrValue));
+  HIP_CHECK(hipGraphKernelNodeGetAttribute(kernel_vecAdd,
+            hipKernelNodeAttributeAccessPolicyWindow, &value_in));
+  memcpy(&value_out, &value_in, sizeof(hipKernelNodeAttrValue));
+
+  SECTION("Pass kernel node as nullptr for Set attribute api") {
+    ret = hipGraphKernelNodeSetAttribute(nullptr,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as invalid value for Set attribute api") {
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                         hipKernelNodeAttrID(-1), &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as INT_MAX value for Set attribute api") {
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                         hipKernelNodeAttrID(INT_MAX), &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+#if HT_AMD  // getting SIGSEGV error in Cuda Setup
+  SECTION("Pass KernelNodeAttrValue as nullptr for Set attribute api") {
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, nullptr);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+#endif
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value missProp as hipAccessPropertyPersisting") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.missProp = hipAccessPropertyPersisting;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value hitProp as hipAccessPropertyPersisting") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitProp = hipAccessPropertyPersisting;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.hitRatio as 1.4") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitRatio = 1.4;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.hitRatio as 0") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitRatio = 0;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.hitRatio as 1") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitRatio = 1;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.hitRatio as -1.8") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitRatio = -1.8;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.hitRatio as -0.6") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.hitRatio = -0.6;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " & pass accessPolicyWindow.num_bytes as 1024 & hitRatio as 0.6") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.num_bytes = 1024;
+    value_in.accessPolicyWindow.hitRatio = 0.6;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " & pass accessPolicyWindow.num_bytes as 1 GB & hitRatio as -0.6") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.num_bytes = 1024 * 1024 * 1024;
+    value_in.accessPolicyWindow.hitRatio = -0.6;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value accessPolicyWindow.num_bytes as 1 MB") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.num_bytes = 1024 * 1024;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass KernelNodeAttrID as hipKernelNodeAttributeAccessPolicyWindow"
+          " and pass value base_ptr as nullptr") {
+    memcpy(&value_in, &value_out, sizeof(hipKernelNodeAttrValue));
+    value_in.accessPolicyWindow.base_ptr = nullptr;
+    ret = hipGraphKernelNodeSetAttribute(kernel_vecAdd,
+                   hipKernelNodeAttributeAccessPolicyWindow, &value_in);
+    REQUIRE(hipSuccess == ret);
+  }
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(stream));
+}
@@ -1,13 +1,16 @@
 /*
 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,220 +20,69 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios :
-Negative -
-1) Pass node as nullptr and verify api returns error code.
-2) Pass un-initialize node and verify api returns error code.
-3) Pass pNodeParams as nullptr and verify api returns error code.
-Functional -
-1) Create a graph, add Memcpy node to graph with desired node params.
-   Verify api fetches the node params mentioned while adding Memcpy node.
-2) Set Memcpy node params with hipGraphMemcpyNodeSetParams,
-   now get the params and verify both are same.
-*/
-
+#include <hip_test_defgroups.hh>
 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <resource_guards.hh>

-#define SIZE 10
-#define UPDATESIZE 8
-
-/* Test verifies hipGraphMemcpyNodeGetParams API Negative scenarios.
- */
-TEST_CASE("Unit_hipGraphMemcpyNodeGetParams_Negative") {
-  CHECK_IMAGE_SUPPORT
-
-  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
-  hipArray_t devArray;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparms;
-  int* hData;
-  uint32_t size = width * height * depth * sizeof(int);
-  hData = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
-    }
-  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
-                             height, depth), hipArrayDefault));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparms.dstArray = devArray;
-  myparms.kind = hipMemcpyHostToDevice;
-
-  hipGraph_t graph;
-  hipError_t ret;
-  hipGraphNode_t memcpyNode;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
-
-  SECTION("Pass node as nullptr") {
-    ret = hipGraphMemcpyNodeGetParams(nullptr, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass un-initilize node") {
-    hipGraphNode_t memcpyNode_uninit{};
-    ret = hipGraphMemcpyNodeGetParams(memcpyNode_uninit, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass GetNodeParams as nullptr") {
-    ret = hipGraphMemcpyNodeGetParams(memcpyNode, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  HIP_CHECK(hipFreeArray(devArray));
-  free(hData);
-  HIP_CHECK(hipGraphDestroy(graph));
-}
-
-/* Test verifies hipGraphMemcpyNodeGetParams API Functional scenarios.
+/**
+ * @addtogroup hipGraphMemcpyNodeGetParams hipGraphMemcpyNodeGetParams
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms *pNodeParams)` -
+ * 	Gets a memcpy node's parameters
+ * ________________________
+ * Test cases from other APIs:
+ *  - @ref Unit_hipGraphMemcpyNodeSetParams_Positive_Basic
 */

-static bool compareHipPos(hipPos hPos1, hipPos hPos2) {
-  if ((hPos1.x == hPos2.x) && (hPos1.y == hPos2.y) && (hPos1.z == hPos2.z))
-    return true;
-  else
-    return false;
-}
-static bool compareHipExtent(hipExtent hExt1, hipExtent hExt2) {
-  if ((hExt1.width == hExt2.width) && (hExt1.height == hExt2.height) &&
-      (hExt1.depth == hExt2.depth))
-    return true;
-  else
-    return false;
-}
-static bool compareHipPitchedPtr(hipPitchedPtr hpPtr1, hipPitchedPtr hpPtr2) {
-  if ((reinterpret_cast<int *>(hpPtr1.ptr) ==
-       reinterpret_cast<int *>(hpPtr2.ptr))
-       && (hpPtr1.pitch == hpPtr2.pitch)
-       #if HT_AMD
-       && (hpPtr1.xsize == hpPtr2.xsize)
-       /* xsize check below is disabled on nvidia as xsize value
-        * is not being updated properly due to issue with CUDA api */
-       #endif
-       && (hpPtr1.ysize == hpPtr2.ysize))
-    return true;
-  else
-    return false;
-}
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *      -# node is nullptr
+ *      -# pNodeParams is nullptr
+ *      -# node is destroyed
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphMemcpyNodeGetParams.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeGetParams_Negative_Parameters") {
+  constexpr hipExtent extent{128 * sizeof(int), 128, 8};

-static bool memcpyNodeCompare(hipMemcpy3DParms *mNode1,
-                              hipMemcpy3DParms *mNode2) {
-  if (mNode1->srcArray != mNode2->srcArray)
-    return false;
-  if (!compareHipPos(mNode1->srcPos, mNode2->srcPos))
-    return false;
-  if (!compareHipPitchedPtr(mNode1->srcPtr, mNode2->srcPtr))
-    return false;
-  if (mNode1->dstArray != mNode2->dstArray)
-    return false;
-  if (!compareHipPos(mNode1->dstPos, mNode2->dstPos))
-    return false;
-  if (!compareHipPitchedPtr(mNode1->dstPtr, mNode2->dstPtr))
-    return false;
-  if (!compareHipExtent(mNode1->extent, mNode2->extent))
-    return false;
-  if (mNode1->kind != mNode2->kind)
-    return false;
-  return true;
-}
+  LinearAllocGuard3D<int> src_alloc(extent);
+  LinearAllocGuard3D<int> dst_alloc(extent);

-TEST_CASE("Unit_hipGraphMemcpyNodeGetParams_Functional") {
-  CHECK_IMAGE_SUPPORT
+  hipMemcpy3DParms params = {};
+  params.srcPtr = src_alloc.pitched_ptr();
+  params.srcPos = make_hipPos(0, 0, 0);
+  params.dstPtr = dst_alloc.pitched_ptr();
+  params.dstPos = make_hipPos(0, 0, 0);
+  params.extent = extent;
+  params.kind = hipMemcpyDeviceToDevice;

-  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
-  hipArray_t devArray;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparms;
-  int* hData;
-  uint32_t size = width * height * depth * sizeof(int);
-  hData = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
-    }
-  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
-                             height, depth), hipArrayDefault));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparms.dstArray = devArray;
-  myparms.kind = hipMemcpyHostToDevice;
+  hipGraph_t graph = nullptr;
+  hipGraphNode_t node = nullptr;

-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
-
-  SECTION("Get Memcpy Param and verify.") {
-    hipMemcpy3DParms m3DGetParams;
-    REQUIRE(hipSuccess == hipGraphMemcpyNodeGetParams(memcpyNode,
-                                                      &m3DGetParams));
-    // Validating the result
-    REQUIRE(true == memcpyNodeCompare(&myparms, &m3DGetParams));
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphMemcpyNodeGetParams(nullptr, &params), hipErrorInvalidValue);
  }

-  SECTION("Set memcpy params and Get param and verify.") {
-    hipMemcpy3DParms myparms1, m3DGetParams1;
-    constexpr int width1{UPDATESIZE}, height1{UPDATESIZE}, depth1{UPDATESIZE};
-    hipArray_t devArray1;
-    hipChannelFormatKind formatKind1 = hipChannelFormatKindSigned;
-    int* hData1;
-    uint32_t size1 = width1 * height1 * depth1 * sizeof(int);
-    hData1 = reinterpret_cast<int*>(malloc(size1));
-    REQUIRE(hData1 != nullptr);
-    memset(hData1, 0, size1);
-    for (int i = 0; i < depth1; i++) {
-      for (int j = 0; j < height1; j++) {
-        for (int k = 0; k < width1; k++) {
-          hData1[i*width1*height1 + j*width1 + k] = i*width1*height1 +
-                                                    j*width1 + k;
-        }
-      }
-    }
-    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind1);
-    HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc1,
-              make_hipExtent(width1, height1, depth1), hipArrayDefault));
-    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
-    myparms1.srcPos = make_hipPos(0, 0, 0);
-    myparms1.dstPos = make_hipPos(0, 0, 0);
-    myparms1.extent = make_hipExtent(width1 , height1, depth1);
-    myparms1.srcPtr = make_hipPitchedPtr(hData1, width1 * sizeof(int),
-                                         width1, height1);
-    myparms1.dstArray = devArray1;
-    myparms1.kind = hipMemcpyHostToDevice;
-
-    REQUIRE(hipSuccess == hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
-    REQUIRE(hipSuccess == hipGraphMemcpyNodeGetParams(memcpyNode,
-                                                      &m3DGetParams1));
-    REQUIRE(true == memcpyNodeCompare(&myparms1, &m3DGetParams1));
-
-    HIP_CHECK(hipFreeArray(devArray1));
-    free(hData1);
+  SECTION("pNodeParams == nullptr") {
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params));
+    HIP_CHECK_ERROR(hipGraphMemcpyNodeGetParams(node, nullptr), hipErrorInvalidValue);
+    HIP_CHECK(hipGraphDestroy(graph));
  }
-  HIP_CHECK(hipFreeArray(devArray));
-  free(hData);
-  HIP_CHECK(hipGraphDestroy(graph));
-}
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-208
+  SECTION("Node is destroyed") {
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    HIP_CHECK(hipGraphAddMemcpyNode(&node, graph, nullptr, 0, &params));
+    HIP_CHECK(hipGraphDestroy(graph));
+    HIP_CHECK_ERROR(hipGraphMemcpyNodeGetParams(node, &params), hipErrorInvalidValue);
+  }
+#endif
+}
@@ -0,0 +1,236 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+Negative -
+1) Pass node as nullptr and verify api returns error code.
+2) Pass un-initialize node and verify api returns error code.
+3) Pass pNodeParams as nullptr and verify api returns error code.
+Functional -
+1) Create a graph, add Memcpy node to graph with desired node params.
+   Verify api fetches the node params mentioned while adding Memcpy node.
+2) Set Memcpy node params with hipGraphMemcpyNodeSetParams,
+   now get the params and verify both are same.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+#define SIZE 10
+#define UPDATESIZE 8
+
+/* Test verifies hipGraphMemcpyNodeGetParams API Negative scenarios.
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeGetParams_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
+  hipArray_t devArray;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparms;
+  int* hData;
+  uint32_t size = width * height * depth * sizeof(int);
+  hData = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
+                             height, depth), hipArrayDefault));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparms.dstArray = devArray;
+  myparms.kind = hipMemcpyHostToDevice;
+
+  hipGraph_t graph;
+  hipError_t ret;
+  hipGraphNode_t memcpyNode;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
+
+  SECTION("Pass node as nullptr") {
+    ret = hipGraphMemcpyNodeGetParams(nullptr, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass un-initilize node") {
+    hipGraphNode_t memcpyNode_uninit{};
+    ret = hipGraphMemcpyNodeGetParams(memcpyNode_uninit, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass GetNodeParams as nullptr") {
+    ret = hipGraphMemcpyNodeGetParams(memcpyNode, nullptr);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  HIP_CHECK(hipFreeArray(devArray));
+  free(hData);
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+/* Test verifies hipGraphMemcpyNodeGetParams API Functional scenarios.
+ */
+
+static bool compareHipPos(hipPos hPos1, hipPos hPos2) {
+  if ((hPos1.x == hPos2.x) && (hPos1.y == hPos2.y) && (hPos1.z == hPos2.z))
+    return true;
+  else
+    return false;
+}
+static bool compareHipExtent(hipExtent hExt1, hipExtent hExt2) {
+  if ((hExt1.width == hExt2.width) && (hExt1.height == hExt2.height) &&
+      (hExt1.depth == hExt2.depth))
+    return true;
+  else
+    return false;
+}
+static bool compareHipPitchedPtr(hipPitchedPtr hpPtr1, hipPitchedPtr hpPtr2) {
+  if ((reinterpret_cast<int *>(hpPtr1.ptr) ==
+       reinterpret_cast<int *>(hpPtr2.ptr))
+       && (hpPtr1.pitch == hpPtr2.pitch)
+       #if HT_AMD
+       && (hpPtr1.xsize == hpPtr2.xsize)
+       /* xsize check below is disabled on nvidia as xsize value
+        * is not being updated properly due to issue with CUDA api */
+       #endif
+       && (hpPtr1.ysize == hpPtr2.ysize))
+    return true;
+  else
+    return false;
+}
+
+static bool memcpyNodeCompare(hipMemcpy3DParms *mNode1,
+                              hipMemcpy3DParms *mNode2) {
+  if (mNode1->srcArray != mNode2->srcArray)
+    return false;
+  if (!compareHipPos(mNode1->srcPos, mNode2->srcPos))
+    return false;
+  if (!compareHipPitchedPtr(mNode1->srcPtr, mNode2->srcPtr))
+    return false;
+  if (mNode1->dstArray != mNode2->dstArray)
+    return false;
+  if (!compareHipPos(mNode1->dstPos, mNode2->dstPos))
+    return false;
+  if (!compareHipPitchedPtr(mNode1->dstPtr, mNode2->dstPtr))
+    return false;
+  if (!compareHipExtent(mNode1->extent, mNode2->extent))
+    return false;
+  if (mNode1->kind != mNode2->kind)
+    return false;
+  return true;
+}
+
+TEST_CASE("Unit_hipGraphMemcpyNodeGetParams_Functional") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
+  hipArray_t devArray;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparms;
+  int* hData;
+  uint32_t size = width * height * depth * sizeof(int);
+  hData = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
+                             height, depth), hipArrayDefault));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparms.dstArray = devArray;
+  myparms.kind = hipMemcpyHostToDevice;
+
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
+
+  SECTION("Get Memcpy Param and verify.") {
+    hipMemcpy3DParms m3DGetParams;
+    REQUIRE(hipSuccess == hipGraphMemcpyNodeGetParams(memcpyNode,
+                                                      &m3DGetParams));
+    // Validating the result
+    REQUIRE(true == memcpyNodeCompare(&myparms, &m3DGetParams));
+  }
+
+  SECTION("Set memcpy params and Get param and verify.") {
+    hipMemcpy3DParms myparms1, m3DGetParams1;
+    constexpr int width1{UPDATESIZE}, height1{UPDATESIZE}, depth1{UPDATESIZE};
+    hipArray_t devArray1;
+    hipChannelFormatKind formatKind1 = hipChannelFormatKindSigned;
+    int* hData1;
+    uint32_t size1 = width1 * height1 * depth1 * sizeof(int);
+    hData1 = reinterpret_cast<int*>(malloc(size1));
+    REQUIRE(hData1 != nullptr);
+    memset(hData1, 0, size1);
+    for (int i = 0; i < depth1; i++) {
+      for (int j = 0; j < height1; j++) {
+        for (int k = 0; k < width1; k++) {
+          hData1[i*width1*height1 + j*width1 + k] = i*width1*height1 +
+                                                    j*width1 + k;
+        }
+      }
+    }
+    hipChannelFormatDesc channelDesc1 = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind1);
+    HIP_CHECK(hipMalloc3DArray(&devArray1, &channelDesc1,
+              make_hipExtent(width1, height1, depth1), hipArrayDefault));
+    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
+    myparms1.srcPos = make_hipPos(0, 0, 0);
+    myparms1.dstPos = make_hipPos(0, 0, 0);
+    myparms1.extent = make_hipExtent(width1 , height1, depth1);
+    myparms1.srcPtr = make_hipPitchedPtr(hData1, width1 * sizeof(int),
+                                         width1, height1);
+    myparms1.dstArray = devArray1;
+    myparms1.kind = hipMemcpyHostToDevice;
+
+    REQUIRE(hipSuccess == hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
+    REQUIRE(hipSuccess == hipGraphMemcpyNodeGetParams(memcpyNode,
+                                                      &m3DGetParams1));
+    REQUIRE(true == memcpyNodeCompare(&myparms1, &m3DGetParams1));
+
+    HIP_CHECK(hipFreeArray(devArray1));
+    free(hData1);
+  }
+  HIP_CHECK(hipFreeArray(devArray));
+  free(hData);
+  HIP_CHECK(hipGraphDestroy(graph));
+}
@@ -1,13 +1,16 @@
 /*
 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,203 +20,264 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios :
-Negative -
-1) Pass node as nullptr and verify api returns error code.
-2) Pass un-initialize node and verify api returns error code.
-3) Pass pNodeParams as nullptr and verify api returns error code.
-Functional -
-1) Add Memcpy node to graph, update the Memcpy node params with set and
-   launch the graph and check updated params are taking effect.
-2) Add Memcpy node to graph, launch graph, then update the Memcpy node params
-   with set and launch the graph and check updated params are taking effect.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip_test_defgroups.hh>
+#include <memcpy3d_tests_common.hh>

-#define SIZE 10
+#include "graph_tests_common.hh"

-/* Test verifies hipGraphMemcpyNodeSetParams API Negative scenarios.
+/**
+ * @addtogroup hipGraphMemcpyNodeSetParams hipGraphMemcpyNodeSetParams
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphMemcpyNodeSetParams (hipGraphNode_t node, const hipMemcpy3DParms *pNodeParams)` - Sets a
+ * memcpy node's parameters
 */
-TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Negative") {
-  CHECK_IMAGE_SUPPORT

-  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
-  hipArray_t devArray;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparms;
-  int* hData;
-  uint32_t size = width * height * depth * sizeof(int);
-  hData = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, size);
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that node parameters get updated correctly by creating a node with valid but
+ * incorrect parameters, and the setting them to the correct values after which the graph is
+ * executed and the results of the memcpy verified.
+ * The test is run for all possible memcpy directions, with both the corresponding memcpy
+ * kind and hipMemcpyDefault, as well as half page and full page allocation sizes.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphMemcpyNodeSetParams.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Positive_Basic") {
+  constexpr bool async = false;
+
+  SECTION("Device to host") {
+    Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<async, true, true>);
+  }
+
+  SECTION("Device to host with default kind") {
+    Memcpy3DDeviceToHostShell<async>(Memcpy3DWrapper<async, true, true>);
+  }
+
+  SECTION("Host to device") {
+    Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<async, true, true>);
+  }
+
+  SECTION("Host to device with default kind") {
+    Memcpy3DHostToDeviceShell<async>(Memcpy3DWrapper<async, true, true>);
+  }
+
+  SECTION("Host to host") { Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<async, true, true>); }
+
+  SECTION("Host to host with default kind") {
+    Memcpy3DHostToHostShell<async>(Memcpy3DWrapper<async, true, true>);
+  }
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<async, true, true>);
+    }
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<async, true, true>);
    }
  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
-                             height, depth), hipArrayDefault));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparms.dstArray = devArray;
-  myparms.kind = hipMemcpyHostToDevice;

-  hipGraph_t graph;
-  hipError_t ret;
-  hipGraphNode_t memcpyNode;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
-
-  SECTION("Pass node as nullptr") {
-    ret = hipGraphMemcpyNodeSetParams(nullptr, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass un-initialize node") {
-    hipGraphNode_t memcpyNode_uninit{};
-    ret = hipGraphMemcpyNodeSetParams(memcpyNode_uninit, &myparms);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass SetNodeParams as nullptr") {
-    ret = hipGraphMemcpyNodeSetParams(memcpyNode, nullptr);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  HIP_CHECK(hipFreeArray(devArray));
-  free(hData);
-  HIP_CHECK(hipGraphDestroy(graph));
-}
-
-/* Test verifies hipGraphMemcpyNodeSetParams API Functional scenarios.
- */
-TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Functional") {
-  CHECK_IMAGE_SUPPORT
-
-  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
-  hipArray_t devArray;
-  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
-  hipMemcpy3DParms myparms, myparms1;
-  uint32_t size = width * height * depth * sizeof(int);
-
-  int *hData = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, size);
-  int *hDataTemp = reinterpret_cast<int*>(malloc(size));
-  REQUIRE(hDataTemp != nullptr);
-  memset(hDataTemp, 0, size);
-  int *hOutputData = reinterpret_cast<int *>(malloc(size));
-  REQUIRE(hOutputData != nullptr);
-  memset(hOutputData, 0,  size);
-  int *hOutputData1 = reinterpret_cast<int *>(malloc(size));
-  REQUIRE(hOutputData1 != nullptr);
-  memset(hOutputData1, 0,  size);
-
-  for (int i = 0; i < depth; i++) {
-    for (int j = 0; j < height; j++) {
-      for (int k = 0; k < width; k++) {
-        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
-      }
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      Memcpy3DDeviceToDeviceShell<async, true>(Memcpy3DWrapper<async, true, true>);
+    }
+    SECTION("Peer access disabled") {
+      Memcpy3DDeviceToDeviceShell<async, false>(Memcpy3DWrapper<async, true, true>);
    }
  }
-  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
-                                              0, 0, 0, formatKind);
-  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
-                             height, depth), hipArrayDefault));
-  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));

-  // Host to Device
-  myparms.srcPos = make_hipPos(0, 0, 0);
-  myparms.dstPos = make_hipPos(0, 0, 0);
-  myparms.extent = make_hipExtent(width , height, depth);
-  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
-                                      width, height);
-  myparms.dstArray = devArray;
-  myparms.kind = hipMemcpyHostToDevice;
-
-  hipGraph_t graph;
-  hipGraphNode_t memcpyNode;
-  std::vector<hipGraphNode_t> dependencies;
-  hipStream_t streamForGraph;
-  hipGraphExec_t graphExec;
-
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
-  dependencies.push_back(memcpyNode);
-
-  // Device to host
-  memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
-  myparms1.srcPos = make_hipPos(0, 0, 0);
-  myparms1.dstPos = make_hipPos(0, 0, 0);
-  myparms1.dstPtr = make_hipPitchedPtr(hDataTemp, width * sizeof(int),
-                                      width, height);
-  myparms1.srcArray = devArray;
-  myparms1.extent = make_hipExtent(width, height, depth);
-  myparms1.kind = hipMemcpyDeviceToHost;
-
-  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                  dependencies.size(), &myparms1));
-
-  SECTION("Update the memcpyNode and check") {
-    // Device to host with updated host ptr hDataTemp -> hOutputData
-    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
-    myparms1.srcPos = make_hipPos(0, 0, 0);
-    myparms1.dstPos = make_hipPos(0, 0, 0);
-    myparms1.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(int),
-                                        width, height);
-    myparms1.srcArray = devArray;
-    myparms1.extent = make_hipExtent(width, height, depth);
-    myparms1.kind = hipMemcpyDeviceToHost;
-
-    HIP_CHECK(hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
-
-    // Instantiate and launch the graph
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-    HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-    // Check result
-    HipTest::checkArray(hData, hOutputData, width, height, depth);
+  SECTION("Array from/to Host") {
+    Memcpy3DArrayHostShell<async>(Memcpy3DWrapper<async, true, true>);
  }

-  SECTION("Update the memcpyNode again and check") {
-    // Device to host with updated host ptr hOutputData -> hOutputData1
-    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
-    myparms1.srcPos = make_hipPos(0, 0, 0);
-    myparms1.dstPos = make_hipPos(0, 0, 0);
-    myparms1.dstPtr = make_hipPitchedPtr(hOutputData1, width * sizeof(int),
-                                        width, height);
-    myparms1.srcArray = devArray;
-    myparms1.extent = make_hipExtent(width, height, depth);
-    myparms1.kind = hipMemcpyDeviceToHost;
-
-    HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
-                                    dependencies.size(), &myparms1));
-    HIP_CHECK(hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
-
-    // Instantiate and launch the graph
-    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-    HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-    HIP_CHECK(hipStreamSynchronize(streamForGraph));
-
-    // Check result
-    HipTest::checkArray(hData, hOutputData1, width, height, depth);
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-220
+  SECTION("Array from/to Device") {
+    Memcpy3DArrayDeviceShell<async>(Memcpy3DWrapper<async, true, true>);
  }
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipGraphDestroy(graph));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
-  HIP_CHECK(hipFreeArray(devArray));
-  free(hData);
-  free(hDataTemp);
-  free(hOutputData);
-  free(hOutputData1);
+#endif
 }
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# node is nullptr
+ *        -# graph is nullptr
+ *        -# pDependencies is nullptr when numDependencies is not zero
+ *        -# A node in pDependencies originates from a different graph
+ *        -# numDependencies is invalid
+ *        -# A node is duplicated in pDependencies
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphAddMemcpyNode.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Negative_Parameters") {
+  using namespace std::placeholders;
+
+  constexpr hipExtent extent{128 * sizeof(int), 128, 8};
+
+  constexpr auto NegativeTests = [](hipPitchedPtr dst_ptr, hipPos dst_pos, hipPitchedPtr src_ptr,
+                                    hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+
+    SECTION("node == nullptr") {
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(nullptr, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("dst_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.ptr = nullptr;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.ptr == nullptr") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.ptr = nullptr;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("dst_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidPitchValue);
+    }
+
+    SECTION("src_ptr.pitch < width") {
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = extent.width - 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidPitchValue);
+    }
+
+    SECTION("dst_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = dst_ptr;
+      invalid_ptr.pitch = attr;
+      auto params = GetMemcpy3DParms(invalid_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("src_ptr.pitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      hipPitchedPtr invalid_ptr = src_ptr;
+      invalid_ptr.pitch = attr;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, invalid_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + dst_pos.x > dst_ptr.pitch") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.x = dst_ptr.pitch - extent.width + 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("extent.width + src_pos.x > src_ptr.pitch") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.x = src_ptr.pitch - extent.width + 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.y out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.y = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.y out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.y = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("dst_pos.z out of bounds") {
+      hipPos invalid_pos = dst_pos;
+      invalid_pos.z = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, invalid_pos, src_ptr, src_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("src_pos.z out of bounds") {
+      hipPos invalid_pos = src_pos;
+      invalid_pos.z = 1;
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, invalid_pos, extent, kind);
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidValue);
+    }
+
+    SECTION("Invalid MemcpyKind") {
+      auto params = GetMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent,
+                                     static_cast<hipMemcpyKind>(-1));
+      HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams(node, &params), hipErrorInvalidMemcpyDirection);
+    }
+
+    HIP_CHECK(hipGraphDestroy(graph));
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(device_alloc.pitched_ptr(), make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToDevice);
+  }
+
+  SECTION("Device to Host") {
+    LinearAllocGuard3D<int> device_alloc(extent);
+    LinearAllocGuard<int> host_alloc(
+        LinearAllocs::hipHostMalloc,
+        device_alloc.pitch() * device_alloc.height() * device_alloc.depth());
+    NegativeTests(make_hipPitchedPtr(host_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                     device_alloc.height()),
+                  make_hipPos(0, 0, 0), device_alloc.pitched_ptr(), make_hipPos(0, 0, 0), extent,
+                  hipMemcpyDeviceToHost);
+  }
+
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc,
+                                    extent.width * extent.height * extent.depth);
+    NegativeTests(make_hipPitchedPtr(dst_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0),
+                  make_hipPitchedPtr(src_alloc.ptr(), extent.width, extent.width, extent.height),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard3D<int> src_alloc(extent);
+    LinearAllocGuard3D<int> dst_alloc(extent);
+    NegativeTests(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
+                  make_hipPos(0, 0, 0), extent, hipMemcpyDeviceToDevice);
+  }
+}
@@ -6,8 +6,10 @@ in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,169 +19,180 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
-Testcase Scenarios :
-Functional-
-1) Create a graph, add Memcpy node to graph, update the Memcpy node params with set and make sure they are taking effect.
-Negative-
-1) Pass pGraphNode as nullptr and check if api returns error.
-2) Pass destination ptr is nullptr, api expected to return error code.
-3) Pass source ptr is nullptr, api expected to return error code.
-4) Pass count as zero, api expected to return error code.
-5) Pass same pointer as source ptr and destination ptr, api expected to return error code.
-6) Pass overlap memory as source ptr and destination ptr where source ptr is ahead of destination ptr, api expected to return error code.
-7) Pass overlap memory as source ptr and destination ptr where destination ptr is ahead of source ptr, api expected to return error code.
-8) If count is more than allocated size for source and destination ptr, api should return error code.
-9) If count is less than allocated size for source and destination ptr, api should return error code.
-*/
+#include <functional>

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_kernels.hh>
+#include <hip_test_defgroups.hh>
+#include <memcpy1d_tests_common.hh>

-/* Test verifies hipGraphMemcpyNodeSetParams1D API Negative scenarios.
+#include "graph_tests_common.hh"
+
+static inline hipMemcpyKind ReverseMemcpyDirection(const hipMemcpyKind direction) {
+  switch (direction) {
+    case hipMemcpyHostToDevice:
+      return hipMemcpyDeviceToHost;
+    case hipMemcpyDeviceToHost:
+      return hipMemcpyHostToDevice;
+    default:
+      return direction;
+  }
+};
+
+/**
+ * @addtogroup hipGraphMemcpyNodeSetParams1D hipGraphMemcpyNodeSetParams1D
+ * @{
+ * @ingroup GraphTest
+ * `hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void *dst, const void *src, size_t count,
+ * hipMemcpyKind kind)` - 	Sets a memcpy node's parameters to perform a 1-dimensional copy
 */
-TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Negative") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  int *A_d, *A_h;
-  hipGraphNode_t memcpyNode{};
-  hipError_t ret;

-  HIP_CHECK(hipMalloc(&A_d, Nbytes));
-  HIP_CHECK(hipMalloc(&A_h, Nbytes));
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify that node parameters get updated correctly by creating a node with valid but
+ * incorrect parameters, and the setting them to the correct values after which the graph is
+ * executed and the results of the memcpy verified.
+ * The test is run for all possible memcpy directions, with both the corresponding memcpy
+ * kind and hipMemcpyDefault, as well as half page and full page allocation sizes.
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphMemcpyNodeSetParams1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Positive_Basic") {
+  constexpr auto f = [](void* dst, void* src, size_t count, hipMemcpyKind direction) {
+    hipGraph_t graph = nullptr;
+    HIP_CHECK(hipGraphCreate(&graph, 0));
+    hipGraphNode_t node = nullptr;
+    HIP_CHECK(hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, src, dst, count / 2,
+                                      ReverseMemcpyDirection(direction)));
+    HIP_CHECK(hipGraphMemcpyNodeSetParams1D(node, dst, src, count, direction));
+    hipGraphExec_t graph_exec = nullptr;
+    HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));

-  hipGraph_t graph;
-  HIP_CHECK(hipGraphCreate(&graph, 0));
-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+    HIP_CHECK(hipGraphExecDestroy(graph_exec));
+    HIP_CHECK(hipGraphDestroy(graph));

-  SECTION("Pass pGraphNode as nullptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(nullptr, A_d, A_h, Nbytes,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+    return hipSuccess;
+  };
+
+#if HT_NVIDIA
+  MemcpyWithDirectionCommonTests<false>(f);
+#else
+  using namespace std::placeholders;
+
+  SECTION("Device to host") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToHost));
  }
-  SECTION("Pass destination ptr is nullptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, nullptr, A_h, Nbytes,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("Host to device") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToDevice));
  }
-  SECTION("Pass source ptr is nullptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, nullptr, Nbytes,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("Device to device") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDeviceToDevice));
+    }
  }
-  SECTION("Pass count as zero") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, 0,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+  SECTION("Device to device with default kind") {
+    SECTION("Peer access enabled") {
+      MemcpyDeviceToDeviceShell<false, true>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
+    SECTION("Peer access disabled") {
+      MemcpyDeviceToDeviceShell<false, false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+    }
  }
-#if HT_AMD
-  SECTION("Pass same pointer as source ptr and destination ptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_d, Nbytes,
-                                        hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
+
+// Disabled on AMD due to defect - EXSWHTEC-209
+#if 0
+  SECTION("Host to host") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyHostToHost));
+  }
+
+  SECTION("Host to host with default kind") {
+    MemcpyHostToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
  }
 #endif
-  SECTION("Pass overlap memory where destination ptr is ahead of source ptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_d-5, Nbytes,
-                                        hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Pass overlap memory where source ptr is ahead of destination ptr") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d+5, A_d, Nbytes-5,
-                                        hipMemcpyDeviceToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Copy more than allocated memory") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes+8,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipErrorInvalidValue == ret);
-  }
-  SECTION("Copy less than allocated memory") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes-8,
-                                        hipMemcpyHostToDevice);
-    REQUIRE(hipSuccess == ret);
-  }
-  SECTION("Change the kind from H2D to D2H") {
-    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes,
-                                        hipMemcpyDeviceToHost);
-    REQUIRE(hipSuccess == ret);
+
+// Disabled on AMD due to defect - EXSWHTEC-210
+#if 0
+  SECTION("Device to host with default kind") {
+    MemcpyDeviceToHostShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
  }

-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(A_h));
-  HIP_CHECK(hipGraphDestroy(graph));
+  SECTION("Host to device with default kind") {
+    MemcpyHostToDeviceShell<false>(std::bind(f, _1, _2, _3, hipMemcpyDefault));
+  }
+#endif
+
+#endif
 }

-/* Test verifies hipGraphMemcpyNodeSetParams1D API Functional scenarios.
+/**
+ * Test Description
+ * ------------------------
+ *    - Verify API behaviour with invalid arguments:
+ *        -# node is nullptr
+ *        -# dst is nullptr
+ *        -# src is nullptr
+ *        -# kind is an invalid enum value
+ *        -# count is zero
+ *        -# count is larger than dst allocation size
+ *        -# count is larger than src allocation size
+ * Test source
+ * ------------------------
+ *    - unit/graph/hipGraphMemcpyNodeSetParams1D.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
 */
-TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Functional") {
-  constexpr size_t N = 1024;
-  constexpr size_t Nbytes = N * sizeof(int);
-  constexpr auto blocksPerCU = 6;  // to hide latency
-  constexpr auto threadsPerBlock = 256;
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  size_t NElem{N};
-
-  int *hData = reinterpret_cast<int*>(malloc(Nbytes));
-  REQUIRE(hData != nullptr);
-  memset(hData, 0, Nbytes);
-
-  hipGraphNode_t memcpyH2D_A, memcpyH2D_B, memcpyD2H_C;
-  hipGraphNode_t kernel_vecAdd;
-  hipKernelNodeParams kernelNodeParams{};
-  hipGraph_t graph;
-  hipGraphExec_t graphExec;
-  hipStream_t streamForGraph;
-
-  HIP_CHECK(hipStreamCreate(&streamForGraph));
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Negative_Parameters") {
+  using namespace std::placeholders;
+  hipGraph_t graph = nullptr;
  HIP_CHECK(hipGraphCreate(&graph, 0));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  int src[2] = {}, dst[2] = {};

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h,
-                                    Nbytes, hipMemcpyHostToDevice));
+  hipGraphNode_t node = nullptr;
+  HIP_CHECK(
+      hipGraphAddMemcpyNode1D(&node, graph, nullptr, 0, dst, src, sizeof(dst), hipMemcpyDefault));

-  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d,
-                                    Nbytes, hipMemcpyDeviceToHost));

-  HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpyD2H_C, hData, C_d, Nbytes,
-                                          hipMemcpyDeviceToHost));
+  SECTION("node == nullptr") {
+    HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams1D(nullptr, dst, src, sizeof(dst), hipMemcpyDefault),
+                    hipErrorInvalidValue);
+  }

-  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
-  kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
-  kernelNodeParams.gridDim = dim3(blocks);
-  kernelNodeParams.blockDim = dim3(threadsPerBlock);
-  kernelNodeParams.sharedMemBytes = 0;
-  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
-  kernelNodeParams.extra = nullptr;
-  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0,
-                                                        &kernelNodeParams));
+  MemcpyWithDirectionCommonNegativeTests(
+      std::bind(hipGraphMemcpyNodeSetParams1D, node, _1, _2, _3, _4), dst, src, sizeof(dst),
+      hipMemcpyDefault);

-  // Create dependencies
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_A, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_B, &kernel_vecAdd, 1));
-  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpyD2H_C, 1));
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(hipGraphMemcpyNodeSetParams1D(node, dst, src, 0, hipMemcpyDefault),
+                    hipErrorInvalidValue);
+  }

-  // Instantiate and launch the graph
-  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
-  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
-  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+  SECTION("count larger than dst allocation size") {
+    LinearAllocGuard<int> dev_dst(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(
+        hipGraphMemcpyNodeSetParams1D(node, dev_dst.ptr(), src, sizeof(src), hipMemcpyDefault),
+        hipErrorInvalidValue);
+  }

-  // Verify graph execution result
-  HipTest::checkVectorADD(A_h, B_h, hData, N);
+  SECTION("count larger than src allocation size") {
+    LinearAllocGuard<int> dev_src(LinearAllocs::hipMalloc, sizeof(int));
+    HIP_CHECK_ERROR(
+        hipGraphMemcpyNodeSetParams1D(node, dst, dev_src.ptr(), sizeof(dst), hipMemcpyDefault),
+        hipErrorInvalidValue);
+  }

-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-  HIP_CHECK(hipGraphExecDestroy(graphExec));
-  HIP_CHECK(hipStreamDestroy(streamForGraph));
  HIP_CHECK(hipGraphDestroy(graph));
-  free(hData);
 }
-
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+Functional-
+1) Create a graph, add Memcpy node to graph, update the Memcpy node params with set and make sure
+they are taking effect. Negative- 1) Pass pGraphNode as nullptr and check if api returns error. 2)
+Pass destination ptr is nullptr, api expected to return error code. 3) Pass source ptr is nullptr,
+api expected to return error code. 4) Pass count as zero, api expected to return error code. 5) Pass
+same pointer as source ptr and destination ptr, api expected to return error code. 6) Pass overlap
+memory as source ptr and destination ptr where source ptr is ahead of destination ptr, api expected
+to return error code. 7) Pass overlap memory as source ptr and destination ptr where destination ptr
+is ahead of source ptr, api expected to return error code. 8) If count is more than allocated size
+for source and destination ptr, api should return error code. 9) If count is less than allocated
+size for source and destination ptr, api should return error code.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_kernels.hh>
+
+/* Test verifies hipGraphMemcpyNodeSetParams1D API Negative scenarios.
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Negative") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  int *A_d, *A_h;
+  hipGraphNode_t memcpyNode{};
+  hipError_t ret;
+
+  HIP_CHECK(hipMalloc(&A_d, Nbytes));
+  HIP_CHECK(hipMalloc(&A_h, Nbytes));
+
+  hipGraph_t graph;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyNode, graph, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+
+  SECTION("Pass pGraphNode as nullptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(nullptr, A_d, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass destination ptr is nullptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, nullptr, A_h, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass source ptr is nullptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, nullptr, Nbytes, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass count as zero") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, 0, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+#if HT_AMD
+  SECTION("Pass same pointer as source ptr and destination ptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_d, Nbytes, hipMemcpyDeviceToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+#endif
+  SECTION("Pass overlap memory where destination ptr is ahead of source ptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_d - 5, Nbytes, hipMemcpyDeviceToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass overlap memory where source ptr is ahead of destination ptr") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d + 5, A_d, Nbytes - 5,
+                                        hipMemcpyDeviceToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Copy more than allocated memory") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes + 8, hipMemcpyHostToDevice);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Copy less than allocated memory") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes - 8, hipMemcpyHostToDevice);
+    REQUIRE(hipSuccess == ret);
+  }
+  SECTION("Change the kind from H2D to D2H") {
+    ret = hipGraphMemcpyNodeSetParams1D(memcpyNode, A_d, A_h, Nbytes, hipMemcpyDeviceToHost);
+    REQUIRE(hipSuccess == ret);
+  }
+
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(A_h));
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+/* Test verifies hipGraphMemcpyNodeSetParams1D API Functional scenarios.
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams1D_Functional") {
+  constexpr size_t N = 1024;
+  constexpr size_t Nbytes = N * sizeof(int);
+  constexpr auto blocksPerCU = 6;  // to hide latency
+  constexpr auto threadsPerBlock = 256;
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  size_t NElem{N};
+
+  int* hData = reinterpret_cast<int*>(malloc(Nbytes));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, Nbytes);
+
+  hipGraphNode_t memcpyH2D_A, memcpyH2D_B, memcpyD2H_C;
+  hipGraphNode_t kernel_vecAdd;
+  hipKernelNodeParams kernelNodeParams{};
+  hipGraph_t graph;
+  hipGraphExec_t graphExec;
+  hipStream_t streamForGraph;
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h, Nbytes,
+                                    hipMemcpyHostToDevice));
+
+  HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d, Nbytes,
+                                    hipMemcpyDeviceToHost));
+
+  HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpyD2H_C, hData, C_d, Nbytes, hipMemcpyDeviceToHost));
+
+  void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
+  kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
+  kernelNodeParams.gridDim = dim3(blocks);
+  kernelNodeParams.blockDim = dim3(threadsPerBlock);
+  kernelNodeParams.sharedMemBytes = 0;
+  kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
+  kernelNodeParams.extra = nullptr;
+  HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph, nullptr, 0, &kernelNodeParams));
+
+  // Create dependencies
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_A, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D_B, &kernel_vecAdd, 1));
+  HIP_CHECK(hipGraphAddDependencies(graph, &kernel_vecAdd, &memcpyD2H_C, 1));
+
+  // Instantiate and launch the graph
+  HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+  HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+  // Verify graph execution result
+  HipTest::checkVectorADD(A_h, B_h, hData, N);
+
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipGraphDestroy(graph));
+  free(hData);
+}
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+Negative -
+1) Pass node as nullptr and verify api returns error code.
+2) Pass un-initialize node and verify api returns error code.
+3) Pass pNodeParams as nullptr and verify api returns error code.
+Functional -
+1) Add Memcpy node to graph, update the Memcpy node params with set and
+   launch the graph and check updated params are taking effect.
+2) Add Memcpy node to graph, launch graph, then update the Memcpy node params
+   with set and launch the graph and check updated params are taking effect.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+#define SIZE 10
+
+/* Test verifies hipGraphMemcpyNodeSetParams API Negative scenarios.
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
+  hipArray_t devArray;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparms;
+  int* hData;
+  uint32_t size = width * height * depth * sizeof(int);
+  hData = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, size);
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
+                             height, depth), hipArrayDefault));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparms.dstArray = devArray;
+  myparms.kind = hipMemcpyHostToDevice;
+
+  hipGraph_t graph;
+  hipError_t ret;
+  hipGraphNode_t memcpyNode;
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
+
+  SECTION("Pass node as nullptr") {
+    ret = hipGraphMemcpyNodeSetParams(nullptr, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass un-initialize node") {
+    hipGraphNode_t memcpyNode_uninit{};
+    ret = hipGraphMemcpyNodeSetParams(memcpyNode_uninit, &myparms);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  SECTION("Pass SetNodeParams as nullptr") {
+    ret = hipGraphMemcpyNodeSetParams(memcpyNode, nullptr);
+    REQUIRE(hipErrorInvalidValue == ret);
+  }
+  HIP_CHECK(hipFreeArray(devArray));
+  free(hData);
+  HIP_CHECK(hipGraphDestroy(graph));
+}
+
+/* Test verifies hipGraphMemcpyNodeSetParams API Functional scenarios.
+ */
+TEST_CASE("Unit_hipGraphMemcpyNodeSetParams_Functional") {
+  CHECK_IMAGE_SUPPORT
+
+  constexpr int width{SIZE}, height{SIZE}, depth{SIZE};
+  hipArray_t devArray;
+  hipChannelFormatKind formatKind = hipChannelFormatKindSigned;
+  hipMemcpy3DParms myparms, myparms1;
+  uint32_t size = width * height * depth * sizeof(int);
+
+  int *hData = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hData != nullptr);
+  memset(hData, 0, size);
+  int *hDataTemp = reinterpret_cast<int*>(malloc(size));
+  REQUIRE(hDataTemp != nullptr);
+  memset(hDataTemp, 0, size);
+  int *hOutputData = reinterpret_cast<int *>(malloc(size));
+  REQUIRE(hOutputData != nullptr);
+  memset(hOutputData, 0,  size);
+  int *hOutputData1 = reinterpret_cast<int *>(malloc(size));
+  REQUIRE(hOutputData1 != nullptr);
+  memset(hOutputData1, 0,  size);
+
+  for (int i = 0; i < depth; i++) {
+    for (int j = 0; j < height; j++) {
+      for (int k = 0; k < width; k++) {
+        hData[i*width*height + j*width + k] = i*width*height + j*width + k;
+      }
+    }
+  }
+  hipChannelFormatDesc channelDesc = hipCreateChannelDesc(sizeof(int)*8,
+                                              0, 0, 0, formatKind);
+  HIP_CHECK(hipMalloc3DArray(&devArray, &channelDesc, make_hipExtent(width,
+                             height, depth), hipArrayDefault));
+  memset(&myparms, 0x0, sizeof(hipMemcpy3DParms));
+
+  // Host to Device
+  myparms.srcPos = make_hipPos(0, 0, 0);
+  myparms.dstPos = make_hipPos(0, 0, 0);
+  myparms.extent = make_hipExtent(width , height, depth);
+  myparms.srcPtr = make_hipPitchedPtr(hData, width * sizeof(int),
+                                      width, height);
+  myparms.dstArray = devArray;
+  myparms.kind = hipMemcpyHostToDevice;
+
+  hipGraph_t graph;
+  hipGraphNode_t memcpyNode;
+  std::vector<hipGraphNode_t> dependencies;
+  hipStream_t streamForGraph;
+  hipGraphExec_t graphExec;
+
+  HIP_CHECK(hipStreamCreate(&streamForGraph));
+  HIP_CHECK(hipGraphCreate(&graph, 0));
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &myparms));
+  dependencies.push_back(memcpyNode);
+
+  // Device to host
+  memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
+  myparms1.srcPos = make_hipPos(0, 0, 0);
+  myparms1.dstPos = make_hipPos(0, 0, 0);
+  myparms1.dstPtr = make_hipPitchedPtr(hDataTemp, width * sizeof(int),
+                                      width, height);
+  myparms1.srcArray = devArray;
+  myparms1.extent = make_hipExtent(width, height, depth);
+  myparms1.kind = hipMemcpyDeviceToHost;
+
+  HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                  dependencies.size(), &myparms1));
+
+  SECTION("Update the memcpyNode and check") {
+    // Device to host with updated host ptr hDataTemp -> hOutputData
+    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
+    myparms1.srcPos = make_hipPos(0, 0, 0);
+    myparms1.dstPos = make_hipPos(0, 0, 0);
+    myparms1.dstPtr = make_hipPitchedPtr(hOutputData, width * sizeof(int),
+                                        width, height);
+    myparms1.srcArray = devArray;
+    myparms1.extent = make_hipExtent(width, height, depth);
+    myparms1.kind = hipMemcpyDeviceToHost;
+
+    HIP_CHECK(hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
+
+    // Instantiate and launch the graph
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+    HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+    // Check result
+    HipTest::checkArray(hData, hOutputData, width, height, depth);
+  }
+
+  SECTION("Update the memcpyNode again and check") {
+    // Device to host with updated host ptr hOutputData -> hOutputData1
+    memset(&myparms1, 0x0, sizeof(hipMemcpy3DParms));
+    myparms1.srcPos = make_hipPos(0, 0, 0);
+    myparms1.dstPos = make_hipPos(0, 0, 0);
+    myparms1.dstPtr = make_hipPitchedPtr(hOutputData1, width * sizeof(int),
+                                        width, height);
+    myparms1.srcArray = devArray;
+    myparms1.extent = make_hipExtent(width, height, depth);
+    myparms1.kind = hipMemcpyDeviceToHost;
+
+    HIP_CHECK(hipGraphAddMemcpyNode(&memcpyNode, graph, dependencies.data(),
+                                    dependencies.size(), &myparms1));
+    HIP_CHECK(hipGraphMemcpyNodeSetParams(memcpyNode, &myparms1));
+
+    // Instantiate and launch the graph
+    HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
+    HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
+    HIP_CHECK(hipStreamSynchronize(streamForGraph));
+
+    // Check result
+    HipTest::checkArray(hData, hOutputData1, width, height, depth);
+  }
+  HIP_CHECK(hipGraphExecDestroy(graphExec));
+  HIP_CHECK(hipGraphDestroy(graph));
+  HIP_CHECK(hipStreamDestroy(streamForGraph));
+  HIP_CHECK(hipFreeArray(devArray));
+  free(hData);
+  free(hDataTemp);
+  free(hOutputData);
+  free(hOutputData1);
+}
@@ -38,9 +38,13 @@ set(TEST_SRC
    hipMemcpy3DAsync.cc
    hipMemcpy3DAsync_old.cc
    hipMemcpyParam2D.cc
+    hipMemcpyParam2D_old.cc
    hipMemcpyParam2DAsync.cc
+    hipMemcpyParam2DAsync_old.cc
    hipMemcpy2D.cc
+    hipMemcpy2D_old.cc
    hipMemcpy2DAsync.cc
+    hipMemcpy2DAsync_old.cc
    hipMemcpy2DFromArray.cc
    hipMemcpy2DFromArray_old.cc
    hipMemcpy2DFromArrayAsync.cc
@@ -1,496 +1,151 @@
 /*
-Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
- * @addtogroup hipMemcpy2D hipMemcpy2D
- * @{
- * @ingroup MemcpyTest
- * `hipMemcpy2D(void* dst, size_t dpitch, const void* src,
- *              size_t spitch, size_t width, size_t height,
- *              hipMemcpyKind kind)` -
- * Copies data between host and device.
- */
-
-// Testcase Description:
-// 1) Verifies the working of Memcpy2D API negative scenarios by
-//    Pass NULL to destination pointer
-//    Pass NULL to Source pointer
-//    Pass width greater than spitch/dpitch
-// 2) Verifies hipMemcpy2D API by
-//    pass 0 to destionation pitch
-//    pass 0 to source pitch
-//    pass 0 to width
-//    pass 0 to height
-// 3) Verifies working of Memcpy2D API on host memory and pinned host memory by
-//    performing D2H, D2D and H2D memory kind copies on same GPU
-// 4) Verifies working of Memcpy2D API for the following scenarios
-//      H2D-D2D-D2H on host and device memory
-//      H2D-D2D-D2H on pinned host and device memory
-//      H2D-D2D-D2H functionalities where memory is allocated in GPU-0
-//      and API is triggered from GPU-1
+#include "memcpy2d_tests_common.hh"

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr auto NUM_W{16};
-static constexpr auto NUM_H{16};
-static constexpr auto COLUMNS{8};
-static constexpr auto ROWS{8};
+TEST_CASE("Unit_hipMemcpy2D_Positive_Basic") {
+  constexpr bool async = false;

-/**
- * Test Description
- * ------------------------
- *  - This testcases performs the following scenarios of hipMemcpy2D API on same GPU
-    1. H2D-D2D-D2H for Host Memory<-->Device Memory
-    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+  SECTION("Device to Host") { Memcpy2DDeviceToHostShell<async>(hipMemcpy2D); }

-    Input : "A_h" initialized based on data type
-             "A_h" --> "A_d" using H2D copy
-             "A_d" --> "B_d" using D2D copy
-             "B_d" --> "B_h" using D2H copy
-    Output: Validating A_h with B_h both should be equal for
-            the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
-
-TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_H2D-D2D-D2H", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  // 1 refers to pinned host memory
-  auto mem_type = GENERATE(0, 1);
-  HIP_CHECK(hipSetDevice(0));
-  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
-            *B_d{nullptr};
-  size_t pitch_A, pitch_B;
-  size_t width{NUM_W * sizeof(TestType)};
-
-  // Allocating memory
-  if (mem_type) {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-  } else {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") { Memcpy2DDeviceToDeviceShell<async, false>(hipMemcpy2D); }
+    SECTION("Peer access enabled") { Memcpy2DDeviceToDeviceShell<async, true>(hipMemcpy2D); }
  }
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
-                          &pitch_B, width, NUM_H));

-  // Initialize the data
-  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+  SECTION("Host to Device") { Memcpy2DHostToDeviceShell<async>(hipMemcpy2D); }

-  // Host to Device
-  HIP_CHECK(hipMemcpy2D(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-                        COLUMNS*sizeof(TestType), ROWS,
-                        hipMemcpyHostToDevice));
-
-  // Performs D2D on same GPU device
-  HIP_CHECK(hipMemcpy2D(B_d, pitch_B, A_d,
-                        pitch_A, COLUMNS*sizeof(TestType),
-                        ROWS, hipMemcpyDeviceToDevice));
-
-  // hipMemcpy2D Device to Host
-  HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
-                        COLUMNS*sizeof(TestType), ROWS,
-                        hipMemcpyDeviceToHost));
-
-  // Validating the result
-  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(B_d));
-  if (mem_type) {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, true);
-  } else {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, false);
-  }
+  SECTION("Host to Host") { Memcpy2DHostToHostShell<async>(hipMemcpy2D); }
 }

-/**
- * Test Description
- * ------------------------
- *  - This testcase performs the following scenarios of hipMemcpy2D API on same GPU.
-    1. H2D-D2D-D2H for Host Memory<-->Device Memory
-    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
-    The src and dst input pointers to hipMemCpy2D add an offset to the pointers
-    returned by the allocation functions.
+TEST_CASE("Unit_hipMemcpy2D_Positive_Synchronization_Behavior") {
+  HIP_CHECK(hipDeviceSynchronize());

-    Input : "A_h" initialized based on data type
-             "A_h" --> "A_d" using H2D copy
-             "A_d" --> "B_d" using D2D copy
-             "B_d" --> "B_h" using D2H copy
-    Output: Validating A_h with B_h both should be equal for
-            the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
+  SECTION("Host to Device") { Memcpy2DHtoDSyncBehavior(hipMemcpy2D, true); }

-TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_H2D-D2D-D2H_WithOffset", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  // 1 refers to pinned host memory
-  auto mem_type = GENERATE(0, 1);
-  HIP_CHECK(hipSetDevice(0));
-  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
-            *B_d{nullptr};
-  size_t pitch_A, pitch_B;
-  size_t width{NUM_W * sizeof(TestType)};
-
-  // Allocating memory
-  if (mem_type) {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-  } else {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+  SECTION("Device to Host") {
+    Memcpy2DDtoHPageableSyncBehavior(hipMemcpy2D, true);
+    Memcpy2DDtoHPinnedSyncBehavior(hipMemcpy2D, true);
  }
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
-                          &pitch_B, width, NUM_H));

-  // Initialize the data
-  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
-
-  // Host to Device
-  HIP_CHECK(hipMemcpy2D(A_d+COLUMNS*sizeof(TestType), pitch_A, A_h,
-                        COLUMNS*sizeof(TestType), COLUMNS*sizeof(TestType),
-                        ROWS, hipMemcpyHostToDevice));
-
-  // Performs D2D on same GPU device
-  HIP_CHECK(hipMemcpy2D(B_d+COLUMNS*sizeof(TestType), pitch_B,
-                        A_d+COLUMNS*sizeof(TestType),
-                        pitch_A, COLUMNS*sizeof(TestType),
-                        ROWS, hipMemcpyDeviceToDevice));
-
-  // hipMemcpy2D Device to Host
-  HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType),
-                        B_d+COLUMNS*sizeof(TestType), pitch_B,
-                        COLUMNS*sizeof(TestType), ROWS,
-                        hipMemcpyDeviceToHost));
-
-
-  // Validating the result
-  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(B_d));
-  if (mem_type) {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, true);
-  } else {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, false);
+  SECTION("Device to Device") {
+#if HT_NVIDIA
+    Memcpy2DDtoDSyncBehavior(hipMemcpy2D, false);
+#else
+    Memcpy2DDtoDSyncBehavior(hipMemcpy2D, true);
+#endif
  }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-232
+  SECTION("Host to Host") { Memcpy2DHtoHSyncBehavior(hipMemcpy2D, true); }
+#endif
 }

-/**
- * Test Description
- * ------------------------
- *  - This testcases performs the following scenarios of hipMemcpy2D API on Peer GPU
-    1. H2D-D2D-D2H for Host Memory<-->Device Memory
-    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
-    3. Device context change where memory is allocated in GPU-0
-       and API is trigerred from GPU-1
+TEST_CASE("Unit_hipMemcpy2D_Positive_Parameters") {
+  constexpr bool async = false;
+  Memcpy2DZeroWidthHeight<async>(hipMemcpy2D);
+}

-    Input : "A_h" initialized based on data type
-             "A_h" --> "A_d" using H2D copy
-             "A_d" --> "X_d" using D2D copy
-             "X_d" --> "B_h" using D2H copy
-    Output: Validating A_h with B_h both should be equal for
-            the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
+TEST_CASE("Unit_hipMemcpy2D_Negative_Parameters") {
+  constexpr size_t cols = 128;
+  constexpr size_t rows = 128;

-TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_multiDevice-D2D", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  auto mem_type = GENERATE(0, 1);
-  int numDevices = 0;
-  int canAccessPeer = 0;
-  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(TestType)};
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
-    if (canAccessPeer) {
-      HIP_CHECK(hipSetDevice(0));
-
-      // Allocating memory
-      if (mem_type) {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-      } else {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
-      }
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-            &pitch_A, width, NUM_H));
-
-      // Initialize the data
-      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
-
-      char *X_d{nullptr};
-      size_t pitch_X;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
-                               &pitch_X, width, NUM_H));
-
-      // Change device
-      HIP_CHECK(hipSetDevice(1));
-
-      // Host to Device
-      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice));
-
-      // Device to Device
-      HIP_CHECK(hipMemcpy2D(X_d, pitch_X, A_d,
-            pitch_A, COLUMNS*sizeof(TestType),
-            ROWS, hipMemcpyDeviceToDevice));
-
-      // Device to Host
-      HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType), X_d,
-            pitch_X, COLUMNS*sizeof(TestType), ROWS, hipMemcpyDeviceToHost));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      if (mem_type) {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, true);
-      } else {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, false);
-      }
-      HIP_CHECK(hipFree(X_d));
-    } else {
-      SUCCEED("Machine does not seem to have P2P");
+  constexpr auto NegativeTests = [](void* dst, size_t dpitch, const void* src, size_t spitch,
+                                    size_t width, size_t height, hipMemcpyKind kind) {
+    SECTION("dst == nullptr") {
+      HIP_CHECK_ERROR(hipMemcpy2D(nullptr, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
    }
-  } else {
-    SUCCEED("skipped the testcase as no of devices is less than 2");
-  }
-}

-/**
- * Test Description
- * ------------------------
- *  - This Testcase verifies the null size checks of hipMemcpy2D API
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
-
-TEST_CASE("Unit_hipMemcpy2D_SizeCheck") {
-  CHECK_IMAGE_SUPPORT
-  HIP_CHECK(hipSetDevice(0));
-  int* A_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(int)};
-
-  // Allocating memory
-  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
-      &A_h, nullptr, nullptr, NUM_W*NUM_H);
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-        &pitch_A, width, NUM_H));
-
-  // Initialize the data
-  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
-
-  SECTION("hipMemcpy2D API where Source Pitch is zero") {
-    REQUIRE(hipMemcpy2D(A_h, 0, A_d,
-            pitch_A, NUM_W, NUM_H,
-            hipMemcpyDeviceToHost) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2D API where Destination Pitch is zero") {
-    REQUIRE(hipMemcpy2D(A_h, width, A_d,
-            0, NUM_W, NUM_H,
-            hipMemcpyDeviceToHost) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2D API where height is zero") {
-    REQUIRE(hipMemcpy2D(A_h, width, A_d,
-            pitch_A, NUM_W, 0,
-            hipMemcpyDeviceToHost) == hipSuccess);
-  }
-
-  SECTION("hipMemcpy2D API where width is zero") {
-    REQUIRE(hipMemcpy2D(A_h, width, A_d,
-            pitch_A, 0, NUM_H,
-            hipMemcpyDeviceToHost) == hipSuccess);
-  }
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  free(A_h);
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This Testcase verifies all the negative scenarios of hipMemcpy2D API
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
-
-TEST_CASE("Unit_hipMemcpy2D_Negative") {
-  CHECK_IMAGE_SUPPORT
-  HIP_CHECK(hipSetDevice(0));
-  int* A_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(int)};
-
-  // Allocating memory
-  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
-      &A_h, nullptr, nullptr, NUM_W*NUM_H);
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-        &pitch_A, width, NUM_H));
-
-  // Initialize the data
-  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
-
-  SECTION("hipMemcpy2D API by Passing nullptr to destination") {
-    REQUIRE(hipMemcpy2D(nullptr, width, A_d,
-          pitch_A, COLUMNS*sizeof(int), ROWS,
-          hipMemcpyDeviceToHost) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2D API by Passing nullptr to destination") {
-    REQUIRE(hipMemcpy2D(nullptr, width, nullptr,
-          pitch_A, COLUMNS*sizeof(int), ROWS,
-          hipMemcpyDeviceToHost) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2D API where width is greater than destination pitch") {
-    REQUIRE(hipMemcpy2D(A_h, 10, A_d, pitch_A,
-          COLUMNS*sizeof(int), ROWS,
-          hipMemcpyDeviceToHost) != hipSuccess);
-  }
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  free(A_h);
-}
-
-static void hipMemcpy2D_Basic_Size_Test(size_t inc) {
-  constexpr int defaultProgramSize = 256 * 1024 * 1024;
-  constexpr int N = 2;
-  constexpr int value = 42;
-  int *in, *out, *dev;
-  size_t newSize = 0, inp = 0;
-  size_t size = sizeof(int) * N * inc;
-
-  size_t free, total;
-  HIP_CHECK(hipMemGetInfo(&free, &total));
-
-  if ( free < 2 * size )
-    newSize = ( free - defaultProgramSize ) / 2;
-  else
-    newSize = size;
-
-  INFO("Array size: " << size/1024.0/1024.0 << " MB or " << size << " Bytes.");
-  INFO("Free memory: " << free/1024.0/1024.0 << " MB or " << free << " Bytes");
-  INFO("NewSize:" << newSize/1024.0/1024.0 << "MB or " << newSize << " Bytes");
-
-  HIP_CHECK(hipHostMalloc(&in, newSize));
-  HIP_CHECK(hipHostMalloc(&out, newSize));
-  HIP_CHECK(hipMalloc(&dev, newSize));
-
-  inp = newSize / (sizeof(int) * N);
-  for (size_t i=0; i < N; i++) {
-    in[i * inp] = value;
-  }
-
-  size_t pitch = sizeof(int) * inp;
-
-  HIP_CHECK(hipMemcpy2D(dev, pitch, in, pitch, sizeof(int),
-                        N, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy2D(out, pitch, dev, pitch, sizeof(int),
-                        N, hipMemcpyDeviceToHost));
-
-  for (size_t i=0; i < N; i++) {
-    REQUIRE(out[i * inp] == value);
-  }
-
-  HIP_CHECK(hipFree(dev));
-  HIP_CHECK(hipHostFree(in));
-  HIP_CHECK(hipHostFree(out));
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcase performs multidevice size check on hipMemcpy2D API
-      1. Verify hipMemcpy2D with 1 << 20 size
-      2. Verify hipMemcpy2D with 1 << 21 size
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2D.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
-
-TEST_CASE("Unit_hipMemcpy2D_multiDevice_Basic_Size_Test") {
-  CHECK_IMAGE_SUPPORT
-  size_t input = 1 << 20;
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-
-  for (int i=0; i < numDevices; i++) {
-    HIP_CHECK(hipSetDevice(i));
-
-    SECTION("Verify hipMemcpy2D with 1 << 20 size") {
-      hipMemcpy2D_Basic_Size_Test(input);
+    SECTION("src == nullptr") {
+      HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, nullptr, spitch, width, height, kind),
+                      hipErrorInvalidValue);
    }
-    SECTION("Verify hipMemcpy2D with 1 << 21 size") {
-      input <<= 1;
-      hipMemcpy2D_Basic_Size_Test(input);
+
+    SECTION("dpitch < width") {
+      HIP_CHECK_ERROR(hipMemcpy2D(dst, width - 1, src, spitch, width, height, kind),
+                      hipErrorInvalidPitchValue);
    }
+
+    SECTION("spitch < width") {
+      HIP_CHECK_ERROR(hipMemcpy2D(dst, dpitch, src, width - 1, width, height, kind),
+                      hipErrorInvalidPitchValue);
+    }
+
+    SECTION("dpitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(
+          hipMemcpy2D(dst, static_cast<size_t>(attr) + 1, src, spitch, width, height, kind),
+          hipErrorInvalidValue);
+    }
+
+    SECTION("spitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(
+          hipMemcpy2D(dst, dpitch, src, static_cast<size_t>(attr) + 1, width, height, kind),
+          hipErrorInvalidValue);
+    }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-234
+    SECTION("Invalid MemcpyKind") {
+      HIP_CHECK_ERROR(
+          hipMemcpy2D(dst, dpitch, src, spitch, width, height, static_cast<hipMemcpyKind>(-1)),
+          hipErrorInvalidMemcpyDirection);
+    }
+#endif
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(device_alloc.ptr(), device_alloc.pitch(), host_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyHostToDevice);
+  }
+
+  SECTION("Device to Host") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(host_alloc.ptr(), device_alloc.pitch(), device_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyDeviceToHost);
+  }
+
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    NegativeTests(dst_alloc.ptr(), cols * sizeof(int), src_alloc.ptr(), cols * sizeof(int),
+                  cols * sizeof(int), rows, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard2D<int> src_alloc(cols, rows);
+    LinearAllocGuard2D<int> dst_alloc(cols, rows);
+    NegativeTests(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                  dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToDevice);
  }
 }
@@ -1,555 +1,188 @@
 /*
-Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/**
- * @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync
- * @{
- * @ingroup MemcpyTest
- * `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src,
- *                   size_t spitch, size_t width, size_t height,
- *                   hipMemcpyKind kind, hipStream_t stream = 0 )` -
- * Copies data between host and device.
- */
-
-// Testcase Description:
-// 1) Verifies the working of Memcpy2DAsync API negative scenarios by
-//    Pass NULL to destination pointer
-//    Pass NULL to Source pointer
-//    Pass width greater than spitch/dpitch
-// 2) Verifies hipMemcpy2DAsync API by
-//    pass 0 to destionation pitch
-//    pass 0 to source pitch
-//    pass 0 to width
-//    pass 0 to height
-// 3) Verifies working of Memcpy2DAsync API on host memory
-//    and pinned host memory by
-//    performing D2H, D2D and H2D memory kind copies on same GPU
-// 4) Verifies working of Memcpy2DAsync API on host memory
-//    and pinned host memory by
-//    performing D2H, D2D and H2D memory kind copies on peer GPU
-// 5) Verifies working of Memcpy2DAsync API where memory is allocated
-//    in GPU-0 and stream is created on GPU-1
+#include "memcpy2d_tests_common.hh"

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr auto NUM_W{16};
-static constexpr auto NUM_H{16};
-static constexpr auto COLUMNS{6};
-static constexpr auto ROWS{6};
+TEST_CASE("Unit_hipMemcpy2DAsync_Positive_Basic") {
+  using namespace std::placeholders;

-/**
- * Test Description
- * ------------------------
- *  - This performs the following scenarios of hipMemcpy2DAsync API on same GPU
-      1. H2D-D2D-D2H for Host Memory<-->Device Memory
-      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+  constexpr bool async = true;

-      Input : "A_h" initialized based on data type
-         "A_h" --> "A_d" using H2D copy
-         "A_d" --> "B_d" using D2D copy
-         "B_d" --> "B_h" using D2H copy
-      Output: Validating A_h with B_h both should be equal for
-        the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
+  const StreamGuard stream_guard(stream_type);
+  const hipStream_t stream = stream_guard.stream();

-TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_Host&PinnedMem", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  // 1 refers to pinned host memory
-  auto mem_type = GENERATE(0, 1);
-  HIP_CHECK(hipSetDevice(0));
-  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
-            *B_d{nullptr};
-  size_t pitch_A, pitch_B;
-  size_t width{NUM_W * sizeof(TestType)};
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  // Allocating memory
-  if (mem_type) {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-  } else {
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
-  }
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
-                          &pitch_B, width, NUM_H));
-
-  // Initialize the data
-  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
-  SECTION("Calling Async apis with stream object created by user") {
-    // Host to Device
-    HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-                               COLUMNS*sizeof(TestType), ROWS,
-                               hipMemcpyHostToDevice, stream));
-
-    // Performs D2D on same GPU device
-    HIP_CHECK(hipMemcpy2DAsync(B_d, pitch_B, A_d,
-                               pitch_A, COLUMNS*sizeof(TestType),
-                               ROWS, hipMemcpyDeviceToDevice, stream));
-
-    // hipMemcpy2DAsync Device to Host
-    HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
-                               COLUMNS*sizeof(TestType), ROWS,
-                               hipMemcpyDeviceToHost, stream));
-    HIP_CHECK(hipStreamSynchronize(stream));
-  }
-  SECTION("Calling Async apis with hipStreamPerThread") {
-    // Host to Device
-    HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-                               COLUMNS*sizeof(TestType), ROWS,
-                               hipMemcpyHostToDevice, hipStreamPerThread));
-
-    // Performs D2D on same GPU device
-    HIP_CHECK(hipMemcpy2DAsync(B_d, pitch_B, A_d, pitch_A,
-                               COLUMNS*sizeof(TestType), ROWS,
-                               hipMemcpyDeviceToDevice, hipStreamPerThread));
-
-    // hipMemcpy2DAsync Device to Host
-    HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
-                               COLUMNS*sizeof(TestType), ROWS,
-                               hipMemcpyDeviceToHost, hipStreamPerThread));
-    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+  SECTION("Device to Host") {
+    Memcpy2DDeviceToHostShell<async>(
+        std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, stream), stream);
  }

-  // Validating the result
-  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(B_d));
-  if (mem_type) {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, true);
-  } else {
-    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                  A_h, B_h, C_h, false);
-  }
-  HIP_CHECK(hipStreamDestroy(stream));
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcases performs the following scenarios of hipMemcpy2DAsync API on Peer GPU
-      1. H2D-D2D-D2H for Host Memory<-->Device Memory
-      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
-
-      Input : "A_h" initialized based on data type
-               "A_h" --> "A_d" using H2D copy
-               "A_d" --> "X_d" using D2D copy
-               "X_d" --> "B_h" using D2H copy
-      Output: Validating A_h with B_h both should be equal for
-              the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
-
-TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice-Host&PinnedMem", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  auto mem_type = GENERATE(0, 1);
-  int numDevices = 0;
-  int canAccessPeer = 0;
-  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(TestType)};
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  hipStream_t stream;
-
-  if (numDevices > 1) {
-    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
-    if (canAccessPeer) {
-      HIP_CHECK(hipSetDevice(0));
-      HIP_CHECK(hipStreamCreate(&stream));
-
-      // Allocating memory
-      if (mem_type) {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-      } else {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
-      }
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-            &pitch_A, width, NUM_H));
-
-      // Initialize the data
-      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
-
-      // Host to Device
-      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice, stream));
-
-      // Change device
-      HIP_CHECK(hipSetDevice(1));
-
-      char *X_d{nullptr};
-      size_t pitch_X;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
-            &pitch_X, width, NUM_H));
-
-      // Device to Device
-      HIP_CHECK(hipMemcpy2DAsync(X_d, pitch_X, A_d,
-            pitch_A, COLUMNS*sizeof(TestType),
-            ROWS, hipMemcpyDeviceToDevice, stream));
-
-      // Device to Host
-      HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), X_d,
-                                 pitch_X, COLUMNS*sizeof(TestType), ROWS,
-                                 hipMemcpyDeviceToHost, stream));
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      if (mem_type) {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, true);
-      } else {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, false);
-      }
-      HIP_CHECK(hipFree(X_d));
-      HIP_CHECK(hipStreamDestroy(stream));
-    } else {
-      SUCCEED("Machine does not seem to have P2P");
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") {
+      Memcpy2DDeviceToDeviceShell<async, false>(
+          std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, stream), stream);
    }
-  } else {
-    SUCCEED("skipped the testcase as no of devices is less than 2");
-  }
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcases performs the following scenarios of hipMemcpy2DAsync API on Peer GPU
-      1. H2D-D2D-D2H for Host Memory<-->Device Memory
-      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
-      Memory is allocated in GPU-0 and Stream is created in GPU-1
-
-      Input : "A_h" initialized based on data type
-               "A_h" --> "A_d" using H2D copy
-               "A_d" --> "X_d" using D2D copy
-               "X_d" --> "B_h" using D2H copy
-      Output: Validating A_h with B_h both should be equal for
-              the number of COLUMNS and ROWS copied
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
-
-TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice-StreamOnDiffDevice", ""
-                   , int, float, double) {
-  CHECK_IMAGE_SUPPORT
-  auto mem_type = GENERATE(0, 1);
-  int numDevices = 0;
-  int canAccessPeer = 0;
-  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(TestType)};
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  hipStream_t stream;
-
-  if (numDevices > 1) {
-    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
-    if (canAccessPeer) {
-      HIP_CHECK(hipSetDevice(0));
-
-      // Allocating memory
-      if (mem_type) {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
-      } else {
-        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
-      }
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-            &pitch_A, width, NUM_H));
-      char *X_d{nullptr};
-      size_t pitch_X;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
-            &pitch_X, width, NUM_H));
-
-      // Initialize the data
-      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
-
-      // Change device
-      HIP_CHECK(hipSetDevice(1));
-      HIP_CHECK(hipStreamCreate(&stream));
-
-      // Host to Device
-      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
-            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice, stream));
-
-      // Device to Device
-      HIP_CHECK(hipMemcpy2DAsync(X_d, pitch_X, A_d,
-            pitch_A, COLUMNS*sizeof(TestType),
-            ROWS, hipMemcpyDeviceToDevice, stream));
-
-      // Device to Host
-      HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), X_d,
-                                 pitch_X, COLUMNS*sizeof(TestType), ROWS,
-                                 hipMemcpyDeviceToHost, stream));
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      if (mem_type) {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, true);
-      } else {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-            A_h, B_h, C_h, false);
-      }
-      HIP_CHECK(hipFree(X_d));
-      HIP_CHECK(hipStreamDestroy(stream));
-    } else {
-      SUCCEED("Machine does not seem to have P2P");
-    }
-  } else {
-    SUCCEED("skipped the testcase as no of devices is less than 2");
-  }
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcase verifies the null checks of hipMemcpy2DAsync API
-      1. hipMemcpy2DAsync API where Source Pitch is zero
-      2. hipMemcpy2DAsync API where Destination Pitch is zero
-      3. hipMemcpy2DAsync API where height is zero
-      4. hipMemcpy2DAsync API where width is zero
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
-
-TEST_CASE("Unit_hipMemcpy2DAsync_SizeCheck") {
-  CHECK_IMAGE_SUPPORT
-  HIP_CHECK(hipSetDevice(0));
-  int* A_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(int)};
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  // Allocating memory
-  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
-      &A_h, nullptr, nullptr, NUM_W*NUM_H);
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-        &pitch_A, width, NUM_H));
-
-  // Initialize the data
-  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
-
-  SECTION("hipMemcpy2DAsync API where Source Pitch is zero") {
-    REQUIRE(hipMemcpy2DAsync(A_h, 0, A_d,
-            pitch_A, NUM_W, NUM_H,
-            hipMemcpyDeviceToHost, stream) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2DAsync API where Destination Pitch is zero") {
-    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
-            0, NUM_W, NUM_H,
-            hipMemcpyDeviceToHost, stream) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2DAsync API where height is zero") {
-    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
-            pitch_A, NUM_W, 0,
-            hipMemcpyDeviceToHost, stream) == hipSuccess);
-  }
-
-  SECTION("hipMemcpy2DAsync API where width is zero") {
-    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
-            pitch_A, 0, NUM_H,
-            hipMemcpyDeviceToHost, stream) == hipSuccess);
-  }
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  free(A_h);
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcase performs the negative scenarios of hipMemcpy2DAsync API
-      1. hipMemcpy2DAsync API by Passing nullptr to destination
-      2. hipMemcpy2DAsync API by Passing nullptr to source
-      3. hipMemcpy2DAsync API where width is > destination pitch
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.2
- */
-
-TEST_CASE("Unit_hipMemcpy2DAsync_Negative") {
-  CHECK_IMAGE_SUPPORT
-  HIP_CHECK(hipSetDevice(0));
-  int* A_h{nullptr}, *A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(int)};
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  // Allocating memory
-  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
-      &A_h, nullptr, nullptr, NUM_W*NUM_H);
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-        &pitch_A, width, NUM_H));
-
-  // Initialize the data
-  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
-
-  SECTION("hipMemcpy2DAsync API by Passing nullptr to destination") {
-    REQUIRE(hipMemcpy2DAsync(nullptr, width, A_d,
-            pitch_A, COLUMNS*sizeof(int), ROWS,
-            hipMemcpyDeviceToHost, stream) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2DAsync API by Passing nullptr to source") {
-    REQUIRE(hipMemcpy2DAsync(A_h, width, nullptr,
-            pitch_A, COLUMNS*sizeof(int), ROWS,
-            hipMemcpyDeviceToHost, stream) != hipSuccess);
-  }
-
-  SECTION("hipMemcpy2DAsync API where width is > destination pitch") {
-    REQUIRE(hipMemcpy2DAsync(A_h, 10, A_d, pitch_A,
-            COLUMNS*sizeof(int), ROWS,
-            hipMemcpyDeviceToHost, stream) != hipSuccess);
-  }
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipStreamDestroy(stream));
-  free(A_h);
-}
-
-static void hipMemcpy2DAsync_Basic_Size_Test(size_t inc) {
-  constexpr int defaultProgramSize = 256 * 1024 * 1024;
-  constexpr int N = 2;
-  constexpr int value = 42;
-  int *in, *out, *dev;
-  size_t newSize = 0, inp = 0;
-  size_t size = sizeof(int) * N * inc;
-
-  size_t free, total;
-  HIP_CHECK(hipMemGetInfo(&free, &total));
-
-  if ( free < 2 * size )
-    newSize = ( free - defaultProgramSize ) / 2;
-  else
-    newSize = size;
-
-  INFO("Array size: " << size/1024.0/1024.0 << " MB or " << size << " Bytes.");
-  INFO("Free memory: " << free/1024.0/1024.0 << " MB or " << free << " Bytes");
-  INFO("NewSize:" << newSize/1024.0/1024.0 << "MB or " << newSize << " Bytes");
-
-  HIP_CHECK(hipHostMalloc(&in, newSize));
-  HIP_CHECK(hipHostMalloc(&out, newSize));
-  HIP_CHECK(hipMalloc(&dev, newSize));
-
-  inp = newSize / (sizeof(int) * N);
-  for (size_t i=0; i < N; i++) {
-    in[i * inp] = value;
-  }
-
-  size_t pitch = sizeof(int) * inp;
-
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  HIP_CHECK(hipMemcpy2DAsync(dev, pitch, in, pitch, sizeof(int),
-                             N, hipMemcpyHostToDevice, stream));
-  HIP_CHECK(hipMemcpy2DAsync(out, pitch, dev, pitch, sizeof(int),
-                             N, hipMemcpyDeviceToHost, stream));
-  HIP_CHECK(hipStreamSynchronize(stream));
-
-  for (size_t i=0; i < N; i++) {
-    REQUIRE(out[i * inp] == value);
-  }
-
-  HIP_CHECK(hipFree(dev));
-  HIP_CHECK(hipHostFree(in));
-  HIP_CHECK(hipHostFree(out));
-  HIP_CHECK(hipStreamDestroy(stream));
-}
-
-/**
- * Test Description
- * ------------------------
- *  - This testcase performs multidevice size check on hipMemcpy2DAsync API
-      1. Verify hipMemcpy2DAsync with 1 << 20 size
-      2. Verify hipMemcpy2DAsync with 1 << 21 size
- * Test source
- * ------------------------
- *  - unit/memory/hipMemcpy2DAsync.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 6.0
- */
-
-TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice_Basic_Size_Test") {
-  CHECK_IMAGE_SUPPORT
-  size_t input = 1 << 20;
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-
-  for (int i=0; i < numDevices; i++) {
-    HIP_CHECK(hipSetDevice(i));
-
-    SECTION("Verify hipMemcpy2DAsync with 1 << 20 size") {
-      hipMemcpy2DAsync_Basic_Size_Test(input);
-    }
-    SECTION("Verify hipMemcpy2DAsync with 1 << 21 size") {
-      input <<= 1;
-      hipMemcpy2DAsync_Basic_Size_Test(input);
+    SECTION("Peer access enabled") {
+      Memcpy2DDeviceToDeviceShell<async, true>(
+          std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, stream), stream);
    }
  }
+
+  SECTION("Host to Device") {
+    Memcpy2DHostToDeviceShell<async>(
+        std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, stream), stream);
+  }
+
+  SECTION("Host to Host") {
+    Memcpy2DHostToHostShell<async>(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, stream),
+                                   stream);
+  }
+}
+
+TEST_CASE("Unit_hipMemcpy2DAsync_Positive_Synchronization_Behavior") {
+  using namespace std::placeholders;
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  SECTION("Host to Device") {
+    Memcpy2DHtoDSyncBehavior(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr),
+                             false);
+  }
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Device to Pageable Host") {
+    Memcpy2DDtoHPageableSyncBehavior(
+        std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr), true);
+  }
+#endif
+
+  SECTION("Device to Pinned Host") {
+    Memcpy2DDtoHPinnedSyncBehavior(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr),
+                                   false);
+  }
+
+  SECTION("Device to Device") {
+    Memcpy2DDtoDSyncBehavior(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr),
+                             false);
+  }
+
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Host to Host") {
+    Memcpy2DHtoHSyncBehavior(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr),
+                             true);
+  }
+#endif
+}
+
+TEST_CASE("Unit_hipMemcpy2DAsync_Positive_Parameters") {
+  using namespace std::placeholders;
+  constexpr bool async = true;
+  Memcpy2DZeroWidthHeight<async>(std::bind(hipMemcpy2DAsync, _1, _2, _3, _4, _5, _6, _7, nullptr));
+}
+
+TEST_CASE("Unit_hipMemcpy2DAsync_Negative_Parameters") {
+  constexpr size_t cols = 128;
+  constexpr size_t rows = 128;
+
+  constexpr auto NegativeTests = [](void* dst, size_t dpitch, const void* src, size_t spitch,
+                                    size_t width, size_t height, hipMemcpyKind kind) {
+    SECTION("dst == nullptr") {
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(nullptr, dpitch, src, spitch, width, height, kind, nullptr),
+                      hipErrorInvalidValue);
+    }
+    SECTION("src == nullptr") {
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, dpitch, nullptr, spitch, width, height, kind, nullptr),
+                      hipErrorInvalidValue);
+    }
+    SECTION("dpitch < width") {
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, width - 1, src, spitch, width, height, kind, nullptr),
+                      hipErrorInvalidPitchValue);
+    }
+    SECTION("spitch < width") {
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, dpitch, src, width - 1, width, height, kind, nullptr),
+                      hipErrorInvalidPitchValue);
+    }
+    SECTION("dpitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, static_cast<size_t>(attr) + 1, src, spitch, width,
+                                       height, kind, nullptr),
+                      hipErrorInvalidValue);
+    }
+    SECTION("spitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, dpitch, src, static_cast<size_t>(attr) + 1, width,
+                                       height, kind, nullptr),
+                      hipErrorInvalidValue);
+    }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-234
+    SECTION("Invalid MemcpyKind") {
+      HIP_CHECK_ERROR(hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
+                                       static_cast<hipMemcpyKind>(-1), nullptr),
+                      hipErrorInvalidMemcpyDirection);
+    }
+#endif
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-235
+    SECTION("Invalid stream") {
+      StreamGuard stream_guard(Streams::created);
+      HIP_CHECK(hipStreamDestroy(stream_guard.stream()));
+      HIP_CHECK_ERROR(
+          hipMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream_guard.stream()),
+          hipErrorContextIsDestroyed);
+    }
+#endif
+  };
+
+  SECTION("Host to device") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(device_alloc.ptr(), device_alloc.pitch(), host_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyHostToDevice);
+  }
+
+  SECTION("Device to host") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(host_alloc.ptr(), device_alloc.pitch(), device_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyDeviceToHost);
+  }
+
+  SECTION("Host to host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    NegativeTests(dst_alloc.ptr(), cols * sizeof(int), src_alloc.ptr(), cols * sizeof(int),
+                  cols * sizeof(int), rows, hipMemcpyHostToHost);
+  }
+
+  SECTION("Device to device") {
+    LinearAllocGuard2D<int> src_alloc(cols, rows);
+    LinearAllocGuard2D<int> dst_alloc(cols, rows);
+    NegativeTests(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                  dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToDevice);
+  }
 }
@@ -0,0 +1,555 @@
+/*
+Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @addtogroup hipMemcpy2DAsync hipMemcpy2DAsync
+ * @{
+ * @ingroup MemcpyTest
+ * `hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src,
+ *                   size_t spitch, size_t width, size_t height,
+ *                   hipMemcpyKind kind, hipStream_t stream = 0 )` -
+ * Copies data between host and device.
+ */
+
+// Testcase Description:
+// 1) Verifies the working of Memcpy2DAsync API negative scenarios by
+//    Pass NULL to destination pointer
+//    Pass NULL to Source pointer
+//    Pass width greater than spitch/dpitch
+// 2) Verifies hipMemcpy2DAsync API by
+//    pass 0 to destionation pitch
+//    pass 0 to source pitch
+//    pass 0 to width
+//    pass 0 to height
+// 3) Verifies working of Memcpy2DAsync API on host memory
+//    and pinned host memory by
+//    performing D2H, D2D and H2D memory kind copies on same GPU
+// 4) Verifies working of Memcpy2DAsync API on host memory
+//    and pinned host memory by
+//    performing D2H, D2D and H2D memory kind copies on peer GPU
+// 5) Verifies working of Memcpy2DAsync API where memory is allocated
+//    in GPU-0 and stream is created on GPU-1
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr auto NUM_W{16};
+static constexpr auto NUM_H{16};
+static constexpr auto COLUMNS{6};
+static constexpr auto ROWS{6};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This performs the following scenarios of hipMemcpy2DAsync API on same GPU
+      1. H2D-D2D-D2H for Host Memory<-->Device Memory
+      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+
+      Input : "A_h" initialized based on data type
+         "A_h" --> "A_d" using H2D copy
+         "A_d" --> "B_d" using D2D copy
+         "B_d" --> "B_h" using D2H copy
+      Output: Validating A_h with B_h both should be equal for
+        the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_Host&PinnedMem", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  // 1 refers to pinned host memory
+  auto mem_type = GENERATE(0, 1);
+  HIP_CHECK(hipSetDevice(0));
+  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
+            *B_d{nullptr};
+  size_t pitch_A, pitch_B;
+  size_t width{NUM_W * sizeof(TestType)};
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocating memory
+  if (mem_type) {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+  } else {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+  }
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
+                          &pitch_B, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+  SECTION("Calling Async apis with stream object created by user") {
+    // Host to Device
+    HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+                               COLUMNS*sizeof(TestType), ROWS,
+                               hipMemcpyHostToDevice, stream));
+
+    // Performs D2D on same GPU device
+    HIP_CHECK(hipMemcpy2DAsync(B_d, pitch_B, A_d,
+                               pitch_A, COLUMNS*sizeof(TestType),
+                               ROWS, hipMemcpyDeviceToDevice, stream));
+
+    // hipMemcpy2DAsync Device to Host
+    HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
+                               COLUMNS*sizeof(TestType), ROWS,
+                               hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+  }
+  SECTION("Calling Async apis with hipStreamPerThread") {
+    // Host to Device
+    HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+                               COLUMNS*sizeof(TestType), ROWS,
+                               hipMemcpyHostToDevice, hipStreamPerThread));
+
+    // Performs D2D on same GPU device
+    HIP_CHECK(hipMemcpy2DAsync(B_d, pitch_B, A_d, pitch_A,
+                               COLUMNS*sizeof(TestType), ROWS,
+                               hipMemcpyDeviceToDevice, hipStreamPerThread));
+
+    // hipMemcpy2DAsync Device to Host
+    HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
+                               COLUMNS*sizeof(TestType), ROWS,
+                               hipMemcpyDeviceToHost, hipStreamPerThread));
+    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+  }
+
+  // Validating the result
+  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  if (mem_type) {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, true);
+  } else {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, false);
+  }
+  HIP_CHECK(hipStreamDestroy(stream));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcases performs the following scenarios of hipMemcpy2DAsync API on Peer GPU
+      1. H2D-D2D-D2H for Host Memory<-->Device Memory
+      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+
+      Input : "A_h" initialized based on data type
+               "A_h" --> "A_d" using H2D copy
+               "A_d" --> "X_d" using D2D copy
+               "X_d" --> "B_h" using D2H copy
+      Output: Validating A_h with B_h both should be equal for
+              the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice-Host&PinnedMem", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  auto mem_type = GENERATE(0, 1);
+  int numDevices = 0;
+  int canAccessPeer = 0;
+  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(TestType)};
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  hipStream_t stream;
+
+  if (numDevices > 1) {
+    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
+    if (canAccessPeer) {
+      HIP_CHECK(hipSetDevice(0));
+      HIP_CHECK(hipStreamCreate(&stream));
+
+      // Allocating memory
+      if (mem_type) {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+      } else {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+      }
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+            &pitch_A, width, NUM_H));
+
+      // Initialize the data
+      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+
+      // Host to Device
+      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice, stream));
+
+      // Change device
+      HIP_CHECK(hipSetDevice(1));
+
+      char *X_d{nullptr};
+      size_t pitch_X;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
+            &pitch_X, width, NUM_H));
+
+      // Device to Device
+      HIP_CHECK(hipMemcpy2DAsync(X_d, pitch_X, A_d,
+            pitch_A, COLUMNS*sizeof(TestType),
+            ROWS, hipMemcpyDeviceToDevice, stream));
+
+      // Device to Host
+      HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), X_d,
+                                 pitch_X, COLUMNS*sizeof(TestType), ROWS,
+                                 hipMemcpyDeviceToHost, stream));
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      if (mem_type) {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, true);
+      } else {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, false);
+      }
+      HIP_CHECK(hipFree(X_d));
+      HIP_CHECK(hipStreamDestroy(stream));
+    } else {
+      SUCCEED("Machine does not seem to have P2P");
+    }
+  } else {
+    SUCCEED("skipped the testcase as no of devices is less than 2");
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcases performs the following scenarios of hipMemcpy2DAsync API on Peer GPU
+      1. H2D-D2D-D2H for Host Memory<-->Device Memory
+      2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+      Memory is allocated in GPU-0 and Stream is created in GPU-1
+
+      Input : "A_h" initialized based on data type
+               "A_h" --> "A_d" using H2D copy
+               "A_d" --> "X_d" using D2D copy
+               "X_d" --> "B_h" using D2H copy
+      Output: Validating A_h with B_h both should be equal for
+              the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice-StreamOnDiffDevice", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  auto mem_type = GENERATE(0, 1);
+  int numDevices = 0;
+  int canAccessPeer = 0;
+  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(TestType)};
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  hipStream_t stream;
+
+  if (numDevices > 1) {
+    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
+    if (canAccessPeer) {
+      HIP_CHECK(hipSetDevice(0));
+
+      // Allocating memory
+      if (mem_type) {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+      } else {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+      }
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+            &pitch_A, width, NUM_H));
+      char *X_d{nullptr};
+      size_t pitch_X;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
+            &pitch_X, width, NUM_H));
+
+      // Initialize the data
+      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+
+      // Change device
+      HIP_CHECK(hipSetDevice(1));
+      HIP_CHECK(hipStreamCreate(&stream));
+
+      // Host to Device
+      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice, stream));
+
+      // Device to Device
+      HIP_CHECK(hipMemcpy2DAsync(X_d, pitch_X, A_d,
+            pitch_A, COLUMNS*sizeof(TestType),
+            ROWS, hipMemcpyDeviceToDevice, stream));
+
+      // Device to Host
+      HIP_CHECK(hipMemcpy2DAsync(B_h, COLUMNS*sizeof(TestType), X_d,
+                                 pitch_X, COLUMNS*sizeof(TestType), ROWS,
+                                 hipMemcpyDeviceToHost, stream));
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      if (mem_type) {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, true);
+      } else {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, false);
+      }
+      HIP_CHECK(hipFree(X_d));
+      HIP_CHECK(hipStreamDestroy(stream));
+    } else {
+      SUCCEED("Machine does not seem to have P2P");
+    }
+  } else {
+    SUCCEED("skipped the testcase as no of devices is less than 2");
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcase verifies the null checks of hipMemcpy2DAsync API
+      1. hipMemcpy2DAsync API where Source Pitch is zero
+      2. hipMemcpy2DAsync API where Destination Pitch is zero
+      3. hipMemcpy2DAsync API where height is zero
+      4. hipMemcpy2DAsync API where width is zero
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+
+TEST_CASE("Unit_hipMemcpy2DAsync_SizeCheck") {
+  CHECK_IMAGE_SUPPORT
+  HIP_CHECK(hipSetDevice(0));
+  int* A_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(int)};
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocating memory
+  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
+      &A_h, nullptr, nullptr, NUM_W*NUM_H);
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+        &pitch_A, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
+
+  SECTION("hipMemcpy2DAsync API where Source Pitch is zero") {
+    REQUIRE(hipMemcpy2DAsync(A_h, 0, A_d,
+            pitch_A, NUM_W, NUM_H,
+            hipMemcpyDeviceToHost, stream) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2DAsync API where Destination Pitch is zero") {
+    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
+            0, NUM_W, NUM_H,
+            hipMemcpyDeviceToHost, stream) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2DAsync API where height is zero") {
+    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
+            pitch_A, NUM_W, 0,
+            hipMemcpyDeviceToHost, stream) == hipSuccess);
+  }
+
+  SECTION("hipMemcpy2DAsync API where width is zero") {
+    REQUIRE(hipMemcpy2DAsync(A_h, width, A_d,
+            pitch_A, 0, NUM_H,
+            hipMemcpyDeviceToHost, stream) == hipSuccess);
+  }
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  free(A_h);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcase performs the negative scenarios of hipMemcpy2DAsync API
+      1. hipMemcpy2DAsync API by Passing nullptr to destination
+      2. hipMemcpy2DAsync API by Passing nullptr to source
+      3. hipMemcpy2DAsync API where width is > destination pitch
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+
+TEST_CASE("Unit_hipMemcpy2DAsync_Negative") {
+  CHECK_IMAGE_SUPPORT
+  HIP_CHECK(hipSetDevice(0));
+  int* A_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(int)};
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocating memory
+  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
+      &A_h, nullptr, nullptr, NUM_W*NUM_H);
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+        &pitch_A, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
+
+  SECTION("hipMemcpy2DAsync API by Passing nullptr to destination") {
+    REQUIRE(hipMemcpy2DAsync(nullptr, width, A_d,
+            pitch_A, COLUMNS*sizeof(int), ROWS,
+            hipMemcpyDeviceToHost, stream) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2DAsync API by Passing nullptr to source") {
+    REQUIRE(hipMemcpy2DAsync(A_h, width, nullptr,
+            pitch_A, COLUMNS*sizeof(int), ROWS,
+            hipMemcpyDeviceToHost, stream) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2DAsync API where width is > destination pitch") {
+    REQUIRE(hipMemcpy2DAsync(A_h, 10, A_d, pitch_A,
+            COLUMNS*sizeof(int), ROWS,
+            hipMemcpyDeviceToHost, stream) != hipSuccess);
+  }
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipStreamDestroy(stream));
+  free(A_h);
+}
+
+static void hipMemcpy2DAsync_Basic_Size_Test(size_t inc) {
+  constexpr int defaultProgramSize = 256 * 1024 * 1024;
+  constexpr int N = 2;
+  constexpr int value = 42;
+  int *in, *out, *dev;
+  size_t newSize = 0, inp = 0;
+  size_t size = sizeof(int) * N * inc;
+
+  size_t free, total;
+  HIP_CHECK(hipMemGetInfo(&free, &total));
+
+  if ( free < 2 * size )
+    newSize = ( free - defaultProgramSize ) / 2;
+  else
+    newSize = size;
+
+  INFO("Array size: " << size/1024.0/1024.0 << " MB or " << size << " Bytes.");
+  INFO("Free memory: " << free/1024.0/1024.0 << " MB or " << free << " Bytes");
+  INFO("NewSize:" << newSize/1024.0/1024.0 << "MB or " << newSize << " Bytes");
+
+  HIP_CHECK(hipHostMalloc(&in, newSize));
+  HIP_CHECK(hipHostMalloc(&out, newSize));
+  HIP_CHECK(hipMalloc(&dev, newSize));
+
+  inp = newSize / (sizeof(int) * N);
+  for (size_t i=0; i < N; i++) {
+    in[i * inp] = value;
+  }
+
+  size_t pitch = sizeof(int) * inp;
+
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  HIP_CHECK(hipMemcpy2DAsync(dev, pitch, in, pitch, sizeof(int),
+                             N, hipMemcpyHostToDevice, stream));
+  HIP_CHECK(hipMemcpy2DAsync(out, pitch, dev, pitch, sizeof(int),
+                             N, hipMemcpyDeviceToHost, stream));
+  HIP_CHECK(hipStreamSynchronize(stream));
+
+  for (size_t i=0; i < N; i++) {
+    REQUIRE(out[i * inp] == value);
+  }
+
+  HIP_CHECK(hipFree(dev));
+  HIP_CHECK(hipHostFree(in));
+  HIP_CHECK(hipHostFree(out));
+  HIP_CHECK(hipStreamDestroy(stream));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcase performs multidevice size check on hipMemcpy2DAsync API
+      1. Verify hipMemcpy2DAsync with 1 << 20 size
+      2. Verify hipMemcpy2DAsync with 1 << 21 size
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2DAsync.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEST_CASE("Unit_hipMemcpy2DAsync_multiDevice_Basic_Size_Test") {
+  CHECK_IMAGE_SUPPORT
+  size_t input = 1 << 20;
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+
+  for (int i=0; i < numDevices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    SECTION("Verify hipMemcpy2DAsync with 1 << 20 size") {
+      hipMemcpy2DAsync_Basic_Size_Test(input);
+    }
+    SECTION("Verify hipMemcpy2DAsync with 1 << 21 size") {
+      input <<= 1;
+      hipMemcpy2DAsync_Basic_Size_Test(input);
+    }
+  }
+}
@@ -0,0 +1,496 @@
+/*
+Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @addtogroup hipMemcpy2D hipMemcpy2D
+ * @{
+ * @ingroup MemcpyTest
+ * `hipMemcpy2D(void* dst, size_t dpitch, const void* src,
+ *              size_t spitch, size_t width, size_t height,
+ *              hipMemcpyKind kind)` -
+ * Copies data between host and device.
+ */
+
+// Testcase Description:
+// 1) Verifies the working of Memcpy2D API negative scenarios by
+//    Pass NULL to destination pointer
+//    Pass NULL to Source pointer
+//    Pass width greater than spitch/dpitch
+// 2) Verifies hipMemcpy2D API by
+//    pass 0 to destionation pitch
+//    pass 0 to source pitch
+//    pass 0 to width
+//    pass 0 to height
+// 3) Verifies working of Memcpy2D API on host memory and pinned host memory by
+//    performing D2H, D2D and H2D memory kind copies on same GPU
+// 4) Verifies working of Memcpy2D API for the following scenarios
+//      H2D-D2D-D2H on host and device memory
+//      H2D-D2D-D2H on pinned host and device memory
+//      H2D-D2D-D2H functionalities where memory is allocated in GPU-0
+//      and API is triggered from GPU-1
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr auto NUM_W{16};
+static constexpr auto NUM_H{16};
+static constexpr auto COLUMNS{8};
+static constexpr auto ROWS{8};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcases performs the following scenarios of hipMemcpy2D API on same GPU
+    1. H2D-D2D-D2H for Host Memory<-->Device Memory
+    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+
+    Input : "A_h" initialized based on data type
+             "A_h" --> "A_d" using H2D copy
+             "A_d" --> "B_d" using D2D copy
+             "B_d" --> "B_h" using D2H copy
+    Output: Validating A_h with B_h both should be equal for
+            the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_H2D-D2D-D2H", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  // 1 refers to pinned host memory
+  auto mem_type = GENERATE(0, 1);
+  HIP_CHECK(hipSetDevice(0));
+  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
+            *B_d{nullptr};
+  size_t pitch_A, pitch_B;
+  size_t width{NUM_W * sizeof(TestType)};
+
+  // Allocating memory
+  if (mem_type) {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+  } else {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+  }
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
+                          &pitch_B, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+
+  // Host to Device
+  HIP_CHECK(hipMemcpy2D(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+                        COLUMNS*sizeof(TestType), ROWS,
+                        hipMemcpyHostToDevice));
+
+  // Performs D2D on same GPU device
+  HIP_CHECK(hipMemcpy2D(B_d, pitch_B, A_d,
+                        pitch_A, COLUMNS*sizeof(TestType),
+                        ROWS, hipMemcpyDeviceToDevice));
+
+  // hipMemcpy2D Device to Host
+  HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType), B_d, pitch_B,
+                        COLUMNS*sizeof(TestType), ROWS,
+                        hipMemcpyDeviceToHost));
+
+  // Validating the result
+  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  if (mem_type) {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, true);
+  } else {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, false);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcase performs the following scenarios of hipMemcpy2D API on same GPU.
+    1. H2D-D2D-D2H for Host Memory<-->Device Memory
+    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+    The src and dst input pointers to hipMemCpy2D add an offset to the pointers
+    returned by the allocation functions.
+
+    Input : "A_h" initialized based on data type
+             "A_h" --> "A_d" using H2D copy
+             "A_d" --> "B_d" using D2D copy
+             "B_d" --> "B_h" using D2H copy
+    Output: Validating A_h with B_h both should be equal for
+            the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_H2D-D2D-D2H_WithOffset", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  // 1 refers to pinned host memory
+  auto mem_type = GENERATE(0, 1);
+  HIP_CHECK(hipSetDevice(0));
+  TestType  *A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr},
+            *B_d{nullptr};
+  size_t pitch_A, pitch_B;
+  size_t width{NUM_W * sizeof(TestType)};
+
+  // Allocating memory
+  if (mem_type) {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+  } else {
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+  }
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&B_d),
+                          &pitch_B, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+
+  // Host to Device
+  HIP_CHECK(hipMemcpy2D(A_d+COLUMNS*sizeof(TestType), pitch_A, A_h,
+                        COLUMNS*sizeof(TestType), COLUMNS*sizeof(TestType),
+                        ROWS, hipMemcpyHostToDevice));
+
+  // Performs D2D on same GPU device
+  HIP_CHECK(hipMemcpy2D(B_d+COLUMNS*sizeof(TestType), pitch_B,
+                        A_d+COLUMNS*sizeof(TestType),
+                        pitch_A, COLUMNS*sizeof(TestType),
+                        ROWS, hipMemcpyDeviceToDevice));
+
+  // hipMemcpy2D Device to Host
+  HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType),
+                        B_d+COLUMNS*sizeof(TestType), pitch_B,
+                        COLUMNS*sizeof(TestType), ROWS,
+                        hipMemcpyDeviceToHost));
+
+
+  // Validating the result
+  REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  if (mem_type) {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, true);
+  } else {
+    HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                  A_h, B_h, C_h, false);
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcases performs the following scenarios of hipMemcpy2D API on Peer GPU
+    1. H2D-D2D-D2H for Host Memory<-->Device Memory
+    2. H2D-D2D-D2H for Pinned Host Memory<-->Device Memory
+    3. Device context change where memory is allocated in GPU-0
+       and API is trigerred from GPU-1
+
+    Input : "A_h" initialized based on data type
+             "A_h" --> "A_d" using H2D copy
+             "A_d" --> "X_d" using D2D copy
+             "X_d" --> "B_h" using D2H copy
+    Output: Validating A_h with B_h both should be equal for
+            the number of COLUMNS and ROWS copied
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEMPLATE_TEST_CASE("Unit_hipMemcpy2D_multiDevice-D2D", ""
+                   , int, float, double) {
+  CHECK_IMAGE_SUPPORT
+  auto mem_type = GENERATE(0, 1);
+  int numDevices = 0;
+  int canAccessPeer = 0;
+  TestType* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(TestType)};
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    HIP_CHECK(hipDeviceCanAccessPeer(&canAccessPeer, 0, 1));
+    if (canAccessPeer) {
+      HIP_CHECK(hipSetDevice(0));
+
+      // Allocating memory
+      if (mem_type) {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, true);
+      } else {
+        HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+            &A_h, &B_h, &C_h, NUM_W*NUM_H, false);
+      }
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+            &pitch_A, width, NUM_H));
+
+      // Initialize the data
+      HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, B_h, C_h);
+
+      char *X_d{nullptr};
+      size_t pitch_X;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&X_d),
+                               &pitch_X, width, NUM_H));
+
+      // Change device
+      HIP_CHECK(hipSetDevice(1));
+
+      // Host to Device
+      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, A_h, COLUMNS*sizeof(TestType),
+            COLUMNS*sizeof(TestType), ROWS, hipMemcpyHostToDevice));
+
+      // Device to Device
+      HIP_CHECK(hipMemcpy2D(X_d, pitch_X, A_d,
+            pitch_A, COLUMNS*sizeof(TestType),
+            ROWS, hipMemcpyDeviceToDevice));
+
+      // Device to Host
+      HIP_CHECK(hipMemcpy2D(B_h, COLUMNS*sizeof(TestType), X_d,
+            pitch_X, COLUMNS*sizeof(TestType), ROWS, hipMemcpyDeviceToHost));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, B_h, COLUMNS, ROWS) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      if (mem_type) {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, true);
+      } else {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+            A_h, B_h, C_h, false);
+      }
+      HIP_CHECK(hipFree(X_d));
+    } else {
+      SUCCEED("Machine does not seem to have P2P");
+    }
+  } else {
+    SUCCEED("skipped the testcase as no of devices is less than 2");
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This Testcase verifies the null size checks of hipMemcpy2D API
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEST_CASE("Unit_hipMemcpy2D_SizeCheck") {
+  CHECK_IMAGE_SUPPORT
+  HIP_CHECK(hipSetDevice(0));
+  int* A_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(int)};
+
+  // Allocating memory
+  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
+      &A_h, nullptr, nullptr, NUM_W*NUM_H);
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+        &pitch_A, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
+
+  SECTION("hipMemcpy2D API where Source Pitch is zero") {
+    REQUIRE(hipMemcpy2D(A_h, 0, A_d,
+            pitch_A, NUM_W, NUM_H,
+            hipMemcpyDeviceToHost) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2D API where Destination Pitch is zero") {
+    REQUIRE(hipMemcpy2D(A_h, width, A_d,
+            0, NUM_W, NUM_H,
+            hipMemcpyDeviceToHost) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2D API where height is zero") {
+    REQUIRE(hipMemcpy2D(A_h, width, A_d,
+            pitch_A, NUM_W, 0,
+            hipMemcpyDeviceToHost) == hipSuccess);
+  }
+
+  SECTION("hipMemcpy2D API where width is zero") {
+    REQUIRE(hipMemcpy2D(A_h, width, A_d,
+            pitch_A, 0, NUM_H,
+            hipMemcpyDeviceToHost) == hipSuccess);
+  }
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  free(A_h);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This Testcase verifies all the negative scenarios of hipMemcpy2D API
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEST_CASE("Unit_hipMemcpy2D_Negative") {
+  CHECK_IMAGE_SUPPORT
+  HIP_CHECK(hipSetDevice(0));
+  int* A_h{nullptr}, *A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(int)};
+
+  // Allocating memory
+  HipTest::initArrays<int>(nullptr, nullptr, nullptr,
+      &A_h, nullptr, nullptr, NUM_W*NUM_H);
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+        &pitch_A, width, NUM_H));
+
+  // Initialize the data
+  HipTest::setDefaultData<int>(NUM_W*NUM_H, A_h, nullptr, nullptr);
+
+  SECTION("hipMemcpy2D API by Passing nullptr to destination") {
+    REQUIRE(hipMemcpy2D(nullptr, width, A_d,
+          pitch_A, COLUMNS*sizeof(int), ROWS,
+          hipMemcpyDeviceToHost) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2D API by Passing nullptr to destination") {
+    REQUIRE(hipMemcpy2D(nullptr, width, nullptr,
+          pitch_A, COLUMNS*sizeof(int), ROWS,
+          hipMemcpyDeviceToHost) != hipSuccess);
+  }
+
+  SECTION("hipMemcpy2D API where width is greater than destination pitch") {
+    REQUIRE(hipMemcpy2D(A_h, 10, A_d, pitch_A,
+          COLUMNS*sizeof(int), ROWS,
+          hipMemcpyDeviceToHost) != hipSuccess);
+  }
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  free(A_h);
+}
+
+static void hipMemcpy2D_Basic_Size_Test(size_t inc) {
+  constexpr int defaultProgramSize = 256 * 1024 * 1024;
+  constexpr int N = 2;
+  constexpr int value = 42;
+  int *in, *out, *dev;
+  size_t newSize = 0, inp = 0;
+  size_t size = sizeof(int) * N * inc;
+
+  size_t free, total;
+  HIP_CHECK(hipMemGetInfo(&free, &total));
+
+  if ( free < 2 * size )
+    newSize = ( free - defaultProgramSize ) / 2;
+  else
+    newSize = size;
+
+  INFO("Array size: " << size/1024.0/1024.0 << " MB or " << size << " Bytes.");
+  INFO("Free memory: " << free/1024.0/1024.0 << " MB or " << free << " Bytes");
+  INFO("NewSize:" << newSize/1024.0/1024.0 << "MB or " << newSize << " Bytes");
+
+  HIP_CHECK(hipHostMalloc(&in, newSize));
+  HIP_CHECK(hipHostMalloc(&out, newSize));
+  HIP_CHECK(hipMalloc(&dev, newSize));
+
+  inp = newSize / (sizeof(int) * N);
+  for (size_t i=0; i < N; i++) {
+    in[i * inp] = value;
+  }
+
+  size_t pitch = sizeof(int) * inp;
+
+  HIP_CHECK(hipMemcpy2D(dev, pitch, in, pitch, sizeof(int),
+                        N, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy2D(out, pitch, dev, pitch, sizeof(int),
+                        N, hipMemcpyDeviceToHost));
+
+  for (size_t i=0; i < N; i++) {
+    REQUIRE(out[i * inp] == value);
+  }
+
+  HIP_CHECK(hipFree(dev));
+  HIP_CHECK(hipHostFree(in));
+  HIP_CHECK(hipHostFree(out));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - This testcase performs multidevice size check on hipMemcpy2D API
+      1. Verify hipMemcpy2D with 1 << 20 size
+      2. Verify hipMemcpy2D with 1 << 21 size
+ * Test source
+ * ------------------------
+ *  - unit/memory/hipMemcpy2D.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 6.0
+ */
+
+TEST_CASE("Unit_hipMemcpy2D_multiDevice_Basic_Size_Test") {
+  CHECK_IMAGE_SUPPORT
+  size_t input = 1 << 20;
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+
+  for (int i=0; i < numDevices; i++) {
+    HIP_CHECK(hipSetDevice(i));
+
+    SECTION("Verify hipMemcpy2D with 1 << 20 size") {
+      hipMemcpy2D_Basic_Size_Test(input);
+    }
+    SECTION("Verify hipMemcpy2D with 1 << 21 size") {
+      input <<= 1;
+      hipMemcpy2D_Basic_Size_Test(input);
+    }
+  }
+}
@@ -1,337 +1,195 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/*
-This testfile verifies the following scenarios of hipMemcpyParam2D API
-1. Negative Scenarios
-2. Extent Validation Scenarios
-3. D2D copy for different datatypes
-4. H2D and D2H copy for different datatypes
-*/
+#include "memcpy2d_tests_common.hh"

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr size_t NUM_W{10};
-static constexpr size_t NUM_H{10};
-/*
- * This testcase verifies D2D functionality of hipMemcpyParam2D API
- * Input: Intializing "A_d" device variable with "C_h" host variable
- * Output: "A_d" device variable to "E_d" device variable
- *
- * Validating the result by copying "E_d" to "A_h" and checking
- * it with the initalized data "C_h".
- *
- */
-TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2D_multiDevice-D2D", "[hipMemcpyParam2D]", char, float, int,
-                   double, long double) {
-  CHECK_IMAGE_SUPPORT
+TEST_CASE("Unit_hipMemcpyParam2D_Positive_Basic") {
+  constexpr bool async = false;

-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    // Initialize and Allocating Memory
-    HIP_CHECK(hipSetDevice(0));
-    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-    size_t pitch_A;
-    size_t width{NUM_W * sizeof(TestType)};
-    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                            &pitch_A, width, NUM_H));
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, nullptr, &C_h,
-                                  width*NUM_H, false);
-    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Device to Host") { Memcpy2DDeviceToHostShell<async>(MemcpyParam2DAdapter<async>()); }
+#endif

-    int peerAccess = 0;
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    if (!peerAccess) {
-      SUCCEED("Skipped the test as there is no peer access");
-    } else {
-      HIP_CHECK(hipSetDevice(1));
-      char *E_d;
-      size_t pitch_E;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
-            &pitch_E, width, NUM_H));
-
-      // Initalizing A_d with C_h
-      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, C_h, width,
-            NUM_W * sizeof(TestType), NUM_H, hipMemcpyHostToDevice));
-
-      // Device to Device
-      hip_Memcpy2D desc = {};
-      desc.srcMemoryType = hipMemoryTypeDevice;
-      desc.srcHost = A_d;
-      desc.srcDevice = hipDeviceptr_t(A_d);
-      desc.srcPitch = pitch_A;
-      desc.dstMemoryType = hipMemoryTypeDevice;
-      desc.dstHost = E_d;
-      desc.dstDevice = hipDeviceptr_t(E_d);
-      desc.dstPitch = pitch_E;
-      desc.WidthInBytes = NUM_W * sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
-
-      // Copying E_d to A_h
-      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
-            NUM_W * sizeof(TestType), NUM_H,
-            hipMemcpyDeviceToHost));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                    A_h, nullptr, C_h, false);
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") {
+      Memcpy2DDeviceToDeviceShell<async, false>(MemcpyParam2DAdapter<async>());
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
- * This testcase verifies H2D & D2H functionality of hipMemcpyParam2D API
- * H2D case:
- * Input: "C_h" host variable initialized with default data
- * Output: "A_d" device variable
- *
- * D2H case:
- * Input: "A_d" device variable from the previous output
- * OutPut: "A_h" variable
- *
- * Validating the result by comparing "A_h" to "C_h"
- */
-TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2D_multiDevice-H2D-D2H", "[hipMemcpyParam2D]", char, float,
-                   int, double, long double) {
-  CHECK_IMAGE_SUPPORT
-
-  // 1 refers to pinned host memory and 0 refers
-  // to unpinned memory
-  auto memory_type = GENERATE(0, 1);
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    HIP_CHECK(hipSetDevice(0));
-
-    // Initialize and Allocating Memory
-    TestType* A_h{nullptr}, *C_h{nullptr},
-             *A_d{nullptr};
-    size_t pitch_A;
-    size_t width{NUM_W * sizeof(TestType)};
-
-    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                            &pitch_A, width, NUM_H));
-
-    // Based on memory type (pinned/unpinned) allocating memory
-    if (memory_type) {
-      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                    &A_h, nullptr, &C_h,
-                                    width*NUM_H, true);
-    } else {
-      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                    &A_h, nullptr, &C_h,
-                                    width*NUM_H, false);
+    SECTION("Peer access enabled") {
+      Memcpy2DDeviceToDeviceShell<async, true>(MemcpyParam2DAdapter<async>());
    }
-    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
-    int peerAccess = 0;
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    if (!peerAccess) {
-      SUCCEED("Skipped the test as there is no peer access");
-    } else {
-      // Host to Device
-      hip_Memcpy2D desc = {};
-      desc.srcMemoryType = hipMemoryTypeHost;
-      desc.srcHost = C_h;
-      desc.srcDevice = hipDeviceptr_t(C_h);
-      desc.srcPitch = width;
-      desc.dstMemoryType = hipMemoryTypeDevice;
-      desc.dstHost = A_d;
-      desc.dstDevice = hipDeviceptr_t(A_d);
-      desc.dstPitch = pitch_A;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+  }

-      // Device to Host
-      memset(&desc, 0x0, sizeof(hip_Memcpy2D));
-      desc.srcMemoryType = hipMemoryTypeDevice;
-      desc.srcHost = A_d;
-      desc.srcDevice = hipDeviceptr_t(A_d);
-      desc.srcPitch = pitch_A;
-      desc.dstMemoryType = hipMemoryTypeHost;
-      desc.dstHost = A_h;
-      desc.dstDevice = hipDeviceptr_t(A_h);
-      desc.dstPitch = width;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+  SECTION("Host to Device") { Memcpy2DHostToDeviceShell<async>(MemcpyParam2DAdapter<async>()); }

-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Host to Host") { Memcpy2DHostToHostShell<async>(MemcpyParam2DAdapter<async>()); }
+#endif
+}

-      // DeAllocating the Memory
-      HIP_CHECK(hipFree(A_d));
-      if (memory_type) {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                      A_h, nullptr, C_h, true);
-      } else {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                      A_h, nullptr, C_h, false);
-      }
+TEST_CASE("Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior") {
+  HIP_CHECK(hipDeviceSynchronize());
+
+  SECTION("Host to Device") { Memcpy2DHtoDSyncBehavior(MemcpyParam2DAdapter<>(), true); }
+
+  SECTION("Device to Pageable Host") {
+    Memcpy2DDtoHPageableSyncBehavior(MemcpyParam2DAdapter<>(), true);
+  }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Device to Pinned Host") {
+    Memcpy2DDtoHPinnedSyncBehavior(MemcpyParam2DAdapter<>(), true);
+  }
+#endif
+
+  SECTION("Device to Device") {
+#if HT_NVIDIA
+    Memcpy2DDtoDSyncBehavior(MemcpyParam2DAdapter<>(), false);
+#else
+    Memcpy2DDtoDSyncBehavior(MemcpyParam2DAdapter<>(), true);
+#endif
+  }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-232
+  SECTION("Host to Host") { Memcpy2DHtoHSyncBehavior(MemcpyParam2DAdapter<>(), true); }
+#endif
+}
+
+TEST_CASE("Unit_hipMemcpyParam2D_Positive_Parameters") {
+  constexpr bool async = false;
+  Memcpy2DZeroWidthHeight<async>(MemcpyParam2DAdapter<async>());
+}
+
+TEST_CASE("Unit_hipMemcpyParam2D_Positive_Array") {
+  constexpr bool async = false;
+  SECTION("Array from/to Host") {
+    MemcpyParam2DArrayHostShell<async>(MemcpyParam2DAdapter<async>());
+  }
+  SECTION("Array from/to Device") {
+    MemcpyParam2DArrayDeviceShell<async>(MemcpyParam2DAdapter<async>());
+  }
+}
+
+TEST_CASE("Unit_hipMemcpyParam2D_Negative_Parameters") {
+  constexpr size_t cols = 128;
+  constexpr size_t rows = 128;
+
+  constexpr auto NegativeTests = [](void* dst, size_t dpitch, void* src, size_t spitch,
+                                    size_t width, size_t height, hipMemcpyKind kind) {
+    SECTION("dst == nullptr") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(static_cast<void*>(nullptr), dpitch, src, spitch,
+                                               width, height, kind),
+                      hipErrorInvalidValue);
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-/*
- * This testcase verifies the extent validation scenarios
- */
-TEST_CASE("Unit_hipMemcpyParam2D_ExtentValidation") {
-  CHECK_IMAGE_SUPPORT

-  // Allocating memory and Initializing the data
-  HIP_CHECK(hipSetDevice(0));
-  char* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
-    * A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(char)};
-  constexpr auto memsetval{100};
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-        &pitch_A, width, NUM_H));
-  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
-      &A_h, nullptr, &C_h,
-      width*NUM_H, false);
-  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
-      &B_h, nullptr, nullptr,
-      width*NUM_H, false);
-  HipTest::setDefaultData<char>(NUM_W*NUM_H, A_h, nullptr, C_h);
-  HipTest::setDefaultData<char>(NUM_W*NUM_H, B_h, nullptr, nullptr);
-  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
+    SECTION("src == nullptr") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(dst, dpitch, static_cast<void*>(nullptr), spitch,
+                                               width, height, kind),
+                      hipErrorInvalidValue);
+    }

-  // Device to Host
-  hip_Memcpy2D desc = {};
-  desc.srcMemoryType = hipMemoryTypeDevice;
-  desc.srcHost = A_d;
-  desc.srcDevice = hipDeviceptr_t(A_d);
-  desc.srcPitch = pitch_A;
-  desc.dstMemoryType = hipMemoryTypeHost;
-  desc.dstHost = A_h;
-  desc.dstDevice = hipDeviceptr_t(A_h);
-  desc.dstPitch = width;
-  desc.WidthInBytes = NUM_W;
-  desc.Height = NUM_H;
+    SECTION("dstPitch < WithInBytes") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(dst, width - 1, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }

-  SECTION("Destination Pitch is 0") {
-    desc.dstPitch = 0;
-    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+    SECTION("srcPitch < WidthInBytes") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(dst, dpitch, src, width - 1, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dstPitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(dst, static_cast<size_t>(attr) + 1, src, spitch,
+                                               width, height, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("srcPitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>()(dst, dpitch, src, static_cast<size_t>(attr) + 1,
+                                               width, height, kind),
+                      hipErrorInvalidValue);
+    }
+
+#if HT_NVIDIA  // Disabled on AMD due to defect - EXSWHTEC-237
+    SECTION("WidthInBytes + srcXInBytes > srcPitch") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>(make_hipExtent(spitch - width + 1, 0, 0))(
+                          dst, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("WidthInBytes + dstXInBytes > dstPitch") {
+      HIP_CHECK_ERROR(
+          MemcpyParam2DAdapter<>(make_hipExtent(0, 0, 0), make_hipExtent(dpitch - width + 1, 0, 0))(
+              dst, dpitch, src, spitch, width, height, kind),
+          hipErrorInvalidValue);
+    }
+
+    SECTION("srcY out of bounds") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>(make_hipExtent(0, 1, 0))(dst, dpitch, src, spitch,
+                                                                      width, height, kind),
+                      hipErrorInvalidValue);
+    }
+
+    SECTION("dstY out of bounds") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<>(make_hipExtent(0, 0, 0), make_hipExtent(0, 1, 0))(
+                          dst, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+#endif
+  };
+
+  SECTION("Host to Device") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(device_alloc.ptr(), device_alloc.pitch(), host_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyHostToDevice);
  }

-  SECTION("Source Pitch is 0") {
-    desc.srcPitch = 0;
-    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+  SECTION("Device to Host") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(host_alloc.ptr(), device_alloc.pitch(), device_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyDeviceToHost);
  }

-  SECTION("Height is 0") {
-    desc.Height = 0;
-    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
-    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  SECTION("Host to Host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    NegativeTests(dst_alloc.ptr(), cols * sizeof(int), src_alloc.ptr(), cols * sizeof(int),
+                  cols * sizeof(int), rows, hipMemcpyHostToHost);
  }

-  SECTION("Width is 0") {
-    desc.WidthInBytes = 0;
-    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
-    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  SECTION("Device to Device") {
+    LinearAllocGuard2D<int> src_alloc(cols, rows);
+    LinearAllocGuard2D<int> dst_alloc(cols, rows);
+    NegativeTests(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                  dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToDevice);
  }
-
-  // DeAllocating the Memory
-  HIP_CHECK(hipFree(A_d));
-  HipTest::freeArrays<char>(nullptr, nullptr, nullptr,
-                                A_h, B_h, C_h, false);
-}
-
-/*
- * This testcase verifies the negative scenarios
- */
-TEST_CASE("Unit_hipMemcpyParam2D_Negative") {
-  CHECK_IMAGE_SUPPORT
-
-  HIP_CHECK(hipSetDevice(0));
-
-  // Allocating and Initializing the data
-  float* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
-       * A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(float)};
-  constexpr auto memsetval{100};
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HipTest::initArrays<float>(nullptr, nullptr, nullptr,
-                                &A_h, &B_h, &C_h,
-                                width*NUM_H, false);
-  HipTest::setDefaultData<float>(NUM_W*NUM_H, A_h, B_h, C_h);
-  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
-
-  hip_Memcpy2D desc = {};
-  desc.srcMemoryType = hipMemoryTypeDevice;
-  desc.srcHost = A_d;
-  desc.srcDevice = hipDeviceptr_t(A_d);
-  desc.srcPitch = pitch_A;
-  desc.dstMemoryType = hipMemoryTypeHost;
-  desc.dstHost = A_h;
-  desc.dstDevice = hipDeviceptr_t(A_h);
-  desc.dstPitch = width;
-  desc.WidthInBytes = NUM_W;
-  desc.Height = NUM_H;
-
-  SECTION("Null Pointer to Source Device Pointer") {
-    desc.srcDevice = hipDeviceptr_t(nullptr);
-    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
-  }
-
-  SECTION("Null Pointer to Destination Device Pointer") {
-    memset(&desc, 0x0, sizeof(hip_Memcpy2D));
-    desc.srcMemoryType = hipMemoryTypeHost;
-    desc.srcHost = A_h;
-    desc.srcDevice = hipDeviceptr_t(A_h);
-    desc.srcPitch = width;
-    desc.dstMemoryType = hipMemoryTypeDevice;
-    desc.dstHost = A_d;
-    desc.dstDevice = hipDeviceptr_t(nullptr);
-    desc.dstPitch = pitch_A;
-    desc.WidthInBytes = NUM_W;
-    desc.Height = NUM_H;
-    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
-  }
-
-  SECTION("Null Pointer to both Src & Dst Device Pointer") {
-    desc.srcDevice = hipDeviceptr_t(nullptr);
-    desc.dstDevice = hipDeviceptr_t(nullptr);
-    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
-  }
-
-  SECTION("Width > src/dest pitches") {
-    desc.WidthInBytes = pitch_A+1;
-    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
-  }
-
-  // DeAllocating the Memory
-  HIP_CHECK(hipFree(A_d));
-  HipTest::freeArrays<float>(nullptr, nullptr, nullptr,
-                                A_h, B_h, C_h, false);
-}
+}
@@ -1,441 +1,220 @@
 /*
 Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-/*
-This testfile verifies the following scenarios of hipMemcpyParam2DAsync API
-1. Negative Scenarios
-2. Extent Validation Scenarios
-3. D2D copy for different datatypes
-4. H2D and D2H copy for different datatypes
-5. Device context change scenario where memory allocated in one GPU
-   stream created in another GPU
-*/
+#include "memcpy2d_tests_common.hh"

 #include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
+#include <hip/hip_runtime_api.h>
+#include <resource_guards.hh>
+#include <utils.hh>

-static constexpr size_t NUM_W{10};
-static constexpr size_t NUM_H{10};
-/*
- * This testcase verifies D2D functionality of hipMemcpyParam2DAsync API
- * Where Memory is allocated in GPU-0 and stream is created in GPU-1
- *
- * Input: Intializing "A_d" device variable with "C_h" host variable
- * Output: "A_d" device variable to "E_d" device variable
- *
- * Validating the result by copying "E_d" to "A_h" and checking
- * it with the initalized data "C_h".
- *
- */
-TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-StreamOnDiffDevice",
-                   "[hipMemcpyParam2DAsync]", char, float, int, double, long double) {
-  CHECK_IMAGE_SUPPORT
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Positive_Basic") {
+  using namespace std::placeholders;

-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    // Allocating and Initializing the data
-    HIP_CHECK(hipSetDevice(0));
-    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-    size_t pitch_A;
-    size_t width{NUM_W * sizeof(TestType)};
-    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                            &pitch_A, width, NUM_H));
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, nullptr, &C_h,
-                                  width*NUM_H, false);
-    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
-    int peerAccess = 0;
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    if (!peerAccess) {
-      SUCCEED("Skipped the test as there is no peer access");
-    } else {
-      TestType *E_d{nullptr};
-      size_t pitch_E;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
-                              &pitch_E, width, NUM_H));
+  constexpr bool async = true;

-      // Initalizing A_d with C_h
-      HIP_CHECK(hipSetDevice(1));
-      hipStream_t stream;
-      HIP_CHECK(hipStreamCreate(&stream));
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
+  const StreamGuard stream_guard(stream_type);
+  const hipStream_t stream = stream_guard.stream();

-      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, C_h, width,
-                           NUM_W*sizeof(TestType), NUM_H,
-                           hipMemcpyHostToDevice, stream));
-      HIP_CHECK(hipStreamSynchronize(stream));
-      // Device to Device
-      hip_Memcpy2D desc = {};
-      desc.srcMemoryType = hipMemoryTypeDevice;
-      desc.srcHost = A_d;
-      desc.srcDevice = hipDeviceptr_t(A_d);
-      desc.srcPitch = pitch_A;
-      desc.dstMemoryType = hipMemoryTypeDevice;
-      desc.dstHost = E_d;
-      desc.dstDevice = hipDeviceptr_t(E_d);
-      desc.dstPitch = pitch_E;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Copying the result E_d to A_h host variable
-      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
-            NUM_W*sizeof(TestType), NUM_H,
-            hipMemcpyDeviceToHost));
-      HIP_CHECK(hipDeviceSynchronize());
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(E_d));
-      HIP_CHECK(hipFree(A_d));
-      HIP_CHECK(hipStreamDestroy(stream));
-      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-          A_h, nullptr, C_h, false);
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Device to Host") {
+    Memcpy2DDeviceToHostShell<async>(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, stream), stream);
+  }
+#endif
+  SECTION("Device to Device") {
+    SECTION("Peer access disabled") {
+      Memcpy2DDeviceToDeviceShell<async, false>(
+          std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, stream), stream);
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-
-/*
- * This testcase verifies D2D functionality of hipMemcpyParam2DAsync API
- * Input: Intializing "A_d" device variable with "C_h" host variable
- * Output: "A_d" device variable to "E_d" device variable
- *
- * Validating the result by copying "E_d" to "A_h" and checking
- * it with the initalized data "C_h".
- *
- */
-TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-D2D", "[hipMemcpyParam2DAsync]", char,
-                   int, float, double, long double) {
-  CHECK_IMAGE_SUPPORT
-
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    // Allocating and Initializing the data
-    HIP_CHECK(hipSetDevice(0));
-    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
-    size_t pitch_A;
-    size_t width{NUM_W * sizeof(TestType)};
-    hipStream_t stream;
-    HIP_CHECK(hipStreamCreate(&stream));
-    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                            &pitch_A, width, NUM_H));
-    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                  &A_h, nullptr, &C_h,
-                                  width*NUM_H, false);
-    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
-
-    int peerAccess = 0;
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    if (!peerAccess) {
-      SUCCEED("Skipped the test as there is no peer access");
-    } else {
-      HIP_CHECK(hipSetDevice(1));
-      TestType *E_d;
-      size_t pitch_E;
-      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
-            &pitch_E, width, NUM_H));
-
-      // Initializing A_d  with C_h
-      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, C_h, width,
-            NUM_W*sizeof(TestType), NUM_H, hipMemcpyHostToDevice));
-
-      // Device to Device
-      hip_Memcpy2D desc = {};
-      desc.srcMemoryType = hipMemoryTypeDevice;
-      desc.srcHost = A_d;
-      desc.srcDevice = hipDeviceptr_t(A_d);
-      desc.srcPitch = pitch_A;
-      desc.dstMemoryType = hipMemoryTypeDevice;
-      desc.dstHost = E_d;
-      desc.dstDevice = hipDeviceptr_t(E_d);
-      desc.dstPitch = pitch_E;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Copying the result E_d to A_h host variable
-      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
-            NUM_W*sizeof(TestType), NUM_H, hipMemcpyDeviceToHost));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      HIP_CHECK(hipStreamDestroy(stream));
-      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-          A_h, nullptr, C_h, false);
+    SECTION("Peer access enabled") {
+      Memcpy2DDeviceToDeviceShell<async, true>(
+          std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, stream), stream);
    }
-  } else {
-    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+  SECTION("Host to Device") {
+    Memcpy2DHostToDeviceShell<async>(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, stream), stream);
+  }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Host to Host") {
+    Memcpy2DHostToHostShell<async>(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, stream), stream);
+  }
+#endif
+}
+
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Positive_Synchronization_Behavior") {
+  using namespace std::placeholders;
+
+  constexpr bool async = true;
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  SECTION("Host to Device") {
+    Memcpy2DHtoDSyncBehavior(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, nullptr), false);
+  }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Device to Pageable Host") {
+    Memcpy2DDtoHPageableSyncBehavior(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, nullptr), true);
+  }
+#endif
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-236
+  SECTION("Device to Pinned Host") {
+    Memcpy2DDtoHPinnedSyncBehavior(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, nullptr), false);
+  }
+#endif
+  SECTION("Device to Device") {
+    Memcpy2DDtoDSyncBehavior(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, nullptr), false);
+  }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-233
+  SECTION("Host to Host") {
+    Memcpy2DHtoHSyncBehavior(
+        std::bind(MemcpyParam2DAdapter<async>(), _1, _2, _3, _4, _5, _6, _7, nullptr), true);
+  }
+#endif
+}
+
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Positive_Parameters") {
+  constexpr bool async = true;
+  Memcpy2DZeroWidthHeight<async>(MemcpyParam2DAdapter<async>());
+}
+
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Positive_Array") {
+  constexpr bool async = true;
+  SECTION("Array from/to Host") {
+    MemcpyParam2DArrayHostShell<async>(MemcpyParam2DAdapter<async>());
+  }
+  SECTION("Array from/to Device") {
+    MemcpyParam2DArrayDeviceShell<async>(MemcpyParam2DAdapter<async>());
  }
 }

-/*
- * This testcase verifies H2D & D2H functionality of hipMemcpyParam2DAsync API
- * H2D case:
- * Input: "C_h" host variable initialized with default data
- * Output: "A_d" device variable
- *
- * D2H case:
- * Input: "A_d" device variable from the previous output
- * OutPut: "A_h" variable
- *
- * Validating the result by comparing "A_h" to "C_h"
- */
-TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-H2D-D2H", "[hipMemcpyParam2DAsync]",
-                   char, int, float, double, long double) {
-  CHECK_IMAGE_SUPPORT
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Negative_Parameters") {
+  constexpr bool async = true;

-  // 1 refers to pinned host memory and 0 refers
-  // to unpinned memory
-  auto memory_type = GENERATE(0, 1);
-  int numDevices = 0;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices > 1) {
-    // Allocating and Initializing the data
-    HIP_CHECK(hipSetDevice(0));
-    TestType* A_h{nullptr}, *C_h{nullptr},
-      *A_d{nullptr};
-    size_t pitch_A;
-    size_t width{NUM_W * sizeof(TestType)};
-    hipStream_t stream;
+  constexpr size_t cols = 128;
+  constexpr size_t rows = 128;

-    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                            &pitch_A, width, NUM_H));
-
-    // Based on memory type (pinned/unpinned) allocating memory
-    if (memory_type) {
-      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                    &A_h, nullptr, &C_h,
-                                    width*NUM_H, true);
-    } else {
-      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
-                                    &A_h, nullptr, &C_h,
-                                    width*NUM_H, false);
+  constexpr auto NegativeTests = [](void* dst, size_t dpitch, void* src, size_t spitch,
+                                    size_t width, size_t height, hipMemcpyKind kind) {
+    SECTION("dst == nullptr") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>()(static_cast<void*>(nullptr), dpitch, src,
+                                                    spitch, width, height, kind),
+                      hipErrorInvalidValue);
    }
-    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
-    int peerAccess = 0;
-    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
-    if (!peerAccess) {
-      SUCCEED("Skipped the test as there is no peer access");
-    } else {
-      // Host to Device
-      hip_Memcpy2D desc = {};
-      HIP_CHECK(hipStreamCreate(&stream));
-      desc.srcMemoryType = hipMemoryTypeHost;
-      desc.srcHost = C_h;
-      desc.srcDevice = hipDeviceptr_t(C_h);
-      desc.srcPitch = width;
-      desc.dstMemoryType = hipMemoryTypeDevice;
-      desc.dstHost = A_d;
-      desc.dstDevice = hipDeviceptr_t(A_d);
-      desc.dstPitch = pitch_A;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Device to Host
-      memset(&desc, 0x0, sizeof(hip_Memcpy2D));
-      desc.srcMemoryType = hipMemoryTypeDevice;
-      desc.srcHost = A_d;
-      desc.srcDevice = hipDeviceptr_t(A_d);
-      desc.srcPitch = pitch_A;
-      desc.dstMemoryType = hipMemoryTypeHost;
-      desc.dstHost = A_h;
-      desc.dstDevice = hipDeviceptr_t(A_h);
-      desc.dstPitch = width;
-      desc.WidthInBytes = NUM_W*sizeof(TestType);
-      desc.Height = NUM_H;
-      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-      HIP_CHECK(hipStreamSynchronize(stream));
-
-      // Validating the result
-      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
-
-      // DeAllocating the memory
-      HIP_CHECK(hipFree(A_d));
-      HIP_CHECK(hipStreamDestroy(stream));
-      if (memory_type) {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                      A_h, nullptr, C_h, true);
-      } else {
-        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
-                                      A_h, nullptr, C_h, false);
-      }
+    SECTION("src == nullptr") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>()(dst, dpitch, static_cast<void*>(nullptr),
+                                                    spitch, width, height, kind),
+                      hipErrorInvalidValue);
    }
-  } else {
-      SUCCEED("skipping the testcases as numDevices < 2");
-  }
-}
-/*
- * This testcase verifies the extent validation scenarios
- */
-TEST_CASE("Unit_hipMemcpyParam2DAsync_ExtentValidation") {
-  CHECK_IMAGE_SUPPORT
+    SECTION("dstPitch < WidthInBytes") {
+      HIP_CHECK_ERROR(
+          MemcpyParam2DAdapter<async>()(dst, width - 1, src, spitch, width, height, kind),
+          hipErrorInvalidValue);
+    }
+    SECTION("srcPitch < WidthInBytes") {
+      HIP_CHECK_ERROR(
+          MemcpyParam2DAdapter<async>()(dst, dpitch, src, width - 1, width, height, kind),
+          hipErrorInvalidValue);
+    }
+    SECTION("dpitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>()(dst, static_cast<size_t>(attr) + 1, src, spitch,
+                                                    width, height, kind),
+                      hipErrorInvalidValue);
+    }
+    SECTION("spitch > max pitch") {
+      int attr = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&attr, hipDeviceAttributeMaxPitch, 0));
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>()(dst, dpitch, src, static_cast<size_t>(attr) + 1,
+                                                    width, height, kind),
+                      hipErrorInvalidValue);
+    }
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-237
+    SECTION("WidthInBytes + srcXInBytes > srcPitch") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>(make_hipExtent(spitch - width + 1, 0, 0))(
+                          dst, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+    SECTION("WidthInBytes + dstXInBytes > dstPitch") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>(make_hipExtent(0, 0, 0),
+                                                  make_hipExtent(dpitch - width + 1, 0, 0))(
+                          dst, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+    SECTION("srcY out of bounds") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>(make_hipExtent(0, 1, 0))(dst, dpitch, src, spitch,
+                                                                           width, height, kind),
+                      hipErrorInvalidValue);
+    }
+    SECTION("dstY out of bounds") {
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>(make_hipExtent(0, 0, 0), make_hipExtent(0, 1, 0))(
+                          dst, dpitch, src, spitch, width, height, kind),
+                      hipErrorInvalidValue);
+    }
+#endif
+#if HT_NVIDIA // Disabled on AMD due to defect - EXSWHTEC-235
+    SECTION("Invalid stream") {
+      StreamGuard stream_guard(Streams::created);
+      HIP_CHECK(hipStreamDestroy(stream_guard.stream()));
+      HIP_CHECK_ERROR(MemcpyParam2DAdapter<async>()(dst, dpitch, src, spitch, width, height, kind,
+                                                    stream_guard.stream()),
+                      hipErrorContextIsDestroyed);
+    }
+#endif
+  };

-  HIP_CHECK(hipSetDevice(0));
-  char* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
-      * A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(char)};
-  constexpr auto memsetval{100};
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  // Allocating and Initializing the data
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
-                            &A_h, nullptr, &C_h,
-                            width*NUM_H, false);
-  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
-                            &B_h, nullptr, nullptr,
-                            width*NUM_H, false);
-  HipTest::setDefaultData<char>(NUM_W*NUM_H, A_h, nullptr, C_h);
-  HipTest::setDefaultData<char>(NUM_W*NUM_H, B_h, nullptr, nullptr);
-  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
-
-  // Device to Host
-  hip_Memcpy2D desc = {};
-  desc.srcMemoryType = hipMemoryTypeDevice;
-  desc.srcHost = A_d;
-  desc.srcDevice = hipDeviceptr_t(A_d);
-  desc.srcPitch = pitch_A;
-  desc.dstMemoryType = hipMemoryTypeHost;
-  desc.dstHost = A_h;
-  desc.dstDevice = hipDeviceptr_t(A_h);
-  desc.dstPitch = width;
-  desc.WidthInBytes = NUM_W;
-  desc.Height = NUM_H;
-
-  SECTION("Destination Pitch is 0") {
-    desc.dstPitch = 0;
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+  SECTION("Host to device") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(device_alloc.ptr(), device_alloc.pitch(), host_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyHostToDevice);
  }

-  SECTION("Source Pitch is 0") {
-    desc.srcPitch = 0;
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+  SECTION("Device to host") {
+    LinearAllocGuard2D<int> device_alloc(cols, rows);
+    LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.pitch() * rows);
+    NegativeTests(host_alloc.ptr(), device_alloc.pitch(), device_alloc.ptr(), device_alloc.pitch(),
+                  device_alloc.width(), device_alloc.height(), hipMemcpyDeviceToHost);
  }

-  SECTION("Height is 0") {
-    desc.Height = 0;
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  SECTION("Host to host") {
+    LinearAllocGuard<int> src_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    LinearAllocGuard<int> dst_alloc(LinearAllocs::hipHostMalloc, cols * rows * sizeof(int));
+    NegativeTests(dst_alloc.ptr(), cols * sizeof(int), src_alloc.ptr(), cols * sizeof(int),
+                  cols * sizeof(int), rows, hipMemcpyHostToHost);
  }

-  SECTION("Width is 0") {
-    desc.Height = 0;
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
-    HIP_CHECK(hipStreamSynchronize(stream));
-    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  SECTION("Device to device") {
+    LinearAllocGuard2D<int> src_alloc(cols, rows);
+    LinearAllocGuard2D<int> dst_alloc(cols, rows);
+    NegativeTests(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                  dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToDevice);
  }
-
-  // DeAllocating the Memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipStreamDestroy(stream));
-  HipTest::freeArrays<char>(nullptr, nullptr, nullptr,
-                                A_h, B_h, C_h, false);
-}
-
-/*
- * This testcase verifies the negative scenarios
- */
-TEST_CASE("Unit_hipMemcpyParam2DAsync_Negative") {
-  CHECK_IMAGE_SUPPORT
-
-  HIP_CHECK(hipSetDevice(0));
-  float* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
-       * A_d{nullptr};
-  size_t pitch_A;
-  size_t width{NUM_W * sizeof(float)};
-  constexpr auto memsetval{100};
-  hipStream_t stream;
-  HIP_CHECK(hipStreamCreate(&stream));
-
-  // Allocating and Initializing the data
-  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
-                          &pitch_A, width, NUM_H));
-  HipTest::initArrays<float>(nullptr, nullptr, nullptr,
-                                &A_h, &B_h, &C_h,
-                                width*NUM_H, false);
-  HipTest::setDefaultData<float>(NUM_W*NUM_H, A_h, B_h, C_h);
-  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
-
-  // Device to Host
-  hip_Memcpy2D desc = {};
-  desc.srcMemoryType = hipMemoryTypeDevice;
-  desc.srcHost = A_d;
-  desc.srcDevice = hipDeviceptr_t(A_d);
-  desc.srcPitch = pitch_A;
-  desc.dstMemoryType = hipMemoryTypeHost;
-  desc.dstHost = A_h;
-  desc.dstDevice = hipDeviceptr_t(A_h);
-  desc.dstPitch = width;
-  desc.WidthInBytes = NUM_W;
-  desc.Height = NUM_H;
-
-  SECTION("Null Pointer to Source Device Pointer") {
-    desc.srcDevice = hipDeviceptr_t(nullptr);
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
-  }
-
-  SECTION("Null Pointer to Destination Device Pointer") {
-    memset(&desc, 0x0, sizeof(hip_Memcpy2D));
-    desc.srcMemoryType = hipMemoryTypeHost;
-    desc.srcHost = A_h;
-    desc.srcDevice = hipDeviceptr_t(A_h);
-    desc.srcPitch = width;
-    desc.dstMemoryType = hipMemoryTypeDevice;
-    desc.dstHost = A_d;
-    desc.dstDevice = hipDeviceptr_t(nullptr);
-    desc.dstPitch = pitch_A;
-    desc.WidthInBytes = NUM_W;
-    desc.Height = NUM_H;
-
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
-  }
-
-  SECTION("Null Pointer to both Src & Dst Device Pointer") {
-    desc.srcDevice = hipDeviceptr_t(nullptr);
-    desc.dstDevice = hipDeviceptr_t(nullptr);
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
-  }
-
-  SECTION("Width > src/dest pitches") {
-    desc.WidthInBytes = pitch_A+1;
-    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
-  }
-
-  // DeAllocating the memory
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipStreamSynchronize(stream));
-  HIP_CHECK(hipStreamDestroy(stream));
-  HipTest::freeArrays<float>(nullptr, nullptr, nullptr,
-                                A_h, B_h, C_h, false);
-}
+}
@@ -0,0 +1,441 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testfile verifies the following scenarios of hipMemcpyParam2DAsync API
+1. Negative Scenarios
+2. Extent Validation Scenarios
+3. D2D copy for different datatypes
+4. H2D and D2H copy for different datatypes
+5. Device context change scenario where memory allocated in one GPU
+   stream created in another GPU
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr size_t NUM_W{10};
+static constexpr size_t NUM_H{10};
+/*
+ * This testcase verifies D2D functionality of hipMemcpyParam2DAsync API
+ * Where Memory is allocated in GPU-0 and stream is created in GPU-1
+ *
+ * Input: Intializing "A_d" device variable with "C_h" host variable
+ * Output: "A_d" device variable to "E_d" device variable
+ *
+ * Validating the result by copying "E_d" to "A_h" and checking
+ * it with the initalized data "C_h".
+ *
+ */
+TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-StreamOnDiffDevice",
+                   "[hipMemcpyParam2DAsync]", char, float, int, double, long double) {
+  CHECK_IMAGE_SUPPORT
+
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    // Allocating and Initializing the data
+    HIP_CHECK(hipSetDevice(0));
+    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+    size_t pitch_A;
+    size_t width{NUM_W * sizeof(TestType)};
+    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                            &pitch_A, width, NUM_H));
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, nullptr, &C_h,
+                                  width*NUM_H, false);
+    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+    int peerAccess = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    if (!peerAccess) {
+      SUCCEED("Skipped the test as there is no peer access");
+    } else {
+      TestType *E_d{nullptr};
+      size_t pitch_E;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
+                              &pitch_E, width, NUM_H));
+
+      // Initalizing A_d with C_h
+      HIP_CHECK(hipSetDevice(1));
+      hipStream_t stream;
+      HIP_CHECK(hipStreamCreate(&stream));
+
+      HIP_CHECK(hipMemcpy2DAsync(A_d, pitch_A, C_h, width,
+                           NUM_W*sizeof(TestType), NUM_H,
+                           hipMemcpyHostToDevice, stream));
+      HIP_CHECK(hipStreamSynchronize(stream));
+      // Device to Device
+      hip_Memcpy2D desc = {};
+      desc.srcMemoryType = hipMemoryTypeDevice;
+      desc.srcHost = A_d;
+      desc.srcDevice = hipDeviceptr_t(A_d);
+      desc.srcPitch = pitch_A;
+      desc.dstMemoryType = hipMemoryTypeDevice;
+      desc.dstHost = E_d;
+      desc.dstDevice = hipDeviceptr_t(E_d);
+      desc.dstPitch = pitch_E;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Copying the result E_d to A_h host variable
+      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
+            NUM_W*sizeof(TestType), NUM_H,
+            hipMemcpyDeviceToHost));
+      HIP_CHECK(hipDeviceSynchronize());
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(E_d));
+      HIP_CHECK(hipFree(A_d));
+      HIP_CHECK(hipStreamDestroy(stream));
+      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+          A_h, nullptr, C_h, false);
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+ * This testcase verifies D2D functionality of hipMemcpyParam2DAsync API
+ * Input: Intializing "A_d" device variable with "C_h" host variable
+ * Output: "A_d" device variable to "E_d" device variable
+ *
+ * Validating the result by copying "E_d" to "A_h" and checking
+ * it with the initalized data "C_h".
+ *
+ */
+TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-D2D", "[hipMemcpyParam2DAsync]", char,
+                   int, float, double, long double) {
+  CHECK_IMAGE_SUPPORT
+
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    // Allocating and Initializing the data
+    HIP_CHECK(hipSetDevice(0));
+    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+    size_t pitch_A;
+    size_t width{NUM_W * sizeof(TestType)};
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                            &pitch_A, width, NUM_H));
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, nullptr, &C_h,
+                                  width*NUM_H, false);
+    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+
+    int peerAccess = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    if (!peerAccess) {
+      SUCCEED("Skipped the test as there is no peer access");
+    } else {
+      HIP_CHECK(hipSetDevice(1));
+      TestType *E_d;
+      size_t pitch_E;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
+            &pitch_E, width, NUM_H));
+
+      // Initializing A_d  with C_h
+      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, C_h, width,
+            NUM_W*sizeof(TestType), NUM_H, hipMemcpyHostToDevice));
+
+      // Device to Device
+      hip_Memcpy2D desc = {};
+      desc.srcMemoryType = hipMemoryTypeDevice;
+      desc.srcHost = A_d;
+      desc.srcDevice = hipDeviceptr_t(A_d);
+      desc.srcPitch = pitch_A;
+      desc.dstMemoryType = hipMemoryTypeDevice;
+      desc.dstHost = E_d;
+      desc.dstDevice = hipDeviceptr_t(E_d);
+      desc.dstPitch = pitch_E;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Copying the result E_d to A_h host variable
+      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
+            NUM_W*sizeof(TestType), NUM_H, hipMemcpyDeviceToHost));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      HIP_CHECK(hipStreamDestroy(stream));
+      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+          A_h, nullptr, C_h, false);
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+ * This testcase verifies H2D & D2H functionality of hipMemcpyParam2DAsync API
+ * H2D case:
+ * Input: "C_h" host variable initialized with default data
+ * Output: "A_d" device variable
+ *
+ * D2H case:
+ * Input: "A_d" device variable from the previous output
+ * OutPut: "A_h" variable
+ *
+ * Validating the result by comparing "A_h" to "C_h"
+ */
+TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2DAsync_multiDevice-H2D-D2H", "[hipMemcpyParam2DAsync]",
+                   char, int, float, double, long double) {
+  CHECK_IMAGE_SUPPORT
+
+  // 1 refers to pinned host memory and 0 refers
+  // to unpinned memory
+  auto memory_type = GENERATE(0, 1);
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    // Allocating and Initializing the data
+    HIP_CHECK(hipSetDevice(0));
+    TestType* A_h{nullptr}, *C_h{nullptr},
+      *A_d{nullptr};
+    size_t pitch_A;
+    size_t width{NUM_W * sizeof(TestType)};
+    hipStream_t stream;
+
+    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                            &pitch_A, width, NUM_H));
+
+    // Based on memory type (pinned/unpinned) allocating memory
+    if (memory_type) {
+      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                    &A_h, nullptr, &C_h,
+                                    width*NUM_H, true);
+    } else {
+      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                    &A_h, nullptr, &C_h,
+                                    width*NUM_H, false);
+    }
+    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+    int peerAccess = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    if (!peerAccess) {
+      SUCCEED("Skipped the test as there is no peer access");
+    } else {
+      // Host to Device
+      hip_Memcpy2D desc = {};
+      HIP_CHECK(hipStreamCreate(&stream));
+      desc.srcMemoryType = hipMemoryTypeHost;
+      desc.srcHost = C_h;
+      desc.srcDevice = hipDeviceptr_t(C_h);
+      desc.srcPitch = width;
+      desc.dstMemoryType = hipMemoryTypeDevice;
+      desc.dstHost = A_d;
+      desc.dstDevice = hipDeviceptr_t(A_d);
+      desc.dstPitch = pitch_A;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Device to Host
+      memset(&desc, 0x0, sizeof(hip_Memcpy2D));
+      desc.srcMemoryType = hipMemoryTypeDevice;
+      desc.srcHost = A_d;
+      desc.srcDevice = hipDeviceptr_t(A_d);
+      desc.srcPitch = pitch_A;
+      desc.dstMemoryType = hipMemoryTypeHost;
+      desc.dstHost = A_h;
+      desc.dstDevice = hipDeviceptr_t(A_h);
+      desc.dstPitch = width;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+      HIP_CHECK(hipStreamSynchronize(stream));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      HIP_CHECK(hipStreamDestroy(stream));
+      if (memory_type) {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                      A_h, nullptr, C_h, true);
+      } else {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                      A_h, nullptr, C_h, false);
+      }
+    }
+  } else {
+      SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+/*
+ * This testcase verifies the extent validation scenarios
+ */
+TEST_CASE("Unit_hipMemcpyParam2DAsync_ExtentValidation") {
+  CHECK_IMAGE_SUPPORT
+
+  HIP_CHECK(hipSetDevice(0));
+  char* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
+      * A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(char)};
+  constexpr auto memsetval{100};
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocating and Initializing the data
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
+                            &A_h, nullptr, &C_h,
+                            width*NUM_H, false);
+  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
+                            &B_h, nullptr, nullptr,
+                            width*NUM_H, false);
+  HipTest::setDefaultData<char>(NUM_W*NUM_H, A_h, nullptr, C_h);
+  HipTest::setDefaultData<char>(NUM_W*NUM_H, B_h, nullptr, nullptr);
+  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
+
+  // Device to Host
+  hip_Memcpy2D desc = {};
+  desc.srcMemoryType = hipMemoryTypeDevice;
+  desc.srcHost = A_d;
+  desc.srcDevice = hipDeviceptr_t(A_d);
+  desc.srcPitch = pitch_A;
+  desc.dstMemoryType = hipMemoryTypeHost;
+  desc.dstHost = A_h;
+  desc.dstDevice = hipDeviceptr_t(A_h);
+  desc.dstPitch = width;
+  desc.WidthInBytes = NUM_W;
+  desc.Height = NUM_H;
+
+  SECTION("Destination Pitch is 0") {
+    desc.dstPitch = 0;
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+  }
+
+  SECTION("Source Pitch is 0") {
+    desc.srcPitch = 0;
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+  }
+
+  SECTION("Height is 0") {
+    desc.Height = 0;
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  }
+
+  SECTION("Width is 0") {
+    desc.Height = 0;
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) == hipSuccess);
+    HIP_CHECK(hipStreamSynchronize(stream));
+    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  }
+
+  // DeAllocating the Memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipStreamDestroy(stream));
+  HipTest::freeArrays<char>(nullptr, nullptr, nullptr,
+                                A_h, B_h, C_h, false);
+}
+
+/*
+ * This testcase verifies the negative scenarios
+ */
+TEST_CASE("Unit_hipMemcpyParam2DAsync_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  HIP_CHECK(hipSetDevice(0));
+  float* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
+       * A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(float)};
+  constexpr auto memsetval{100};
+  hipStream_t stream;
+  HIP_CHECK(hipStreamCreate(&stream));
+
+  // Allocating and Initializing the data
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HipTest::initArrays<float>(nullptr, nullptr, nullptr,
+                                &A_h, &B_h, &C_h,
+                                width*NUM_H, false);
+  HipTest::setDefaultData<float>(NUM_W*NUM_H, A_h, B_h, C_h);
+  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
+
+  // Device to Host
+  hip_Memcpy2D desc = {};
+  desc.srcMemoryType = hipMemoryTypeDevice;
+  desc.srcHost = A_d;
+  desc.srcDevice = hipDeviceptr_t(A_d);
+  desc.srcPitch = pitch_A;
+  desc.dstMemoryType = hipMemoryTypeHost;
+  desc.dstHost = A_h;
+  desc.dstDevice = hipDeviceptr_t(A_h);
+  desc.dstPitch = width;
+  desc.WidthInBytes = NUM_W;
+  desc.Height = NUM_H;
+
+  SECTION("Null Pointer to Source Device Pointer") {
+    desc.srcDevice = hipDeviceptr_t(nullptr);
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
+  }
+
+  SECTION("Null Pointer to Destination Device Pointer") {
+    memset(&desc, 0x0, sizeof(hip_Memcpy2D));
+    desc.srcMemoryType = hipMemoryTypeHost;
+    desc.srcHost = A_h;
+    desc.srcDevice = hipDeviceptr_t(A_h);
+    desc.srcPitch = width;
+    desc.dstMemoryType = hipMemoryTypeDevice;
+    desc.dstHost = A_d;
+    desc.dstDevice = hipDeviceptr_t(nullptr);
+    desc.dstPitch = pitch_A;
+    desc.WidthInBytes = NUM_W;
+    desc.Height = NUM_H;
+
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
+  }
+
+  SECTION("Null Pointer to both Src & Dst Device Pointer") {
+    desc.srcDevice = hipDeviceptr_t(nullptr);
+    desc.dstDevice = hipDeviceptr_t(nullptr);
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
+  }
+
+  SECTION("Width > src/dest pitches") {
+    desc.WidthInBytes = pitch_A+1;
+    REQUIRE(hipMemcpyParam2DAsync(&desc, stream) != hipSuccess);
+  }
+
+  // DeAllocating the memory
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipStreamSynchronize(stream));
+  HIP_CHECK(hipStreamDestroy(stream));
+  HipTest::freeArrays<float>(nullptr, nullptr, nullptr,
+                                A_h, B_h, C_h, false);
+}
@@ -0,0 +1,337 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+This testfile verifies the following scenarios of hipMemcpyParam2D API
+1. Negative Scenarios
+2. Extent Validation Scenarios
+3. D2D copy for different datatypes
+4. H2D and D2H copy for different datatypes
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+static constexpr size_t NUM_W{10};
+static constexpr size_t NUM_H{10};
+/*
+ * This testcase verifies D2D functionality of hipMemcpyParam2D API
+ * Input: Intializing "A_d" device variable with "C_h" host variable
+ * Output: "A_d" device variable to "E_d" device variable
+ *
+ * Validating the result by copying "E_d" to "A_h" and checking
+ * it with the initalized data "C_h".
+ *
+ */
+TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2D_multiDevice-D2D", "[hipMemcpyParam2D]", char, float, int,
+                   double, long double) {
+  CHECK_IMAGE_SUPPORT
+
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    // Initialize and Allocating Memory
+    HIP_CHECK(hipSetDevice(0));
+    TestType* A_h{nullptr}, *C_h{nullptr}, *A_d{nullptr};
+    size_t pitch_A;
+    size_t width{NUM_W * sizeof(TestType)};
+    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                            &pitch_A, width, NUM_H));
+    HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                  &A_h, nullptr, &C_h,
+                                  width*NUM_H, false);
+    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+
+    int peerAccess = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    if (!peerAccess) {
+      SUCCEED("Skipped the test as there is no peer access");
+    } else {
+      HIP_CHECK(hipSetDevice(1));
+      char *E_d;
+      size_t pitch_E;
+      HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&E_d),
+            &pitch_E, width, NUM_H));
+
+      // Initalizing A_d with C_h
+      HIP_CHECK(hipMemcpy2D(A_d, pitch_A, C_h, width,
+            NUM_W * sizeof(TestType), NUM_H, hipMemcpyHostToDevice));
+
+      // Device to Device
+      hip_Memcpy2D desc = {};
+      desc.srcMemoryType = hipMemoryTypeDevice;
+      desc.srcHost = A_d;
+      desc.srcDevice = hipDeviceptr_t(A_d);
+      desc.srcPitch = pitch_A;
+      desc.dstMemoryType = hipMemoryTypeDevice;
+      desc.dstHost = E_d;
+      desc.dstDevice = hipDeviceptr_t(E_d);
+      desc.dstPitch = pitch_E;
+      desc.WidthInBytes = NUM_W * sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+
+      // Copying E_d to A_h
+      HIP_CHECK(hipMemcpy2D(A_h, width, E_d, pitch_E,
+            NUM_W * sizeof(TestType), NUM_H,
+            hipMemcpyDeviceToHost));
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+
+      // DeAllocating the memory
+      HIP_CHECK(hipFree(A_d));
+      HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                    A_h, nullptr, C_h, false);
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+
+/*
+ * This testcase verifies H2D & D2H functionality of hipMemcpyParam2D API
+ * H2D case:
+ * Input: "C_h" host variable initialized with default data
+ * Output: "A_d" device variable
+ *
+ * D2H case:
+ * Input: "A_d" device variable from the previous output
+ * OutPut: "A_h" variable
+ *
+ * Validating the result by comparing "A_h" to "C_h"
+ */
+TEMPLATE_TEST_CASE("Unit_hipMemcpyParam2D_multiDevice-H2D-D2H", "[hipMemcpyParam2D]", char, float,
+                   int, double, long double) {
+  CHECK_IMAGE_SUPPORT
+
+  // 1 refers to pinned host memory and 0 refers
+  // to unpinned memory
+  auto memory_type = GENERATE(0, 1);
+  int numDevices = 0;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices > 1) {
+    HIP_CHECK(hipSetDevice(0));
+
+    // Initialize and Allocating Memory
+    TestType* A_h{nullptr}, *C_h{nullptr},
+             *A_d{nullptr};
+    size_t pitch_A;
+    size_t width{NUM_W * sizeof(TestType)};
+
+    HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                            &pitch_A, width, NUM_H));
+
+    // Based on memory type (pinned/unpinned) allocating memory
+    if (memory_type) {
+      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                    &A_h, nullptr, &C_h,
+                                    width*NUM_H, true);
+    } else {
+      HipTest::initArrays<TestType>(nullptr, nullptr, nullptr,
+                                    &A_h, nullptr, &C_h,
+                                    width*NUM_H, false);
+    }
+    HipTest::setDefaultData<TestType>(NUM_W*NUM_H, A_h, nullptr, C_h);
+    int peerAccess = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&peerAccess, 1, 0));
+    if (!peerAccess) {
+      SUCCEED("Skipped the test as there is no peer access");
+    } else {
+      // Host to Device
+      hip_Memcpy2D desc = {};
+      desc.srcMemoryType = hipMemoryTypeHost;
+      desc.srcHost = C_h;
+      desc.srcDevice = hipDeviceptr_t(C_h);
+      desc.srcPitch = width;
+      desc.dstMemoryType = hipMemoryTypeDevice;
+      desc.dstHost = A_d;
+      desc.dstDevice = hipDeviceptr_t(A_d);
+      desc.dstPitch = pitch_A;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+
+      // Device to Host
+      memset(&desc, 0x0, sizeof(hip_Memcpy2D));
+      desc.srcMemoryType = hipMemoryTypeDevice;
+      desc.srcHost = A_d;
+      desc.srcDevice = hipDeviceptr_t(A_d);
+      desc.srcPitch = pitch_A;
+      desc.dstMemoryType = hipMemoryTypeHost;
+      desc.dstHost = A_h;
+      desc.dstDevice = hipDeviceptr_t(A_h);
+      desc.dstPitch = width;
+      desc.WidthInBytes = NUM_W*sizeof(TestType);
+      desc.Height = NUM_H;
+      REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+
+      // Validating the result
+      REQUIRE(HipTest::checkArray<TestType>(A_h, C_h, NUM_W, NUM_H) == true);
+
+      // DeAllocating the Memory
+      HIP_CHECK(hipFree(A_d));
+      if (memory_type) {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                      A_h, nullptr, C_h, true);
+      } else {
+        HipTest::freeArrays<TestType>(nullptr, nullptr, nullptr,
+                                      A_h, nullptr, C_h, false);
+      }
+    }
+  } else {
+    SUCCEED("skipping the testcases as numDevices < 2");
+  }
+}
+/*
+ * This testcase verifies the extent validation scenarios
+ */
+TEST_CASE("Unit_hipMemcpyParam2D_ExtentValidation") {
+  CHECK_IMAGE_SUPPORT
+
+  // Allocating memory and Initializing the data
+  HIP_CHECK(hipSetDevice(0));
+  char* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
+    * A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(char)};
+  constexpr auto memsetval{100};
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+        &pitch_A, width, NUM_H));
+  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
+      &A_h, nullptr, &C_h,
+      width*NUM_H, false);
+  HipTest::initArrays<char>(nullptr, nullptr, nullptr,
+      &B_h, nullptr, nullptr,
+      width*NUM_H, false);
+  HipTest::setDefaultData<char>(NUM_W*NUM_H, A_h, nullptr, C_h);
+  HipTest::setDefaultData<char>(NUM_W*NUM_H, B_h, nullptr, nullptr);
+  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
+
+  // Device to Host
+  hip_Memcpy2D desc = {};
+  desc.srcMemoryType = hipMemoryTypeDevice;
+  desc.srcHost = A_d;
+  desc.srcDevice = hipDeviceptr_t(A_d);
+  desc.srcPitch = pitch_A;
+  desc.dstMemoryType = hipMemoryTypeHost;
+  desc.dstHost = A_h;
+  desc.dstDevice = hipDeviceptr_t(A_h);
+  desc.dstPitch = width;
+  desc.WidthInBytes = NUM_W;
+  desc.Height = NUM_H;
+
+  SECTION("Destination Pitch is 0") {
+    desc.dstPitch = 0;
+    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+  }
+
+  SECTION("Source Pitch is 0") {
+    desc.srcPitch = 0;
+    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+  }
+
+  SECTION("Height is 0") {
+    desc.Height = 0;
+    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  }
+
+  SECTION("Width is 0") {
+    desc.WidthInBytes = 0;
+    REQUIRE(hipMemcpyParam2D(&desc) == hipSuccess);
+    REQUIRE(HipTest::checkArray<char>(A_h, B_h, NUM_W, NUM_H) == true);
+  }
+
+  // DeAllocating the Memory
+  HIP_CHECK(hipFree(A_d));
+  HipTest::freeArrays<char>(nullptr, nullptr, nullptr,
+                                A_h, B_h, C_h, false);
+}
+
+/*
+ * This testcase verifies the negative scenarios
+ */
+TEST_CASE("Unit_hipMemcpyParam2D_Negative") {
+  CHECK_IMAGE_SUPPORT
+
+  HIP_CHECK(hipSetDevice(0));
+
+  // Allocating and Initializing the data
+  float* A_h{nullptr}, *B_h{nullptr}, *C_h{nullptr},
+       * A_d{nullptr};
+  size_t pitch_A;
+  size_t width{NUM_W * sizeof(float)};
+  constexpr auto memsetval{100};
+  HIP_CHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d),
+                          &pitch_A, width, NUM_H));
+  HipTest::initArrays<float>(nullptr, nullptr, nullptr,
+                                &A_h, &B_h, &C_h,
+                                width*NUM_H, false);
+  HipTest::setDefaultData<float>(NUM_W*NUM_H, A_h, B_h, C_h);
+  HIP_CHECK(hipMemset2D(A_d, pitch_A, memsetval, NUM_W, NUM_H));
+
+  hip_Memcpy2D desc = {};
+  desc.srcMemoryType = hipMemoryTypeDevice;
+  desc.srcHost = A_d;
+  desc.srcDevice = hipDeviceptr_t(A_d);
+  desc.srcPitch = pitch_A;
+  desc.dstMemoryType = hipMemoryTypeHost;
+  desc.dstHost = A_h;
+  desc.dstDevice = hipDeviceptr_t(A_h);
+  desc.dstPitch = width;
+  desc.WidthInBytes = NUM_W;
+  desc.Height = NUM_H;
+
+  SECTION("Null Pointer to Source Device Pointer") {
+    desc.srcDevice = hipDeviceptr_t(nullptr);
+    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
+  }
+
+  SECTION("Null Pointer to Destination Device Pointer") {
+    memset(&desc, 0x0, sizeof(hip_Memcpy2D));
+    desc.srcMemoryType = hipMemoryTypeHost;
+    desc.srcHost = A_h;
+    desc.srcDevice = hipDeviceptr_t(A_h);
+    desc.srcPitch = width;
+    desc.dstMemoryType = hipMemoryTypeDevice;
+    desc.dstHost = A_d;
+    desc.dstDevice = hipDeviceptr_t(nullptr);
+    desc.dstPitch = pitch_A;
+    desc.WidthInBytes = NUM_W;
+    desc.Height = NUM_H;
+    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
+  }
+
+  SECTION("Null Pointer to both Src & Dst Device Pointer") {
+    desc.srcDevice = hipDeviceptr_t(nullptr);
+    desc.dstDevice = hipDeviceptr_t(nullptr);
+    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
+  }
+
+  SECTION("Width > src/dest pitches") {
+    desc.WidthInBytes = pitch_A+1;
+    REQUIRE(hipMemcpyParam2D(&desc) != hipSuccess);
+  }
+
+  // DeAllocating the Memory
+  HIP_CHECK(hipFree(A_d));
+  HipTest::freeArrays<float>(nullptr, nullptr, nullptr,
+                                A_h, B_h, C_h, false);
+}
@@ -0,0 +1,521 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <variant>
+
+#include <hip_test_common.hh>
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include <resource_guards.hh>
+#include <hip/driver_types.h>
+
+template <bool should_synchronize, typename F>
+void Memcpy2DDeviceToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyDeviceToHost, hipMemcpyDefault);
+
+  constexpr size_t cols = 127;
+  constexpr size_t rows = 128;
+
+  LinearAllocGuard2D<int> device_alloc(cols, rows);
+
+  const size_t host_pitch = GENERATE_REF(device_alloc.width(), device_alloc.width() + 64);
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, host_pitch * rows);
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(cols / threads_per_block.x + 1, rows / threads_per_block.y + 1);
+  Iota<<<blocks, threads_per_block>>>(device_alloc.ptr(), device_alloc.pitch(),
+                                      device_alloc.width_logical(), device_alloc.height(), 1);
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(memcpy_func(host_alloc.ptr(), host_pitch, device_alloc.ptr(), device_alloc.pitch(),
+                        device_alloc.width(), device_alloc.height(), kind));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  constexpr auto f = [](size_t x, size_t y, size_t z) { return z * cols * rows + y * cols + x; };
+  PitchedMemoryVerify(host_alloc.ptr(), host_pitch, device_alloc.width_logical(),
+                      device_alloc.height(), 1, f);
+}
+
+template <bool should_synchronize, bool enable_peer_access, typename F>
+void Memcpy2DDeviceToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyDeviceToDevice, hipMemcpyDefault);
+
+  constexpr size_t cols = 127;
+  constexpr size_t rows = 128;
+
+  const auto device_count = HipTest::getDeviceCount();
+  const auto src_device = GENERATE_COPY(range(0, device_count));
+  const auto dst_device = GENERATE_COPY(range(0, device_count));
+  const size_t src_cols_mult = GENERATE(1, 2);
+
+  INFO("Src device: " << src_device << ", Dst device: " << dst_device);
+
+  HIP_CHECK(hipSetDevice(src_device));
+  if constexpr (enable_peer_access) {
+    if (src_device == dst_device) {
+      return;
+    }
+    int can_access_peer = 0;
+    HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
+    if (!can_access_peer) {
+      INFO("Peer access cannot be enabled between devices " << src_device << " " << dst_device);
+      REQUIRE(can_access_peer);
+    }
+    HIP_CHECK(hipDeviceEnablePeerAccess(dst_device, 0));
+  }
+
+  LinearAllocGuard2D<int> src_alloc(cols * src_cols_mult, rows);
+  HIP_CHECK(hipSetDevice(src_device));
+  LinearAllocGuard2D<int> dst_alloc(cols, rows);
+  HIP_CHECK(hipSetDevice(src_device));
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, dst_alloc.width() * rows);
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(cols / threads_per_block.x + 1, rows / threads_per_block.y + 1);
+  // Using dst_alloc width and height to set only the elements that will be copied over to
+  // dst_alloc
+  Iota<<<blocks, threads_per_block>>>(src_alloc.ptr(), src_alloc.pitch(), dst_alloc.width_logical(),
+                                      dst_alloc.height(), 1);
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(memcpy_func(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                        dst_alloc.width(), dst_alloc.height(), kind));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  HIP_CHECK(hipMemcpy2D(host_alloc.ptr(), dst_alloc.width(), dst_alloc.ptr(), dst_alloc.pitch(),
+                        dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToHost));
+  constexpr auto f = [](size_t x, size_t y, size_t z) { return z * cols * rows + y * cols + x; };
+  PitchedMemoryVerify(host_alloc.ptr(), dst_alloc.width(), dst_alloc.width_logical(),
+                      dst_alloc.height(), 1, f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy2DHostToDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyHostToDevice, hipMemcpyDefault);
+
+  constexpr size_t cols = 127;
+  constexpr size_t rows = 128;
+
+  LinearAllocGuard2D<int> device_alloc(cols, rows);
+
+  const size_t host_pitch = GENERATE_REF(device_alloc.pitch(), 2 * device_alloc.pitch());
+
+  LinearAllocGuard<int> src_host_alloc(LinearAllocs::hipHostMalloc, host_pitch * rows);
+  LinearAllocGuard<int> dst_host_alloc(LinearAllocs::hipHostMalloc, device_alloc.width() * rows);
+
+  constexpr auto f = [](size_t x, size_t y, size_t z) { return z * cols * rows + y * cols + x; };
+  PitchedMemorySet(src_host_alloc.ptr(), host_pitch, device_alloc.width_logical(),
+                   device_alloc.height(), 1, f);
+
+  std::fill_n(dst_host_alloc.ptr(), device_alloc.width_logical() * rows, 0);
+
+  HIP_CHECK(memcpy_func(device_alloc.ptr(), device_alloc.pitch(), src_host_alloc.ptr(), host_pitch,
+                        device_alloc.width(), device_alloc.height(), kind));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  HIP_CHECK(hipMemcpy2D(dst_host_alloc.ptr(), device_alloc.width(), device_alloc.ptr(),
+                        device_alloc.pitch(), device_alloc.width(), device_alloc.height(),
+                        hipMemcpyDeviceToHost));
+
+  PitchedMemoryVerify(dst_host_alloc.ptr(), device_alloc.width(), device_alloc.width_logical(),
+                      device_alloc.height(), 1, f);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy2DHostToHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  const auto kind = GENERATE(hipMemcpyHostToHost, hipMemcpyDefault);
+
+  constexpr size_t cols = 127;
+  constexpr size_t rows = 128;
+
+  const size_t src_pitch = GENERATE_REF(cols * sizeof(int), cols * sizeof(int) + 64);
+
+  LinearAllocGuard<int> src_host(LinearAllocs::hipHostMalloc, src_pitch * rows);
+  LinearAllocGuard<int> dst_host(LinearAllocs::hipHostMalloc, cols * sizeof(int) * rows);
+
+  constexpr auto f = [](size_t x, size_t y, size_t z) { return z * cols * rows + y * cols + x; };
+  PitchedMemorySet(src_host.ptr(), src_pitch, cols, rows, 1, f);
+
+  HIP_CHECK(memcpy_func(dst_host.ptr(), cols * sizeof(int), src_host.ptr(), src_pitch,
+                        cols * sizeof(int), rows, kind));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  PitchedMemoryVerify(dst_host.ptr(), cols * sizeof(int), cols, rows, 1, f);
+}
+
+// Synchronization behavior checks
+template <typename F>
+void MemcpySyncBehaviorCheck(F memcpy_func, const bool should_sync,
+                             const hipStream_t kernel_stream) {
+  LaunchDelayKernel(std::chrono::milliseconds{300}, kernel_stream);
+  HIP_CHECK(memcpy_func());
+  if (should_sync) {
+    HIP_CHECK(hipStreamQuery(kernel_stream));
+  } else {
+    HIP_CHECK_ERROR(hipStreamQuery(kernel_stream), hipErrorNotReady);
+  }
+}
+
+template <typename F>
+void Memcpy2DHtoDSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  using LA = LinearAllocs;
+  const auto host_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+  LinearAllocGuard<int> host_alloc(host_alloc_type, 32 * sizeof(int) * 32);
+  LinearAllocGuard2D<int> device_alloc(32, 32);
+  MemcpySyncBehaviorCheck(std::bind(memcpy_func, device_alloc.ptr(), device_alloc.pitch(),
+                                    host_alloc.ptr(), device_alloc.width(), device_alloc.width(),
+                                    device_alloc.height(), hipMemcpyHostToDevice),
+                          should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy2DDtoHPageableSyncBehavior(F memcpy_func, const bool should_sync,
+                                      const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard<int> host_alloc(LinearAllocs::malloc, 32 * sizeof(int) * 32);
+  LinearAllocGuard2D<int> device_alloc(32, 32);
+  MemcpySyncBehaviorCheck(std::bind(memcpy_func, host_alloc.ptr(), device_alloc.width(),
+                                    device_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                    device_alloc.height(), hipMemcpyDeviceToHost),
+                          should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy2DDtoHPinnedSyncBehavior(F memcpy_func, const bool should_sync,
+                                    const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc, 32 * sizeof(int) * 32);
+  LinearAllocGuard2D<int> device_alloc(32, 32);
+  MemcpySyncBehaviorCheck(std::bind(memcpy_func, host_alloc.ptr(), device_alloc.width(),
+                                    device_alloc.ptr(), device_alloc.pitch(), device_alloc.width(),
+                                    device_alloc.height(), hipMemcpyDeviceToHost),
+                          should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy2DDtoDSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  LinearAllocGuard2D<int> src_alloc(32, 32);
+  LinearAllocGuard2D<int> dst_alloc(32, 32);
+  MemcpySyncBehaviorCheck(
+      std::bind(memcpy_func, dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToDevice),
+      should_sync, kernel_stream);
+}
+
+template <typename F>
+void Memcpy2DHtoHSyncBehavior(F memcpy_func, const bool should_sync,
+                              const hipStream_t kernel_stream = nullptr) {
+  using LA = LinearAllocs;
+  const auto src_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+  const auto dst_alloc_type = GENERATE(LA::malloc, LA::hipHostMalloc);
+
+  LinearAllocGuard<int> src_alloc(src_alloc_type, 32 * sizeof(int) * 32);
+  LinearAllocGuard<int> dst_alloc(dst_alloc_type, 32 * sizeof(int) * 32);
+  MemcpySyncBehaviorCheck(std::bind(memcpy_func, dst_alloc.ptr(), 32 * sizeof(int), src_alloc.ptr(),
+                                    32 * sizeof(int), 32 * sizeof(int), 32, hipMemcpyHostToHost),
+                          should_sync, kernel_stream);
+}
+
+template <bool should_synchronize, typename F>
+void Memcpy2DZeroWidthHeight(F memcpy_func, const hipStream_t stream = nullptr) {
+  constexpr size_t cols = 63;
+  constexpr size_t rows = 64;
+
+  const auto [width_mult, height_mult] =
+      GENERATE(std::make_pair(0, 1), std::make_pair(1, 0), std::make_pair(0, 0));
+
+  SECTION("Device to Host") {
+    LinearAllocGuard2D<uint8_t> device_alloc(cols, rows);
+    LinearAllocGuard<uint8_t> host_alloc(LinearAllocs::hipHostMalloc, device_alloc.width() * rows);
+    std::fill_n(host_alloc.ptr(), device_alloc.width_logical() * device_alloc.height(), 42);
+    HIP_CHECK(hipMemset2D(device_alloc.ptr(), device_alloc.pitch(), 1, device_alloc.width(),
+                          device_alloc.height()));
+
+    HIP_CHECK(memcpy_func(host_alloc.ptr(), device_alloc.width(), device_alloc.ptr(),
+                          device_alloc.pitch(), device_alloc.width() * width_mult,
+                          device_alloc.height() * height_mult, hipMemcpyDeviceToHost));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    ArrayFindIfNot(host_alloc.ptr(), static_cast<uint8_t>(42),
+                   device_alloc.width_logical() * device_alloc.height());
+  }
+
+  SECTION("Device to Device") {
+    LinearAllocGuard2D<uint8_t> src_alloc(cols, rows);
+    LinearAllocGuard2D<uint8_t> dst_alloc(cols, rows);
+    LinearAllocGuard<uint8_t> host_alloc(LinearAllocs::hipHostMalloc, dst_alloc.width() * rows);
+    HIP_CHECK(
+        hipMemset2D(src_alloc.ptr(), src_alloc.pitch(), 1, src_alloc.width(), src_alloc.height()));
+    HIP_CHECK(
+        hipMemset2D(dst_alloc.ptr(), dst_alloc.pitch(), 42, dst_alloc.width(), dst_alloc.height()));
+    HIP_CHECK(memcpy_func(dst_alloc.ptr(), dst_alloc.pitch(), src_alloc.ptr(), src_alloc.pitch(),
+                          dst_alloc.width() * width_mult, dst_alloc.height() * height_mult,
+                          hipMemcpyDeviceToDevice));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    HIP_CHECK(hipMemcpy2D(host_alloc.ptr(), dst_alloc.width(), dst_alloc.ptr(), dst_alloc.pitch(),
+                          dst_alloc.width(), dst_alloc.height(), hipMemcpyDeviceToHost));
+    ArrayFindIfNot(host_alloc.ptr(), static_cast<uint8_t>(42),
+                   dst_alloc.width_logical() * dst_alloc.height());
+  }
+
+  SECTION("Host to Device") {
+    LinearAllocGuard2D<uint8_t> device_alloc(cols, rows);
+    LinearAllocGuard<uint8_t> src_host_alloc(LinearAllocs::hipHostMalloc,
+                                             device_alloc.width() * rows);
+    LinearAllocGuard<uint8_t> dst_host_alloc(LinearAllocs::hipHostMalloc,
+                                             device_alloc.width() * rows);
+    std::fill_n(src_host_alloc.ptr(), device_alloc.width_logical() * device_alloc.height(), 1);
+    HIP_CHECK(hipMemset2D(device_alloc.ptr(), device_alloc.pitch(), 42, device_alloc.width(),
+                          device_alloc.height()));
+    HIP_CHECK(memcpy_func(device_alloc.ptr(), device_alloc.pitch(), src_host_alloc.ptr(),
+                          device_alloc.width(), device_alloc.width() * width_mult,
+                          device_alloc.height() * height_mult, hipMemcpyHostToDevice));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    HIP_CHECK(hipMemcpy2D(dst_host_alloc.ptr(), device_alloc.width(), device_alloc.ptr(),
+                          device_alloc.pitch(), device_alloc.width(), device_alloc.height(),
+                          hipMemcpyDeviceToHost));
+    ArrayFindIfNot(dst_host_alloc.ptr(), static_cast<uint8_t>(42),
+                   device_alloc.width_logical() * device_alloc.height());
+  }
+
+  SECTION("Host to Host") {
+    const auto alloc_size = cols * rows;
+    LinearAllocGuard<uint8_t> src_alloc(LinearAllocs::hipHostMalloc, alloc_size);
+    LinearAllocGuard<uint8_t> dst_alloc(LinearAllocs::hipHostMalloc, alloc_size);
+    std::fill_n(src_alloc.ptr(), alloc_size, 1);
+    std::fill_n(dst_alloc.ptr(), alloc_size, 42);
+    HIP_CHECK(memcpy_func(dst_alloc.ptr(), cols, src_alloc.ptr(), cols, cols * width_mult,
+                          rows * height_mult, hipMemcpyHostToHost));
+    if constexpr (should_synchronize) {
+      HIP_CHECK(hipStreamSynchronize(stream));
+    }
+    ArrayFindIfNot(dst_alloc.ptr(), static_cast<uint8_t>(42), alloc_size);
+  }
+}
+
+constexpr auto MemTypeHost() {
+  return hipMemoryTypeHost;
+}
+
+constexpr auto MemTypeDevice() {
+  return hipMemoryTypeDevice;
+}
+
+constexpr auto MemTypeArray() {
+  return hipMemoryTypeArray;
+}
+
+constexpr auto MemTypeUnified() {
+  return hipMemoryTypeUnified;
+}
+
+using PtrVariant = std::variant<void*, hipArray_t>;
+
+template <bool async = false>
+constexpr auto MemcpyParam2DAdapter(const hipExtent src_offset = {0, 0, 0},
+                                    const hipExtent dst_offset = {0, 0, 0}) {
+  return [=](PtrVariant dst, size_t dpitch, PtrVariant src, size_t spitch, size_t width,
+             size_t height, hipMemcpyKind kind, hipStream_t stream = nullptr) {
+    hip_Memcpy2D parms = {};
+    memset(&parms, 0x0, sizeof(hip_Memcpy2D));
+
+    if (std::holds_alternative<hipArray_t>(dst)) {
+      parms.dstMemoryType = MemTypeArray();
+      parms.dstArray = std::get<hipArray_t>(dst);
+    } else {
+      parms.dstPitch = dpitch;
+      auto ptr = std::get<void*>(dst);
+      switch (kind) {
+        case hipMemcpyDeviceToHost:
+        case hipMemcpyHostToHost:
+          parms.dstMemoryType = MemTypeHost();
+          parms.dstHost = ptr;
+          break;
+        case hipMemcpyDeviceToDevice:
+        case hipMemcpyHostToDevice:
+          parms.dstMemoryType = MemTypeDevice();
+          parms.dstDevice = reinterpret_cast<hipDeviceptr_t>(ptr);
+          break;
+        case hipMemcpyDefault:
+          parms.dstMemoryType = MemTypeUnified();
+          parms.dstDevice = reinterpret_cast<hipDeviceptr_t>(ptr);
+          break;
+        default:
+          assert(false);
+      }
+    }
+
+    if (std::holds_alternative<hipArray_t>(src)) {
+      parms.srcMemoryType = MemTypeArray();
+      parms.srcArray = std::get<hipArray_t>(src);
+    } else {
+      parms.srcPitch = spitch;
+      auto ptr = std::get<void*>(src);
+      switch (kind) {
+        case hipMemcpyDeviceToHost:
+        case hipMemcpyDeviceToDevice:
+          parms.srcMemoryType = MemTypeDevice();
+          parms.srcDevice = reinterpret_cast<hipDeviceptr_t>(ptr);
+          break;
+        case hipMemcpyHostToDevice:
+        case hipMemcpyHostToHost:
+          parms.srcMemoryType = MemTypeHost();
+          parms.srcHost = ptr;
+          break;
+        case hipMemcpyDefault:
+          parms.srcMemoryType = MemTypeUnified();
+          parms.srcDevice = reinterpret_cast<hipDeviceptr_t>(ptr);
+          break;
+        default:
+          assert(false);
+      }
+    }
+
+    parms.WidthInBytes = width;
+    parms.Height = height;
+    parms.srcXInBytes = src_offset.width;
+    parms.srcY = src_offset.height;
+    parms.dstXInBytes = dst_offset.width;
+    parms.dstY = dst_offset.height;
+
+    if constexpr (async) {
+      return hipMemcpyParam2DAsync(&parms, stream);
+    } else {
+      return hipMemcpyParam2D(&parms);
+    }
+  };
+}
+
+template <bool should_synchronize, typename F>
+void MemcpyParam2DArrayHostShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  hipExtent extent{127 * sizeof(int), 128, 1};
+
+  LinearAllocGuard<int> src_host(LinearAllocs::hipHostMalloc,
+                                 extent.width * extent.height * extent.depth);
+  LinearAllocGuard<int> dst_host(LinearAllocs::hipHostMalloc,
+                                 extent.width * extent.height * extent.depth);
+
+  DrvArrayAllocGuard<int> src_array(extent);
+  DrvArrayAllocGuard<int> dst_array(extent);
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemorySet(src_host.ptr(), extent.width, extent.width / sizeof(int), extent.height,
+                   extent.depth, f);
+
+  // Host -> Array
+  HIP_CHECK(memcpy_func(src_array.ptr(), 0, src_host.ptr(), extent.width, extent.width,
+                        extent.height, hipMemcpyHostToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Array
+  HIP_CHECK(memcpy_func(dst_array.ptr(), 0, src_array.ptr(), 0, extent.width, extent.height,
+                        hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Host
+  HIP_CHECK(memcpy_func(dst_host.ptr(), extent.width, dst_array.ptr(), 0, extent.width,
+                        extent.height, hipMemcpyDeviceToHost, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  PitchedMemoryVerify(dst_host.ptr(), extent.width, extent.width / sizeof(int), extent.height,
+                      extent.depth, f);
+}
+
+template <bool should_synchronize, typename F>
+void MemcpyParam2DArrayDeviceShell(F memcpy_func, const hipStream_t kernel_stream = nullptr) {
+  hipExtent extent{127 * sizeof(int), 128, 1};
+
+  LinearAllocGuard<int> host_alloc(LinearAllocs::hipHostMalloc,
+                                   extent.width * extent.height * extent.depth);
+
+  DrvArrayAllocGuard<int> src_array(extent);
+  DrvArrayAllocGuard<int> dst_array(extent);
+
+  LinearAllocGuard3D<int> src_device(extent);
+  LinearAllocGuard3D<int> dst_device(extent);
+
+  const dim3 threads_per_block(32, 32);
+  const dim3 blocks(src_device.width_logical() / threads_per_block.x + 1,
+                    src_device.height() / threads_per_block.y + 1, src_device.depth());
+  Iota<<<blocks, threads_per_block>>>(src_device.ptr(), src_device.pitch(),
+                                      src_device.width_logical(), src_device.height(),
+                                      src_device.depth());
+  HIP_CHECK(hipGetLastError());
+
+  // Device -> Array
+  HIP_CHECK(memcpy_func(src_array.ptr(), 0, src_device.ptr(), src_device.pitch(), extent.width,
+                        extent.height, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Array
+  HIP_CHECK(memcpy_func(dst_array.ptr(), 0, src_array.ptr(), 0, extent.width, extent.height,
+                        hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  // Array -> Device
+  HIP_CHECK(memcpy_func(dst_device.ptr(), dst_device.pitch(), dst_array.ptr(), 0, extent.width,
+                        extent.height, hipMemcpyDeviceToDevice, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  HIP_CHECK(memcpy_func(host_alloc.ptr(), extent.width, dst_device.ptr(), dst_device.pitch(),
+                        extent.width, extent.height, hipMemcpyDeviceToHost, kernel_stream));
+  if constexpr (should_synchronize) {
+    HIP_CHECK(hipStreamSynchronize(kernel_stream));
+  }
+
+  const auto f = [extent](size_t x, size_t y, size_t z) {
+    auto width_logical = extent.width / sizeof(int);
+    return z * width_logical * extent.height + y * width_logical + x;
+  };
+  PitchedMemoryVerify(host_alloc.ptr(), extent.width, extent.width / sizeof(int), extent.height,
+                      extent.depth, f);
+}
@@ -19,14 +19,61 @@
 # SOFTWARE.

 # Common Tests - Test independent of all platforms
-if(HIP_PLATFORM MATCHES "amd")
 set(TEST_SRC
-    hipExtModuleLaunchKernel.cc
+    hip_module_common.cc
+    hipModuleLoad.cc
+    hipModuleLoadData.cc
+    hipModuleLoadDataEx.cc
+    hipModuleUnload.cc
+    hipModuleGetFunction.cc
+    hipModuleLaunchKernel.cc
+    hipModuleGetGlobal.cc
+    hipModuleGetTexRef.cc
 )

+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/get_function_module.code
+                   COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17 ${CMAKE_CURRENT_SOURCE_DIR}/get_function_module.cc
+                   -o get_function_module.code
+                   -I${ROCM_PATH}/include/ --rocm-path=${ROCM_PATH}
+                   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/get_function_module.cc)
+add_custom_target(get_function_module ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/get_function_module.code)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/launch_kernel_module.code
+                   COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17 ${CMAKE_CURRENT_SOURCE_DIR}/launch_kernel_module.cc
+                   -o launch_kernel_module.code
+                   -I${ROCM_PATH}/include/ --rocm-path=${ROCM_PATH}
+                   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/launch_kernel_module.cc)
+add_custom_target(launch_kernel_module ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/launch_kernel_module.code)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/get_global_test_module.code
+                   COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17 ${CMAKE_CURRENT_SOURCE_DIR}/get_global_test_module.cc
+                   -o get_global_test_module.code
+                   -I${ROCM_PATH}/include/ --rocm-path=${ROCM_PATH}
+                   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/get_global_test_module.cc)
+add_custom_target(get_global_test_module ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/get_global_test_module.code)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/get_tex_ref_module.code
+                   COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17 ${CMAKE_CURRENT_SOURCE_DIR}/get_tex_ref_module.cc
+                   -o get_tex_ref_module.code
+                   -I${ROCM_PATH}/include/ --rocm-path=${ROCM_PATH}
+                   DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/get_tex_ref_module.cc)
+add_custom_target(get_tex_ref_module ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/get_tex_ref_module.code)
+
 # Note to pass arch use format like -DOFFLOAD_ARCH_STR="--offload-arch=gfx900  --offload-arch=gfx906"
 # having space at the start/end of OFFLOAD_ARCH_STR can cause build failures

+if(HIP_PLATFORM MATCHES "amd")
+set(TEST_SRC
+    ${TEST_SRC}
+    hipExtModuleLaunchKernel.cc)
+
+add_custom_target(empty_module.code
+                  COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR}
+                  ${CMAKE_CURRENT_SOURCE_DIR}/empty_module.cc
+                  -o ${CMAKE_CURRENT_BINARY_DIR}/../../unit/module/empty_module.code
+                  -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/
+                  -I${CMAKE_CURRENT_SOURCE_DIR}/../../include --rocm-path=${ROCM_PATH})
+
 add_custom_target(copyKernel.code
                  COMMAND ${CMAKE_CXX_COMPILER} -mcode-object-version=5 --genco ${OFFLOAD_ARCH_STR}
                  ${CMAKE_CURRENT_SOURCE_DIR}/copyKernel.cc
@@ -100,14 +147,30 @@ add_custom_target(copiousArgKernel17.code
                  -I${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/
                  -I${CMAKE_CURRENT_SOURCE_DIR}/../../include --rocm-path=${ROCM_PATH})
 endif()
+endif()

+if(HIP_PLATFORM MATCHES "amd")
+    set(RTCLIB "hiprtc")
+else()
+    set(RTCLIB "nvrtc")
+endif()
 hip_add_exe_to_target(NAME ModuleTest
  TEST_SRC ${TEST_SRC}
-  TEST_TARGET_NAME build_tests COMMON_SHARED_SRC ${COMMON_SHARED_SRC})
+  TEST_TARGET_NAME build_tests
+  LINKER_LIBS ${RTCLIB}
+  COMMON_SHARED_SRC ${COMMON_SHARED_SRC}
+  COMPILE_OPTIONS -std=c++17)

+add_dependencies(ModuleTest get_function_module)
+add_dependencies(ModuleTest launch_kernel_module)
+add_dependencies(ModuleTest get_global_test_module)
+add_dependencies(ModuleTest get_tex_ref_module)
+
+if(HIP_PLATFORM MATCHES "amd")
+add_dependencies(build_tests empty_module.code)
 add_dependencies(build_tests copyKernel.code copyKernel.s)
 if(UNIX)
 add_dependencies(build_tests copiousArgKernel.code copiousArgKernel0.code copiousArgKernel1.code copiousArgKernel2.code
 copiousArgKernel3.code copiousArgKernel16.code copiousArgKernel17.code)
 endif()
-endif()
+endif()
@@ -0,0 +1,20 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip/hip_runtime_api.h>
+
+extern "C" {
+__global__ void GlobalKernel() {}
+
+__device__ void DeviceKernel() {}
+}
--- a/Daha Fazla Göster
+++ b/Daha Fazla Göster