SWDEV-1 - Merge github PRs to amd-staging

Change-Id: I2944a63ddc2eec8dc1403d9790ffffbaec343385
2024-03-04 11:51:34 +05:30
@@ -211,6 +211,10 @@
        "Unit_hipHostMalloc_AllocateUseMoreThanAvailGPUMemory",
        "=== SWDEV-432250:Below tests failed in stress test on 10/11/23 ===",
        "Unit_hipVectorTypes_test_on_device",
+        "Unit_Layered1DTexture_Check_DeviceBufferToFromLayered1DArray - ushort4",
+        "Unit_Layered2DTexture_Check_DeviceBufferToFromLayered2DArray - float4",
+        "=== Below test is disabled due to defect EXSWHTEC-347 ===",
+        "Unit_hipPointerSetAttribute_Positive_SyncMemops",
        "=== Patch which removes the typetraits implementation from std namespace in hiprtc is reverted ===",
        "Unit_hiprtc_stdheaders",
        "NOTE: The following test is disabled due to defect - EXSWHTEC-241",
@@ -222,6 +226,12 @@
        "NOTE: The following test is disabled due to defect - EXSWHTEC-244",
        "Unit_hipExtLaunchMultiKernelMultiDevice_Negative_Parameters",
        "Unit_hipMemAddressFree_negative",
+        "=== Below 2 tests are disable due to defect EXSWHTEC-369 ===",
+        "Unit_Device_ilogbf_Accuracy_Positive",
+        "Unit_Device_ilogb_Accuracy_Positive",
+        "NOTE: The following test is disabled due to defect - EXSWHTEC-245",
+        "Unit_hipFuncGetAttribute_Negative_Parameters",
+        "Unit_hipMemAddressFree_negative",
        "Unit_hipMemAddressReserve_AlignmentTest",
        "Unit_hipGraphAddMemcpyNode_Negative_Parameters",
        "Unit_hipMemCreate_ChkWithKerLaunch",
@@ -387,6 +397,261 @@
        "Performance_hipMemsetD32Async",
        "Performance_hipMemcpy2D_HostToHost",
        "Performance_hipMemcpy2DAsync_HostToHost",
+        "Unit_hipDeviceGetGraphMemAttribute_Positive_ReuseMemory",
+        "Unit_hipGraphAddNodeTypeEventWait_Positive_Basic",
+        "Unit_hipDrvGraphAddMemsetNode_Negative_Parameters",
+        "Unit_hipDrvGraphAddMemsetNode_hipMallocPitch_2D",
+        "Unit_hipDrvGraphAddMemsetNode_hipMallocPitch_1D",
+        "Unit_hipDrvGraphAddMemsetNode_hipMalloc3D_2D",
+        "Unit_hipDrvGraphAddMemsetNode_hipMalloc3D_1D",
+        "Unit_hipDrvGraphAddMemsetNode_hipMalloc_1D",
+        "Unit_hipDrvGraphAddMemsetNode_hipMallocManaged",
+        "Unit_hipDrvGraphAddMemcpyNode_Negative_Parameters",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - char",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - short",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - int",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - float",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - char",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - short",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - int",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1DGrad_Positive_ReadModeElementType - float",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - int",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeElementType - float",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - int",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1DLayeredLod_Positive_ReadModeElementType - float",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLod_Positive_ReadModeElementType - char",
+        "Unit_tex1DLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1DLod_Positive_ReadModeElementType - short",
+        "Unit_tex1DLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1DLod_Positive_ReadModeElementType - int",
+        "Unit_tex1DLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1DLod_Positive_ReadModeElementType - float",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3D_Positive_ReadModeElementType - char",
+        "Unit_tex3D_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex3D_Positive_ReadModeElementType - short",
+        "Unit_tex3D_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex3D_Positive_ReadModeElementType - int",
+        "Unit_tex3D_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex3D_Positive_ReadModeElementType - float",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3DLod_Positive_ReadModeElementType - char",
+        "Unit_tex3DLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex3DLod_Positive_ReadModeElementType - short",
+        "Unit_tex3DLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex3DLod_Positive_ReadModeElementType - int",
+        "Unit_tex3DLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex3DLod_Positive_ReadModeElementType - float",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - char",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - short",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - int",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex3DGrad_Positive_ReadModeElementType - float",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemap_Positive_ReadModeElementType - char",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemap_Positive_ReadModeElementType - short",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemap_Positive_ReadModeElementType - int",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemap_Positive_ReadModeElementType - float",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - char",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - int",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - float",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - char",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - short",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - int",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2Dgather_Positive_ReadModeElementType - float",
+        "Unit_tex2D_Positive_ReadModeElementType - char",
+        "Unit_tex2D_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2D_Positive_ReadModeElementType - short",
+        "Unit_tex2D_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2D_Positive_ReadModeElementType - int",
+        "Unit_tex2D_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2D_Positive_ReadModeElementType - float",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - char",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - short",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - int",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2DLayered_Positive_ReadModeElementType - float",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - char",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - short",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - int",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2DGrad_Positive_ReadModeElementType - float",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - int",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeElementType - float",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLod_Positive_ReadModeElementType - char",
+        "Unit_tex2DLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2DLod_Positive_ReadModeElementType - short",
+        "Unit_tex2DLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2DLod_Positive_ReadModeElementType - int",
+        "Unit_tex2DLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2DLod_Positive_ReadModeElementType - float",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - int",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex2DLayeredLod_Positive_ReadModeElementType - float",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_hipDrvGetErrorName_Positive_Basic",
+        "Unit_hipDrvGetErrorString_Positive_Basic",
+        "Unit_hipModuleLaunchKernel_Negative_Parameters",
+        "Unit_hipModuleGetTexRef_Positive_Basic",
+        "Unit_hipExtModuleLaunchKernel_Positive_Basic",
+        "Unit_hipExtModuleLaunchKernel_Negative_Parameters",
+        "Unit_hipLaunchKernel_Negative_Parameters",
+        "Unit_Kernel_Launch_bounds_Negative_OutOfBounds",
+        "Unit_Kernel_Launch_bounds_Negative_Parameters_RTC",
+        "Unit_AtomicBuiltins_Negative_Parameters_RTC",
        "Note: Test disabled due to defect - EXSWHTEC-151",
        "Unit_hipModuleLoad_Negative_Load_From_A_File_That_Is_Not_A_Module",
        "Note: Test disabled due to defect - EXSWHTEC-152",
@@ -446,6 +711,710 @@
        "Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed",
        "Unit_hipGraphMem_Alloc_Free_NodeGetParams_Functional_MultiDevice",
        "Unit_hipGraphUpload_Functional_multidevice_test",
+        "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===",
+        "Unit_StaticAssert_Positive_Basic_RTC",
+        "Unit_Assert_Positive_Basic_KernelFail",
+        "=== Below tests are disabled due to defect EXSWHTEC-356 ===",
+        "Unit_Device___hisinf2_Accuracy_Positive",
+        "Unit_Device___hisnan2_Accuracy_Positive",
+        "Unit_Device___hbequ2_Accuracy_Positive",
+        "Unit_Device___hne_Accuracy_Positive",
+        "Unit_Device___hne2_Accuracy_Positive",
+        "Unit_Device___hbne2_Accuracy_Positive",
+        "Unit_Device___hbgeu2_Accuracy_Positive",
+        "Unit_Device___hbgtu2_Accuracy_Positive",
+        "Unit_Device___hbleu2_Accuracy_Positive",
+        "Unit_Device___hbltu2_Accuracy_Positive",
+        "=== Below 4 tests are disable due to defect EXSWHTEC-355 ===",
+        "Unit_Device___hadd_Sanity_Positive",
+        "Unit_Device___uhadd_Sanity_Positive",
+        "Unit_Device___rhadd_Sanity_Positive",
+        "Unit_Device___urhadd_Sanity_Positive",
+        "SWDEV-435667 : Below tests failed in stress test on 19/01/24 ===",
+        "Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - unsigned long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - float",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic - double",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - unsigned long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - float",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic - double",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned int",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - unsigned long long",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - float",
+        "Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic - double",
+        "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint8_t",
+        "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint16_t",
+        "Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic - uint32_t",
+        "Below tests failed in stress test of 25/01/24 ===",
+        "Unit_atomicAnd_Positive_SameAddress - int",
+        "Unit_atomicAnd_Positive_SameAddress - unsigned int",
+        "Unit_atomicAnd_Positive_SameAddress - unsigned long",
+        "Unit_atomicAnd_Positive_SameAddress - unsigned long long",
+        "Unit_atomicAnd_Positive_Adjacent_Addresses - int",
+        "Unit_atomicAnd_Positive_Adjacent_Addresses - unsigned int",
+        "Unit_atomicAnd_Positive_Adjacent_Addresses - unsigned long",
+        "Unit_atomicAnd_Positive_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicAnd_Positive_Scattered_Addresses - int",
+        "Unit_atomicAnd_Positive_Scattered_Addresses - unsigned int",
+        "Unit_atomicAnd_Positive_Scattered_Addresses - unsigned long",
+        "Unit_atomicAnd_Positive_Scattered_Addresses - unsigned long long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Same_Address - int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Same_Address - unsigned int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Same_Address - unsigned long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Same_Address - unsigned long long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Adjacent_Addresses - int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Adjacent_Addresses - unsigned int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Scattered_Addresses - int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Scattered_Addresses - unsigned int",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Scattered_Addresses - unsigned long",
+        "Unit_atomicAnd_Positive_Multi_Kernel_Scattered_Addresses - unsigned long long",
+        "Unit_atomicAnd_Negative_Parameters_RTC",
+        "Unit_atomicOr_Positive_SameAddress - int",
+        "Unit_atomicOr_Positive_SameAddress - unsigned int",
+        "Unit_atomicOr_Positive_SameAddress - unsigned long",
+        "Unit_atomicOr_Positive_SameAddress - unsigned long long",
+        "Unit_atomicOr_Positive_Adjacent_Addresses - int",
+        "Unit_atomicOr_Positive_Adjacent_Addresses - unsigned int",
+        "Unit_atomicOr_Positive_Adjacent_Addresses - unsigned long",
+        "Unit_atomicOr_Positive_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicOr_Positive_Scattered_Addresses - int",
+        "Unit_atomicOr_Positive_Scattered_Addresses - unsigned int",
+        "Unit_atomicOr_Positive_Scattered_Addresses - unsigned long",
+        "Unit_atomicOr_Positive_Scattered_Addresses - unsigned long long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Same_Address - int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Same_Address - unsigned int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Same_Address - unsigned long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Same_Address - unsigned long long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Adjacent_Addresses - int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Adjacent_Addresses - unsigned int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Scattered_Addresses - int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Scattered_Addresses - unsigned int",
+        "Unit_atomicOr_Positive_Multi_Kernel_Scattered_Addresses - unsigned long",
+        "Unit_atomicOr_Positive_Multi_Kernel_Scattered_Addresses - unsigned long long",
+        "Unit_atomicOr_Negative_Parameters_RTC",
+        "Unit_atomicXor_Positive_SameAddress - int",
+        "Unit_atomicXor_Positive_SameAddress - unsigned int",
+        "Unit_atomicXor_Positive_SameAddress - unsigned long",
+        "Unit_atomicXor_Positive_SameAddress - unsigned long long",
+        "Unit_atomicXor_Positive_Adjacent_Addresses - int",
+        "Unit_atomicXor_Positive_Adjacent_Addresses - unsigned int",
+        "Unit_atomicXor_Positive_Adjacent_Addresses - unsigned long",
+        "Unit_atomicXor_Positive_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicXor_Positive_Scattered_Addresses - int",
+        "Unit_atomicXor_Positive_Scattered_Addresses - unsigned int",
+        "Unit_atomicXor_Positive_Scattered_Addresses - unsigned long",
+        "Unit_atomicXor_Positive_Scattered_Addresses - unsigned long long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Same_Address - int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Same_Address - unsigned int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Same_Address - unsigned long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Same_Address - unsigned long long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Adjacent_Addresses - int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Adjacent_Addresses - unsigned int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Scattered_Addresses - int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Scattered_Addresses - unsigned int",
+        "Unit_atomicXor_Positive_Multi_Kernel_Scattered_Addresses - unsigned long",
+        "Unit_atomicXor_Positive_Multi_Kernel_Scattered_Addresses - unsigned long long",
+        "Unit_atomicXor_Negative_Parameters_RTC",
+        "Unit_atomicMin_Positive_SameAddress - int",
+        "Unit_atomicMin_Positive_SameAddress - unsigned int",
+        "Unit_atomicMin_Positive_SameAddress - unsigned long",
+        "Unit_atomicMin_Positive_SameAddress - unsigned long long",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - int",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - unsigned int",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - unsigned long",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - float",
+        "Unit_atomicMin_Positive_Adjacent_Addresses - double",
+        "Unit_atomicMin_Positive_Scattered_Addresses - int",
+        "Unit_atomicMin_Positive_Scattered_Addresses - unsigned int",
+        "Unit_atomicMin_Positive_Scattered_Addresses - unsigned long",
+        "Unit_atomicMin_Positive_Scattered_Addresses - unsigned long long",
+        "Unit_atomicMin_Positive_Scattered_Addresses - float",
+        "Unit_atomicMin_Positive_Scattered_Addresses - double",
+        "Unit_atomicMin_Positive_Multi_Kernel_Same_Address - int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Same_Address - unsigned int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Same_Address - unsigned long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Same_Address - unsigned long long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - unsigned int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - unsigned int",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - unsigned long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - unsigned long long",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit_atomicMin_Negative_Parameters_RTC",
+        "Unit_atomicMax_Positive_SameAddress - int",
+        "Unit_atomicMax_Positive_SameAddress - unsigned int",
+        "Unit_atomicMax_Positive_SameAddress - unsigned long",
+        "Unit_atomicMax_Positive_SameAddress - unsigned long long",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - int",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - unsigned int",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - unsigned long",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - float",
+        "Unit_atomicMax_Positive_Adjacent_Addresses - double",
+        "Unit_atomicMax_Positive_Scattered_Addresses - int",
+        "Unit_atomicMax_Positive_Scattered_Addresses - unsigned int",
+        "Unit_atomicMax_Positive_Scattered_Addresses - unsigned long",
+        "Unit_atomicMax_Positive_Scattered_Addresses - unsigned long long",
+        "Unit_atomicMax_Positive_Scattered_Addresses - float",
+        "Unit_atomicMax_Positive_Scattered_Addresses - double",
+        "Unit_atomicMax_Positive_Multi_Kernel_Same_Address - int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Same_Address - unsigned int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Same_Address - unsigned long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Same_Address - unsigned long long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - unsigned int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - unsigned long long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - unsigned int",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - unsigned long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - unsigned long long",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit_atomicMax_Negative_Parameters_RTC",
+        "Unit_safeAtomicMin_Positive_Adjacent_Addresses - float",
+        "Unit_safeAtomicMin_Positive_Adjacent_Addresses - double",
+        "Unit_safeAtomicMin_Positive_Scattered_Addresses - float",
+        "Unit_safeAtomicMin_Positive_Scattered_Addresses - double",
+        "Unit_safeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_safeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_safeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_safeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit_unsafeAtomicMin_Positive_Adjacent_Addresses - float",
+        "Unit_unsafeAtomicMin_Positive_Adjacent_Addresses - double",
+        "Unit_unsafeAtomicMin_Positive_Scattered_Addresses - float",
+        "Unit_unsafeAtomicMin_Positive_Scattered_Addresses - double",
+        "Unit_unsafeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_unsafeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_unsafeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_unsafeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit_safeAtomicMax_Positive_Adjacent_Addresses - float",
+        "Unit_safeAtomicMax_Positive_Adjacent_Addresses - double",
+        "Unit_safeAtomicMax_Positive_Scattered_Addresses - float",
+        "Unit_safeAtomicMax_Positive_Scattered_Addresses - double",
+        "Unit_safeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_safeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_safeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_safeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit_unsafeAtomicMax_Positive_Adjacent_Addresses - float",
+        "Unit_unsafeAtomicMax_Positive_Adjacent_Addresses - double",
+        "Unit_unsafeAtomicMax_Positive_Scattered_Addresses - float",
+        "Unit_unsafeAtomicMax_Positive_Scattered_Addresses - double",
+        "Unit_unsafeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses - float",
+        "Unit_unsafeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses - double",
+        "Unit_unsafeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses - float",
+        "Unit_unsafeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses - double",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_SameAddress - int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - float",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses - double",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - float",
+        "Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses - double",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_SameAddress - int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - float",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses - double",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses - float",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_SameAddress - int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - float",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses - double",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - float",
+        "Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses - double",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_SameAddress - int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - float",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses - double",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - float",
+        "Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses - double",
+        "Unit_atomicExch_Positive - int",
+        "Unit_atomicExch_Positive - unsigned int",
+        "Unit_atomicExch_Positive - unsigned long",
+        "Unit_atomicExch_Positive - unsigned long long",
+        "Unit_atomicExch_Positive - float",
+        "Unit_atomicExch_Positive - double",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress - int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress - int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress - int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress - int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses - unsigned long long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses - int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses - unsigned int",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses - unsigned long",
+        "Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses - unsigned long long",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - int",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - unsigned int",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - unsigned long",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - unsigned long long",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - float",
+        "Unit___hip_atomic_exchange_Positive_Wavefront - double",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - int",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - unsigned int",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - unsigned long",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - unsigned long long",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - float",
+        "Unit___hip_atomic_exchange_Positive_Workgroup - double",
+        "=== Below tests cause timeout in stress test of 09/02/24 ===",
+        "Unit_Device___half2half2_Accuracy_Positive",
+        "Unit_Device_make_half2_Accuracy_Positive",
+        "Unit_Device___halves2half2_Accuracy_Positive",
+        "Unit_Device___low2half_Accuracy_Positive",
+        "Unit_Device___high2half_Accuracy_Positive",
+        "Unit_Device___low2half2_Accuracy_Positive",
+        "Unit_Device___high2half2_Accuracy_Positive",
+        "Unit_Device___lowhigh2highlow_Accuracy_Positive",
+        "Unit_Device___lows2half2_Accuracy_Positive",
+        "Unit_Device___highs2half2_Accuracy_Positive",
+        "Unit_Device___float2half2_rn_Accuracy_Positive",
+        "Unit_Device___floats2half2_rn_Accuracy_Positive",
+        "Unit_Device___float22half2_rn_Accuracy_Positive",
+        "Unit_Device___low2float_Accuracy_Positive",
+        "Unit_Device___high2float_Accuracy_Positive",
+        "Unit_Device___half22float2_Accuracy_Positive",
+        "Unit_Device_hcos_Accuracy_Positive",
+        "Unit_Device_h2cos_Accuracy_Positive",
+        "Unit_Device_hsin_Accuracy_Positive",
+        "Unit_Device_h2sin_Accuracy_Positive",
+        "Unit_Device_hexp_Accuracy_Positive",
+        "Unit_Device_h2exp_Accuracy_Positive",
+        "Unit_Device_hexp10_Accuracy_Positive",
+        "Unit_Device_h2exp10_Accuracy_Positive",
+        "Unit_Device_hexp2_Accuracy_Positive",
+        "Unit_Device_h2exp2_Accuracy_Positive",
+        "Unit_Device_hlog_Accuracy_Positive",
+        "Unit_Device_h2log_Accuracy_Positive",
+        "Unit_Device_hlog10_Accuracy_Positive",
+        "Unit_Device_h2log10_Accuracy_Positive",
+        "Unit_Device_hlog2_Accuracy_Positive",
+        "Unit_Device_h2log2_Accuracy_Positive",
+        "Unit_Device_hsqrt_Accuracy_Positive",
+        "Unit_Device_h2sqrt_Accuracy_Positive",
+        "Unit_Device_hceil_Accuracy_Positive",
+        "Unit_Device_h2ceil_Accuracy_Positive",
+        "Unit_Device_hfloor_Accuracy_Positive",
+        "Unit_Device_h2floor_Accuracy_Positive",
+        "Unit_Device_htrunc_Accuracy_Positive",
+        "Unit_Device_h2trunc_Accuracy_Positive",
+        "Unit_Device_hrcp_Accuracy_Positive",
+        "Unit_Device_h2rcp_Accuracy_Positive",
+        "Unit_Device_hrsqrt_Accuracy_Positive",
+        "Unit_Device_h2rsqrt_Accuracy_Positive",
+        "Unit_Device_hrint_Accuracy_Positive",
+        "Unit_Device_h2rint_Accuracy_Positive",
+        "Unit_Device___habs_Accuracy_Positive",
+        "Unit_Device___habs2_Accuracy_Positive",
+        "Unit_Device___hneg_Accuracy_Positive",
+        "Unit_Device___hneg2_Accuracy_Positive",
+        "Unit_Device___hadd_wrapper_Accuracy_Positive",
+        "Unit_Device___hadd2_Accuracy_Positive",
+        "Unit_Device___hadd_sat_Accuracy_Positive",
+        "Unit_Device___hadd2_sat_Accuracy_Positive",
+        "Unit_Device___hsub_Accuracy_Positive",
+        "Unit_Device___hsub2_Accuracy_Positive",
+        "Unit_Device___hsub_sat_Accuracy_Positive",
+        "Unit_Device___hsub2_sat_Accuracy_Positive",
+        "Unit_Device___hmul_Accuracy_Positive",
+        "Unit_Device___hmul2_Accuracy_Positive",
+        "Unit_Device___hmul_sat_Accuracy_Positive",
+        "Unit_Device___hmul2_sat_Accuracy_Positive",
+        "Unit_Device___hdiv_Accuracy_Positive",
+        "Unit_Device___h2div_Accuracy_Positive",
+        "Unit_Device___hfma_Accuracy_Positive",
+        "Unit_Device___hfma2_Accuracy_Positive",
+        "Unit_Device___hfma_sat_Accuracy_Positive",
+        "Unit_Device___hfma2_sat_Accuracy_Positive",
+        "Unit_Device___hisinf_Accuracy_Positive",
+        "Unit_Device___hisinf2_Accuracy_Positive",
+        "Unit_Device___hisnan_Accuracy_Positive",
+        "Unit_Device___hisnan2_Accuracy_Positive",
+        "Unit_Device___heq_Accuracy_Positive",
+        "Unit_Device___hbeq2_Accuracy_Positive",
+        "Unit_Device___hequ_Accuracy_Positive",
+        "Unit_Device___hbequ2_Accuracy_Positive",
+        "Unit_Device___heq2_Accuracy_Positive",
+        "Unit_Device___hequ2_Accuracy_Positive",
+        "Unit_Device___hne_Accuracy_Positive",
+        "Unit_Device___hbne2_Accuracy_Positive",
+        "Unit_Device___hneu_Accuracy_Positive",
+        "Unit_Device___hbneu2_Accuracy_Positive",
+        "Unit_Device___hne2_Accuracy_Positive",
+        "Unit_Device___hneu2_Accuracy_Positive",
+        "Unit_Device___hge_Accuracy_Positive",
+        "Unit_Device___hbge2_Accuracy_Positive",
+        "Unit_Device___hgeu_Accuracy_Positive",
+        "Unit_Device___hbgeu2_Accuracy_Positive",
+        "Unit_Device___hge2_Accuracy_Positive",
+        "Unit_Device___hgeu2_Accuracy_Positive",
+        "Unit_Device___hgt_Accuracy_Positive",
+        "Unit_Device___hbgt2_Accuracy_Positive",
+        "Unit_Device___hgtu_Accuracy_Positive",
+        "Unit_Device___hbgtu2_Accuracy_Positive",
+        "Unit_Device___hgt2_Accuracy_Positive",
+        "Unit_Device___hgtu2_Accuracy_Positive",
+        "Unit_Device___hle_Accuracy_Positive",
+        "Unit_Device___hble2_Accuracy_Positive",
+        "Unit_Device___hleu_Accuracy_Positive",
+        "Unit_Device___hbleu2_Accuracy_Positive",
+        "Unit_Device___hle2_Accuracy_Positive",
+        "Unit_Device___hleu2_Accuracy_Positive",
+        "Unit_Device___hlt_Accuracy_Positive",
+        "Unit_Device___hblt2_Accuracy_Positive",
+        "Unit_Device___hltu_Accuracy_Positive",
+        "Unit_Device___hbltu2_Accuracy_Positive",
+        "Unit_Device___hlt2_Accuracy_Positive",
+        "Unit_Device___hltu2_Accuracy_Positive",
+        "Unit_Device___hmax_Accuracy_Positive",
+        "Unit_Device___hmin_Accuracy_Positive",
+        "Unit_Device___hmax_nan_Accuracy_Positive",
+        "Unit_Device___hmin_nan_Accuracy_Positive",
+        "Unit_Device___half2int_rn_Accuracy_Positive",
+        "Unit_Device___half2int_rz_Accuracy_Positive",
+        "Unit_Device___half2int_rd_Accuracy_Positive",
+        "Unit_Device___half2int_ru_Accuracy_Positive",
+        "Unit_Device___half2uint_rn_Accuracy_Positive",
+        "Unit_Device___half2uint_rz_Accuracy_Positive",
+        "Unit_Device___half2uint_rd_Accuracy_Positive",
+        "Unit_Device___half2uint_ru_Accuracy_Positive",
+        "Unit_Device___half2short_rn_Accuracy_Positive",
+        "Unit_Device___half2short_rz_Accuracy_Positive",
+        "Unit_Device___half2short_rd_Accuracy_Positive",
+        "Unit_Device___half2short_ru_Accuracy_Positive",
+        "Unit_Device___half2ushort_rn_Accuracy_Positive",
+        "Unit_Device___half2ushort_rz_Accuracy_Positive",
+        "Unit_Device___half2ushort_rd_Accuracy_Positive",
+        "Unit_Device___half2ushort_ru_Accuracy_Positive",
+        "Unit_Device___half2ll_rn_Accuracy_Positive",
+        "Unit_Device___half2ll_rz_Accuracy_Positive",
+        "Unit_Device___half2ll_rd_Accuracy_Positive",
+        "Unit_Device___half2ll_ru_Accuracy_Positive",
+        "Unit_Device___half2ull_rn_Accuracy_Positive",
+        "Unit_Device___half2ull_rz_Accuracy_Positive",
+        "Unit_Device___half2ull_rd_Accuracy_Positive",
+        "Unit_Device___half2ull_ru_Accuracy_Positive",
+        "Unit_Device___half_as_short_Accuracy_Positive",
+        "Unit_Device___half_as_ushort_Accuracy_Positive",
+        "Unit_Device___int2half_rn_Accuracy_Positive",
+        "Unit_Device___int2half_rz_Accuracy_Positive",
+        "Unit_Device___int2half_rd_Accuracy_Positive",
+        "Unit_Device___int2half_ru_Accuracy_Positive",
+        "Unit_Device___uint2half_rn_Accuracy_Positive",
+        "Unit_Device___uint2half_rz_Accuracy_Positive",
+        "Unit_Device___uint2half_rd_Accuracy_Positive",
+        "Unit_Device___uint2half_ru_Accuracy_Positive",
+        "Unit_Device___short2half_rn_Accuracy_Positive",
+        "Unit_Device___short2half_rz_Accuracy_Positive",
+        "Unit_Device___short2half_rd_Accuracy_Positive",
+        "Unit_Device___short2half_ru_Accuracy_Positive",
+        "Unit_Device___ushort2half_rn_Accuracy_Positive",
+        "Unit_Device___ushort2half_rz_Accuracy_Positive",
+        "Unit_Device___ushort2half_rd_Accuracy_Positive",
+        "Unit_Device___ushort2half_ru_Accuracy_Positive",
+        "Unit_Device___ll2half_rn_Accuracy_Positive",
+        "Unit_Device___ll2half_rz_Accuracy_Positive",
+        "Unit_Device___ll2half_rd_Accuracy_Positive",
+        "Unit_Device___ll2half_ru_Accuracy_Positive",
+        "Unit_Device___ull2half_rn_Accuracy_Positive",
+        "Unit_Device___ull2half_rz_Accuracy_Positive",
+        "Unit_Device___ull2half_rd_Accuracy_Positive",
+        "Unit_Device___ull2half_ru_Accuracy_Positive",
+        "Unit_Device___short_as_half_Accuracy_Positive",
+        "Unit_Device___ushort_as_half_Accuracy_Positive",
+        "Unit_Device___float2half_rn_Accuracy_Positive",
+        "Unit_Device___float2half_Accuracy_Positive",
+        "Unit_Device___half2float_Accuracy_Positive",
+        "Unit_Device___frcp_rn_Accuracy_Positive",
+        "Unit_Device___fsqrt_rn_Accuracy_Positive",
+        "Unit_Device___frsqrt_rn_Accuracy_Positive",
+        "Unit_Device___expf_Accuracy_Positive",
+        "Unit_Device___exp10f_Accuracy_Positive",
+        "Unit_Device___logf_Accuracy_Positive",
+        "Unit_Device___log2f_Accuracy_Positive",
+        "Unit_Device___log10f_Accuracy_Positive",
+        "Unit_Device___sinf_Accuracy_Positive",
+        "Unit_Device___sincosf_sin_Accuracy_Positive",
+        "Unit_Device___cosf_Accuracy_Positive",
+        "Unit_Device___sincosf_cos_Accuracy_Positive",
+        "Unit_Device___fadd_rn_Accuracy_Positive",
+        "Unit_Device___fsub_rn_Accuracy_Positive",
+        "Unit_Device___fmul_rn_Accuracy_Positive",
+        "Unit_Device___fdiv_rn_Accuracy_Positive",
+        "Unit_Device___fdividef_Accuracy_Positive",
+        "Unit_Device___fmaf_rn_Accuracy_Positive",
+        "Unit_Device___drcp_rn_Accuracy_Positive",
+        "Unit_Device___dsqrt_rn_Accuracy_Positive",
+        "Unit_Device___dadd_rn_Accuracy_Positive",
+        "Unit_Device___dsub_rn_Accuracy_Positive",
+        "Unit_Device___dmul_rn_Accuracy_Positive",
+        "Unit_Device___ddiv_rn_Accuracy_Positive",
+        "Unit_Device___fma_rn_Accuracy_Positive",
+        "Unit___hip_atomic_load_store_Positive_Acquire_Release",
+        "Unit___hip_atomic_exchange_Positive_Acquire_Release",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Acquire_Release",
+        "Unit___hip_atomic_compare_exchange_weak_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_add_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_and_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_or_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_xor_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_min_Positive_Acquire_Release",
+        "Unit___hip_atomic_fetch_max_Positive_Acquire_Release",
+        "Unit___hip_atomic_load_store_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_exchange_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_compare_exchange_weak_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_add_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_and_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_or_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_xor_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_min_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_max_Positive_Sequential_Consistency",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - int",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - unsigned int",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - unsigned long",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - unsigned long long",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - float",
+        "Unit___hip_atomic_fetch_add_Positive_Wavefront - double",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - int",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - unsigned int",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - unsigned long",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - unsigned long long",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - float",
+        "Unit___hip_atomic_fetch_add_Positive_Workgroup - double",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - int",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - unsigned int",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - unsigned long",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - unsigned long long",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - float",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront - double",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - int",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - unsigned int",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - unsigned long",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - unsigned long long",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - float",
+        "Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup - double",
+        "Unit_atomicAdd_Positive - int",
+        "Unit_atomicAdd_Positive - unsigned int",
+        "Unit_atomicAdd_Positive - unsigned long",
+        "Unit_atomicAdd_Positive - unsigned long long",
+        "Unit_atomicAdd_Positive - float",
+        "Unit_atomicAdd_Positive - double",
+        "Unit_atomicAdd_Positive_Multi_Kernel - int",
+        "Unit_atomicAdd_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicAdd_Positive_Multi_Kernel - unsigned long",
+        "Unit_atomicAdd_Positive_Multi_Kernel - unsigned long long",
+        "Unit_atomicAdd_Positive_Multi_Kernel - float",
+        "Unit_atomicAdd_Positive_Multi_Kernel - double",
+        "Unit_atomicAdd_Negative_Parameters_RTC",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - int",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - unsigned int",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - unsigned long",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - unsigned long long",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - float",
+        "Unit_atomicAdd_system_Positive_Peer_GPUs - double",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - int",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - unsigned long",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - float",
+        "Unit_atomicAdd_system_Positive_Host_And_GPU - double",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - int",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - unsigned int",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - unsigned long",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - unsigned long long",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - float",
+        "Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs - double",
+        "Unit_unsafeAtomicAdd_Positive - float",
+        "Unit_unsafeAtomicAdd_Positive - double",
+        "Unit_unsafeAtomicAdd_Positive_Multi_Kernel - float",
+        "Unit_unsafeAtomicAdd_Positive_Multi_Kernel - double",
+        "Unit_safeAtomicAdd_Positive - float",
+        "Unit_safeAtomicAdd_Positive - double",
+        "Unit_safeAtomicAdd_Positive_Multi_Kernel - float",
+        "Unit_safeAtomicAdd_Positive_Multi_Kernel - double",
+        "Unit_atomicSub_Positive - int",
+        "Unit_atomicSub_Positive - unsigned int",
+        "Unit_atomicSub_Positive - unsigned long",
+        "Unit_atomicSub_Positive - unsigned long long",
+        "Unit_atomicSub_Positive - float",
+        "Unit_atomicSub_Positive - double",
+        "Unit_atomicSub_Positive_Multi_Kernel - int",
+        "Unit_atomicSub_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicSub_Positive_Multi_Kernel - unsigned long",
+        "Unit_atomicSub_Positive_Multi_Kernel - unsigned long long",
+        "Unit_atomicSub_Positive_Multi_Kernel - float",
+        "Unit_atomicSub_Positive_Multi_Kernel - double",
+        "Unit_atomicSub_Negative_Parameters_RTC",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - int",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - unsigned int",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - unsigned long",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - unsigned long long",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - float",
+        "Unit_atomicSub_system_Positive_Peer_GPUs - double",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - int",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - unsigned long",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - float",
+        "Unit_atomicSub_system_Positive_Host_And_GPU - double",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - int",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - unsigned int",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - unsigned long",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - unsigned long long",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - float",
+        "Unit_atomicSub_system_Positive_Host_And_Peer_GPUs - double",
+        "Unit_atomicInc_Positive - unsigned int",
+        "Unit_atomicInc_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicInc_Negative_Parameters_RTC",
+        "Unit_atomicDec_Positive - unsigned int",
+        "Unit_atomicDec_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicDec_Negative_Parameters_RTC",
+        "Unit_atomicCAS_Positive - int",
+        "Unit_atomicCAS_Positive - unsigned int",
+        "Unit_atomicCAS_Positive - unsigned long long",
+        "Unit_atomicCAS_Positive_Multi_Kernel - int",
+        "Unit_atomicCAS_Positive_Multi_Kernel - unsigned int",
+        "Unit_atomicCAS_Positive_Multi_Kernel - unsigned long long",
+        "Unit_atomicCAS_Negative_Parameters_RTC",
+        "Unit_atomicCAS_system_Positive_Peer_GPUs - int",
+        "Unit_atomicCAS_system_Positive_Peer_GPUs - unsigned int",
+        "Unit_atomicCAS_system_Positive_Peer_GPUs - unsigned long long",
+        "Unit_atomicCAS_system_Positive_Host_And_GPU - int",
+        "Unit_atomicCAS_system_Positive_Host_And_GPU - unsigned int",
+        "Unit_atomicCAS_system_Positive_Host_And_GPU - unsigned long long",
+        "Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs - int",
+        "Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs - unsigned int",
+        "Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs - unsigned long long",
    #endif
        "End of json"
    ]
@@ -89,6 +89,149 @@
        "Performance_hipMemsetD32",
        "Performance_hipMemsetD32Async",
        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
-        "Unit_hipMemcpy_Positive_Synchronization_Behavior"
+        "Unit_hipMemcpy_Positive_Synchronization_Behavior",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - char",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned char",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - short",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned short",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - int",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - unsigned int",
+        "Unit_tex1Dfetch_Positive_ReadModeElementType - float",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1Dfetch_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex1DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex3DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemap_Positive_ReadModeElementType - char",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemap_Positive_ReadModeElementType - short",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemap_Positive_ReadModeElementType - int",
+        "Unit_texCubemap_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemap_Positive_ReadModeElementType - float",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemap_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLod_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - char",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - int",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapGrad_Positive_ReadModeElementType - float",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayered_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - int",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - unsigned int",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeElementType - float",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_texCubemapLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2D_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayered_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayeredGrad_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned char",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - short",
+        "Unit_tex2DLayeredLod_Positive_ReadModeNormalizedFloat - unsigned short",
+        "Unit_hipDrvGetErrorString_Positive_Basic",
+        "Unit_hipLaunchKernel_Negative_Parameters",
+        "Unit_Assert_Positive_Basic_KernelFail",
+        "=== Below tests fail in external CI for PR https://github.com/ROCm-Developer-Tools/hip-tests/pull/210 ===",
+        "Unit_hipMemImportFromShareableHandle_Positive_MultiProc",
+        "Unit_hipMemMapArrayAsync_Positive_Basic"
    ]
 }
@@ -44,6 +44,7 @@
        "Performance_hipMemsetD32",
        "Performance_hipMemsetD32Async",
        "Unit_hipMemcpyParam2D_Positive_Synchronization_Behavior",
-        "Unit_hipMemcpy_Positive_Synchronization_Behavior"
+        "Unit_hipMemcpy_Positive_Synchronization_Behavior",
+        "Unit_hipMemMapArrayAsync_Positive_Basic"
    ]
 }
@@ -36,6 +36,12 @@ int main(int argc, char** argv) {
    | Opt(cmd_options.cg_iterations, "cg_iterations")
        ["-C"]["--cg-iterations"]
        ("Number of iterations used for cooperative groups sync tests (default: 5)")
+    | Opt(cmd_options.accuracy_iterations, "accuracy_iterations")
+        ["-A"]["--accuracy-iterations"]
+        ("Number of iterations used for math accuracy tests with randomly generated inputs (default: 2^32)")
+    | Opt(cmd_options.accuracy_max_memory, "accuracy_max_memory")
+        ["-M"]["--accuracy-max-memory"]
+        ("Percentage of global device memory allowed for math accuracy tests (default: 80%)")
  ;
  // clang-format on

@@ -22,6 +22,9 @@ THE SOFTWARE.

 #pragma once

+#include <cstdint>
+#include <limits>
+
 struct CmdOptions {
  int iterations = 10;
  int warmups = 100;
@@ -29,6 +32,8 @@ struct CmdOptions {
  int cg_iterations = 5;
  bool no_display = false;
  bool progress = false;
+  uint64_t accuracy_iterations = std::numeric_limits<uint32_t>::max() + 1ull;
+  int accuracy_max_memory = 80;
 };

 extern CmdOptions cmd_options;
@@ -129,6 +129,19 @@ THE SOFTWARE.
    }                                                                                              \
  }

+// Check that an expression, errorExpr, evaluates to the expected error_t, expectedError.
+#define HIPRTC_CHECK_ERROR(errorExpr, expectedError)                                               \
+  {                                                                                                \
+    auto localError = errorExpr;                                                                   \
+    INFO("Matching Errors: "                                                                       \
+         << "\n    Expected Error: " << hiprtcGetErrorString(expectedError)                        \
+         << "\n    Expected Code: " << expectedError << '\n'                                       \
+         << "                  Actual Error:   " << hiprtcGetErrorString(localError)               \
+         << "\n    Actual Code:   " << localError << "\nStr: " << #errorExpr                       \
+         << "\n    In File: " << __FILE__ << "\n    At line: " << __LINE__);                       \
+    REQUIRE(localError == expectedError);                                                          \
+  }
+
 #define HIPASSERT(condition)                                                                       \
  if (!(condition)) {                                                                              \
    printf("assertion %s at %s:%d \n", #condition, __FILE__, __LINE__);                            \
@@ -165,7 +178,7 @@ static inline bool IsGfx11() {
  hipDeviceProp_t props{};
  HIP_CHECK(hipGetDevice(&device));
  HIP_CHECK(hipGetDeviceProperties(&props, device));
-   // Get GCN Arch Name and compare to check if it is gfx11
+  // Get GCN Arch Name and compare to check if it is gfx11
  std::string arch = std::string(props.gcnArchName);
  auto pos = arch.find("gfx11");
  if (pos != std::string::npos)
@@ -173,7 +186,7 @@ static inline bool IsGfx11() {
  else
    return false;
 #else
-  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  std::cout << "Have to be either Nvidia or AMD platform, asserting" << std::endl;
  assert(false);
 #endif
 }
@@ -308,7 +321,7 @@ void launchKernel(K kernel, Dim numBlocks, Dim numThreads, std::uint32_t memPerB
  launchRTCKernel<Typenames...>(kernel, numBlocks, numThreads, memPerBlock, stream,
                                std::forward<Args>(packedArgs)...);
 #endif
-HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipGetLastError());
 }

 //---
@@ -39,6 +39,13 @@ THE SOFTWARE.
 * @}
 */

+/**
+ * @defgroup AtomicsTest Device Atomics
+ * @{
+ * This section describes tests for the Device Atomic APIs.
+ * @}
+ */
+
 /**
 * @defgroup DeviceLanguageTest Device Language
 * @{
@@ -96,16 +103,23 @@ THE SOFTWARE.
 */

 /**
-* @defgroup KernelTest Kernel Functions Management
-* @{
-* This section describes the various kernel functions invocation.
-* @}
-*/
+ * @defgroup KernelTest Kernel Functions Management
+ * @{
+ * This section describes the various kernel functions invocation.
+ * @}
+ */

 /**
- * @defgroup AtomicsTest Device Atomics
+ * @defgroup SyncthreadsTest Synchronization Functions
 * @{
- * This section describes tests for the Device Atomic APIs.
+ * This section describes tests for Synchronization Functions.
+ * @}
+ */
+
+/**
+ * @defgroup ThreadfenceTest Memory Fence Functions
+ * @{
+ * This section describes tests for Memory Fence Functions.
 * @}
 */

@@ -119,7 +133,8 @@ THE SOFTWARE.
 /**
 * @defgroup PeerToPeerTest PeerToPeer Device Memory Access
 * @{
- * This section describes tests for the PeerToPeer device memory access functions of HIP runtime API.
+ * This section describes tests for the PeerToPeer device memory access functions of HIP runtime
+ * API.
 * @warning PeerToPeer support is experimental.
 * @}
 */
@@ -135,6 +150,7 @@ THE SOFTWARE.
 * @defgroup ShflTest warp shuffle function Management
 * @{
 * This section describes the warp shuffle types & functions of HIP runtime API.
+ * @}
 */

 /**
@@ -158,6 +174,13 @@ THE SOFTWARE.
 * @}
 */

+/**
+ * @defgroup ModuleTest Module Management
+ * @{
+ * This section describes the module management types & functions of HIP runtime API.
+ * @}
+ */
+
 /**
 * @defgroup TextureTest Texture Management
 * @{
@@ -172,6 +195,13 @@ THE SOFTWARE.
 * @}
 */

+ /**
+ * @defgroup MathTest Math Device Functions
+ * @{
+ * This section describes tests for device math functions of HIP runtime API.
+ * @}
+ */
+
 /**
 * @defgroup PrintfTest Printf API Management
 * @{
@@ -192,3 +222,10 @@ THE SOFTWARE.
 * This section describes tests for the Complex type functions.
 * @}
 */
+
+/**
+ * @defgroup VirtualMemoryManagementTest Virtual Memory Management APIs
+ * @{
+ * This section describes the virtual memory management types & functions of HIP runtime API.
+ * @}
+ */
@@ -23,7 +23,7 @@ THE SOFTWARE.
 #pragma once
 #pragma clang diagnostic ignored "-Wmissing-field-initializers"
 #pragma clang diagnostic ignored "-Wunused-lambda-capture"
-
+#pragma clang diagnostic ignored "-Wunused-parameter"
 #include <variant>

 #include <hip_test_common.hh>
@@ -44,8 +44,9 @@ static inline hipMemcpyKind ReverseMemcpyDirection(const hipMemcpyKind direction
  }
 };

-static hipMemcpy3DParms GetMemcpy3DParms(PtrVariant dst_ptr, hipPos dst_pos, PtrVariant src_ptr,
-                                         hipPos src_pos, hipExtent extent, hipMemcpyKind kind) {
+static inline hipMemcpy3DParms GetMemcpy3DParms(PtrVariant dst_ptr, hipPos dst_pos,
+                                                PtrVariant src_ptr, hipPos src_pos,
+                                                hipExtent extent, hipMemcpyKind kind) {
  hipMemcpy3DParms parms = {0};
  if (std::holds_alternative<hipArray_t>(dst_ptr)) {
    parms.dstArray = std::get<hipArray_t>(dst_ptr);
@@ -185,7 +186,7 @@ void Memcpy3DDeviceToDeviceShell(F memcpy_func, hipStream_t kernel_stream = null
    HIP_CHECK(hipDeviceCanAccessPeer(&can_access_peer, src_device, dst_device));
    if (!can_access_peer) {
      std::string msg = "Skipped as peer access cannot be enabled between devices " +
-                         std::to_string(src_device) + " " + std::to_string(dst_device);
+          std::to_string(src_device) + " " + std::to_string(dst_device);
      HipTest::HIP_SKIP_TEST(msg.c_str());
      return;
    }
@@ -205,7 +206,8 @@ void Memcpy3DDeviceToDeviceShell(F memcpy_func, hipStream_t kernel_stream = null
  // Using dst_alloc width and height to set only the elements that will be copied over to
  // dst_alloc
  Iota<<<blocks, threads_per_block, 0, kernel_stream>>>(src_alloc.ptr(), src_alloc.pitch(),
-                          dst_alloc.width_logical(),dst_alloc.height(), dst_alloc.depth());
+                                                        dst_alloc.width_logical(),
+                                                        dst_alloc.height(), dst_alloc.depth());
  HIP_CHECK(hipGetLastError());

  HIP_CHECK(memcpy_func(dst_alloc.pitched_ptr(), make_hipPos(0, 0, 0), src_alloc.pitched_ptr(),
@@ -626,15 +628,14 @@ constexpr auto MemTypeUnified() {

 using DrvPtrVariant = std::variant<hipPitchedPtr, hipArray_t>;

-template <bool async = false>
-hipError_t DrvMemcpy3DWrapper(DrvPtrVariant dst_ptr, hipPos dst_pos, DrvPtrVariant src_ptr,
-                              hipPos src_pos, hipExtent extent, hipMemcpyKind kind,
-                              hipStream_t stream = nullptr) {
+static inline HIP_MEMCPY3D GetDrvMemcpy3DParms(DrvPtrVariant dst_ptr, hipPos dst_pos,
+                                               DrvPtrVariant src_ptr, hipPos src_pos,
+                                               hipExtent extent, hipMemcpyKind kind) {
  HIP_MEMCPY3D parms = {0};

  if (std::holds_alternative<hipArray_t>(dst_ptr)) {
    parms.dstMemoryType = hipMemoryTypeArray;
-    parms.dstArray = std::get<hipArray_t>(dst_ptr);  
+    parms.dstArray = std::get<hipArray_t>(dst_ptr);
  } else {
    auto ptr = std::get<hipPitchedPtr>(dst_ptr);
    parms.dstPitch = ptr.pitch;
@@ -694,6 +695,84 @@ hipError_t DrvMemcpy3DWrapper(DrvPtrVariant dst_ptr, hipPos dst_pos, DrvPtrVaria
  parms.dstY = dst_pos.y;
  parms.dstZ = dst_pos.z;

+  return parms;
+}
+
+static inline bool operator==(const HIP_MEMCPY3D& lhs, const HIP_MEMCPY3D& rhs) {
+  bool pos_eq = lhs.dstXInBytes == rhs.dstXInBytes && lhs.dstY == rhs.dstY &&
+      lhs.dstZ == rhs.dstZ && lhs.srcXInBytes == rhs.srcXInBytes && lhs.srcY == rhs.srcY &&
+      lhs.srcZ == rhs.srcZ;
+  bool extent_eq =
+      lhs.WidthInBytes == rhs.WidthInBytes && lhs.Height == rhs.Height && lhs.Depth == rhs.Depth;
+  bool mem_eq = true;
+  if (lhs.dstArray) {
+    mem_eq = lhs.dstArray == rhs.dstArray && lhs.dstMemoryType == rhs.dstMemoryType;
+  } else {
+    mem_eq = lhs.dstPitch == rhs.dstPitch && lhs.dstMemoryType == rhs.dstMemoryType;
+  }
+  if (lhs.srcArray) {
+    mem_eq = lhs.srcArray == rhs.srcArray && lhs.srcMemoryType == rhs.srcMemoryType;
+  } else {
+    mem_eq = lhs.srcPitch == rhs.srcPitch && lhs.srcMemoryType == rhs.srcMemoryType;
+  }
+  if (lhs.dstDevice) {
+    mem_eq = mem_eq && (lhs.dstDevice == rhs.dstDevice);
+  }
+  if (lhs.dstHost) {
+    mem_eq = mem_eq && (lhs.dstDevice == rhs.dstDevice);
+  }
+  if (lhs.srcDevice) {
+    mem_eq = mem_eq && (lhs.srcDevice == rhs.srcDevice);
+  }
+  if (lhs.srcHost) {
+    mem_eq = mem_eq && (lhs.srcHost == rhs.srcHost);
+  }
+
+  return pos_eq && extent_eq && mem_eq;
+}
+
+// APIs hipDrvGraphMemcpyNodeGetParams, hipDrvGraphMemcpyNodeSetParams are yet to be implemented in HIP runtime.
+#if 0
+template <bool set_params = false>
+hipError_t DrvMemcpy3DGraphWrapper(DrvPtrVariant dst_ptr, hipPos dst_pos, DrvPtrVariant src_ptr,
+                                   hipPos src_pos, hipExtent extent, hipMemcpyKind kind,
+                                   hipCtx_t context, hipStream_t stream = nullptr) {
+  auto parms = GetDrvMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+
+  hipGraph_t g = nullptr;
+  HIP_CHECK(hipGraphCreate(&g, 0));
+  hipGraphNode_t node = nullptr;
+  if constexpr (set_params) {
+    auto reversed_parms = GetDrvMemcpy3DParms(src_ptr, src_pos, dst_ptr, dst_pos, extent,
+                                              ReverseMemcpyDirection(kind));
+    HIP_CHECK(hipDrvGraphAddMemcpyNode(&node, g, nullptr, 0, &reversed_parms, context));
+    HIP_CHECK(hipDrvGraphMemcpyNodeSetParams(node, &parms));
+  } else {
+    HIP_CHECK(hipDrvGraphAddMemcpyNode(&node, g, nullptr, 0, &parms, context));
+  }
+
+  HIP_MEMCPY3D retrieved_params = {0};
+  HIP_CHECK(hipDrvGraphMemcpyNodeGetParams(node, &retrieved_params));
+  REQUIRE(parms == retrieved_params);
+
+  hipGraphExec_t graph_exec = nullptr;
+  HIP_CHECK(hipGraphInstantiate(&graph_exec, g, nullptr, nullptr, 0));
+  HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
+  HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
+
+  HIP_CHECK(hipGraphExecDestroy(graph_exec));
+  HIP_CHECK(hipGraphDestroy(g));
+
+  return hipSuccess;
+}
+#endif //if 0
+
+template <bool async = false>
+hipError_t DrvMemcpy3DWrapper(DrvPtrVariant dst_ptr, hipPos dst_pos, DrvPtrVariant src_ptr,
+                              hipPos src_pos, hipExtent extent, hipMemcpyKind kind,
+                              hipStream_t stream = nullptr) {
+  auto parms = GetDrvMemcpy3DParms(dst_ptr, dst_pos, src_ptr, src_pos, extent, kind);
+
  if constexpr (async) {
    return hipDrvMemcpy3DAsync(&parms, stream);
  } else {
@@ -805,4 +884,4 @@ void DrvMemcpy3DArrayDeviceShell(F memcpy_func, const hipStream_t kernel_stream
  };
  PitchedMemoryVerify(host_alloc.ptr(), extent.width, extent.width / sizeof(int), extent.height,
                      extent.depth, f);
-}
+}
@@ -35,15 +35,15 @@ enum class LinearAllocs {
 inline std::string to_string(const LinearAllocs allocation_type) {
  switch (allocation_type) {
    case LinearAllocs::malloc:
-      return "host pageable";
+      return "malloc";
    case LinearAllocs::mallocAndRegister:
-      return "registered";
+      return "malloc + hipHostRegister";
    case LinearAllocs::hipHostMalloc:
-      return "host pinned";
+      return "hipHostMalloc";
    case LinearAllocs::hipMalloc:
-      return "device malloc";
+      return "hipMalloc";
    case LinearAllocs::hipMallocManaged:
-      return "managed";
+      return "hipMallocManaged";
    default:
      return "unknown alloc type";
  }
@@ -83,24 +83,38 @@ template <typename T> class LinearAllocGuard {

  LinearAllocGuard(const LinearAllocGuard&) = delete;

-  LinearAllocGuard(LinearAllocGuard&& o)
-      : allocation_type_{o.allocation_type_}, ptr_{o.ptr_}, host_ptr_{o.host_ptr_} {
-    o.allocation_type_ = LinearAllocs::noAlloc;
-    o.ptr_ = nullptr;
-    o.host_ptr_ = nullptr;
-  }
+  LinearAllocGuard(LinearAllocGuard&& o) { *this = std::move(o); }

  LinearAllocGuard& operator=(LinearAllocGuard&& o) {
-    allocation_type_ = o.allocation_type_;
-    ptr_ = o.ptr_;
-    host_ptr_ = o.host_ptr_;
+    if (this != &o) {
+      dealloc();

-    o.allocation_type_ = LinearAllocs::noAlloc;
-    o.ptr_ = nullptr;
-    o.host_ptr_ = nullptr;
+      allocation_type_ = o.allocation_type_;
+      ptr_ = o.ptr_;
+      host_ptr_ = o.host_ptr_;
+
+      o.allocation_type_ = LinearAllocs::noAlloc;
+      o.ptr_ = nullptr;
+      o.host_ptr_ = nullptr;
+    }
+
+    return *this;
  }

-  ~LinearAllocGuard() {
+  ~LinearAllocGuard() { dealloc(); }
+
+  T* ptr() const { return ptr_; };
+  T* host_ptr() const { return host_ptr_; }
+
+ private:
+  LinearAllocs allocation_type_ = LinearAllocs::noAlloc;
+  T* ptr_ = nullptr;
+  T* host_ptr_ = nullptr;
+
+  void dealloc() {
+    if (ptr_ == nullptr) {
+      return;
+    }
    // No Catch macros, don't want to possibly throw in the destructor
    if (ptr_ != nullptr) {
      switch (allocation_type_) {
@@ -123,14 +137,6 @@ template <typename T> class LinearAllocGuard {
      }
    }
  }
-
-  T* ptr() const { return ptr_; };
-  T* host_ptr() const { return host_ptr_; }
-
- private:
-  LinearAllocs allocation_type_ = LinearAllocs::noAlloc;
-  T* ptr_ = nullptr;
-  T* host_ptr_ = nullptr;
 };

 template <typename T> class LinearAllocGuardMultiDim {
@@ -210,6 +216,42 @@ template <typename T> class ArrayAllocGuard {
  const hipExtent extent_;
 };

+template <typename T> class MipmappedArrayAllocGuard {
+ public:
+  // extent should contain logical width
+  MipmappedArrayAllocGuard(const hipExtent extent, const unsigned int levels,
+                           const unsigned int flags)
+      : extent_{extent}, levels_{levels} {
+    hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
+    HIP_CHECK(hipMallocMipmappedArray(&ptr_, &desc, extent_, levels_, flags));
+  }
+
+  MipmappedArrayAllocGuard(const hipExtent extent, const unsigned int flags = 0u)
+      : MipmappedArrayAllocGuard{extent, 1, flags} {}
+
+  ~MipmappedArrayAllocGuard() { static_cast<void>(hipFreeMipmappedArray(ptr_)); }
+
+  MipmappedArrayAllocGuard(const MipmappedArrayAllocGuard&) = delete;
+  MipmappedArrayAllocGuard(MipmappedArrayAllocGuard&&) = delete;
+
+  hipMipmappedArray_t ptr() const { return ptr_; }
+
+  hipArray_t GetLevel(unsigned int level) {
+    hipArray_t ret;
+    HIP_CHECK(hipGetMipmappedArrayLevel(&ret, ptr_, level));
+    return ret;
+  }
+
+  hipExtent extent() const { return extent_; }
+
+  unsigned int levels() const { return levels_; }
+
+ private:
+  hipMipmappedArray_t ptr_ = nullptr;
+  const hipExtent extent_;
+  const unsigned int levels_;
+};
+
 template <typename T> class DrvArrayAllocGuard {
 public:
  // extent should contain width in bytes
@@ -266,24 +308,24 @@ class StreamGuard {

  StreamGuard(const StreamGuard&) = delete;

-  StreamGuard(StreamGuard&& o)
-      : stream_type_{o.stream_type_}, flags_{o.flags_}, priority_{o.priority_}, stream_{o.stream_} {
-    o.stream_type_ = Streams::nullstream;
-    o.flags_ = 0u;
-    o.priority_ = 0;
-    o.stream_ = nullptr;
-  }
+  StreamGuard(StreamGuard&& o) { *this = std::move(o); }

  StreamGuard& operator=(StreamGuard&& o) {
-    stream_type_ = o.stream_type_;
-    flags_ = o.flags_;
-    priority_ = o.priority_;
-    stream_ = o.stream_;
+    if (this != &o) {
+      if (stream_type_ == Streams::created) {
+        static_cast<void>(hipStreamDestroy(stream_));
+      }

-    o.stream_type_ = Streams::nullstream;
-    o.flags_ = 0u;
-    o.priority_ = 0;
-    o.stream_ = nullptr;
+      stream_type_ = o.stream_type_;
+      flags_ = o.flags_;
+      priority_ = o.priority_;
+      stream_ = o.stream_;
+
+      o.stream_type_ = Streams::nullstream;
+      o.flags_ = 0u;
+      o.priority_ = 0;
+      o.stream_ = nullptr;
+    }

    return *this;
  }
@@ -170,7 +170,7 @@ inline bool DeviceAttributesSupport(const int device, Attributes... attributes)
  return (... && DeviceAttributeSupport(device, attributes));
 }

-inline int GetDeviceAttribute(int device, const hipDeviceAttribute_t attr) {
+inline int GetDeviceAttribute(const hipDeviceAttribute_t attr, int device) {
  int value = 0;
  HIP_CHECK(hipDeviceGetAttribute(&value, attr, device));
  return value;
@@ -22,6 +22,7 @@ add_subdirectory(rtc)
 add_subdirectory(deviceLib)
 add_subdirectory(graph)
 add_subdirectory(memory)
+add_subdirectory(stream_ordered)
 add_subdirectory(stream)
 add_subdirectory(event)
 add_subdirectory(occupancy)
@@ -43,11 +44,15 @@ add_subdirectory(g++)
 add_subdirectory(module)
 add_subdirectory(channelDescriptor)
 add_subdirectory(executionControl)
+add_subdirectory(math)
 add_subdirectory(vector_types)
 add_subdirectory(atomics)
 add_subdirectory(complex)
 add_subdirectory(p2p)
 add_subdirectory(gcc)
+add_subdirectory(syncthreads)
+add_subdirectory(threadfence)
+add_subdirectory(virtualMemoryManagement)

 if(HIP_PLATFORM STREQUAL "amd")
 add_subdirectory(callback)
@@ -58,3 +63,5 @@ add_subdirectory(vulkan_interop)
 add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
 endif()
 add_subdirectory(synchronization)
+add_subdirectory(launchBounds)
+add_subdirectory(assertion)
@@ -0,0 +1,49 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+if(HIP_PLATFORM MATCHES "nvidia")
+    set(TEST_SRC
+        assert.cc
+    )
+    hip_add_exe_to_target(NAME AssertionTest
+                          TEST_SRC ${TEST_SRC}
+                          TEST_TARGET_NAME build_tests
+                          LINKER_LIBS nvrtc)
+elseif(HIP_PLATFORM MATCHES "amd")
+    set(TEST_SRC
+        static_assert.cc
+        assert.cc
+    )
+    hip_add_exe_to_target(NAME AssertionTest
+                          TEST_SRC ${TEST_SRC}
+                          TEST_TARGET_NAME build_tests
+                          LINKER_LIBS hiprtc)
+endif()
+
+# Below tests fail in PSDB
+#add_test(NAME Unit_StaticAssert_Positive_Basic
+#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+#         static_assert_kernels_positive.cc 2)
+#
+#add_test(NAME Unit_StaticAssert_Negative_Basic
+#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+#         static_assert_kernels_negative.cc 2)
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <csetjmp>
+#include <csignal>
+
+/**
+ * @addtogroup assert assert
+ * @{
+ * @ingroup DeviceLanguageTest
+ * `void assert(int expression)` -
+ * Stops the kernel execution if expression is equal to zero.
+ */
+
+jmp_buf env_ignore_abort;
+volatile int abort_raised_flag = 0;
+
+void on_sigabrt(int signum) {
+  signal(signum, SIG_DFL);
+  abort_raised_flag = 1;
+  longjmp(env_ignore_abort, 1);
+}
+
+void try_and_catch_abort(void (*func)()) {
+  if (!setjmp(env_ignore_abort)) {
+    signal(SIGABRT, &on_sigabrt);
+    (*func)();
+    signal(SIGABRT, SIG_DFL);
+  }
+}
+
+__global__ void AssertPassKernel(int* x) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  *x = tid;
+  // expected always to be true
+  assert(tid >= 0);
+}
+
+__global__ void AssertFailKernel(int* x) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+   *x = tid;
+  // expected to fail for the even thread indices
+  assert(tid % 2 == 1);
+}
+
+template <bool should_abort> void LaunchAssertKernel() {
+  const int num_blocks = 2;
+  const int num_threads = 16;
+  int *d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(int)));
+
+  if constexpr (should_abort) {
+    AssertFailKernel<<<num_blocks, num_threads, 0, 0>>>(d_a);
+#if HT_AMD
+    HIP_CHECK(hipDeviceSynchronize());
+#else
+    HIP_CHECK_ERROR(hipDeviceSynchronize(), hipErrorAssert);
+#endif
+  } else {
+    AssertPassKernel<<<num_blocks, num_threads, 0, 0>>>(d_a);
+    HIP_CHECK(hipDeviceSynchronize());
+  }
+
+  HIP_CHECK(hipFree(d_a));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Launches kernels with asserts that have an expression equal to 1.
+ *  - Expects that SIGABRT is not raised and kernels have executed successfully.
+ * Test source
+ * ------------------------
+ *  - unit/assertion/assert.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Assert_Positive_Basic_KernelPass") {
+  try_and_catch_abort(&LaunchAssertKernel<false>);
+  REQUIRE(abort_raised_flag == 0);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Launches kernels with asserts that have an expression equal to 0.
+ *  - Expects that SIGABRT is raised and kernels have been stopped on AMD.
+ *    - The HIP runtime also aborts the host code, so this test case uses signal handlers
+ *      to avoid host code abortion.
+ *  - Expects that `hipErrorAssert` is returned from `hipDeviceSynchronize` on NVIDIA.
+ *    - The host code is not aborted.
+ * Test source
+ * ------------------------
+ *  - unit/assertion/assert.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Assert_Positive_Basic_KernelFail") {
+  try_and_catch_abort(&LaunchAssertKernel<true>);
+#if HT_AMD
+  REQUIRE(abort_raised_flag == 1);
+#else
+  REQUIRE(abort_raised_flag == 0);
+#endif
+}
@@ -0,0 +1,88 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include "static_assert_kernels_rtc.hh"
+
+/**
+ * @addtogroup static_assert static_assert
+ * @{
+ * @ingroup DeviceLanguageTest
+ * `void static_assert(constexpr expression, const char* message)` -
+ * Stops the compilation if expression is equal to zero, and displays the specified message.
+ */
+
+void StaticAssertWrapper(const char* program_source) {
+  hiprtcProgram program{};
+
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "static_assert_rtc.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{2};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles kernels with static_assert calls:
+ *    -# Expected that static_assert passes and compilation is successful.
+ *    -# Expected that static_assert fails and compilation has errors.
+ *  - Uses RTC to perform compilation.
+ * Test source
+ * ------------------------
+ *  - unit/assertion/static_assert.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_StaticAssert_Positive_Basic_RTC") { StaticAssertWrapper(kStaticAssert_Positive); }
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Passes invalidly formed expressions to static_assert calls.
+ *  - Uses expressions that are not constexpr and values that are not known during compilation.
+ *  - Uses RTC to perform compilation.
+ * Test source
+ * ------------------------
+ *  - unit/assertion/static_assert.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_StaticAssert_Negative_Basic_RTC") { StaticAssertWrapper(kStaticAssert_Negative); }
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+__global__ void StaticAssertErrorKernel1() {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(tid % 2 == 1, "[StaticAssertErrorKernel1]");
+}
+
+__global__ void StaticAssertErrorKernel2() {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(++tid > 2, "[StaticAssertErrorKernel2]");
+}
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+__global__ void StaticAssertPassKernel1() {
+  static_assert(sizeof(int) < sizeof(long), "[StaticAssertPassKernel1]");
+}
+
+__global__ void StaticAssertPassKernel2() { static_assert(10 > 5, "[StaticAssertPassKernel2]"); }
+
+__global__ void StaticAssertFailKernel1() {
+  static_assert(sizeof(int) > sizeof(long), "[StaticAssertFailKernel1]");
+}
+
+__global__ void StaticAssertFailKernel2() { static_assert(10 < 5, "[StaticAssertFailKernel2]"); }
@@ -0,0 +1,56 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Positive and negative kernels used for the static_assert Test Cases that are using RTC.
+*/
+
+static constexpr auto kStaticAssert_Positive{
+    R"(
+    __global__ void StaticAssertPassKernel1() {
+      static_assert(sizeof(int) < sizeof(long), "[StaticAssertPassKernel1]");
+    }
+
+    __global__ void StaticAssertPassKernel2() {
+      static_assert(10 > 5, "[StaticAssertPassKernel2]");
+    }
+
+    __global__ void StaticAssertFailKernel1() {
+      static_assert(sizeof(int) > sizeof(long), "[StaticAssertFailKernel1]");
+    }
+
+    __global__ void StaticAssertFailKernel2() {
+      static_assert(10 < 5, "[StaticAssertFailKernel2]");
+    }
+  )"};
+
+static constexpr auto kStaticAssert_Negative{
+    R"(
+    __global__ void StaticAssertErrorKernel1() {
+      const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      static_assert(tid % 2 == 1, "[StaticAssertErrorKernel1]");
+    }
+
+    __global__ void StaticAssertErrorKernel2() {
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      static_assert(++tid > 2, "[StaticAssertErrorKernel2]");
+    }
+  )"};
@@ -18,31 +18,145 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.

-set(TEST_SRC
-    atomicExch.cc
-    atomicExch_system.cc
-)
+if(HIP_PLATFORM MATCHES "amd")
+    set(TEST_SRC
+      atomicAnd.cc
+      atomicAnd_system.cc
+      atomicOr.cc
+      atomicOr_system.cc
+      atomicXor.cc
+      atomicXor_system.cc
+      atomicMin.cc
+      atomicMin_system.cc
+      atomicMax.cc
+      atomicMax_system.cc
+      safeAtomicMin.cc
+      unsafeAtomicMin.cc
+      safeAtomicMax.cc
+      unsafeAtomicMax.cc
+      __hip_atomic_fetch_min.cc
+      __hip_atomic_fetch_max.cc
+      atomic_builtins.cc
+      acquire_release.cc
+      sequential_consistency.cc
+      atomicAdd.cc
+      atomicAdd_system.cc
+      unsafeAtomicAdd.cc
+      safeAtomicAdd.cc
+      atomicSub.cc
+      atomicSub_system.cc
+      atomicCAS.cc
+      atomicCAS_system.cc
+      __hip_atomic_fetch_add.cc
+      __hip_atomic_compare_exchange_strong.cc
+      atomicExch.cc
+      atomicExch_system.cc
+      __hip_atomic_fetch_and.cc
+      __hip_atomic_fetch_or.cc
+      __hip_atomic_fetch_xor.cc
+      __hip_atomic_exchange.cc
+    )

-if(HIP_PLATFORM MATCHES "nvidia")
-    set_source_files_properties(atomicExch_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
-    hip_add_exe_to_target(NAME AtomicsTest
-                        TEST_SRC ${TEST_SRC}
-                        TEST_TARGET_NAME build_tests
-                        LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
-elseif(HIP_PLATFORM MATCHES "amd")
-    hip_add_exe_to_target(NAME AtomicsTest
+  #atomicInc & atomicDec tests are disabled on MI300X due to SWDEV-440688
+  set(NOT_FOR_MI300X_TEST
+      atomicInc.cc
+      atomicDec.cc
+  )
+  set(MI300X_TARGET gfx941)
+  function(CheckRejectedArchs OFFLOAD_ARCH_STR_LOCAL)
+    set(ARCH_CHECK -1 PARENT_SCOPE)
+    string(REGEX MATCHALL "--offload-arch=gfx[0-9a-z]+" OFFLOAD_ARCH_LIST ${OFFLOAD_ARCH_STR_LOCAL})
+    foreach(OFFLOAD_ARCH IN LISTS OFFLOAD_ARCH_LIST)
+      string(REGEX MATCHALL "--offload-arch=(gfx[0-9a-z]+)" matches ${OFFLOAD_ARCH})
+      if (CMAKE_MATCH_COUNT EQUAL 1)
+        if (CMAKE_MATCH_1 IN_LIST MI300X_TARGET)
+          set(ARCH_CHECK 1 PARENT_SCOPE)
+        endif()         # CMAKE_MATCH_1
+      endif()        # CMAKE_MATCH_COUNT
+    endforeach()   # OFFLOAD_ARCH_LIST
+  endfunction()  # CheckAcceptedArchs
+
+  if (DEFINED OFFLOAD_ARCH_STR)
+    CheckRejectedArchs(${OFFLOAD_ARCH_STR})
+  elseif(DEFINED $ENV{HCC_AMDGPU_TARGET})
+    CheckRejectedArchs($ENV{HCC_AMDGPU_TARGET})
+  else()
+    set(ARCH_CHECK -1)
+  endif()
+  if(${ARCH_CHECK} EQUAL -1)
+    message(STATUS "Adding test: ${NOT_FOR_MI300X_TEST}")
+    set(TEST_SRC ${TEST_SRC} ${NOT_FOR_MI300X_TEST})
+  else()
+    message(STATUS "Removing test: ${NOT_FOR_MI300X_TEST}")
+  endif()
+
+
+  hip_add_exe_to_target(NAME AtomicsTest
                        TEST_SRC ${TEST_SRC}
                        TEST_TARGET_NAME build_tests
                        LINKER_LIBS hiprtc)
-endif()
+  set(EXPECTED_ERRORS 48)

-# SWDEV-435667: Below 2 tests failed in stress test on 01/12/23
-#add_test(NAME Unit_atomicExch_Negative_Parameters
-#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
-#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
-#         atomicExch_negative_kernels.cc 40)
-#
-#add_test(NAME Unit_atomicExch_system_Negative_Parameters
-#         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
-#         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
-#         atomicExch_system_negative_kernels.cc 40)
+  # Below tests fail in PSDB
+  #add_test(NAME Unit_atomicAnd_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicAnd_negative_kernels.cc ${EXPECTED_ERRORS})
+  #
+  #add_test(NAME Unit_atomicOr_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicOr_negative_kernels.cc ${EXPECTED_ERRORS})
+  #
+  #add_test(NAME Unit_atomicXor_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicXor_negative_kernels.cc ${EXPECTED_ERRORS})
+  #
+  #add_test(NAME Unit_atomicMin_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicMin_negative_kernels.cc ${EXPECTED_ERRORS})
+  #
+  #add_test(NAME Unit_atomicMax_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicMax_negative_kernels.cc ${EXPECTED_ERRORS})
+  #add_test(NAME Unit_AtomicBuiltins_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomic_builtins_kernels.cc 60 27) # Should be 35 warnings, see EXSWHTEC-309
+  #add_test(NAME Unit_atomicAdd_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicAdd_negative_kernels.cc 48)
+  #add_test(NAME Unit_atomicSub_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicSub_negative_kernels.cc 48)
+  #add_test(NAME Unit_atomicInc_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicInc_negative_kernels.cc 8)
+  #
+  #add_test(NAME Unit_atomicDec_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicDec_negative_kernels.cc 8)
+  #
+  #add_test(NAME Unit_atomicCAS_Negative_Parameters
+  #        COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #        ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #        atomicCAS_negative_kernels.cc 48)
+  #
+  # SWDEV-435667: Below 2 tests failed in stress test on 01/12/23
+  #add_test(NAME Unit_atomicExch_Negative_Parameters
+  #         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #         atomicExch_negative_kernels.cc 40)
+  #
+  #add_test(NAME Unit_atomicExch_system_Negative_Parameters
+  #         COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
+  #         ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
+  #         atomicExch_system_negative_kernels.cc 40)
+endif()
@@ -0,0 +1,129 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_compare_exchange_strong __hip_atomic_compare_exchange_strong
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of __hip_atomic_compare_exchange_strong
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - WAVEFRONT memory scope.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_compare_exchange_strong.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_compare_exchange_strong_Positive_Wavefront", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of __hip_atomic_compare_exchange_strong
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - WORKGROUP memory scope.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_compare_exchange_strong.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_compare_exchange_strong_Positive_Workgroup", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinCAS,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,136 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicExch_common.hh"
+
+/**
+ * @addtogroup __hip_atomic_exchange __hip_atomic_exchange
+ * @{
+ * @ingroup AtomicsTest
+ * ________________________
+ * Test cases from other modules:
+ *    - @ref Unit_AtomicBuiltins_Negative_Parameters_RTC
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * exchange into a runtime determined memory location. Each thread will exchange its own grid wide
+ * linear index + offset into the memory location, storing the return value into a separate output
+ * array slot corresponding to it. Once complete, the union of output array and exchange memory is
+ * validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots). Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Exchange memory located in shared memory
+ *      - WAVEFRONT memory scope
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_exchange.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_exchange_Positive_Wavefront", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                           sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                           cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * exchange into a runtime determined memory location. Each thread will exchange its own grid wide
+ * linear index + offset into the memory location, storing the return value into a separate output
+ * array slot corresponding to it. Once complete, the union of output array and exchange memory is
+ * validated to contain all values in the range [0, number_of_threads +
+ * number_of_exchange_memory_slots). Several memory access patterns are tested:
+ *      -# All threads exchange to a single memory location
+ *      -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicExch
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
+ *      - Exchange memory located in shared memory
+ *      - WORKGROUP memory scope
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_exchange.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_exchange_Positive_Workgroup", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                           sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::builtin,
+                                             __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                           cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,132 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_add __hip_atomic_fetch_add
+ * @{
+ * @ingroup AtomicsTest
+ * ________________________
+ * Test cases from other modules:
+ *    - @ref Unit_AtomicBuiltins_Negative_Parameters_RTC
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of __hip_atomic_fetch_add
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - WAVEFRONT memory scope.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_fetch_add.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_add_Positive_Wavefront", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of __hip_atomic_fetch_add
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - WORKGROUP memory scope.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/__hip_atomic_fetch_add.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_add_Positive_Workgroup", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kBuiltinAdd,
+                                   __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_and __hip_atomic_fetch_and
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_and.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinAnd,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_max __hip_atomic_fetch_max
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                         sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                         sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MAX with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_max.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMax,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_min __hip_atomic_fetch_min
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                         sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                         sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic MIN with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_min.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kBuiltinMin,
+                                           __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_or __hip_atomic_fetch_or
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_or.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinOr,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,187 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup __hip_atomic_fetch_xor __hip_atomic_fetch_xor
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WAVEFRONT>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on the same
+ * address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on adjacent
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on scattered
+ * addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/__hip_atomic_fetch_xor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kBuiltinXor,
+                                            __HIP_MEMORY_SCOPE_WORKGROUP>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,551 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include "memory_order_common.hh"
+
+TEST_CASE("Unit___hip_atomic_load_store_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kLoadStore, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kLoadStore, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kLoadStore, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_exchange_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kExchange, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kExchange, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kExchange, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_compare_exchange_strong_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                 __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                 __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeStrong, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                 __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_compare_exchange_weak_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kCompareExchangeWeak, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_add_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAdd, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAdd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAdd, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAnd, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kAnd, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kAnd, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kOr, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kOr, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kOr, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kOr, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kXor, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kXor, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kXor, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kXor, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMin, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMin, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMin, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMin, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Acquire_Release") {
+  SECTION("ACQUIRE/RELEASE") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQUIRE,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMax, __ATOMIC_ACQUIRE>();
+    }
+  }
+  SECTION("ACQ_REL") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_ACQ_REL,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMax, __ATOMIC_ACQ_REL>();
+    }
+  }
+  SECTION("SEQ_CST") {
+    SECTION("WAVEFRONT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WAVEFRONT>();
+    }
+    SECTION("WORKGROUP") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_WORKGROUP>();
+    }
+    SECTION("AGENT") {
+      AcquireRelease::Test<BuiltinAtomicOperation::kMax, __ATOMIC_SEQ_CST,
+                           __HIP_MEMORY_SCOPE_AGENT>();
+    }
+    SECTION("SYSTEM") {
+      AcquireRelease::SystemTest<BuiltinAtomicOperation::kMax, __ATOMIC_SEQ_CST>();
+    }
+  }
+}
@@ -0,0 +1,577 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <resource_guards.hh>
+#include <cmd_options.hh>
+
+namespace cg = cooperative_groups;
+
+// Atomic operations for which the tests in this file apply for
+enum class AtomicOperation {
+  kAdd = 0,
+  kAddSystem,
+  kSub,
+  kSubSystem,
+  kInc,
+  kDec,
+  kUnsafeAdd,
+  kSafeAdd,
+  kCASAdd,
+  kCASAddSystem,
+  kBuiltinAdd,
+  kBuiltinCAS
+};
+
+// Constants that are passed as operands to the atomic operations
+constexpr auto kIntegerTestValue = 7;
+constexpr auto kFloatingPointTestValue = 3.125;
+constexpr auto kIncDecWraparoundValue = 1023;
+
+// Retrieves test value constant based on the atomic operation and test type:
+// - kIncDecWraparoundValue for increment and decrement operations
+// - kFloatingPointTestValue for floating point test type
+// - kIntegerTestValue for integer test type
+template <typename TestType, AtomicOperation operation>
+__host__ __device__ TestType GetTestValue() {
+  if constexpr (operation == AtomicOperation::kInc || operation == AtomicOperation::kDec) {
+    return kIncDecWraparoundValue;
+  }
+
+  return std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
+}
+
+// Implements an atomic addition via atomicCAS
+template <typename TestType> __device__ TestType CASAtomicAdd(TestType* address, TestType val) {
+  TestType old = *address, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val + assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+// Implements an atomic addition via atomicCAS_system
+template <typename TestType>
+__device__ TestType CASAtomicAddSystem(TestType* address, TestType val) {
+  TestType old = *address, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS_system(address, assumed, val + assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+// Implements an atomic addition via __hip_atomic_compare_exchange_strong
+template <typename TestType, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__device__ TestType BuiltinCASAtomicAdd(TestType* address, TestType val) {
+  TestType old = *address, assumed;
+
+  const auto builtin_cas = [](TestType* address, TestType assumed, TestType val) {
+    __hip_atomic_compare_exchange_strong(address, &assumed, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                         memory_scope);
+    return assumed;
+  };
+
+  do {
+    assumed = old;
+    old = builtin_cas(address, assumed, val + assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+// Performs an atomic operation on parameter `mem` based on the `operation` enumerator.
+// `memory_scope` is forwarded to the builtin operations and is by default device-wide.
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__device__ TestType PerformAtomicOperation(TestType* const mem) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  if constexpr (operation == AtomicOperation::kAdd) {
+    return atomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kAddSystem) {
+    return atomicAdd_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSub) {
+    return atomicSub(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSubSystem) {
+    return atomicSub_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kInc) {
+    return atomicInc(mem, val);
+  } else if constexpr (operation == AtomicOperation::kDec) {
+    return atomicDec(mem, val);
+  } else if constexpr (operation == AtomicOperation::kUnsafeAdd) {
+    return unsafeAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSafeAdd) {
+    return safeAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kCASAdd) {
+    return CASAtomicAdd(mem, val);
+  } else if constexpr (operation == AtomicOperation::kCASAddSystem) {
+    return CASAtomicAddSystem(mem, val);
+  } else if constexpr (operation == AtomicOperation::kBuiltinAdd) {
+    return __hip_atomic_fetch_add(mem, val, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinCAS) {
+    return BuiltinCASAtomicAdd<TestType, memory_scope>(mem, val);
+  }
+}
+
+// This kernel executes the atomic operation specified by the enumerator `operation`. Results of
+// the atomic operations are stored in `old_vals`. Each thread executes the atomic operation on the
+// same memory location `global_mem`.
+// If `use_shared_mem` is true, `global_mem` is copied to shared memory first, the atomic
+// operations are executed on shared memory, and the result is copied back to `global_mem`.
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
+  __shared__ TestType shared_mem;
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? &shared_mem : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid == 0) mem[0] = global_mem[0];
+    __syncthreads();
+  }
+
+  old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(mem);
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid == 0) global_mem[0] = mem[0];
+  }
+}
+
+// Indexes array `ptr`, with the size in bytes of each element specified by `pitch`
+template <typename TestType>
+__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch,
+                                            const unsigned int idx) {
+  const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
+  return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
+}
+
+// Executes arbitrary load-store operations on the range specified by `begin_addr` and `end_addr`
+__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+// This kernel executes the atomic operation specified by the enumerator `operation`. Results of the
+// atomic operations are stored in `old_vals`. `global_mem` is an array with `width` number of
+// elements. Each thread performs the atomic operation on the element that corresponds to its thread
+// id (tid % width).
+// The elements of `global_mem` can be larger than sizeof(TestType) with the actual size in bytes
+// specified by `pitch`. This is done so we can test scenarios where threads target memory locations
+// that are scattered over different cache lines.
+// If `use_shared_mem` is true, `global_mem` is copied to shared memory first, the atomic operations
+// are executed on shared memory, and the result is copied back to `global_mem`.
+// If `pitch` is greater than sizeof(TestType), random memory operations are performed in the empty
+// space between consecutive atomic operations so that we can test that the atomic operations
+// behaves correctly even with some interference.
+//
+// For example, given that sizeof(TestType) is 1, `width` is 3, and `pitch` is 4:
+//
+//                  0     1     2     3     4     5     6     7     8     9     10     11
+// global_mem -> |  x  |     |     |     |  x  |     |     |     |  x  |     |      |      |
+//               |         pitch         |         pitch         |          pitch          |
+//
+// In this scenario, the atomic operations will target the elements denoted with `x` (addresses 0,
+// 4, 8). Random memory traffic will be generated on the addresses in between (1, 2, 3, 5, 6, 7, 9,
+// 10, 11)
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
+                           const unsigned int width, const unsigned int pitch) {
+  extern __shared__ uint8_t shared_mem[];
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? reinterpret_cast<TestType*>(shared_mem) : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid < width) {
+      const auto target = PitchedOffset(mem, pitch, tid);
+      *target = *PitchedOffset(global_mem, pitch, tid);
+    };
+    __syncthreads();
+  }
+
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(
+        PitchedOffset(mem, pitch, tid % width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    GenerateMemoryTraffic(begin_addr, end_addr);
+  }
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid < width) {
+      const auto target = PitchedOffset(global_mem, pitch, tid);
+      *target = *PitchedOffset(mem, pitch, tid);
+    };
+  }
+}
+
+// Used to configure test run
+struct TestParams {
+  auto ThreadCount() const {
+    return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
+  }
+
+  auto HostIterationsPerThread() const {  // number of iterations per host thread
+    return std::max(num_devices * kernel_count * ThreadCount() / 20, width);
+  }
+
+  dim3 blocks;                          // number of blocks per kernel launch
+  dim3 threads;                         // number of threads per kernel launch
+  unsigned int num_devices = 1u;        // number of devices used
+  unsigned int kernel_count = 1u;       // number of kernels launched per device
+  unsigned int width = 1u;              // number of memory locations targeted
+  unsigned int pitch = 0u;              // defines spacing between memory locations
+  unsigned int host_thread_count = 0u;  // number of host threads launched
+  LinearAllocs alloc_type;              // type of allocation used
+};
+
+// Reference implementation used to verify results
+template <typename TestType, AtomicOperation operation>
+std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const TestParams& p) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  const auto total_thread_count = p.num_devices * p.kernel_count * p.ThreadCount() +
+      p.host_thread_count * p.HostIterationsPerThread();
+
+  std::vector<TestType> res_vals(p.width);
+  std::vector<TestType> old_vals;
+  old_vals.reserve(total_thread_count);
+
+  auto perform_op = [&](unsigned id) {
+    auto& res = res_vals[id % p.width];
+    old_vals.push_back(res);
+
+    if constexpr (operation == AtomicOperation::kAdd || operation == AtomicOperation::kAddSystem ||
+                  operation == AtomicOperation::kUnsafeAdd ||
+                  operation == AtomicOperation::kSafeAdd || operation == AtomicOperation::kCASAdd ||
+                  operation == AtomicOperation::kCASAddSystem ||
+                  operation == AtomicOperation::kBuiltinAdd ||
+                  operation == AtomicOperation::kBuiltinCAS) {
+      res = res + val;
+    } else if constexpr (operation == AtomicOperation::kSub ||
+                         operation == AtomicOperation::kSubSystem) {
+      res = res - val;
+    } else if constexpr (operation == AtomicOperation::kInc) {
+      res = (res >= val) ? 0 : res + 1;
+    } else if constexpr (operation == AtomicOperation::kDec) {
+      res = ((res == 0) || (res > val)) ? val : res - 1;
+    }
+  };
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      for (auto tid = 0u; tid < p.ThreadCount() - p.width; ++tid) {
+        perform_op(tid);
+      }
+    }
+  }
+
+  for (auto i = 0u; i < p.host_thread_count; ++i) {
+    for (auto j = 0u; j < p.HostIterationsPerThread(); ++j) {
+      perform_op(j);
+    }
+  }
+
+  return {res_vals, old_vals};
+}
+
+// Compares the results of the test kernel stored in `res_vals` with results generated by the
+// reference implementation
+template <typename TestType, AtomicOperation operation>
+void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<TestType>& old_vals) {
+  auto [expected_res_vals, expected_old_vals] = TestKernelHostRef<TestType, operation>(p);
+
+  for (auto i = 0u; i < res_vals.size(); ++i) {
+    INFO("Results index: " << i);
+    REQUIRE(expected_res_vals[i] == res_vals[i]);
+  }
+
+  std::sort(begin(old_vals), end(old_vals));
+  std::sort(begin(expected_old_vals), end(expected_old_vals));
+  for (auto i = 0u; i < old_vals.size(); ++i) {
+    INFO("Old values index: " << i);
+    REQUIRE(expected_old_vals[i] == old_vals[i]);
+  }
+}
+
+// Launches the test kernel
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
+                  TestType* const old_vals) {
+  const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
+  if (p.width == 1 && p.pitch == sizeof(TestType))
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
+  else
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
+}
+
+// Performs a host atomic operation on parameter `mem` based on the `operation` enumerator.
+template <typename TestType, AtomicOperation operation>
+void HostAtomicOperation(const unsigned int iterations, TestType* mem, TestType* const old_vals,
+                         const unsigned int width, const unsigned pitch, TestType /*base_val*/) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  for (auto i = 0u; i < iterations; ++i) {
+    if constexpr (operation == AtomicOperation::kAddSystem ||
+                  operation == AtomicOperation::kCASAddSystem ||
+                  operation == AtomicOperation::kBuiltinAdd ||
+                  operation == AtomicOperation::kBuiltinCAS) {
+      old_vals[i] = __atomic_fetch_add(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
+    } else if constexpr (operation == AtomicOperation::kSubSystem) {
+      old_vals[i] = __atomic_fetch_sub(PitchedOffset(mem, pitch, i % width), val, __ATOMIC_RELAXED);
+    }
+  }
+}
+
+// Launches host threads based on TestParams::host_thread_count that compete with the test kernel
+// for the same resources
+template <typename TestType, AtomicOperation operation>
+void PerformHostAtomicOperation(const TestParams& p, TestType* mem, TestType* const old_vals) {
+  if (p.host_thread_count == 0) {
+    return;
+  }
+
+  const auto host_base_val = p.num_devices * p.kernel_count * p.ThreadCount();
+
+  std::vector<std::thread> threads;
+  for (auto i = 0u; i < p.host_thread_count; ++i) {
+    const auto iterations = p.HostIterationsPerThread();
+    const auto thread_base_val = host_base_val + i * iterations;
+    threads.push_back(std::thread(HostAtomicOperation<TestType, operation>, iterations, mem,
+                                  old_vals + thread_base_val, p.width, p.pitch, thread_base_val));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// This is the main body of the test:
+// 1. Allocate memory based on TestParams::alloc_type
+// 2. Launch kernels based on TestParams::num_devices and TestParams::kernel_count
+// 3. Launch host threads based on TestParams::host_thread_count
+// 4. Verify the results
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void TestCore(const TestParams& p) {
+  const unsigned int flags =
+      p.alloc_type == LinearAllocs::mallocAndRegister ? hipHostRegisterMapped : 0u;
+
+  const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
+  std::vector<LinearAllocGuard<TestType>> old_vals_devs;
+  std::vector<StreamGuard> streams;
+  for (auto i = 0; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
+    old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
+    for (auto j = 0; j < p.kernel_count; ++j) {
+      streams.emplace_back(Streams::created);
+    }
+  }
+
+  const auto mem_alloc_size = p.width * p.pitch;
+  LinearAllocGuard<TestType> mem_dev(p.alloc_type, mem_alloc_size, flags);
+
+  std::vector<TestType> old_vals(p.num_devices * p.kernel_count * p.ThreadCount() +
+                                 p.host_thread_count * p.HostIterationsPerThread());
+  std::vector<TestType> res_vals(p.width);
+
+  TestType* const mem_ptr =
+      p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr();
+
+  HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size));
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      const auto& stream = streams[i * p.kernel_count + j].stream();
+      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+      LaunchKernel<TestType, operation, use_shared_mem, memory_scope>(p, stream, mem_dev.ptr(),
+                                                                      old_vals);
+    }
+  }
+
+  PerformHostAtomicOperation<TestType, operation>(p, mem_dev.host_ptr(), old_vals.data());
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    const auto device_offset = i * p.kernel_count * p.ThreadCount();
+    HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
+                        old_vals_alloc_size, hipMemcpyDeviceToHost));
+  }
+  HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType),
+                        p.width, hipMemcpyDeviceToHost));
+
+  Verify<TestType, operation>(p, res_vals, old_vals);
+}
+
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+// Configures and creates the TestCore for a single device, and a single kernel launch
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = 1;
+  if constexpr ((operation == AtomicOperation::kBuiltinAdd ||
+                 operation == AtomicOperation::kBuiltinCAS) &&
+                memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr ((operation == AtomicOperation::kBuiltinAdd ||
+                        operation == AtomicOperation::kBuiltinCAS) &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateThreadDimensions();
+  }
+  params.width = width;
+  params.pitch = pitch;
+
+  SECTION("Global memory") {
+    if constexpr ((operation == AtomicOperation::kBuiltinAdd ||
+                   operation == AtomicOperation::kBuiltinCAS) &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateBlockDimensions();
+    }
+    using LA = LinearAllocs;
+    for (const auto alloc_type :
+         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+      params.alloc_type = alloc_type;
+      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+        TestCore<TestType, operation, false, memory_scope>(params);
+      }
+    }
+  }
+
+  SECTION("Shared memory") {
+    params.blocks = dim3(1);
+    params.alloc_type = LinearAllocs::hipMalloc;
+    TestCore<TestType, operation, true, memory_scope>(params);
+  }
+}
+
+// Configures and creates the TestCore for a single device, and multiple kernel launches
+template <typename TestType, AtomicOperation operation>
+void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width,
+                                    const unsigned int pitch) {
+  int concurrent_kernels = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
+  if (!concurrent_kernels) {
+    HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+    return;
+  }
+
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type :
+       {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false>(params);
+    }
+  }
+}
+
+// Configures and creates the TestCore for a multiple devices (and host), and multiple kernel
+// launches
+template <typename TestType, AtomicOperation operation>
+void MultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
+                                             const unsigned int kernel_count,
+                                             const unsigned int width, const unsigned int pitch,
+                                             const unsigned int host_thread_count = 0u) {
+  if (num_devices > 1) {
+    if (HipTest::getDeviceCount() < num_devices) {
+      std::string msg = std::to_string(num_devices) + " devices are required";
+      HipTest::HIP_SKIP_TEST(msg.c_str());
+      return;
+    }
+  }
+
+  if (kernel_count > 1) {
+    for (auto i = 0u; i < num_devices; ++i) {
+      int concurrent_kernels = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
+      if (!concurrent_kernels) {
+        HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+        return;
+      }
+    }
+  }
+
+  TestParams params;
+  params.num_devices = num_devices;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+  params.host_thread_count = host_thread_count;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false, __HIP_MEMORY_SCOPE_SYSTEM>(params);
+    }
+  }
+}
@@ -0,0 +1,167 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicAdd_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicAdd atomicAdd
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAdd_Positive", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kAdd>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kAdd>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kAdd>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAdd_Positive_Multi_Kernel", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kAdd>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kAdd>(2, warp_size,
+                                                                      sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kAdd>(2, warp_size,
+                                                                      cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicAdd.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicAdd_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicAdd_int, kAtomicAdd_uint, kAtomicAdd_ulong,
+                                       kAtomicAdd_ulonglong, kAtomicAdd_float, kAtomicAdd_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicAdd_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicAdd(int* address, int val) */
+__global__ void atomicAdd_int_v1(int* address, int* result) { *result = atomicAdd(&address, 1234); }
+
+__global__ void atomicAdd_int_v2(int* address, int* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_int_v3(int* address, int* result) { *result = atomicAdd(1234, 1234); }
+
+__global__ void atomicAdd_int_v4(Dummy* address, int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_int_v5(char* address, int* result) { *result = atomicAdd(address, 1234); }
+
+__global__ void atomicAdd_int_v6(short* address, int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_int_v7(long* address, int* result) { *result = atomicAdd(address, 1234); }
+
+__global__ void atomicAdd_int_v8(long long* address, int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+/* unsigned int atomicAdd(unsigned int* address, unsigned int val) */
+__global__ void atomicAdd_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicAdd(&address, 1234);
+}
+
+__global__ void atomicAdd_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicAdd(1234, 1234);
+}
+
+__global__ void atomicAdd_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_uint_v5(char* address, unsigned int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_uint_v6(short* address, unsigned int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_uint_v7(long* address, unsigned int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+/* atomicAdd(unsigned long* address, unsigned long val) */
+__global__ void atomicAdd_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicAdd(&address, 1234);
+}
+
+__global__ void atomicAdd_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicAdd(1234, 1234);
+}
+
+__global__ void atomicAdd_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+/* atomicAdd(unsigned long long* address, unsigned long long val) */
+__global__ void atomicAdd_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAdd(&address, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAdd(1234, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+__global__ void atomicAdd_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+/* atomicAdd(float* address, float val) */
+__global__ void atomicAdd_float_v1(float* address, float* result) {
+  *result = atomicAdd(&address, 1234.f);
+}
+
+__global__ void atomicAdd_float_v2(float* address, float* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_float_v3(float* address, float* result) {
+  *result = atomicAdd(1234.f, 1234.f);
+}
+
+__global__ void atomicAdd_float_v4(Dummy* address, float* result) {
+  *result = atomicAdd(address, 1234.f);
+}
+
+__global__ void atomicAdd_float_v5(char* address, float* result) {
+  *result = atomicAdd(address, 1234.f);
+}
+
+__global__ void atomicAdd_float_v6(short* address, float* result) {
+  *result = atomicAdd(address, 1234.f);
+}
+
+__global__ void atomicAdd_float_v7(long* address, float* result) {
+  *result = atomicAdd(address, 1234.f);
+}
+
+__global__ void atomicAdd_float_v8(long long* address, float* result) {
+  *result = atomicAdd(address, 1234);
+}
+
+/* atomicAdd(double* address, double val) */
+__global__ void atomicAdd_double_v1(double* address, double* result) {
+  *result = atomicAdd(&address, 1234.0);
+}
+
+__global__ void atomicAdd_double_v2(double* address, double* result) {
+  *result = atomicAdd(address, address);
+}
+
+__global__ void atomicAdd_double_v3(double* address, double* result) {
+  *result = atomicAdd(1234.0, 1234.0);
+}
+
+__global__ void atomicAdd_double_v4(Dummy* address, double* result) {
+  *result = atomicAdd(address, 1234.0);
+}
+
+__global__ void atomicAdd_double_v5(char* address, double* result) {
+  *result = atomicAdd(address, 1234.0);
+}
+
+__global__ void atomicAdd_double_v6(short* address, double* result) {
+  *result = atomicAdd(address, 1234.0);
+}
+
+__global__ void atomicAdd_double_v7(long* address, double* result) {
+  *result = atomicAdd(address, 1234.0);
+}
+
+__global__ void atomicAdd_double_v8(long long* address, double* result) {
+  *result = atomicAdd(address, 1234.0);
+}
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicAdd_int{
+    R"(
+    __global__ void atomicAdd_int_v1(int* address, int* result) {
+      *result = atomicAdd(&address, 1234);
+    }
+
+    __global__ void atomicAdd_int_v2(int* address, int* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_int_v3(int* address, int* result) {
+      *result = atomicAdd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_int_v4(Dummy* address, int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_int_v5(char* address, int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_int_v6(short* address, int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_int_v7(long* address, int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_int_v8(long long* address, int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAdd_uint{
+    R"(
+    __global__ void atomicAdd_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicAdd(&address, 1234);
+    }
+
+    __global__ void atomicAdd_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicAdd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_uint_v5(char* address, unsigned int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_uint_v6(short* address, unsigned int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_uint_v7(long* address, unsigned int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicAdd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAdd_ulong{
+    R"(
+    __global__ void atomicAdd_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicAdd(&address, 1234);
+    }
+
+    __global__ void atomicAdd_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicAdd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAdd_ulonglong{
+    R"(
+    __global__ void atomicAdd_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAdd(&address, 1234);
+    }
+
+    __global__ void atomicAdd_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAdd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+
+    __global__ void atomicAdd_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicAdd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAdd_float{
+    R"(
+    __global__ void atomicAdd_float_v1(float* address, float* result) {
+      *result = atomicAdd(&address, 1234.f);
+    }
+
+    __global__ void atomicAdd_float_v2(float* address, float* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_float_v3(float* address, float* result) {
+      *result = atomicAdd(1234.f, 1234.f);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_float_v4(Dummy* address, float* result) {
+      *result = atomicAdd(address, 1234.f);
+    }
+
+    __global__ void atomicAdd_float_v5(char* address, float* result) {
+      *result = atomicAdd(address, 1234.f);
+    }
+
+    __global__ void atomicAdd_float_v6(short* address, float* result) {
+      *result = atomicAdd(address, 1234.f);
+    }
+
+    __global__ void atomicAdd_float_v7(long* address, float* result) {
+      *result = atomicAdd(address, 1234.f);
+    }
+
+    __global__ void atomicAdd_float_v8(long long* address, float* result) {
+      *result = atomicAdd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAdd_double{
+    R"(
+    __global__ void atomicAdd_double_v1(double* address, double* result) {
+      *result = atomicAdd(&address, 1234.0);
+    }
+
+    __global__ void atomicAdd_double_v2(double* address, double* result) {
+      *result = atomicAdd(address, address);
+    }
+
+    __global__ void atomicAdd_double_v3(double* address, double* result) {
+      *result = atomicAdd(1234.0, 1234.0);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAdd_double_v4(Dummy* address, double* result) {
+      *result = atomicAdd(address, 1234.0);
+    }
+
+    __global__ void atomicAdd_double_v5(char* address, double* result) {
+      *result = atomicAdd(address, 1234.0);
+    }
+
+    __global__ void atomicAdd_double_v6(short* address, double* result) {
+      *result = atomicAdd(address, 1234.0);
+    }
+
+    __global__ void atomicAdd_double_v7(long* address, double* result) {
+      *result = atomicAdd(address, 1234.0);
+    }
+
+    __global__ void atomicAdd_double_v8(long long* address, double* result) {
+      *result = atomicAdd(address, 1234.0);
+    }
+  )"};
@@ -0,0 +1,177 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicAdd_system atomicAdd_system
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a two devices wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicAdd_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAdd_system_Positive_Peer_GPUs", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel on a single device wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the memory
+ * location, storing the return value into a separate output array slot corresponding to it. While
+ * the kernel is running, the host performs atomic additions, in 4 threads, on the same memory
+ * location(s). Once complete, the output array and target memory is validated to contain all the
+ * expected values. Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicAdd_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAdd_system_Positive_Host_And_GPU", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          1, 1, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          1, 1, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          1, 1, warp_size, cache_line_size, 4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times on two devices wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the memory
+ * location, storing the return value into a separate output array slot corresponding to it. While
+ * the kernel is running, the host performs atomic additions, in 4 threads, on the same memory
+ * location(s). Once complete, the output array and target memory is validated to contain all the
+ * expected values. Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicAdd_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicAdd_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAdd_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kAddSystem>(
+          2, 2, warp_size, cache_line_size, 4);
+    }
+  }
+}
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicAnd_negative_kernels_rtc.hh"
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicAnd atomicAnd
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicAnd(TestType* address, TestType* val)` -
+ * performs atomic bitwise AND between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_SameAddress", "", int, unsigned int, unsigned long,
+                   unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_Multi_Kernel_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_Multi_Kernel_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_Positive_Multi_Kernel_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAnd>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicAnd with invalid parameters.
+ *  - Compiles the source with RTC.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicAnd_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source =
+      GENERATE(kAtomicAnd_int, kAtomicAnd_uint, kAtomicAnd_ulong, kAtomicAnd_ulonglong);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicAnd_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  // Please check the content of negative_kernels_rtc.hh
+  int expected_error_count{9};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,185 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicAnd(int* address, int val) */
+__global__ void atomicAnd_int_v1(int* address, int* result) { *result = atomicAnd(&address, 1234); }
+
+__global__ void atomicAnd_int_v2(int* address, int* result) {
+  *result = atomicAnd(address, address);
+}
+
+__global__ void atomicAnd_int_v3(int* address, int* result) { *result = atomicAnd(1234, 1234); }
+
+__global__ void atomicAnd_int_v4(Dummy* address, int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_int_v5(char* address, int* result) { *result = atomicAnd(address, 1234); }
+
+__global__ void atomicAnd_int_v6(short* address, int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_int_v7(long* address, int* result) { *result = atomicAnd(address, 1234); }
+
+__global__ void atomicAnd_int_v8(long long* address, int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_int_v9(float* address, int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_int_v10(double* address, int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+/* unsigned int atomicAnd(unsigned int* address, unsigned int val) */
+__global__ void atomicAnd_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicAnd(&address, 1234);
+}
+
+__global__ void atomicAnd_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicAnd(address, address);
+}
+
+__global__ void atomicAnd_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicAnd(1234, 1234);
+}
+
+__global__ void atomicAnd_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v5(char* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v6(short* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v7(long* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v9(float* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_uint_v10(double* address, unsigned int* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+/* atomicAnd(unsigned long* address, unsigned long val) */
+__global__ void atomicAnd_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicAnd(&address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicAnd(address, address);
+}
+
+__global__ void atomicAnd_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicAnd(1234, 1234);
+}
+
+__global__ void atomicAnd_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v9(float* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulong_v10(double* address, unsigned long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+/* atomicAnd(unsigned long long* address, unsigned long long val) */
+__global__ void atomicAnd_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAnd(&address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAnd(address, address);
+}
+
+__global__ void atomicAnd_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicAnd(1234, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v9(float* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
+
+__global__ void atomicAnd_ulonglong_v10(double* address, unsigned long long* result) {
+  *result = atomicAnd(address, 1234);
+}
@@ -0,0 +1,223 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicAnd_int{
+    R"(
+    __global__ void atomicAnd_int_v1(int* address, int* result) {
+      *result = atomicAnd(&address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v2(int* address, int* result) {
+      *result = atomicAnd(address, address);
+    }
+
+    __global__ void atomicAnd_int_v3(int* address, int* result) {
+      *result = atomicAnd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAnd_int_v4(Dummy* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v5(char* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v6(short* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v7(long* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v8(long long* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v9(float* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_int_v10(double* address, int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAnd_uint{
+    R"(
+    __global__ void atomicAnd_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicAnd(&address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicAnd(address, address);
+    }
+
+    __global__ void atomicAnd_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicAnd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAnd_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v5(char* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v6(short* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v7(long* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v9(float* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_uint_v10(double* address, unsigned int* result) {
+      *result = atomicAnd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAnd_ulong{
+    R"(
+    __global__ void atomicAnd_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicAnd(&address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicAnd(address, address);
+    }
+
+    __global__ void atomicAnd_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicAnd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAnd_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v9(float* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulong_v10(double* address, unsigned long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicAnd_ulonglong{
+    R"(
+    __global__ void atomicAnd_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAnd(&address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAnd(address, address);
+    }
+
+    __global__ void atomicAnd_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicAnd(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicAnd_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v9(float* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+
+    __global__ void atomicAnd_ulonglong_v10(double* address, unsigned long long* result) {
+      *result = atomicAnd(address, 1234);
+    }
+  )"};
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicAnd_system atomicAnd_system
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicAnd_system(TestType* address, TestType* val)` -
+ * performs system-wide atomic bitwise AND between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd_system from multiple threads on the same address.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAndSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd_system from multiple threads on adjacent addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAndSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicAnd_system from multiple threads on scattered addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicAnd_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicAnd_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kAndSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,172 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicCAS_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicCAS atomicCAS
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+#ifdef HT_NVIDIA
+#define TYPES
+#else
+#define TYPES , float, double
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition, implemented using an atomic CAS operation, on a target memory location. Each thread
+ * will add the same value to the memory location, storing the return value into a separate output
+ * array slot corresponding to it. Once complete, the output array and target memory is validated to
+ * contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive", "", int, unsigned int, unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kCASAdd>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
+ * thread will add the same value to the memory location, storing the return value into a separate
+ * output array slot corresponding to it. Once complete, the output array and target memory is
+ * validated to contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_Positive_Multi_Kernel", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
+                                                                         sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kCASAdd>(2, warp_size,
+                                                                         cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicCAS.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicCAS_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicCAS_int, kAtomicCAS_uint, kAtomicCAS_ulong,
+                                       kAtomicCAS_ulonglong, kAtomicCAS_float, kAtomicCAS_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicCAS_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+#define ATOMIC_CAS_NEGATIVE_KERNEL(type_name)                                                      \
+  __global__ void atomicCAS_v1(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(&address, 12, 13);                                                         \
+  }                                                                                                \
+  __global__ void atomicCAS_v2(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(address, address, 13);                                                     \
+  }                                                                                                \
+  __global__ void atomicCAS_v3(type_name* address, type_name* result) {                            \
+    *result = atomicCAS(address, 12, address);                                                     \
+  }                                                                                                \
+  __global__ void atomicCAS_v4(Dummy* address, type_name* result) {                                \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v5(char* address, type_name* result) {                                 \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v6(short* address, type_name* result) {                                \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v7(long* address, type_name* result) {                                 \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }                                                                                                \
+  __global__ void atomicCAS_v8(long long* address, type_name* result) {                            \
+    *result = atomicCAS(address, 12, 13);                                                          \
+  }
+
+ATOMIC_CAS_NEGATIVE_KERNEL(int)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned int)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long)
+ATOMIC_CAS_NEGATIVE_KERNEL(unsigned long long)
+ATOMIC_CAS_NEGATIVE_KERNEL(float)
+ATOMIC_CAS_NEGATIVE_KERNEL(double)
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicCAS_int{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_int_v1(int* address, int* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v2(int* address, int* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_int_v3(int* address, int* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_int_v4(Dummy* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v5(char* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v6(short* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v7(long* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_int_v8(long long* address, int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_uint{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v5(char* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v6(short* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v7(long* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_ulong{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_ulonglong{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_float{
+    R"(
+    class Dummy {
+      public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_float_v1(float* address, float* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v2(float* address, float* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_float_v3(float* address, float* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_float_v4(Dummy* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v5(char* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v6(short* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v7(long* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_float_v8(long long* address, float* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
+
+static constexpr auto kAtomicCAS_double{
+    R"(
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicCAS_double_v1(double* address, double* result) {
+      *result = atomicCAS(&address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v2(double* address, double* result) {
+      *result = atomicCAS(address, address, 13);
+    }
+
+    __global__ void atomicCAS_double_v3(double* address, double* result) {
+      *result = atomicCAS(address, 12, address);
+    }
+
+    __global__ void atomicCAS_double_v4(Dummy* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v5(char* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v6(short* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v7(long* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+
+    __global__ void atomicCAS_double_v8(long long* address, double* result) {
+      *result = atomicCAS(address, 12, 13);
+    }
+  )"};
@@ -0,0 +1,185 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicCAS_system atomicCAS_system
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+#ifdef HT_NVIDIA
+#define TYPES
+#else
+#define TYPES , float, double
+#endif
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a two devices wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location. Each
+ * thread will add the same value to the memory location, storing the return value into a separate
+ * output array slot corresponding to it. Once complete, the output array and target memory is
+ * validated to contain all the expected values. Several memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel on a single device wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location.
+ * Each thread will add the same value to the memory location, storing the return value into a
+ * separate output array slot corresponding to it. While the kernel is running, the host
+ * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
+ * output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_GPU", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          1, 1, warp_size, cache_line_size, 4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times on two devices wherein all threads will perform
+ * an atomic addition, implemented using an atomic CAS operation, on a target memory location.
+ * Each thread will add the same value to the memory location, storing the return value into a
+ * separate output array slot corresponding to it. While the kernel is running, the host
+ * performs atomic additions, in 4 threads, on the same memory location(s). Once complete, the
+ * output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads exchange to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicCAS_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicCAS_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicCAS_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long long TYPES) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kCASAddSystem>(
+          2, 2, warp_size, cache_line_size, 4);
+    }
+  }
+}
@@ -0,0 +1,164 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicDec_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicDec atomicDec
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * decrement on a target memory location. Each thread will decrement the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads decrement a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicDec
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicDec.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicDec_Positive", "", unsigned int) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kDec>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kDec>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kDec>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic decrement on a target memory location. Each thread will decrement the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads decrement a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicDec
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicDec.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicDec_Positive_Multi_Kernel", "", unsigned int) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kDec>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kDec>(2, warp_size,
+                                                                      sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kDec>(2, warp_size,
+                                                                      cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicDec.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicDec.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicDec_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicDec_uint);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicDec_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* unsigned int atomicDec(unsigned int* address, unsigned int val) */
+__global__ void atomicDec_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicDec(&address, 1234);
+}
+
+__global__ void atomicDec_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicDec(address, address);
+}
+
+__global__ void atomicDec_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicDec(1234, 1234);
+}
+
+__global__ void atomicDec_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicDec(address, 1234);
+}
+
+__global__ void atomicDec_uint_v5(char* address, unsigned int* result) {
+  *result = atomicDec(address, 1234);
+}
+
+__global__ void atomicDec_uint_v6(short* address, unsigned int* result) {
+  *result = atomicDec(address, 1234);
+}
+
+__global__ void atomicDec_uint_v7(long* address, unsigned int* result) {
+  *result = atomicDec(address, 1234);
+}
+
+__global__ void atomicDec_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicDec(address, 1234);
+}
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicDec_uint{
+    R"(
+    __global__ void atomicDec_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicDec(&address, 1234);
+    }
+
+    __global__ void atomicDec_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicDec(address, address);
+    }
+
+    __global__ void atomicDec_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicDec(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicDec_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicDec(address, 1234);
+    }
+
+    __global__ void atomicDec_uint_v5(char* address, unsigned int* result) {
+      *result = atomicDec(address, 1234);
+    }
+
+    __global__ void atomicDec_uint_v6(short* address, unsigned int* result) {
+      *result = atomicDec(address, 1234);
+    }
+
+    __global__ void atomicDec_uint_v7(long* address, unsigned int* result) {
+      *result = atomicDec(address, 1234);
+    }
+
+    __global__ void atomicDec_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicDec(address, 1234);
+    }
+  )"};
@@ -24,22 +24,26 @@ THE SOFTWARE.

 #include <numeric>

+#include <cmd_options.hh>
 #include <hip_test_common.hh>
 #include <resource_guards.hh>
 #include <hip/hip_cooperative_groups.h>
-#include <cmd_options.hh>

-enum class AtomicScopes { device, system };
+enum class AtomicScopes { device, system, builtin };

-template <typename T, AtomicScopes scope> __device__ T perform_atomic_exch(T* address, T val) {
+template <typename T, AtomicScopes scope, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__device__ T perform_atomic_exch(T* address, T val) {
  if constexpr (scope == AtomicScopes::device) {
    return atomicExch(address, val);
  } else if (scope == AtomicScopes::system) {
    return atomicExch_system(address, val);
+  } else if (scope == AtomicScopes::builtin) {
+    return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, memory_scope);
  }
 }

-template <typename T, bool use_shared_mem, AtomicScopes scope>
+template <typename T, bool use_shared_mem, AtomicScopes scope,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const old_vals) {
  __shared__ T shared_mem;

@@ -52,7 +56,7 @@ __global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const ol
    __syncthreads();
  }

-  old_vals[tid] = perform_atomic_exch<T, scope>(mem, static_cast<T>(tid + 1));
+  old_vals[tid] = perform_atomic_exch<T, scope, memory_scope>(mem, static_cast<T>(tid + 1));

  if constexpr (use_shared_mem) {
    __syncthreads();
@@ -67,7 +71,16 @@ __host__ __device__ T* pitched_offset(T* const ptr, const unsigned int pitch,
  return reinterpret_cast<T*>(byte_ptr + idx * pitch);
 }

-template <typename T, bool use_shared_mem, AtomicScopes scope>
+__device__ void generate_memory_traffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+template <typename T, bool use_shared_mem, AtomicScopes scope,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 __global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const unsigned int width,
                                   const unsigned pitch, const T base_val = 0) {
  extern __shared__ uint8_t shared_mem[];
@@ -84,8 +97,18 @@ __global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const
    __syncthreads();
  }

-  old_vals[tid] = perform_atomic_exch<T, scope>(pitched_offset(mem, pitch, tid % width),
-                                                base_val + static_cast<T>(tid + width));
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  T* atomic_addr = pitched_offset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = perform_atomic_exch<T, scope, memory_scope>(
+        pitched_offset(mem, pitch, tid % width), base_val + static_cast<T>(tid + width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    generate_memory_traffic(begin_addr, end_addr);
+  }

  if constexpr (use_shared_mem) {
    __syncthreads();
@@ -255,14 +278,16 @@ class AtomicExchCRTP {
  }
 };

-template <typename T, bool use_shared_mem, AtomicScopes scope>
+template <typename T, bool use_shared_mem, AtomicScopes scope,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 class AtomicExch
    : public AtomicExchCRTP<AtomicExch<T, use_shared_mem, scope>, T, use_shared_mem, scope> {
 public:
  void LaunchKernel(const unsigned int shared_mem_size, const hipStream_t stream, T* const mem,
                    T* const old_vals, const T base_val, const AtomicExchParams& p) const {
-    atomic_exch_kernel<T, use_shared_mem, scope><<<p.blocks, p.threads, shared_mem_size, stream>>>(
-        mem, old_vals, p.width, p.pitch, base_val);
+    atomic_exch_kernel<T, use_shared_mem, scope, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem, old_vals, p.width, p.pitch,
+                                                           base_val);
  }

  void ValidateResults(std::vector<T>& old_vals) const {
@@ -281,23 +306,39 @@ inline dim3 GenerateAtomicExchBlockDimensions() {
  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
 }

-template <typename TestType, AtomicScopes scope>
+template <typename TestType, AtomicScopes scope, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
 void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
  AtomicExchParams params;
  params.num_devices = 1;
  params.kernel_count = 1;
-  params.threads = GenerateAtomicExchThreadDimensions();
+  if constexpr (scope == AtomicScopes::builtin && memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr (scope == AtomicScopes::builtin &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateAtomicExchThreadDimensions();
+  }
  params.width = width;
  params.pitch = pitch;

  SECTION("Global memory") {
-    params.blocks = GenerateAtomicExchBlockDimensions();
+    if constexpr (scope == AtomicScopes::builtin &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateAtomicExchBlockDimensions();
+    }
    using LA = LinearAllocs;
    for (const auto alloc_type :
         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
      params.alloc_type = alloc_type;
      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
-        AtomicExch<TestType, false, scope>().run(params);
+        AtomicExch<TestType, false, scope, memory_scope>().run(params);
      }
    }
  }
@@ -305,7 +346,7 @@ void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsi
  SECTION("Shared memory") {
    params.blocks = dim3(1);
    params.alloc_type = LinearAllocs::hipMalloc;
-    AtomicExch<TestType, true, scope>().run(params);
+    AtomicExch<TestType, true, scope, memory_scope>().run(params);
  }
 }

@@ -0,0 +1,164 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicInc_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicInc atomicInc
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * increment on a target memory location. Each thread will increment the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads increment a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicInc
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicInc.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicInc_Positive", "", unsigned int) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kInc>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kInc>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kInc>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will
+ * perform an atomic increment on a target memory location. Each thread will increment the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads increment a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicInc
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicInc.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicInc_Positive_Multi_Kernel", "", unsigned int) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kInc>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kInc>(2, warp_size,
+                                                                      sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kInc>(2, warp_size,
+                                                                      cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicInc.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicInc.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicInc_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicInc_uint);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicInc_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* unsigned int atomicInc(unsigned int* address, unsigned int val) */
+__global__ void atomicInc_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicInc(&address, 1234);
+}
+
+__global__ void atomicInc_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicInc(address, address);
+}
+
+__global__ void atomicInc_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicInc(1234, 1234);
+}
+
+__global__ void atomicInc_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicInc(address, 1234);
+}
+
+__global__ void atomicInc_uint_v5(char* address, unsigned int* result) {
+  *result = atomicInc(address, 1234);
+}
+
+__global__ void atomicInc_uint_v6(short* address, unsigned int* result) {
+  *result = atomicInc(address, 1234);
+}
+
+__global__ void atomicInc_uint_v7(long* address, unsigned int* result) {
+  *result = atomicInc(address, 1234);
+}
+
+__global__ void atomicInc_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicInc(address, 1234);
+}
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicInc_uint{
+    R"(
+    __global__ void atomicInc_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicInc(&address, 1234);
+    }
+
+    __global__ void atomicInc_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicInc(address, address);
+    }
+
+    __global__ void atomicInc_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicInc(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicInc_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicInc(address, 1234);
+    }
+
+    __global__ void atomicInc_uint_v5(char* address, unsigned int* result) {
+      *result = atomicInc(address, 1234);
+    }
+
+    __global__ void atomicInc_uint_v6(short* address, unsigned int* result) {
+      *result = atomicInc(address, 1234);
+    }
+
+    __global__ void atomicInc_uint_v7(long* address, unsigned int* result) {
+      *result = atomicInc(address, 1234);
+    }
+
+    __global__ void atomicInc_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicInc(address, 1234);
+    }
+  )"};
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicMax_negative_kernels_rtc.hh"
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicMax atomicMax
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicMax(TestType* address, TestType* val)` -
+ * calculates maximum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_SameAddress", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on the scaterred addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_Multi_Kernel_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_Multi_Kernel_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax from multiple threads on the scaterred addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMax_Positive_Multi_Kernel_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMax>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicMax with invalid parameters.
+ *  - Compiles the source with RTC.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicMax_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicMax_int, kAtomicMax_uint, kAtomicMax_ulong,
+                                       kAtomicMax_ulonglong, kAtomicMax_float, kAtomicMax_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicMax_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  // Please check the content of negative_kernels_rtc.hh
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicMax(int* address, int val) */
+__global__ void atomicMax_int_v1(int* address, int* result) { *result = atomicMax(&address, 1234); }
+
+__global__ void atomicMax_int_v2(int* address, int* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_int_v3(int* address, int* result) { *result = atomicMax(1234, 1234); }
+
+__global__ void atomicMax_int_v4(Dummy* address, int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_int_v5(char* address, int* result) { *result = atomicMax(address, 1234); }
+
+__global__ void atomicMax_int_v6(short* address, int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_int_v7(long* address, int* result) { *result = atomicMax(address, 1234); }
+
+__global__ void atomicMax_int_v8(long long* address, int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+/* unsigned int atomicMax(unsigned int* address, unsigned int val) */
+__global__ void atomicMax_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicMax(&address, 1234);
+}
+
+__global__ void atomicMax_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicMax(1234, 1234);
+}
+
+__global__ void atomicMax_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_uint_v5(char* address, unsigned int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_uint_v6(short* address, unsigned int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_uint_v7(long* address, unsigned int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicMax(address, 1234);
+}
+
+/* atomicMax(unsigned long* address, unsigned long val) */
+__global__ void atomicMax_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicMax(&address, 1234);
+}
+
+__global__ void atomicMax_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicMax(1234, 1234);
+}
+
+__global__ void atomicMax_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+/* atomicMax(unsigned long long* address, unsigned long long val) */
+__global__ void atomicMax_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMax(&address, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMax(1234, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+__global__ void atomicMax_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicMax(address, 1234);
+}
+
+/* atomicMax(float* address, float val) */
+__global__ void atomicMax_float_v1(float* address, float* result) {
+  *result = atomicMax(&address, 1234.f);
+}
+
+__global__ void atomicMax_float_v2(float* address, float* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_float_v3(float* address, float* result) {
+  *result = atomicMax(1234.f, 1234.f);
+}
+
+__global__ void atomicMax_float_v4(Dummy* address, float* result) {
+  *result = atomicMax(address, 1234.f);
+}
+
+__global__ void atomicMax_float_v5(char* address, float* result) {
+  *result = atomicMax(address, 1234.f);
+}
+
+__global__ void atomicMax_float_v6(short* address, float* result) {
+  *result = atomicMax(address, 1234.f);
+}
+
+__global__ void atomicMax_float_v7(long* address, float* result) {
+  *result = atomicMax(address, 1234.f);
+}
+
+__global__ void atomicMax_float_v8(long long* address, float* result) {
+  *result = atomicMax(address, 1234);
+}
+
+/* atomicMax(double* address, double val) */
+__global__ void atomicMax_double_v1(double* address, double* result) {
+  *result = atomicMax(&address, 1234.0);
+}
+
+__global__ void atomicMax_double_v2(double* address, double* result) {
+  *result = atomicMax(address, address);
+}
+
+__global__ void atomicMax_double_v3(double* address, double* result) {
+  *result = atomicMax(1234.0, 1234.0);
+}
+
+__global__ void atomicMax_double_v4(Dummy* address, double* result) {
+  *result = atomicMax(address, 1234.0);
+}
+
+__global__ void atomicMax_double_v5(char* address, double* result) {
+  *result = atomicMax(address, 1234.0);
+}
+
+__global__ void atomicMax_double_v6(short* address, double* result) {
+  *result = atomicMax(address, 1234.0);
+}
+
+__global__ void atomicMax_double_v7(long* address, double* result) {
+  *result = atomicMax(address, 1234.0);
+}
+
+__global__ void atomicMax_double_v8(long long* address, double* result) {
+  *result = atomicMax(address, 1234.0);
+}
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicMax_int{
+    R"(
+    __global__ void atomicMax_int_v1(int* address, int* result) {
+      *result = atomicMax(&address, 1234);
+    }
+
+    __global__ void atomicMax_int_v2(int* address, int* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_int_v3(int* address, int* result) {
+      *result = atomicMax(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_int_v4(Dummy* address, int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_int_v5(char* address, int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_int_v6(short* address, int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_int_v7(long* address, int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_int_v8(long long* address, int* result) {
+      *result = atomicMax(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMax_uint{
+    R"(
+    __global__ void atomicMax_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicMax(&address, 1234);
+    }
+
+    __global__ void atomicMax_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicMax(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_uint_v5(char* address, unsigned int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_uint_v6(short* address, unsigned int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_uint_v7(long* address, unsigned int* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicMax(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMax_ulong{
+    R"(
+    __global__ void atomicMax_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicMax(&address, 1234);
+    }
+
+    __global__ void atomicMax_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicMax(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicMax(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMax_ulonglong{
+    R"(
+    __global__ void atomicMax_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMax(&address, 1234);
+    }
+
+    __global__ void atomicMax_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMax(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicMax(address, 1234);
+    }
+
+    __global__ void atomicMax_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicMax(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMax_float{
+    R"(
+    __global__ void atomicMax_float_v1(float* address, float* result) {
+      *result = atomicMax(&address, 1234.f);
+    }
+
+    __global__ void atomicMax_float_v2(float* address, float* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_float_v3(float* address, float* result) {
+      *result = atomicMax(1234.f, 1234.f);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_float_v4(Dummy* address, float* result) {
+      *result = atomicMax(address, 1234.f);
+    }
+
+    __global__ void atomicMax_float_v5(char* address, float* result) {
+      *result = atomicMax(address, 1234.f);
+    }
+
+    __global__ void atomicMax_float_v6(short* address, float* result) {
+      *result = atomicMax(address, 1234.f);
+    }
+
+    __global__ void atomicMax_float_v7(long* address, float* result) {
+      *result = atomicMax(address, 1234.f);
+    }
+
+    __global__ void atomicMax_float_v8(long long* address, float* result) {
+      *result = atomicMax(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMax_double{
+    R"(
+    __global__ void atomicMax_double_v1(double* address, double* result) {
+      *result = atomicMax(&address, 1234.0);
+    }
+
+    __global__ void atomicMax_double_v2(double* address, double* result) {
+      *result = atomicMax(address, address);
+    }
+
+    __global__ void atomicMax_double_v3(double* address, double* result) {
+      *result = atomicMax(1234.0, 1234.0);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMax_double_v4(Dummy* address, double* result) {
+      *result = atomicMax(address, 1234.0);
+    }
+
+    __global__ void atomicMax_double_v5(char* address, double* result) {
+      *result = atomicMax(address, 1234.0);
+    }
+
+    __global__ void atomicMax_double_v6(short* address, double* result) {
+      *result = atomicMax(address, 1234.0);
+    }
+
+    __global__ void atomicMax_double_v7(long* address, double* result) {
+      *result = atomicMax(address, 1234.0);
+    }
+
+    __global__ void atomicMax_double_v8(long long* address, double* result) {
+      *result = atomicMax(address, 1234.0);
+    }
+  )"};
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicMax_system atomicMax_system
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicMax_system(TestType* address, TestType* val)` -
+ * performs system-wide atomic maximum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax_system from multiple threads on the same address.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+#endif
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMaxSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax_system from multiple threads on adjacent addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+#endif
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMaxSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMax_system from multiple threads on scaterred addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMax_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMax_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+#endif
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMaxSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicMin_negative_kernels_rtc.hh"
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicMin atomicMin
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicMin(TestType* address, TestType* val)` -
+ * calculates minimum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_SameAddress", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on the scaterred addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_Multi_Kernel_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_Multi_Kernel_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin from multiple threads on the scaterred addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicMin_Positive_Multi_Kernel_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMin>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicMin with invalid parameters.
+ *  - Compiles the source with RTC.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicMin_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicMin_int, kAtomicMin_uint, kAtomicMin_ulong,
+                                       kAtomicMin_ulonglong, kAtomicMin_float, kAtomicMin_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicMin_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  // Please check the content of negative_kernels_rtc.hh
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicMin(int* address, int val) */
+__global__ void atomicMin_int_v1(int* address, int* result) { *result = atomicMin(&address, 1234); }
+
+__global__ void atomicMin_int_v2(int* address, int* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_int_v3(int* address, int* result) { *result = atomicMin(1234, 1234); }
+
+__global__ void atomicMin_int_v4(Dummy* address, int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_int_v5(char* address, int* result) { *result = atomicMin(address, 1234); }
+
+__global__ void atomicMin_int_v6(short* address, int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_int_v7(long* address, int* result) { *result = atomicMin(address, 1234); }
+
+__global__ void atomicMin_int_v8(long long* address, int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+/* unsigned int atomicMin(unsigned int* address, unsigned int val) */
+__global__ void atomicMin_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicMin(&address, 1234);
+}
+
+__global__ void atomicMin_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicMin(1234, 1234);
+}
+
+__global__ void atomicMin_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_uint_v5(char* address, unsigned int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_uint_v6(short* address, unsigned int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_uint_v7(long* address, unsigned int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicMin(address, 1234);
+}
+
+/* atomicMin(unsigned long* address, unsigned long val) */
+__global__ void atomicMin_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicMin(&address, 1234);
+}
+
+__global__ void atomicMin_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicMin(1234, 1234);
+}
+
+__global__ void atomicMin_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+/* atomicMin(unsigned long long* address, unsigned long long val) */
+__global__ void atomicMin_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMin(&address, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicMin(1234, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+__global__ void atomicMin_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicMin(address, 1234);
+}
+
+/* atomicMin(float* address, float val) */
+__global__ void atomicMin_float_v1(float* address, float* result) {
+  *result = atomicMin(&address, 1234.f);
+}
+
+__global__ void atomicMin_float_v2(float* address, float* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_float_v3(float* address, float* result) {
+  *result = atomicMin(1234.f, 1234.f);
+}
+
+__global__ void atomicMin_float_v4(Dummy* address, float* result) {
+  *result = atomicMin(address, 1234.f);
+}
+
+__global__ void atomicMin_float_v5(char* address, float* result) {
+  *result = atomicMin(address, 1234.f);
+}
+
+__global__ void atomicMin_float_v6(short* address, float* result) {
+  *result = atomicMin(address, 1234.f);
+}
+
+__global__ void atomicMin_float_v7(long* address, float* result) {
+  *result = atomicMin(address, 1234.f);
+}
+
+__global__ void atomicMin_float_v8(long long* address, float* result) {
+  *result = atomicMin(address, 1234);
+}
+
+/* atomicMin(double* address, double val) */
+__global__ void atomicMin_double_v1(double* address, double* result) {
+  *result = atomicMin(&address, 1234.0);
+}
+
+__global__ void atomicMin_double_v2(double* address, double* result) {
+  *result = atomicMin(address, address);
+}
+
+__global__ void atomicMin_double_v3(double* address, double* result) {
+  *result = atomicMin(1234.0, 1234.0);
+}
+
+__global__ void atomicMin_double_v4(Dummy* address, double* result) {
+  *result = atomicMin(address, 1234.0);
+}
+
+__global__ void atomicMin_double_v5(char* address, double* result) {
+  *result = atomicMin(address, 1234.0);
+}
+
+__global__ void atomicMin_double_v6(short* address, double* result) {
+  *result = atomicMin(address, 1234.0);
+}
+
+__global__ void atomicMin_double_v7(long* address, double* result) {
+  *result = atomicMin(address, 1234.0);
+}
+
+__global__ void atomicMin_double_v8(long long* address, double* result) {
+  *result = atomicMin(address, 1234.0);
+}
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicMin_int{
+    R"(
+    __global__ void atomicMin_int_v1(int* address, int* result) {
+      *result = atomicMin(&address, 1234);
+    }
+
+    __global__ void atomicMin_int_v2(int* address, int* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_int_v3(int* address, int* result) {
+      *result = atomicMin(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_int_v4(Dummy* address, int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_int_v5(char* address, int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_int_v6(short* address, int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_int_v7(long* address, int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_int_v8(long long* address, int* result) {
+      *result = atomicMin(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMin_uint{
+    R"(
+    __global__ void atomicMin_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicMin(&address, 1234);
+    }
+
+    __global__ void atomicMin_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicMin(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_uint_v5(char* address, unsigned int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_uint_v6(short* address, unsigned int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_uint_v7(long* address, unsigned int* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicMin(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMin_ulong{
+    R"(
+    __global__ void atomicMin_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicMin(&address, 1234);
+    }
+
+    __global__ void atomicMin_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicMin(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicMin(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMin_ulonglong{
+    R"(
+    __global__ void atomicMin_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMin(&address, 1234);
+    }
+
+    __global__ void atomicMin_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicMin(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicMin(address, 1234);
+    }
+
+    __global__ void atomicMin_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicMin(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMin_float{
+    R"(
+    __global__ void atomicMin_float_v1(float* address, float* result) {
+      *result = atomicMin(&address, 1234.f);
+    }
+
+    __global__ void atomicMin_float_v2(float* address, float* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_float_v3(float* address, float* result) {
+      *result = atomicMin(1234.f, 1234.f);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_float_v4(Dummy* address, float* result) {
+      *result = atomicMin(address, 1234.f);
+    }
+
+    __global__ void atomicMin_float_v5(char* address, float* result) {
+      *result = atomicMin(address, 1234.f);
+    }
+
+    __global__ void atomicMin_float_v6(short* address, float* result) {
+      *result = atomicMin(address, 1234.f);
+    }
+
+    __global__ void atomicMin_float_v7(long* address, float* result) {
+      *result = atomicMin(address, 1234.f);
+    }
+
+    __global__ void atomicMin_float_v8(long long* address, float* result) {
+      *result = atomicMin(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicMin_double{
+    R"(
+    __global__ void atomicMin_double_v1(double* address, double* result) {
+      *result = atomicMin(&address, 1234.0);
+    }
+
+    __global__ void atomicMin_double_v2(double* address, double* result) {
+      *result = atomicMin(address, address);
+    }
+
+    __global__ void atomicMin_double_v3(double* address, double* result) {
+      *result = atomicMin(1234.0, 1234.0);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicMin_double_v4(Dummy* address, double* result) {
+      *result = atomicMin(address, 1234.0);
+    }
+
+    __global__ void atomicMin_double_v5(char* address, double* result) {
+      *result = atomicMin(address, 1234.0);
+    }
+
+    __global__ void atomicMin_double_v6(short* address, double* result) {
+      *result = atomicMin(address, 1234.0);
+    }
+
+    __global__ void atomicMin_double_v7(long* address, double* result) {
+      *result = atomicMin(address, 1234.0);
+    }
+
+    __global__ void atomicMin_double_v8(long long* address, double* result) {
+      *result = atomicMin(address, 1234.0);
+    }
+  )"};
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicMin_system atomicMin_system
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicMin_system(TestType* address, TestType* val)` -
+ * performs system-wide atomic minimum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin_system from multiple threads on the same address.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+#endif
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMinSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin_system from multiple threads on adjacent addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+#endif
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMinSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicMin_system from multiple threads on scaterred addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicMin_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+#if HT_AMD
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long, float, double) {
+#else
+TEMPLATE_TEST_CASE("Unit_atomicMin_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+#endif
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::MultipleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kMinSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicOr_negative_kernels_rtc.hh"
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicOr atomicOr
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicOr(TestType* address, TestType* val)` -
+ * performs atomic bitwise OR between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_SameAddress", "", int, unsigned int, unsigned long,
+                   unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_Multi_Kernel_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_Multi_Kernel_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_Positive_Multi_Kernel_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOr>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicAnd with invalid parameters.
+ *  - Compiles the source with RTC.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicOr_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source =
+      GENERATE(kAtomicOr_int, kAtomicOr_uint, kAtomicOr_ulong, kAtomicOr_ulonglong);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicOr_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  // Please check the content of negative_kernels_rtc.hh
+  int expected_error_count{9};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,177 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicOr(int* address, int val) */
+__global__ void atomicOr_int_v1(int* address, int* result) { *result = atomicOr(&address, 1234); }
+
+__global__ void atomicOr_int_v2(int* address, int* result) { *result = atomicOr(address, address); }
+
+__global__ void atomicOr_int_v3(int* address, int* result) { *result = atomicOr(1234, 1234); }
+
+__global__ void atomicOr_int_v4(Dummy* address, int* result) { *result = atomicOr(address, 1234); }
+
+__global__ void atomicOr_int_v5(char* address, int* result) { *result = atomicOr(address, 1234); }
+
+__global__ void atomicOr_int_v6(short* address, int* result) { *result = atomicOr(address, 1234); }
+
+__global__ void atomicOr_int_v7(long* address, int* result) { *result = atomicOr(address, 1234); }
+
+__global__ void atomicOr_int_v8(long long* address, int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_int_v9(float* address, int* result) { *result = atomicOr(address, 1234); }
+
+__global__ void atomicOr_int_v10(double* address, int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+/* unsigned int atomicOr(unsigned int* address, unsigned int val) */
+__global__ void atomicOr_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicOr(&address, 1234);
+}
+
+__global__ void atomicOr_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicOr(address, address);
+}
+
+__global__ void atomicOr_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicOr(1234, 1234);
+}
+
+__global__ void atomicOr_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v5(char* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v6(short* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v7(long* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v9(float* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_uint_v10(double* address, unsigned int* result) {
+  *result = atomicOr(address, 1234);
+}
+
+/* atomicOr(unsigned long* address, unsigned long val) */
+__global__ void atomicOr_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicOr(&address, 1234);
+}
+
+__global__ void atomicOr_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicOr(address, address);
+}
+
+__global__ void atomicOr_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicOr(1234, 1234);
+}
+
+__global__ void atomicOr_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v9(float* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulong_v10(double* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+/* atomicOr(unsigned long long* address, unsigned long long val) */
+__global__ void atomicOr_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicOr(&address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicOr(address, address);
+}
+
+__global__ void atomicOr_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicOr(1234, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v9(float* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v10(double* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
@@ -0,0 +1,223 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicOr_int{
+    R"(
+    __global__ void atomicOr_int_v1(int* address, int* result) {
+      *result = atomicOr(&address, 1234);
+    }
+
+    __global__ void atomicOr_int_v2(int* address, int* result) {
+      *result = atomicOr(address, address);
+    }
+
+    __global__ void atomicOr_int_v3(int* address, int* result) {
+      *result = atomicOr(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicOr_int_v4(Dummy* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v5(char* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v6(short* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v7(long* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v8(long long* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v9(float* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_int_v10(double* address, int* result) {
+      *result = atomicOr(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicOr_uint{
+    R"(
+    __global__ void atomicOr_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicOr(&address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicOr(address, address);
+    }
+
+    __global__ void atomicOr_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicOr(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicOr_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v5(char* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v6(short* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v7(long* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v9(float* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_uint_v10(double* address, unsigned int* result) {
+      *result = atomicOr(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicOr_ulong{
+    R"(
+    __global__ void atomicOr_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicOr(&address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicOr(address, address);
+    }
+
+    __global__ void atomicOr_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicOr(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicOr_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v9(float* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulong_v10(double* address, unsigned long* result) {
+      *result = atomicOr(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicOr_ulonglong{
+    R"(
+    __global__ void atomicOr_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicOr(&address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicOr(address, address);
+    }
+
+    __global__ void atomicOr_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicOr(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicOr_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v9(float* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+
+    __global__ void atomicOr_ulonglong_v10(double* address, unsigned long long* result) {
+      *result = atomicOr(address, 1234);
+    }
+  )"};
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicOr_system atomicOr_system
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicOr_system(TestType* address, TestType* val)` -
+ * performs system-wide atomic bitwise OR between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr_system from multiple threads on the same address.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOrSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr_system from multiple threads on adjacent addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOrSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicOr_system from multiple threads on scattered addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicOr_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicOr_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kOrSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,167 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+#include "atomicSub_negative_kernels_rtc.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicSub atomicSub
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * subtraction on a target memory location. Each thread will subtract the same value from the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads subtract from a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicSub
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicSub_Positive", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSub>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSub>(warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSub>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will perform
+ * an atomic subtraction on a target memory location. Each thread will subtract the same value from
+ * the memory location, storing the return value into a separate output array slot corresponding to
+ * it. Once complete, the output array and target memory is validated to contain all the expected
+ * values. Several memory access patterns are tested:
+ *      -# All threads subtract from a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicSub
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicSub_Positive_Multi_Kernel", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSub>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSub>(2, warp_size,
+                                                                      sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSub>(2, warp_size,
+                                                                      cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
+ * atomicSub.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicSub_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source = GENERATE(kAtomicSub_int, kAtomicSub_uint, kAtomicSub_ulong,
+                                       kAtomicSub_ulonglong, kAtomicSub_float, kAtomicSub_double);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicSub_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+
+  int expected_error_count{8};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,219 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicSub(int* address, int val) */
+__global__ void atomicSub_int_v1(int* address, int* result) { *result = atomicSub(&address, 1234); }
+
+__global__ void atomicSub_int_v2(int* address, int* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_int_v3(int* address, int* result) { *result = atomicSub(1234, 1234); }
+
+__global__ void atomicSub_int_v4(Dummy* address, int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_int_v5(char* address, int* result) { *result = atomicSub(address, 1234); }
+
+__global__ void atomicSub_int_v6(short* address, int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_int_v7(long* address, int* result) { *result = atomicSub(address, 1234); }
+
+__global__ void atomicSub_int_v8(long long* address, int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+/* unsigned int atomicSub(unsigned int* address, unsigned int val) */
+__global__ void atomicSub_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicSub(&address, 1234);
+}
+
+__global__ void atomicSub_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicSub(1234, 1234);
+}
+
+__global__ void atomicSub_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_uint_v5(char* address, unsigned int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_uint_v6(short* address, unsigned int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_uint_v7(long* address, unsigned int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicSub(address, 1234);
+}
+
+/* atomicSub(unsigned long* address, unsigned long val) */
+__global__ void atomicSub_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicSub(&address, 1234);
+}
+
+__global__ void atomicSub_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicSub(1234, 1234);
+}
+
+__global__ void atomicSub_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+/* atomicSub(unsigned long long* address, unsigned long long val) */
+__global__ void atomicSub_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicSub(&address, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicSub(1234, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+__global__ void atomicSub_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicSub(address, 1234);
+}
+
+/* atomicSub(float* address, float val) */
+__global__ void atomicSub_float_v1(float* address, float* result) {
+  *result = atomicSub(&address, 1234.f);
+}
+
+__global__ void atomicSub_float_v2(float* address, float* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_float_v3(float* address, float* result) {
+  *result = atomicSub(1234.f, 1234.f);
+}
+
+__global__ void atomicSub_float_v4(Dummy* address, float* result) {
+  *result = atomicSub(address, 1234.f);
+}
+
+__global__ void atomicSub_float_v5(char* address, float* result) {
+  *result = atomicSub(address, 1234.f);
+}
+
+__global__ void atomicSub_float_v6(short* address, float* result) {
+  *result = atomicSub(address, 1234.f);
+}
+
+__global__ void atomicSub_float_v7(long* address, float* result) {
+  *result = atomicSub(address, 1234.f);
+}
+
+__global__ void atomicSub_float_v8(long long* address, float* result) {
+  *result = atomicSub(address, 1234);
+}
+
+/* atomicSub(double* address, double val) */
+__global__ void atomicSub_double_v1(double* address, double* result) {
+  *result = atomicSub(&address, 1234.0);
+}
+
+__global__ void atomicSub_double_v2(double* address, double* result) {
+  *result = atomicSub(address, address);
+}
+
+__global__ void atomicSub_double_v3(double* address, double* result) {
+  *result = atomicSub(1234.0, 1234.0);
+}
+
+__global__ void atomicSub_double_v4(Dummy* address, double* result) {
+  *result = atomicSub(address, 1234.0);
+}
+
+__global__ void atomicSub_double_v5(char* address, double* result) {
+  *result = atomicSub(address, 1234.0);
+}
+
+__global__ void atomicSub_double_v6(short* address, double* result) {
+  *result = atomicSub(address, 1234.0);
+}
+
+__global__ void atomicSub_double_v7(long* address, double* result) {
+  *result = atomicSub(address, 1234.0);
+}
+
+__global__ void atomicSub_double_v8(long long* address, double* result) {
+  *result = atomicSub(address, 1234.0);
+}
@@ -0,0 +1,273 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicSub_int{
+    R"(
+    __global__ void atomicSub_int_v1(int* address, int* result) {
+      *result = atomicSub(&address, 1234);
+    }
+
+    __global__ void atomicSub_int_v2(int* address, int* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_int_v3(int* address, int* result) {
+      *result = atomicSub(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_int_v4(Dummy* address, int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_int_v5(char* address, int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_int_v6(short* address, int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_int_v7(long* address, int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_int_v8(long long* address, int* result) {
+      *result = atomicSub(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicSub_uint{
+    R"(
+    __global__ void atomicSub_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicSub(&address, 1234);
+    }
+
+    __global__ void atomicSub_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicSub(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_uint_v5(char* address, unsigned int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_uint_v6(short* address, unsigned int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_uint_v7(long* address, unsigned int* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicSub(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicSub_ulong{
+    R"(
+    __global__ void atomicSub_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicSub(&address, 1234);
+    }
+
+    __global__ void atomicSub_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicSub(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicSub(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicSub_ulonglong{
+    R"(
+    __global__ void atomicSub_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicSub(&address, 1234);
+    }
+
+    __global__ void atomicSub_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicSub(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicSub(address, 1234);
+    }
+
+    __global__ void atomicSub_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicSub(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicSub_float{
+    R"(
+    __global__ void atomicSub_float_v1(float* address, float* result) {
+      *result = atomicSub(&address, 1234.f);
+    }
+
+    __global__ void atomicSub_float_v2(float* address, float* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_float_v3(float* address, float* result) {
+      *result = atomicSub(1234.f, 1234.f);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_float_v4(Dummy* address, float* result) {
+      *result = atomicSub(address, 1234.f);
+    }
+
+    __global__ void atomicSub_float_v5(char* address, float* result) {
+      *result = atomicSub(address, 1234.f);
+    }
+
+    __global__ void atomicSub_float_v6(short* address, float* result) {
+      *result = atomicSub(address, 1234.f);
+    }
+
+    __global__ void atomicSub_float_v7(long* address, float* result) {
+      *result = atomicSub(address, 1234.f);
+    }
+
+    __global__ void atomicSub_float_v8(long long* address, float* result) {
+      *result = atomicSub(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicSub_double{
+    R"(
+    __global__ void atomicSub_double_v1(double* address, double* result) {
+      *result = atomicSub(&address, 1234.0);
+    }
+
+    __global__ void atomicSub_double_v2(double* address, double* result) {
+      *result = atomicSub(address, address);
+    }
+
+    __global__ void atomicSub_double_v3(double* address, double* result) {
+      *result = atomicSub(1234.0, 1234.0);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicSub_double_v4(Dummy* address, double* result) {
+      *result = atomicSub(address, 1234.0);
+    }
+
+    __global__ void atomicSub_double_v5(char* address, double* result) {
+      *result = atomicSub(address, 1234.0);
+    }
+
+    __global__ void atomicSub_double_v6(short* address, double* result) {
+      *result = atomicSub(address, 1234.0);
+    }
+
+    __global__ void atomicSub_double_v7(long* address, double* result) {
+      *result = atomicSub(address, 1234.0);
+    }
+
+    __global__ void atomicSub_double_v8(long long* address, double* result) {
+      *result = atomicSub(address, 1234.0);
+    }
+  )"};
@@ -0,0 +1,177 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicSub_system atomicSub_system
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a two devices wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the memory
+ * location, storing the return value into a separate output array slot corresponding to it. Once
+ * complete, the output array and target memory is validated to contain all the expected values.
+ * Several memory access patterns are tested:
+ *      -# All threads subtract from a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicSub_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicSub_system_Positive_Peer_GPUs", "", int, unsigned int, unsigned long,
+                   unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel on a single device wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the
+ * memory location, storing the return value into a separate output array slot corresponding to
+ * it. While the kernel is running, the host performs atomic additions, in 4 threads, on the same
+ * memory location(s). Once complete, the output array and target memory is validated to contain
+ * all the expected values. Several memory access patterns are tested:
+ *      -# All threads subtract from a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicSub_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicSub_system_Positive_Host_And_GPU", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          1, 1, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          1, 1, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          1, 1, warp_size, cache_line_size, 4);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times on two devices wherein all threads will perform
+ * an atomic addition on a target memory location. Each thread will add the same value to the
+ * memory location, storing the return value into a separate output array slot corresponding to
+ * it. While the kernel is running, the host performs atomic additions, in 4 threads, on the same
+ * memory location(s). Once complete, the output array and target memory is validated to contain
+ * all the expected values. Several memory access patterns are tested:
+ *      -# All threads subtract from a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of atomicSub_system
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomicSub_system.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicSub_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
+                   unsigned long, unsigned long long, float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, 1, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, warp_size, sizeof(TestType), 4);
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      MultipleDeviceMultipleKernelAndHostTest<TestType, AtomicOperation::kSubSystem>(
+          2, 2, warp_size, cache_line_size, 4);
+    }
+  }
+}
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "atomicXor_negative_kernels_rtc.hh"
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicXor atomicXor
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicXor(TestType* address, TestType* val)` -
+ * performs atomic bitwise XOR between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_SameAddress", "", int, unsigned int, unsigned long,
+                   unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_Scattered_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceSingleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_Multi_Kernel_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_Multi_Kernel_Adjacent_Addresses", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          2, warp_size - 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_Positive_Multi_Kernel_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::SingleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXor>(
+          2, warp_size - 1, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Compiles atomicXor with invalid parameters.
+ *  - Compiles the source with RTC.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_atomicXor_Negative_Parameters_RTC") {
+  hiprtcProgram program{};
+
+  const auto program_source =
+      GENERATE(kAtomicXor_int, kAtomicXor_uint, kAtomicXor_ulong, kAtomicXor_ulonglong);
+  HIPRTC_CHECK(
+      hiprtcCreateProgram(&program, program_source, "atomicXor_negative.cc", 0, nullptr, nullptr));
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  // Get the compile log and count compiler error messages
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  // Please check the content of negative_kernels_rtc.hh
+  int expected_error_count{9};
+  std::string error_message{"error:"};
+
+  size_t n_pos = log.find(error_message, 0);
+  while (n_pos != std::string::npos) {
+    ++error_count;
+    n_pos = log.find(error_message, n_pos + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_error_count);
+}
@@ -0,0 +1,185 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+/* int atomicXor(int* address, int val) */
+__global__ void atomicXor_int_v1(int* address, int* result) { *result = atomicXor(&address, 1234); }
+
+__global__ void atomicXor_int_v2(int* address, int* result) {
+  *result = atomicXor(address, address);
+}
+
+__global__ void atomicXor_int_v3(int* address, int* result) { *result = atomicXor(1234, 1234); }
+
+__global__ void atomicXor_int_v4(Dummy* address, int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v5(char* address, int* result) { *result = atomicXor(address, 1234); }
+
+__global__ void atomicXor_int_v6(short* address, int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v7(long* address, int* result) { *result = atomicXor(address, 1234); }
+
+__global__ void atomicXor_int_v8(long long* address, int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v9(float* address, int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v10(double* address, int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+/* unsigned int atomicXor(unsigned int* address, unsigned int val) */
+__global__ void atomicXor_uint_v1(unsigned int* address, unsigned int* result) {
+  *result = atomicXor(&address, 1234);
+}
+
+__global__ void atomicXor_uint_v2(unsigned int* address, unsigned int* result) {
+  *result = atomicXor(address, address);
+}
+
+__global__ void atomicXor_uint_v3(unsigned int* address, unsigned int* result) {
+  *result = atomicXor(1234, 1234);
+}
+
+__global__ void atomicXor_uint_v4(Dummy* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_uint_v5(char* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_uint_v6(short* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_uint_v7(long* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_uint_v8(long long* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v9(float* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_int_v10(double* address, unsigned int* result) {
+  *result = atomicXor(address, 1234);
+}
+
+/* atomicXor(unsigned long* address, unsigned long val) */
+__global__ void atomicXor_ulong_v1(unsigned long* address, unsigned long* result) {
+  *result = atomicXor(&address, 1234);
+}
+
+__global__ void atomicXor_ulong_v2(unsigned long* address, unsigned long* result) {
+  *result = atomicXor(address, address);
+}
+
+__global__ void atomicXor_ulong_v3(unsigned long* address, unsigned long* result) {
+  *result = atomicXor(1234, 1234);
+}
+
+__global__ void atomicXor_ulong_v4(Dummy* address, unsigned long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v5(char* address, unsigned long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v6(short* address, unsigned long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v7(long* address, unsigned long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v8(long long* address, unsigned long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v9(float* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicXor_ulong_v10(double* address, unsigned long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+/* atomicXor(unsigned long long* address, unsigned long long val) */
+__global__ void atomicXor_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+  *result = atomicXor(&address, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+  *result = atomicXor(address, address);
+}
+
+__global__ void atomicXor_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+  *result = atomicXor(1234, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v4(Dummy* address, unsigned long long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v5(char* address, unsigned long long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v6(short* address, unsigned long long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v7(long* address, unsigned long long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicXor_ulonglong_v8(long long* address, unsigned long long* result) {
+  *result = atomicXor(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v9(float* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
+
+__global__ void atomicOr_ulonglong_v10(double* address, unsigned long long* result) {
+  *result = atomicOr(address, 1234);
+}
@@ -0,0 +1,223 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Negative kernels used for the atomics negative Test Cases that are using RTC.
+*/
+
+static constexpr auto kAtomicXor_int{
+    R"(
+    __global__ void atomicXor_int_v1(int* address, int* result) {
+      *result = atomicXor(&address, 1234);
+    }
+
+    __global__ void atomicXor_int_v2(int* address, int* result) {
+      *result = atomicXor(address, address);
+    }
+
+    __global__ void atomicXor_int_v3(int* address, int* result) {
+      *result = atomicXor(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicXor_int_v4(Dummy* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v5(char* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v6(short* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v7(long* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v8(long long* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v9(float* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_int_v10(double* address, int* result) {
+      *result = atomicXor(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicXor_uint{
+    R"(
+    __global__ void atomicXor_uint_v1(unsigned int* address, unsigned int* result) {
+      *result = atomicXor(&address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v2(unsigned int* address, unsigned int* result) {
+      *result = atomicXor(address, address);
+    }
+
+    __global__ void atomicXor_uint_v3(unsigned int* address, unsigned int* result) {
+      *result = atomicXor(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicXor_uint_v4(Dummy* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v5(char* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v6(short* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v7(long* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v8(long long* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v9(float* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_uint_v10(double* address, unsigned int* result) {
+      *result = atomicXor(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicXor_ulong{
+    R"(
+    __global__ void atomicXor_ulong_v1(unsigned long* address, unsigned long* result) {
+      *result = atomicXor(&address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v2(unsigned long* address, unsigned long* result) {
+      *result = atomicXor(address, address);
+    }
+
+    __global__ void atomicXor_ulong_v3(unsigned long* address, unsigned long* result) {
+      *result = atomicXor(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicXor_ulong_v4(Dummy* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v5(char* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v6(short* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v7(long* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v8(long long* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v9(float* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulong_v10(double* address, unsigned long* result) {
+      *result = atomicXor(address, 1234);
+    }
+  )"};
+
+static constexpr auto kAtomicXor_ulonglong{
+    R"(
+    __global__ void atomicXor_ulonglong_v1(unsigned long long* address, unsigned long long* result) {
+      *result = atomicXor(&address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v2(unsigned long long* address, unsigned long long* result) {
+      *result = atomicXor(address, address);
+    }
+
+    __global__ void atomicXor_ulonglong_v3(unsigned long long* address, unsigned long long* result) {
+      *result = atomicXor(1234, 1234);
+    }
+
+    class Dummy {
+     public:
+      __device__ Dummy() {}
+      __device__ ~Dummy() {}
+    };
+
+    __global__ void atomicXor_ulonglong_v4(Dummy* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v5(char* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v6(short* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v7(long* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v8(long long* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v9(float* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+
+    __global__ void atomicXor_ulonglong_v10(double* address, unsigned long long* result) {
+      *result = atomicXor(address, 1234);
+    }
+  )"};
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "bitwise_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup atomicXor_system atomicXor_system
+ * @{
+ * @ingroup AtomicsTest
+ * `atomicXor_system(TestType* address, TestType* val)` -
+ * performs system-wide atomic bitwise XOR between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor_system from multiple threads on the same address.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_system_Positive_Peer_GPUs_Same_Address", "", int, unsigned int,
+                   unsigned long, unsigned long long) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXorSystem>(
+          2, 2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor_system from multiple threads on adjacent addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_system_Positive_Peer_GPUs_Adjacent_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXorSystem>(
+          2, 2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs atomicXor_system from multiple threads on scattered addresses.
+ *  - Uses multiple devices and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/atomicXor_system.cc
+ * Test requirements
+ * ------------------------
+ *  - Multi-device
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_atomicXor_system_Positive_Peer_GPUs_Scattered_Addresses", "", int,
+                   unsigned int, unsigned long, unsigned long long) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      Bitwise::MultipleDeviceMultipleKernelTest<TestType, Bitwise::AtomicOperation::kXorSystem>(
+          2, 2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,458 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <resource_guards.hh>
+
+constexpr int kMemOrder = __ATOMIC_RELAXED;
+constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+// Trivially-copyable class.
+class DummyTC {
+ public:
+  __device__ DummyTC() {}
+  __device__ ~DummyTC() = default;
+  __device__ DummyTC(const DummyTC&) = default;
+  __device__ DummyTC& operator=(const DummyTC&) = default;
+  __device__ DummyTC(DummyTC&&) = default;
+  __device__ DummyTC& operator=(DummyTC&&) = default;
+};
+
+class Dummy {
+ public:
+  __device__ Dummy() {}
+  __device__ ~Dummy() {}
+};
+
+__global__ void StoreCompileKernel(int* x) {
+  // Valid combinations
+  __hip_atomic_store(x, 1, __ATOMIC_RELAXED, kMemScope);
+  __hip_atomic_store(x, 1, __ATOMIC_RELEASE, kMemScope);
+  __hip_atomic_store(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  __hip_atomic_store(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  __hip_atomic_store(*x, 1, kMemOrder, kMemScope);
+  // Consume not allowed by C++1 for store
+  __hip_atomic_store(x, 1, __ATOMIC_CONSUME, kMemScope);
+  // Acquire not allowed by C++11 for store
+  __hip_atomic_store(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  // Acquire-Release not allowed by C++11 for store
+  __hip_atomic_store(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  // Memory order is out of bounds
+  __hip_atomic_store(x, 1, -1, kMemScope);
+  __hip_atomic_store(x, 1, 10, kMemScope);
+  // Memory scope is out of bounds
+  __hip_atomic_store(x, 1, kMemOrder, -1);
+  __hip_atomic_store(x, 1, kMemOrder, 10);
+
+  // Storing an object that is not trivially-copyable
+  Dummy dummy_a{};
+  Dummy dummy_b{};
+  __hip_atomic_store(&dummy_a, dummy_b, kMemOrder, kMemScope);
+
+  // Storing an object that is trivially-copyable
+  DummyTC dummytc_a{};
+  DummyTC dummytc_b{};
+  __hip_atomic_store(&dummytc_a, dummytc_b, kMemOrder, kMemScope);
+}
+
+__global__ void LoadCompileKernel(int* x, int* y) {
+  // Valid combinations
+  *y = __hip_atomic_load(x, __ATOMIC_RELAXED, kMemScope);
+  *y = __hip_atomic_load(x, __ATOMIC_CONSUME, kMemScope);
+  *y = __hip_atomic_load(x, __ATOMIC_ACQUIRE, kMemScope);
+  *y = __hip_atomic_load(x, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Value instead of pointer to the atomic builtin for 1st parameter
+  *y = __hip_atomic_load(*x, kMemOrder, kMemScope);
+  // Release not allowed by C++11 for load
+  *y = __hip_atomic_load(x, __ATOMIC_RELEASE, kMemScope);
+  // Acquire-Release not allowed by C++11 for load
+  *y = __hip_atomic_load(x, __ATOMIC_ACQ_REL, kMemScope);
+  // Memory order is out of bounds
+  *y = __hip_atomic_load(x, -1, kMemScope);
+  *y = __hip_atomic_load(x, 10, kMemScope);
+  // Memory scope is out of bounds
+  *y = __hip_atomic_load(x, kMemOrder, -1);
+  *y = __hip_atomic_load(x, kMemOrder, 10);
+
+  // Loading an object that is not trivially-copyable
+  Dummy dummy_a{};
+  Dummy dummy_b{};
+  dummy_a = __hip_atomic_load(&dummy_b, kMemOrder, kMemScope);
+
+  // Loading an object that is trivially-copyable
+  DummyTC dummytc_a{};
+  DummyTC dummytc_b{};
+  dummytc_a = __hip_atomic_load(&dummytc_b, kMemOrder, kMemScope);
+}
+
+__global__ void CompareWeakCompileKernel(int* x, int* expected) {
+  bool res{false};
+  // Valid combinations
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_CONSUME,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_CONSUME,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_CONSUME,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_CONSUME,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_CONSUME,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQ_REL,
+                                           kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST,
+                                           kMemScope);
+
+  // Release not allowed on fail by C++11
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, __ATOMIC_RELEASE, kMemScope);
+  // Acquire-Release not allowed on fail by C++11
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, __ATOMIC_ACQ_REL, kMemScope);
+  // Fail stronger than success
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST,
+                                           kMemScope);
+  // Pointer to a non-const type
+  res = __hip_atomic_compare_exchange_weak(reinterpret_cast<const int*>(x), expected, 1, kMemOrder,
+                                           kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  res = __hip_atomic_compare_exchange_weak(*x, expected, 1, kMemOrder, kMemOrder, kMemScope);
+  // Memory order on success is out of bounds
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, -1, kMemOrder, kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, 10, kMemOrder, kMemScope);
+  // Memory order on failure is out of bounds
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, -1, kMemScope);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, 10, kMemScope);
+  // Memory scope is out of bounds
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, kMemOrder, -1);
+  res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, kMemOrder, 10);
+
+  // User-defined class is not trivially-copyable and therefore cannot be atomically copied
+  Dummy dummy_a{};
+  Dummy dummy_b{};
+  Dummy dummy_c{};
+  res = __hip_atomic_compare_exchange_weak(&dummy_a, &dummy_b, dummy_c, kMemOrder, kMemOrder,
+                                           kMemScope);
+  // User-defined class is trivially-copyable and can be atomically copied
+  DummyTC dummytc_a{};
+  DummyTC dummytc_b{};
+  DummyTC dummytc_c{};
+  res = __hip_atomic_compare_exchange_weak(&dummytc_a, &dummytc_b, dummytc_c, kMemOrder, kMemOrder,
+                                           kMemScope);
+}
+
+__global__ void CompareStrongCompileKernel(int* x, int* expected) {
+  bool res{false};
+  // Valid combinations
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_CONSUME,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_CONSUME,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_CONSUME,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_CONSUME,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_CONSUME,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQ_REL,
+                                             kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST,
+                                             kMemScope);
+
+  // Release not allowed on fail by C++11
+  res =
+      __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, __ATOMIC_RELEASE, kMemScope);
+  // Acquire-Release not allowed on fail by C++11
+  res =
+      __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, __ATOMIC_ACQ_REL, kMemScope);
+  // Fail stronger than success
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST,
+                                             kMemScope);
+  // Pointer to a non-const type
+  res = __hip_atomic_compare_exchange_strong(reinterpret_cast<const int*>(x), expected, 1,
+                                             kMemOrder, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin for 1st parameter
+  res = __hip_atomic_compare_exchange_strong(*x, expected, 1, kMemOrder, kMemOrder, kMemScope);
+  // Memory order on success is out of bounds
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, -1, kMemOrder, kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, 10, kMemOrder, kMemScope);
+  // Memory order on failure is out of bounds
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, -1, kMemScope);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, 10, kMemScope);
+  // Memory scope is out of bounds
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, kMemOrder, -1);
+  res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, kMemOrder, 10);
+
+  // User-defined class is not trivially-copyable and therefore cannot be atomically copied
+  Dummy dummy_a{};
+  Dummy dummy_b{};
+  Dummy dummy_c{};
+  res = __hip_atomic_compare_exchange_strong(&dummy_a, &dummy_b, dummy_c, kMemOrder, kMemOrder,
+                                             kMemScope);
+  // User-defined class is trivially-copyable and can be atomically copied
+  DummyTC dummytc_a{};
+  DummyTC dummytc_b{};
+  DummyTC dummytc_c{};
+  res = __hip_atomic_compare_exchange_strong(&dummytc_a, &dummytc_b, dummytc_c, kMemOrder,
+                                             kMemOrder, kMemScope);
+}
+
+__global__ void ExchangeCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_exchange(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_exchange(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_exchange(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_exchange(x, 1, -1, kMemScope);
+  old = __hip_atomic_exchange(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_exchange(x, 1, kMemOrder, -1);
+  old = __hip_atomic_exchange(x, 1, kMemOrder, 10);
+
+  // User-defined class is not trivially-copyable and therefore cannot be atomically copied
+  Dummy dummy_a{};
+  Dummy dummy_b{};
+  dummy_b = __hip_atomic_exchange(&dummy_a, dummy_b, kMemOrder, kMemScope);
+
+  // User-defined class is trivially-copyable and can be atomically copied
+  DummyTC dummytc_a{};
+  DummyTC dummytc_b{};
+  dummytc_b = __hip_atomic_exchange(&dummytc_a, dummytc_b, kMemOrder, kMemScope);
+}
+
+__global__ void FetchAddCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_add(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_add(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_add(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_add(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_add(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_add(x, 1, kMemOrder, 10);
+
+  Dummy dummy{};
+  old = __hip_atomic_fetch_add(&dummy, 1, kMemOrder, kMemScope);
+}
+
+__global__ void FetchAndCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_and(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_and(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_and(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_and(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_and(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_and(x, 1, kMemOrder, 10);
+
+  // Value must be an integer
+  Dummy dummy{};
+  old = __hip_atomic_fetch_and(&dummy, 1, kMemOrder, kMemScope);
+  float float_var{1.5f};
+  old = __hip_atomic_fetch_and(&float_var, 1, kMemOrder, kMemScope);
+  double double_var{1.5};
+  old = __hip_atomic_fetch_and(&double_var, 1, kMemOrder, kMemScope);
+}
+
+__global__ void FetchOrCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_or(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_or(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_or(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_or(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_or(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_or(x, 1, kMemOrder, 10);
+
+  // Value must be an integer
+  Dummy dummy{};
+  old = __hip_atomic_fetch_or(&dummy, 1, kMemOrder, kMemScope);
+  float float_var{1.5f};
+  old = __hip_atomic_fetch_or(&float_var, 1, kMemOrder, kMemScope);
+  double double_var{1.5};
+  old = __hip_atomic_fetch_or(&double_var, 1, kMemOrder, kMemScope);
+}
+
+__global__ void FetchXorCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_xor(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_xor(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_xor(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_xor(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_xor(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_xor(x, 1, kMemOrder, 10);
+
+  // Value must be an integer
+  Dummy dummy{};
+  old = __hip_atomic_fetch_xor(&dummy, 1, kMemOrder, kMemScope);
+  float float_var{1.5f};
+  old = __hip_atomic_fetch_xor(&float_var, 1, kMemOrder, kMemScope);
+  double double_var{1.5};
+  old = __hip_atomic_fetch_xor(&double_var, 1, kMemOrder, kMemScope);
+}
+
+__global__ void FetchMaxCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_max(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_max(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_max(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_max(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_max(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_max(x, 1, kMemOrder, 10);
+
+  // Value must be integer or floating point type
+  Dummy dummy{};
+  old = __hip_atomic_fetch_max(&dummy, 1, kMemOrder, kMemScope);
+}
+
+__global__ void FetchMinCompileKernel(int* x) {
+  int old{};
+  // Valid combinations
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_RELAXED, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_CONSUME, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_RELEASE, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+  // Pointer to a non-const type
+  old = __hip_atomic_fetch_min(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+  // Value instead of pointer to the atomic builtin
+  old = __hip_atomic_fetch_min(*x, 1, kMemOrder, kMemScope);
+  // Memory order out of bounds
+  old = __hip_atomic_fetch_min(x, 1, -1, kMemScope);
+  old = __hip_atomic_fetch_min(x, 1, 10, kMemScope);
+  // Memory scope out of bounds
+  old = __hip_atomic_fetch_min(x, 1, kMemOrder, -1);
+  old = __hip_atomic_fetch_min(x, 1, kMemOrder, 10);
+
+  // Value must be integer or floating point type
+  Dummy dummy{};
+  old = __hip_atomic_fetch_min(&dummy, 1, kMemOrder, kMemScope);
+}
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <resource_guards.hh>
+
+#include "atomic_builtins_kernels_rtc.hh"
+
+/**
+ * @addtogroup __hip_atomic_fetch_add __hip_atomic_fetch_add
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+void AtomicBuiltinsRTCWrapper(const char* program_source, int expected_errors_num,
+                              int expected_warnings_num) {
+  hiprtcProgram program{};
+  HIPRTC_CHECK(hiprtcCreateProgram(&program, program_source, "atomics_builtins_kernels.cc", 0,
+                                   nullptr, nullptr));
+
+  hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
+
+  size_t log_size{};
+  HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
+  std::string log(log_size, ' ');
+  HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
+  int error_count{0};
+  int warning_count{0};
+
+  std::string error_message{"error:"};
+  std::string warning_message{"warning:"};
+
+  size_t npos_e = log.find(error_message, 0);
+  while (npos_e != std::string::npos) {
+    ++error_count;
+    npos_e = log.find(error_message, npos_e + 1);
+  }
+
+  size_t npos_w = log.find(warning_message, 0);
+  while (npos_w != std::string::npos) {
+    ++warning_count;
+    npos_w = log.find(warning_message, npos_w + 1);
+  }
+
+  HIPRTC_CHECK(hiprtcDestroyProgram(&program));
+  HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
+  REQUIRE(error_count == expected_errors_num);
+  REQUIRE(warning_count == expected_warnings_num);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Compiles atomic builtins while passing parameters that shall cause:
+ *        -# Compiler warnings
+ *        -# Compiler errors
+ * Test source
+ * ------------------------
+ *    - unit/atomics/atomic_builtins.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_AtomicBuiltins_Negative_Parameters_RTC") {
+  AtomicBuiltinsRTCWrapper(kBuiltinStore, 5, 5);
+  AtomicBuiltinsRTCWrapper(kBuiltinLoad, 4, 4);
+  /* Begin: Should be 5 errors, 6 warnings for both. See EXSWHTEC-309*/
+  AtomicBuiltinsRTCWrapper(kBuiltinCompExWeak, 5, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinCompExStrong, 5, 2);
+  /* End. */
+  AtomicBuiltinsRTCWrapper(kBuiltinExchange, 5, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchAdd, 5, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchAnd, 7, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchOr, 7, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchXor, 7, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchMax, 5, 2);
+  AtomicBuiltinsRTCWrapper(kBuiltinFetchMin, 5, 2);
+}
@@ -0,0 +1,590 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+/*
+Positive and negative kernels used for the builtin atomic Test Cases that are using RTC.
+*/
+
+static constexpr auto kBuiltinStore{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void StoreCompileKernel(int* x) {
+    __hip_atomic_store(x, 1, __ATOMIC_RELAXED, kMemScope);
+    __hip_atomic_store(x, 1, __ATOMIC_RELEASE, kMemScope);
+    __hip_atomic_store(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    __hip_atomic_store(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    __hip_atomic_store(*x, 1, kMemOrder, kMemScope);
+    __hip_atomic_store(x, 1, __ATOMIC_CONSUME, kMemScope);
+    __hip_atomic_store(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    __hip_atomic_store(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    __hip_atomic_store(x, 1, -1, kMemScope);
+    __hip_atomic_store(x, 1, 10, kMemScope);
+    __hip_atomic_store(x, 1, kMemOrder, -1);
+    __hip_atomic_store(x, 1, kMemOrder, 10);
+
+    Dummy dummy_a{};
+    Dummy dummy_b{};
+    __hip_atomic_store(&dummy_a, dummy_b, kMemOrder, kMemScope);
+
+    DummyTC dummytc_a{};
+    DummyTC dummytc_b{};
+    __hip_atomic_store(&dummytc_a, dummytc_b, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinLoad{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void LoadCompileKernel(int* x, int* y) {
+    *y = __hip_atomic_load(x, __ATOMIC_RELAXED, kMemScope);
+    *y = __hip_atomic_load(x, __ATOMIC_CONSUME, kMemScope);
+    *y = __hip_atomic_load(x, __ATOMIC_ACQUIRE, kMemScope);
+    *y = __hip_atomic_load(x, __ATOMIC_SEQ_CST, kMemScope);
+
+    *y = __hip_atomic_load(*x, kMemOrder, kMemScope);
+    *y = __hip_atomic_load(x, __ATOMIC_RELEASE, kMemScope);
+    *y = __hip_atomic_load(x, __ATOMIC_ACQ_REL, kMemScope);
+    *y = __hip_atomic_load(x, -1, kMemScope);
+    *y = __hip_atomic_load(x, 10, kMemScope);
+    *y = __hip_atomic_load(x, kMemOrder, -1);
+    *y = __hip_atomic_load(x, kMemOrder, 10);
+
+    Dummy dummy_a{};
+    Dummy dummy_b{};
+    dummy_a = __hip_atomic_load(&dummy_b, kMemOrder, kMemScope);
+
+    DummyTC dummytc_a{};
+    DummyTC dummytc_b{};
+    dummytc_a = __hip_atomic_load(&dummytc_b, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinCompExWeak{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void CompareWeakCompileKernel(int* x, int* expected) {
+    bool res{false};
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_CONSUME,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_CONSUME,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_CONSUME,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_CONSUME,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_CONSUME,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQ_REL,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST,
+                                            kMemScope);
+
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, __ATOMIC_RELEASE, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, __ATOMIC_ACQ_REL, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST,
+                                            kMemScope);
+    res = __hip_atomic_compare_exchange_weak(reinterpret_cast<const int*>(x), expected, 1, kMemOrder,
+                                            kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(*x, expected, 1, kMemOrder, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, -1, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, 10, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, -1, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, 10, kMemScope);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, kMemOrder, -1);
+    res = __hip_atomic_compare_exchange_weak(x, expected, 1, kMemOrder, kMemOrder, 10);
+
+    Dummy dummy_a{};
+    Dummy dummy_b{};
+    Dummy dummy_c{};
+    res = __hip_atomic_compare_exchange_weak(&dummy_a, &dummy_b, dummy_c, kMemOrder, kMemOrder,
+                                            kMemScope);
+    DummyTC dummytc_a{};
+    DummyTC dummytc_b{};
+    DummyTC dummytc_c{};
+    res = __hip_atomic_compare_exchange_weak(&dummytc_a, &dummytc_b, dummytc_c, kMemOrder, kMemOrder,
+                                            kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinCompExStrong{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void CompareStrongCompileKernel(int* x, int* expected) {
+    bool res{false};
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_CONSUME, __ATOMIC_CONSUME,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_CONSUME,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_CONSUME,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_CONSUME,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_CONSUME,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_ACQ_REL,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST,
+                                              kMemScope);
+
+    res =
+        __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, __ATOMIC_RELEASE, kMemScope);
+    res =
+        __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, __ATOMIC_ACQ_REL, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST,
+                                              kMemScope);
+    res = __hip_atomic_compare_exchange_strong(reinterpret_cast<const int*>(x), expected, 1,
+                                              kMemOrder, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(*x, expected, 1, kMemOrder, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, -1, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, 10, kMemOrder, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, -1, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, 10, kMemScope);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, kMemOrder, -1);
+    res = __hip_atomic_compare_exchange_strong(x, expected, 1, kMemOrder, kMemOrder, 10);
+
+    Dummy dummy_a{};
+    Dummy dummy_b{};
+    Dummy dummy_c{};
+    res = __hip_atomic_compare_exchange_strong(&dummy_a, &dummy_b, dummy_c, kMemOrder, kMemOrder,
+                                              kMemScope);
+    DummyTC dummytc_a{};
+    DummyTC dummytc_b{};
+    DummyTC dummytc_c{};
+    res = __hip_atomic_compare_exchange_strong(&dummytc_a, &dummytc_b, dummytc_c, kMemOrder,
+                                              kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinExchange{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void ExchangeCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_exchange(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_exchange(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_exchange(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_exchange(x, 1, -1, kMemScope);
+    old = __hip_atomic_exchange(x, 1, 10, kMemScope);
+    old = __hip_atomic_exchange(x, 1, kMemOrder, -1);
+    old = __hip_atomic_exchange(x, 1, kMemOrder, 10);
+
+    Dummy dummy_a{};
+    Dummy dummy_b{};
+    dummy_b = __hip_atomic_exchange(&dummy_a, dummy_b, kMemOrder, kMemScope);
+
+    DummyTC dummytc_a{};
+    DummyTC dummytc_b{};
+    dummytc_b = __hip_atomic_exchange(&dummytc_a, dummytc_b, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinFetchAdd{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchAddCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_add(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_add(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_add(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_add(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_add(&dummy, 1, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinFetchAnd{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchAndCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_and(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_and(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_and(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_and(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_and(&dummy, 1, kMemOrder, kMemScope);
+    float float_var{1.5f};
+    old = __hip_atomic_fetch_and(&float_var, 1, kMemOrder, kMemScope);
+    double double_var{1.5};
+    old = __hip_atomic_fetch_and(&double_var, 1, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinFetchOr{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchOrCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_or(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_or(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_or(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_or(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_or(&dummy, 1, kMemOrder, kMemScope);
+    float float_var{1.5f};
+    old = __hip_atomic_fetch_or(&float_var, 1, kMemOrder, kMemScope);
+    double double_var{1.5};
+    old = __hip_atomic_fetch_or(&double_var, 1, kMemOrder, kMemScope);
+  }
+)"};
+
+static auto constexpr kBuiltinFetchXor{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchXorCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_xor(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_xor(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_xor(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_xor(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_xor(&dummy, 1, kMemOrder, kMemScope);
+    float float_var{1.5f};
+    old = __hip_atomic_fetch_xor(&float_var, 1, kMemOrder, kMemScope);
+    double double_var{1.5};
+    old = __hip_atomic_fetch_xor(&double_var, 1, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinFetchMax{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchMaxCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_max(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_max(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_max(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_max(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_max(&dummy, 1, kMemOrder, kMemScope);
+  }
+)"};
+
+static constexpr auto kBuiltinFetchMin{R"(
+  constexpr int kMemOrder = __ATOMIC_RELAXED;
+  constexpr int kMemScope = __HIP_MEMORY_SCOPE_SYSTEM;
+
+  class DummyTC {
+  public:
+    __device__ DummyTC() {}
+    __device__ ~DummyTC() = default;
+    __device__ DummyTC(const DummyTC&) = default;
+    __device__ DummyTC& operator=(const DummyTC&) = default;
+    __device__ DummyTC(DummyTC&&) = default;
+    __device__ DummyTC& operator=(DummyTC&&) = default;
+  };
+
+  class Dummy {
+  public:
+    __device__ Dummy() {}
+    __device__ ~Dummy() {}
+  };
+
+  __global__ void FetchMinCompileKernel(int* x) {
+    int old{};
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_RELAXED, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_CONSUME, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_ACQUIRE, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_RELEASE, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_ACQ_REL, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, __ATOMIC_SEQ_CST, kMemScope);
+
+    old = __hip_atomic_fetch_min(reinterpret_cast<const int*>(x), 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_min(*x, 1, kMemOrder, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, -1, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, 10, kMemScope);
+    old = __hip_atomic_fetch_min(x, 1, kMemOrder, -1);
+    old = __hip_atomic_fetch_min(x, 1, kMemOrder, 10);
+
+    Dummy dummy{};
+    old = __hip_atomic_fetch_min(&dummy, 1, kMemOrder, kMemScope);
+  }
+)"};
@@ -0,0 +1,412 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <cmd_options.hh>
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <resource_guards.hh>
+
+namespace cg = cooperative_groups;
+
+namespace Bitwise {
+enum class AtomicOperation {
+  kAnd = 0,
+  kAndSystem,
+  kOr,
+  kOrSystem,
+  kXor,
+  kXorSystem,
+  kBuiltinAnd,
+  kBuiltinOr,
+  kBuiltinXor
+};
+
+constexpr auto kMask = 0xAAAA;
+constexpr auto kTestValue = 0x4545;
+constexpr auto kAndTestValue = 0xFFFF;
+
+template <typename TestType, AtomicOperation operation>
+__host__ __device__ TestType GetTestValue() {
+  if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem) {
+    return kAndTestValue;
+  }
+
+  return kTestValue;
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__device__ TestType PerformAtomicOperation(TestType* const mem) {
+  const auto mask = kMask;
+
+  if constexpr (operation == AtomicOperation::kAnd) {
+    return atomicAnd(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kAndSystem) {
+    return atomicAnd_system(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kOr) {
+    return atomicOr(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kOrSystem) {
+    return atomicOr_system(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kXor) {
+    return atomicXor(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kXorSystem) {
+    return atomicXor_system(mem, mask);
+  } else if constexpr (operation == AtomicOperation::kBuiltinAnd) {
+    return __hip_atomic_fetch_and(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinOr) {
+    return __hip_atomic_fetch_or(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinXor) {
+    return __hip_atomic_fetch_xor(mem, mask, __ATOMIC_RELAXED, memory_scope);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
+  __shared__ TestType shared_mem;
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? &shared_mem : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid == 0) mem[0] = global_mem[0];
+    __syncthreads();
+  }
+
+  old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(mem);
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid == 0) global_mem[0] = mem[0];
+  }
+}
+
+template <typename TestType>
+__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch,
+                                            const unsigned int idx) {
+  const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
+  return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
+}
+
+__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
+                           const unsigned int width, const unsigned pitch) {
+  extern __shared__ uint8_t shared_mem[];
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? reinterpret_cast<TestType*>(shared_mem) : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid < width) {
+      const auto target = PitchedOffset(mem, pitch, tid);
+      *target = *PitchedOffset(global_mem, pitch, tid);
+    };
+    __syncthreads();
+  }
+
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(
+        PitchedOffset(mem, pitch, tid % width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    GenerateMemoryTraffic(begin_addr, end_addr);
+  }
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid < width) {
+      const auto target = PitchedOffset(global_mem, pitch, tid);
+      *target = *PitchedOffset(mem, pitch, tid);
+    };
+  }
+}
+
+struct TestParams {
+  auto ThreadCount() const {
+    return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
+  }
+
+  dim3 blocks;
+  dim3 threads;
+  unsigned int num_devices = 1u;
+  unsigned int kernel_count = 1u;
+  unsigned int width = 1u;
+  unsigned int pitch = 0u;
+  unsigned int host_thread_count = 0u;
+  LinearAllocs alloc_type;
+};
+
+template <typename TestType, AtomicOperation operation>
+std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const TestParams& p) {
+  const auto thread_count = p.num_devices * p.kernel_count * p.ThreadCount();
+
+  TestType test_value = GetTestValue<TestType, operation>();
+  const auto mask = kMask;
+  std::vector<TestType> res_vals(p.width, test_value);
+  std::vector<TestType> old_vals;
+  old_vals.reserve(thread_count);
+
+  for (auto tid = 0u; tid < thread_count; ++tid) {
+    auto& res = res_vals[tid % p.width];
+    old_vals.push_back(res);
+
+    if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem ||
+                  operation == AtomicOperation::kBuiltinAnd) {
+      res = res & mask;
+    } else if constexpr (operation == AtomicOperation::kOr ||
+                         operation == AtomicOperation::kOrSystem ||
+                         operation == AtomicOperation::kBuiltinOr) {
+      res = res | mask;
+    } else if constexpr (operation == AtomicOperation::kXor ||
+                         operation == AtomicOperation::kXorSystem ||
+                         operation == AtomicOperation::kBuiltinXor) {
+      res = res ^ mask;
+    }
+  }
+
+  return {res_vals, old_vals};
+}
+
+template <typename TestType, AtomicOperation operation>
+void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<TestType>& old_vals) {
+  auto [expected_res_vals, expected_old_vals] = TestKernelHostRef<TestType, operation>(p);
+
+  for (auto i = 0u; i < res_vals.size(); ++i) {
+    INFO("Results index: " << i);
+    REQUIRE(expected_res_vals[i] == res_vals[i]);
+  }
+
+  std::sort(begin(old_vals), end(old_vals));
+  std::sort(begin(expected_old_vals), end(expected_old_vals));
+  for (auto i = 0u; i < old_vals.size(); ++i) {
+    INFO("Old values index: " << i);
+    REQUIRE(expected_old_vals[i] == old_vals[i]);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
+                  TestType* const old_vals) {
+  const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
+  if (p.width == 1 && p.pitch == sizeof(TestType))
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
+  else
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void TestCore(const TestParams& p) {
+  const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
+  std::vector<LinearAllocGuard<TestType>> old_vals_devs;
+  std::vector<StreamGuard> streams;
+  for (auto i = 0; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
+    old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
+    for (auto j = 0; j < p.kernel_count; ++j) {
+      streams.emplace_back(Streams::created);
+    }
+  }
+
+  const auto mem_alloc_size = p.width * p.pitch;
+  LinearAllocGuard<TestType> mem_dev(p.alloc_type, mem_alloc_size);
+
+  std::vector<TestType> old_vals(p.num_devices * p.kernel_count * p.ThreadCount());
+  std::vector<TestType> res_vals(p.width);
+
+  TestType* const mem_ptr =
+      p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr();
+
+  TestType test_value = GetTestValue<TestType, operation>();
+  HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size));
+  for (int i = 0; i < p.width * p.pitch / sizeof(TestType); ++i) {
+    HIP_CHECK(hipMemcpy(&mem_ptr[i], &test_value, sizeof(TestType), hipMemcpyHostToDevice));
+  }
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      const auto& stream = streams[i * p.kernel_count + j].stream();
+      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+      LaunchKernel<TestType, operation, use_shared_mem, memory_scope>(p, stream, mem_dev.ptr(),
+                                                                      old_vals);
+    }
+  }
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    const auto device_offset = i * p.kernel_count * p.ThreadCount();
+    HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
+                        old_vals_alloc_size, hipMemcpyDeviceToHost));
+  }
+  HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType),
+                        p.width, hipMemcpyDeviceToHost));
+
+  Verify<TestType, operation>(p, res_vals, old_vals);
+}
+
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = 1;
+  if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                 operation == AtomicOperation::kBuiltinOr ||
+                 operation == AtomicOperation::kBuiltinXor) &&
+                memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                        operation == AtomicOperation::kBuiltinOr ||
+                        operation == AtomicOperation::kBuiltinXor) &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateThreadDimensions();
+  }
+  params.width = width;
+  params.pitch = pitch;
+
+  SECTION("Global memory") {
+    if constexpr ((operation == AtomicOperation::kBuiltinAnd ||
+                   operation == AtomicOperation::kBuiltinOr ||
+                   operation == AtomicOperation::kBuiltinXor) &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateBlockDimensions();
+    }
+    using LA = LinearAllocs;
+    for (const auto alloc_type :
+         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+      params.alloc_type = alloc_type;
+      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+        TestCore<TestType, operation, false>(params);
+      }
+    }
+  }
+
+  SECTION("Shared memory") {
+    params.blocks = dim3(1);
+    params.alloc_type = LinearAllocs::hipMalloc;
+    TestCore<TestType, operation, true>(params);
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width,
+                                    const unsigned int pitch) {
+  int concurrent_kernels = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
+  if (!concurrent_kernels) {
+    HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+    return;
+  }
+
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type :
+       {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false>(params);
+    }
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
+                                      const unsigned int kernel_count, const unsigned int width,
+                                      const unsigned int pitch) {
+  if (num_devices > 1) {
+    if (HipTest::getDeviceCount() < num_devices) {
+      std::string msg = std::to_string(num_devices) + " devices are required";
+      HipTest::HIP_SKIP_TEST(msg.c_str());
+      return;
+    }
+  }
+
+  if (kernel_count > 1) {
+    for (auto i = 0u; i < num_devices; ++i) {
+      int concurrent_kernels = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
+      if (!concurrent_kernels) {
+        HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+        return;
+      }
+    }
+  }
+
+  TestParams params;
+  params.num_devices = num_devices;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateBlockDimensions();
+  params.threads = GenerateThreadDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false, __HIP_MEMORY_SCOPE_SYSTEM>(params);
+    }
+  }
+}
+
+}  // namespace Bitwise
@@ -0,0 +1,433 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <resource_guards.hh>
+
+enum class BuiltinAtomicOperation {
+  kLoadStore = 0,
+  kExchange,
+  kCompareExchangeStrong,
+  kCompareExchangeWeak,
+  kAdd,
+  kAnd,
+  kOr,
+  kXor,
+  kMin,
+  kMax
+};
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__host__ __device__ void SetFlag(int* const flag) {
+#ifdef __HIP_DEVICE_COMPILE__
+  if constexpr (operation == BuiltinAtomicOperation::kLoadStore) {
+    static_assert(memory_order != __ATOMIC_ACQ_REL);
+    __hip_atomic_store(flag, 1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kExchange) {
+    __hip_atomic_exchange(flag, 1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kCompareExchangeStrong) {
+    int compare = 0;
+    __hip_atomic_compare_exchange_strong(flag, &compare, 1, memory_order, __ATOMIC_RELAXED,
+                                         memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kCompareExchangeWeak) {
+    int compare = 0;
+    while (!__hip_atomic_compare_exchange_weak(flag, &compare, 1, memory_order, __ATOMIC_RELAXED,
+                                               memory_scope))
+      compare = 0;
+  } else if constexpr (operation == BuiltinAtomicOperation::kAdd) {
+    __hip_atomic_fetch_add(flag, 1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kAnd) {
+    __hip_atomic_fetch_and(flag, 0x0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kOr) {
+    __hip_atomic_fetch_or(flag, 0x1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kXor) {
+    __hip_atomic_fetch_xor(flag, 0x1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kMin) {
+    __hip_atomic_fetch_min(flag, -1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kMax) {
+    __hip_atomic_fetch_max(flag, 1, memory_order, memory_scope);
+  }
+#else
+  if constexpr (operation == BuiltinAtomicOperation::kAnd) {
+    __atomic_store_n(flag, 0, __ATOMIC_RELEASE);
+  } else {
+    __atomic_store_n(flag, 1, __ATOMIC_RELEASE);
+  }
+#endif
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__host__ __device__ int FetchFlag(int* const flag) {
+#ifdef __HIP_DEVICE_COMPILE__
+  if constexpr (operation == BuiltinAtomicOperation::kLoadStore) {
+    static_assert(memory_order != __ATOMIC_ACQ_REL);
+    return __hip_atomic_load(flag, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kExchange) {
+    return __hip_atomic_exchange(flag, 0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kCompareExchangeStrong) {
+    int compare = 1;
+    __hip_atomic_compare_exchange_strong(
+        flag, &compare, 1, memory_order,
+        memory_order == __ATOMIC_ACQ_REL ? __ATOMIC_ACQUIRE : memory_order, memory_scope);
+    return compare;
+  } else if constexpr (operation == BuiltinAtomicOperation::kCompareExchangeWeak) {
+    int compare = 1;
+    __hip_atomic_compare_exchange_weak(
+        flag, &compare, 1, memory_order,
+        memory_order == __ATOMIC_ACQ_REL ? __ATOMIC_ACQUIRE : memory_order, memory_scope);
+    return compare;
+  } else if constexpr (operation == BuiltinAtomicOperation::kAdd) {
+    return __hip_atomic_fetch_add(flag, 0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kAnd) {
+    return !__hip_atomic_fetch_and(flag, 0x1, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kOr) {
+    return __hip_atomic_fetch_or(flag, 0x0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kXor) {
+    return __hip_atomic_fetch_xor(flag, 0x0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kMin) {
+    return __hip_atomic_fetch_min(flag, 0, memory_order, memory_scope);
+  } else if constexpr (operation == BuiltinAtomicOperation::kMax) {
+    return __hip_atomic_fetch_max(flag, 0, memory_order, memory_scope);
+  }
+#else
+  if constexpr (operation == BuiltinAtomicOperation::kAnd) {
+    return !__atomic_load_n(flag, __ATOMIC_ACQUIRE);
+  } else {
+    return __atomic_load_n(flag, __ATOMIC_ACQUIRE);
+  }
+#endif
+}
+
+namespace AcquireRelease {
+
+constexpr auto kTestValue = 42;
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__host__ __device__ void Producer(int* const flag, int* const data) {
+  constexpr int actual_memory_order =
+      memory_order == __ATOMIC_ACQUIRE ? __ATOMIC_RELEASE : memory_order;
+
+  data[0] = kTestValue;
+
+  SetFlag<operation, actual_memory_order, memory_scope>(flag);
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__host__ __device__ void Consumer(int* const flag, int* const data, int* const ret) {
+  while (!FetchFlag<operation, memory_order, memory_scope>(flag))
+    ;
+
+  ret[0] = data[0];
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__global__ void TestKernel(int* const flag, int* data, int* const ret) {
+  __shared__ int shared_mem;
+
+  if (data == nullptr) data = &shared_mem;
+
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    if constexpr (operation == BuiltinAtomicOperation::kAnd)
+      *flag = 1;
+    else
+      *flag = 0;
+  }
+  __syncthreads();
+
+  bool producer = false, consumer = false;
+
+  if constexpr (memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    producer = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer = blockIdx.x == 0 && threadIdx.x == 1;
+  } else if constexpr (memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP) {
+    producer = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer = blockIdx.x == 0 && threadIdx.x == warpSize;
+  } else if constexpr (memory_scope == __HIP_MEMORY_SCOPE_AGENT) {
+    producer = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer = blockIdx.x == 1 && threadIdx.x == 0;
+  }
+
+  if (producer) {
+    Producer<operation, memory_order, memory_scope>(flag, data);
+    return;
+  }
+
+  if (consumer) {
+    Consumer<operation, memory_order, memory_scope>(flag, data, ret);
+    return;
+  }
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__global__ void ProducerKernel(int* const flag, int* const data) {
+  if (!(blockIdx.x == 0 && threadIdx.x == 0)) {
+    return;
+  }
+
+  Producer<operation, memory_order, memory_scope>(flag, data);
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope>
+__global__ void ConsumerKernel(int* const flag, int* const data, int* const ret) {
+  if (!(blockIdx.x == 0 && threadIdx.x == 0)) {
+    return;
+  }
+
+  Consumer<operation, memory_order, memory_scope>(flag, data, ret);
+}
+
+template <BuiltinAtomicOperation operation, int memory_order, int memory_scope> void Test() {
+  int blocks = 1, threads = 1;
+  if (memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    blocks = 1;
+    threads = 2;
+  } else if (memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP) {
+    blocks = 1;
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    threads = warp_size * 2;
+  } else if (memory_scope == __HIP_MEMORY_SCOPE_AGENT) {
+    blocks = 2;
+    threads = 1;
+  }
+
+  LinearAllocGuard<int> flag(LinearAllocs::hipMalloc, sizeof(int));
+  LinearAllocGuard<int> ret(LinearAllocs::hipMallocManaged, sizeof(int));
+
+  SECTION("Global memory") {
+    const auto alloc_type = GENERATE(LinearAllocs::hipMalloc, LinearAllocs::hipMallocManaged);
+    LinearAllocGuard<int> data(alloc_type, sizeof(int));
+    TestKernel<operation, memory_order, memory_scope>
+        <<<blocks, threads>>>(flag.ptr(), data.ptr(), ret.ptr());
+  }
+
+  if (memory_scope != __HIP_MEMORY_SCOPE_AGENT && memory_scope != __HIP_MEMORY_SCOPE_SYSTEM) {
+    SECTION("Shared memory") {
+      TestKernel<operation, memory_order, memory_scope>
+          <<<blocks, threads>>>(flag.ptr(), nullptr, ret.ptr());
+    }
+  }
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  REQUIRE(ret.ptr()[0] == kTestValue);
+}
+
+template <BuiltinAtomicOperation operation, int memory_order> void SystemTest() {
+  std::thread host_thread;
+
+  LinearAllocGuard<int> flag(LinearAllocs::hipMallocManaged, sizeof(int));
+  LinearAllocGuard<int> ret(LinearAllocs::hipMallocManaged, sizeof(int));
+
+  SECTION("Global memory") {
+    const auto alloc_type = GENERATE(LinearAllocs::hipHostMalloc, LinearAllocs::hipMallocManaged);
+    LinearAllocGuard<int> data(alloc_type, sizeof(int));
+
+    SECTION("Host producer - Device consumer") {
+      ConsumerKernel<operation, memory_order, __HIP_MEMORY_SCOPE_SYSTEM>
+          <<<1, 1>>>(flag.ptr(), data.ptr(), ret.ptr());
+      host_thread = std::thread([&] {
+        Producer<operation, memory_order, __HIP_MEMORY_SCOPE_SYSTEM>(flag.ptr(), data.ptr());
+      });
+    }
+
+    SECTION("Device producer - Host consumer") {
+      host_thread = std::thread([&] {
+        Consumer<operation, memory_order, __HIP_MEMORY_SCOPE_SYSTEM>(flag.ptr(), data.ptr(),
+                                                                     ret.ptr());
+      });
+      ProducerKernel<operation, memory_order, __HIP_MEMORY_SCOPE_SYSTEM>
+          <<<1, 1>>>(flag.ptr(), data.ptr());
+    }
+  }
+
+  HIP_CHECK(hipDeviceSynchronize());
+  host_thread.join();
+
+  REQUIRE(ret.ptr()[0] == kTestValue);
+}
+
+} /* namespace AcquireRelease */
+
+namespace SequentialConsistency {
+
+template <BuiltinAtomicOperation operation, int memory_scope>
+__host__ __device__ void Producer(int* const flag) {
+  __atomic_store_n(flag, 1, __ATOMIC_SEQ_CST);
+}
+
+template <BuiltinAtomicOperation operation, int memory_scope>
+__host__ __device__ void Consumer(int* const flag1, int* const flag2, int* const counter) {
+  while (!FetchFlag<operation, __ATOMIC_SEQ_CST, memory_scope>(flag1))
+    ;
+  if (FetchFlag<operation, __ATOMIC_SEQ_CST, memory_scope>(flag2)) {
+#ifdef __HIP_DEVICE_COMPILE__
+    __hip_atomic_fetch_add(counter, 1, __ATOMIC_SEQ_CST, memory_scope);
+#else
+    __atomic_fetch_add(counter, 1, __ATOMIC_SEQ_CST);
+#endif
+  }
+}
+
+template <BuiltinAtomicOperation operation, int memory_scope>
+__global__ void TestKernel(int* flag1, int* flag2, int* const counter) {
+  __shared__ int shared_mem[2];
+
+  if (flag1 == nullptr) flag1 = &shared_mem[0];
+  if (flag2 == nullptr) flag2 = &shared_mem[1];
+
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    if constexpr (operation == BuiltinAtomicOperation::kAnd) {
+      *flag1 = 1;
+      *flag2 = 1;
+    } else {
+      *flag1 = 0;
+      *flag2 = 0;
+    }
+  }
+  __syncthreads();
+
+  bool producer1 = false, producer2 = false, consumer1 = false, consumer2 = false;
+
+  if constexpr (memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    producer1 = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer1 = blockIdx.x == 0 && threadIdx.x == 1;
+    producer2 = blockIdx.x == 0 && threadIdx.x == 2;
+    consumer2 = blockIdx.x == 0 && threadIdx.x == 3;
+  } else if constexpr (memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP) {
+    producer1 = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer1 = blockIdx.x == 0 && threadIdx.x == warpSize;
+    producer2 = blockIdx.x == 0 && threadIdx.x == warpSize * 2;
+    consumer2 = blockIdx.x == 0 && threadIdx.x == warpSize * 3;
+  } else if constexpr (memory_scope == __HIP_MEMORY_SCOPE_AGENT) {
+    producer1 = blockIdx.x == 0 && threadIdx.x == 0;
+    consumer1 = blockIdx.x == 1 && threadIdx.x == 0;
+    producer2 = blockIdx.x == 2 && threadIdx.x == 0;
+    consumer2 = blockIdx.x == 3 && threadIdx.x == 0;
+  }
+
+  if (producer1) {
+    Producer<operation, memory_scope>(flag1);
+    return;
+  }
+
+  if (consumer1) {
+    Consumer<operation, memory_scope>(flag1, flag2, counter);
+    return;
+  }
+
+  if (producer2) {
+    Producer<operation, memory_scope>(flag2);
+    return;
+  }
+
+  if (consumer2) {
+    Consumer<operation, memory_scope>(flag2, flag1, counter);
+    return;
+  }
+}
+
+template <BuiltinAtomicOperation operation, int memory_scope>
+__global__ void ProducerKernel(int* const flag) {
+  if (!(blockIdx.x == 0 && threadIdx.x == 0)) {
+    return;
+  }
+
+  Producer<operation, memory_scope>(flag);
+}
+
+template <BuiltinAtomicOperation operation, int memory_scope>
+__global__ void ConsumerKernel(int* const flag1, int* const flag2, int* const counter) {
+  if (!(blockIdx.x == 0 && threadIdx.x == 0)) {
+    return;
+  }
+
+  Consumer<operation, memory_scope>(flag1, flag2, counter);
+}
+
+template <BuiltinAtomicOperation operation, int memory_scope> void Test() {
+  int blocks = 1, threads = 1;
+  if (memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    blocks = 1;
+    threads = 4;
+  } else if (memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP) {
+    blocks = 1;
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    threads = warp_size * 4;
+  } else if (memory_scope == __HIP_MEMORY_SCOPE_AGENT) {
+    blocks = 4;
+    threads = 1;
+  }
+
+  LinearAllocGuard<int> counter(LinearAllocs::hipMallocManaged, sizeof(int));
+
+  SECTION("Global memory") {
+    const auto alloc_type = GENERATE(LinearAllocs::hipMalloc);
+    LinearAllocGuard<int> flag1(alloc_type, sizeof(int));
+    LinearAllocGuard<int> flag2(alloc_type, sizeof(int));
+    TestKernel<operation, memory_scope>
+        <<<blocks, threads>>>(flag1.ptr(), flag2.ptr(), counter.ptr());
+  }
+
+  if (memory_scope != __HIP_MEMORY_SCOPE_AGENT && memory_scope != __HIP_MEMORY_SCOPE_SYSTEM) {
+    SECTION("Shared memory") {
+      TestKernel<operation, memory_scope><<<blocks, threads>>>(nullptr, nullptr, counter.ptr());
+    }
+  }
+
+  HIP_CHECK(hipDeviceSynchronize());
+
+  REQUIRE(counter.ptr()[0] != 0);
+}
+
+template <BuiltinAtomicOperation operation> void SystemTest() {
+  std::thread host_producer, host_consumer;
+
+  LinearAllocGuard<int> counter(LinearAllocs::hipMallocManaged, sizeof(int));
+
+  SECTION("Global memory") {
+    const auto alloc_type = GENERATE(LinearAllocs::hipMallocManaged);
+    LinearAllocGuard<int> flag1(alloc_type, sizeof(int));
+    LinearAllocGuard<int> flag2(alloc_type, sizeof(int));
+
+    ConsumerKernel<operation, __HIP_MEMORY_SCOPE_SYSTEM>
+        <<<1, 1>>>(flag1.ptr(), flag2.ptr(), counter.ptr());
+    host_consumer = std::thread([&] {
+      Consumer<operation, __HIP_MEMORY_SCOPE_SYSTEM>(flag2.ptr(), flag1.ptr(), counter.ptr());
+    });
+
+    ProducerKernel<operation, __HIP_MEMORY_SCOPE_SYSTEM><<<1, 1>>>(flag1.ptr());
+    host_producer =
+        std::thread([&] { Producer<operation, __HIP_MEMORY_SCOPE_SYSTEM>(flag2.ptr()); });
+  }
+
+  HIP_CHECK(hipDeviceSynchronize());
+  host_producer.join();
+  host_consumer.join();
+
+  REQUIRE(counter.ptr()[0] != 0);
+}
+
+}  // namespace SequentialConsistency
@@ -0,0 +1,420 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+#include <resource_guards.hh>
+#include <cmd_options.hh>
+
+namespace cg = cooperative_groups;
+
+namespace MinMax {
+enum class AtomicOperation {
+  kMin = 0,
+  kMinSystem,
+  kMax,
+  kMaxSystem,
+  kSafeMin,
+  kUnsafeMin,
+  kSafeMax,
+  kUnsafeMax,
+  kBuiltinMin,
+  kBuiltinMax
+};
+
+constexpr auto kIntegerTestValue = 5;
+constexpr auto kFloatingPointTestValue = 5.5;
+
+template <typename TestType, AtomicOperation operation>
+__host__ __device__ TestType GetTestValue() {
+  TestType test_value =
+      std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
+
+  if constexpr (operation == AtomicOperation::kMin || operation == AtomicOperation::kMinSystem ||
+                operation == AtomicOperation::kUnsafeMin ||
+                operation == AtomicOperation::kSafeMin) {
+    return test_value - 2;
+  }
+
+  return test_value + 2;
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__device__ TestType PerformAtomicOperation(TestType* const mem) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  if constexpr (operation == AtomicOperation::kMin) {
+    return atomicMin(mem, val);
+  } else if constexpr (operation == AtomicOperation::kMinSystem) {
+    return atomicMin_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kMax) {
+    return atomicMax(mem, val);
+  } else if constexpr (operation == AtomicOperation::kMaxSystem) {
+    return atomicMax_system(mem, val);
+  } else if constexpr (operation == AtomicOperation::kUnsafeMin) {
+    return unsafeAtomicMin(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSafeMin) {
+    return safeAtomicMin(mem, val);
+  } else if constexpr (operation == AtomicOperation::kUnsafeMax) {
+    return unsafeAtomicMax(mem, val);
+  } else if constexpr (operation == AtomicOperation::kSafeMax) {
+    return safeAtomicMax(mem, val);
+  } else if constexpr (operation == AtomicOperation::kBuiltinMin) {
+    return __hip_atomic_fetch_min(mem, val, __ATOMIC_RELAXED, memory_scope);
+  } else if constexpr (operation == AtomicOperation::kBuiltinMax) {
+    return __hip_atomic_fetch_max(mem, val, __ATOMIC_RELAXED, memory_scope);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) {
+  __shared__ TestType shared_mem;
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? &shared_mem : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid == 0) mem[0] = global_mem[0];
+    __syncthreads();
+  }
+
+  old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(mem);
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid == 0) global_mem[0] = mem[0];
+  }
+}
+
+template <typename TestType>
+__host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned int pitch,
+                                            const unsigned int idx) {
+  const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
+  return reinterpret_cast<TestType*>(byte_ptr + idx * pitch);
+}
+
+__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) {
+  for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) {
+    uint8_t val = *addr;
+    val ^= 0xAB;
+    *addr = val;
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+__global__ void TestKernel(TestType* const global_mem, TestType* const old_vals,
+                           const unsigned int width, const unsigned pitch) {
+  extern __shared__ uint8_t shared_mem[];
+
+  const auto tid = cg::this_grid().thread_rank();
+
+  TestType* const mem = use_shared_mem ? reinterpret_cast<TestType*>(shared_mem) : global_mem;
+
+  if constexpr (use_shared_mem) {
+    if (tid < width) {
+      const auto target = PitchedOffset(mem, pitch, tid);
+      *target = *PitchedOffset(global_mem, pitch, tid);
+    };
+    __syncthreads();
+  }
+
+  const auto n = cooperative_groups::this_grid().size() - width;
+
+  TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width);
+
+  if (tid < n) {
+    old_vals[tid] = PerformAtomicOperation<TestType, operation, memory_scope>(
+        PitchedOffset(mem, pitch, tid % width));
+  } else {
+    uint8_t* const begin_addr = reinterpret_cast<uint8_t*>(atomic_addr + 1);
+    uint8_t* const end_addr = reinterpret_cast<uint8_t*>(atomic_addr) + pitch;
+    GenerateMemoryTraffic(begin_addr, end_addr);
+  }
+
+  if constexpr (use_shared_mem) {
+    __syncthreads();
+    if (tid < width) {
+      const auto target = PitchedOffset(global_mem, pitch, tid);
+      *target = *PitchedOffset(mem, pitch, tid);
+    };
+  }
+}
+
+struct TestParams {
+  auto ThreadCount() const {
+    return blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
+  }
+
+  dim3 blocks;
+  dim3 threads;
+  unsigned int num_devices = 1u;
+  unsigned int kernel_count = 1u;
+  unsigned int width = 1u;
+  unsigned int pitch = 0u;
+  unsigned int host_thread_count = 0u;
+  LinearAllocs alloc_type;
+};
+
+template <typename TestType, AtomicOperation operation>
+std::tuple<std::vector<TestType>, std::vector<TestType>> TestKernelHostRef(const TestParams& p) {
+  const auto val = GetTestValue<TestType, operation>();
+
+  const auto thread_count = p.num_devices * p.kernel_count * p.ThreadCount();
+
+  TestType test_value =
+      std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
+
+  std::vector<TestType> res_vals(p.width, test_value);
+  std::vector<TestType> old_vals;
+  old_vals.reserve(thread_count);
+
+  for (auto tid = 0u; tid < thread_count; ++tid) {
+    auto& res = res_vals[tid % p.width];
+    old_vals.push_back(res);
+
+    if constexpr (operation == AtomicOperation::kMin || operation == AtomicOperation::kMinSystem ||
+                  operation == AtomicOperation::kUnsafeMin ||
+                  operation == AtomicOperation::kSafeMin ||
+                  operation == AtomicOperation::kBuiltinMin) {
+      res = std::min(res, val);
+    } else if constexpr (operation == AtomicOperation::kMax ||
+                         operation == AtomicOperation::kMaxSystem ||
+                         operation == AtomicOperation::kUnsafeMax ||
+                         operation == AtomicOperation::kSafeMax ||
+                         operation == AtomicOperation::kBuiltinMax) {
+      res = std::max(res, val);
+    }
+  }
+
+  return {res_vals, old_vals};
+}
+
+template <typename TestType, AtomicOperation operation>
+void Verify(const TestParams& p, std::vector<TestType>& res_vals, std::vector<TestType>& old_vals) {
+  auto [expected_res_vals, expected_old_vals] = TestKernelHostRef<TestType, operation>(p);
+
+  for (auto i = 0u; i < res_vals.size(); ++i) {
+    INFO("Results index: " << i);
+    REQUIRE(expected_res_vals[i] == res_vals[i]);
+  }
+
+  std::sort(begin(old_vals), end(old_vals));
+  std::sort(begin(expected_old_vals), end(expected_old_vals));
+  for (auto i = 0u; i < old_vals.size(); ++i) {
+    INFO("Old values index: " << i);
+    REQUIRE(expected_old_vals[i] == old_vals[i]);
+  }
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr,
+                  TestType* const old_vals) {
+  const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u;
+  if (p.width == 1 && p.pitch == sizeof(TestType))
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals);
+  else
+    TestKernel<TestType, operation, use_shared_mem, memory_scope>
+        <<<p.blocks, p.threads, shared_mem_size, stream>>>(mem_ptr, old_vals, p.width, p.pitch);
+}
+
+template <typename TestType, AtomicOperation operation, bool use_shared_mem,
+          int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void TestCore(const TestParams& p) {
+  const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType);
+  std::vector<LinearAllocGuard<TestType>> old_vals_devs;
+  std::vector<StreamGuard> streams;
+  for (auto i = 0; i < p.num_devices; ++i) {
+    HIP_CHECK(hipSetDevice(i));
+    old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
+    for (auto j = 0; j < p.kernel_count; ++j) {
+      streams.emplace_back(Streams::created);
+    }
+  }
+
+  const auto mem_alloc_size = p.width * p.pitch;
+  LinearAllocGuard<TestType> mem_dev(p.alloc_type, mem_alloc_size);
+
+  std::vector<TestType> old_vals(p.num_devices * p.kernel_count * p.ThreadCount());
+  std::vector<TestType> res_vals(p.width);
+
+  TestType* const mem_ptr =
+      p.alloc_type == LinearAllocs::hipMalloc ? mem_dev.ptr() : mem_dev.host_ptr();
+
+  TestType test_value =
+      std::is_floating_point_v<TestType> ? kFloatingPointTestValue : kIntegerTestValue;
+  HIP_CHECK(hipMemset(mem_ptr, 0, mem_alloc_size));
+  for (int i = 0; i < p.width * p.pitch / sizeof(TestType); ++i) {
+    HIP_CHECK(hipMemcpy(&mem_ptr[i], &test_value, sizeof(TestType), hipMemcpyHostToDevice));
+  }
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    for (auto j = 0u; j < p.kernel_count; ++j) {
+      const auto& stream = streams[i * p.kernel_count + j].stream();
+      const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount();
+      LaunchKernel<TestType, operation, use_shared_mem, memory_scope>(p, stream, mem_dev.ptr(),
+                                                                      old_vals);
+    }
+  }
+
+  for (auto i = 0u; i < p.num_devices; ++i) {
+    const auto device_offset = i * p.kernel_count * p.ThreadCount();
+    HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
+                        old_vals_alloc_size, hipMemcpyDeviceToHost));
+  }
+  HIP_CHECK(hipMemcpy2D(res_vals.data(), sizeof(TestType), mem_ptr, p.pitch, sizeof(TestType),
+                        p.width, hipMemcpyDeviceToHost));
+
+  Verify<TestType, operation>(p, res_vals, old_vals);
+}
+
+inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
+
+inline dim3 GenerateBlockDimensions() {
+  int sm_count = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
+  return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
+}
+
+template <typename TestType, AtomicOperation operation, int memory_scope = __HIP_MEMORY_SCOPE_AGENT>
+void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = 1;
+  if constexpr ((operation == AtomicOperation::kBuiltinMin ||
+                 operation == AtomicOperation::kBuiltinMax) &&
+                memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) {
+    params.threads = 1;
+  } else if constexpr ((operation == AtomicOperation::kBuiltinMin ||
+                        operation == AtomicOperation::kBuiltinMax) &&
+                       memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) {
+    int warp_size = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    params.threads = dim3(warp_size);
+  } else {
+    params.threads = GenerateThreadDimensions();
+  }
+  params.width = width;
+  params.pitch = pitch;
+
+  SECTION("Global memory") {
+    if constexpr ((operation == AtomicOperation::kBuiltinMin ||
+                   operation == AtomicOperation::kBuiltinMax) &&
+                  (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT ||
+                   memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) {
+      params.blocks = dim3(1);
+    } else {
+      params.blocks = GenerateBlockDimensions();
+    }
+    using LA = LinearAllocs;
+    for (const auto alloc_type :
+         {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+      params.alloc_type = alloc_type;
+      DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+        TestCore<TestType, operation, false>(params);
+      }
+    }
+  }
+
+  SECTION("Shared memory") {
+    params.blocks = dim3(1);
+    params.alloc_type = LinearAllocs::hipMalloc;
+    TestCore<TestType, operation, true>(params);
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsigned int width,
+                                    const unsigned int pitch) {
+  int concurrent_kernels = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
+  if (!concurrent_kernels) {
+    HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+    return;
+  }
+
+  TestParams params;
+  params.num_devices = 1;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateThreadDimensions();
+  params.threads = GenerateBlockDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type :
+       {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false>(params);
+    }
+  }
+}
+
+template <typename TestType, AtomicOperation operation>
+void MultipleDeviceMultipleKernelTest(const unsigned int num_devices,
+                                      const unsigned int kernel_count, const unsigned int width,
+                                      const unsigned int pitch) {
+  if (num_devices > 1) {
+    if (HipTest::getDeviceCount() < num_devices) {
+      std::string msg = std::to_string(num_devices) + " devices are required";
+      HipTest::HIP_SKIP_TEST(msg.c_str());
+      return;
+    }
+  }
+
+  if (kernel_count > 1) {
+    for (auto i = 0u; i < num_devices; ++i) {
+      int concurrent_kernels = 0;
+      HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
+      if (!concurrent_kernels) {
+        HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
+        return;
+      }
+    }
+  }
+
+  TestParams params;
+  params.num_devices = num_devices;
+  params.kernel_count = kernel_count;
+  params.blocks = GenerateThreadDimensions();
+  params.threads = GenerateBlockDimensions();
+  params.width = width;
+  params.pitch = pitch;
+
+  using LA = LinearAllocs;
+  for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
+    params.alloc_type = alloc_type;
+    DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
+      TestCore<TestType, operation, false, __HIP_MEMORY_SCOPE_SYSTEM>(params);
+    }
+  }
+}
+
+}  // namespace MinMax
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup safeAtomicAdd safeAtomicAdd
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of safeAtomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/safeAtomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicAdd_Positive", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSafeAdd>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSafeAdd>(warp_size,
+                                                                        sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kSafeAdd>(warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will
+ * perform an atomic addition on a target memory location. Each thread will add the same value to
+ * the memory location, storing the return value into a separate output array slot corresponding
+ * to it. Once complete, the output array and target memory is validated to contain all the
+ * expected values. Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of safeAtomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/safeAtomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicAdd_Positive_Multi_Kernel", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSafeAdd>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSafeAdd>(2, warp_size,
+                                                                          sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kSafeAdd>(2, warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,175 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup safeAtomicMax safeAtomicMax
+ * @{
+ * @ingroup AtomicsTest
+ * `safeAtomicMax(TestType* address, TestType* val)` -
+ * calculates maximum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_SameAddress", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_Adjacent_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_Scattered_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_Multi_Kernel_Same_Address", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMax from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMax>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,175 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup safeAtomicMin safeAtomicMin
+ * @{
+ * @ingroup AtomicsTest
+ * `safeAtomicMin(TestType* address, TestType* val)` -
+ * calculates minimum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_SameAddress", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_Adjacent_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_Scattered_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_Multi_Kernel_Same_Address", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs safeAtomicMin from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/safeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_safeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kSafeMin>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,165 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include "memory_order_common.hh"
+
+TEST_CASE("Unit___hip_atomic_load_store_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kLoadStore, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kLoadStore, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kLoadStore, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kLoadStore>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_exchange_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kExchange, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kExchange, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kExchange, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kExchange>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_compare_exchange_strong_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeStrong,
+                                __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") {
+    SequentialConsistency::SystemTest<BuiltinAtomicOperation::kCompareExchangeStrong>();
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_compare_exchange_weak_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeWeak,
+                                __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeWeak,
+                                __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kCompareExchangeWeak,
+                                __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") {
+    SequentialConsistency::SystemTest<BuiltinAtomicOperation::kCompareExchangeWeak>();
+  }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_add_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAdd, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAdd, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAdd, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kAdd>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAnd, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAnd, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kAnd, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kAnd>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kOr, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kOr, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kOr, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kOr>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kXor, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kXor, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kXor, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kXor>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_min_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMin, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMin, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMin, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kMin>(); }
+}
+
+TEST_CASE("Unit___hip_atomic_fetch_max_Positive_Sequential_Consistency") {
+  SECTION("WAVEFRONT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMax, __HIP_MEMORY_SCOPE_WAVEFRONT>();
+  }
+  SECTION("WORKGROUP") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMax, __HIP_MEMORY_SCOPE_WORKGROUP>();
+  }
+  SECTION("AGENT") {
+    SequentialConsistency::Test<BuiltinAtomicOperation::kMax, __HIP_MEMORY_SCOPE_AGENT>();
+  }
+  SECTION("SYSTEM") { SequentialConsistency::SystemTest<BuiltinAtomicOperation::kMax>(); }
+}
@@ -0,0 +1,124 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "arithmetic_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup unsafeAtomicAdd unsafeAtomicAdd
+ * @{
+ * @ingroup AtomicsTest
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a single kernel on a single device wherein all threads will perform an atomic
+ * addition on a target memory location. Each thread will add the same value to the memory location,
+ * storing the return value into a separate output array slot corresponding to it. Once complete,
+ * the output array and target memory is validated to contain all the expected values. Several
+ * memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of unsafeAtomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Shared memory
+ *      - Several grid and block dimension combinations (only one block is used for shared memory).
+ * Test source
+ * ------------------------
+ *    - unit/atomics/unsafeAtomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_Positive", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(warp_size,
+                                                                          sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceSingleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(warp_size,
+                                                                          cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Executes a kernel two times concurrently on a single device wherein all threads will
+ * perform an atomic addition on a target memory location. Each thread will add the same value to
+ * the memory location, storing the return value into a separate output array slot corresponding
+ * to it. Once complete, the output array and target memory is validated to contain all the
+ * expected values. Several memory access patterns are tested:
+ *      -# All threads add to a single, compile time deducible, memory location
+ *      -# Each thread targets an array containing warp_size elements, using tid % warp_size
+ *         for indexing
+ *      -# Same as the above, but the elements are spread out by L1 cache line size bytes.
+ *
+ *    - The test is run for:
+ *      - All overloads of unsafeAtomicAdd
+ *      - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated memory
+ *      - Several grid and block dimension combinations.
+ * Test source
+ * ------------------------
+ *    - unit/atomics/unsafeAtomicAdd.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicAdd_Positive_Multi_Kernel", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(2, 1, sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Adjacent addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(2, warp_size,
+                                                                            sizeof(TestType));
+    }
+
+    DYNAMIC_SECTION("Scattered addresses " << current) {
+      SingleDeviceMultipleKernelTest<TestType, AtomicOperation::kUnsafeAdd>(2, warp_size,
+                                                                            cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,175 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup unsafeAtomicMax unsafeAtomicMax
+ * @{
+ * @ingroup AtomicsTest
+ * `unsafeAtomicMax(TestType* address, TestType* val)` -
+ * calculates maximum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_SameAddress", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_Adjacent_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_Scattered_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_Multi_Kernel_Same_Address", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_Multi_Kernel_Adjacent_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMax from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMax.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMax_Positive_Multi_Kernel_Scattered_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMax>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -0,0 +1,175 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "min_max_common.hh"
+
+#include <hip_test_common.hh>
+
+/**
+ * @addtogroup unsafeAtomicMin unsafeAtomicMin
+ * @{
+ * @ingroup AtomicsTest
+ * `unsafeAtomicMin(TestType* address, TestType* val)` -
+ * calculates minimum between address and val, returns old value.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_SameAddress", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_Adjacent_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches one kernel.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_Scattered_Addresses", "", float, double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceSingleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          warp_size, cache_line_size);
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on the same address.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_Multi_Kernel_Same_Address", "", float, double) {
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Same address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          2, 1, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on adjacent addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_Multi_Kernel_Adjacent_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Adjacent address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          2, warp_size, sizeof(TestType));
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Performs unsafeAtomicMin from multiple threads on the scattered addresses.
+ *  - Uses only one device and launches multiple kernels.
+ * Test source
+ * ------------------------
+ *  - unit/atomics/unsafeAtomicMin.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_unsafeAtomicMin_Positive_Multi_Kernel_Scattered_Addresses", "", float,
+                   double) {
+  int warp_size = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+  const auto cache_line_size = 128u;
+
+  for (auto current = 0; current < cmd_options.iterations; ++current) {
+    DYNAMIC_SECTION("Scattered address " << current) {
+      MinMax::SingleDeviceMultipleKernelTest<TestType, MinMax::AtomicOperation::kUnsafeMin>(
+          2, warp_size, cache_line_size);
+    }
+  }
+}
@@ -52,7 +52,7 @@ class CompileAndCapture(unittest.TestCase):
    # HIP compiler on AMD platforms has limit of 20 errors, and some negative
    # test cases expect that more errors are detected.
    if (self.platform == 'amd'):
-      compiler_args.append('-ferror-limit=100')
+      compiler_args.append('-ferror-limit=200')
    compiler_output = subprocess.run(compiler_args, stderr=subprocess.PIPE)
    # Get the compiler output in the stdout if -V flag is raised during ctest invocation.
    compiler_stderr = compiler_output.stderr.decode('UTF-8')
@@ -2,6 +2,7 @@
 set(TEST_SRC
  thread_block.cc
  thread_block_tile.cc
+  coalesced_group_tiled_partition.cc
  hipCGThreadBlockType_old.cc
  hipCGMultiGridGroupType_old.cc
  hipCGGridGroupType_old.cc
@@ -0,0 +1,685 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "cooperative_groups_common.hh"
+
+#include <bitset>
+#include <optional>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+#include <cmd_options.hh>
+#include <cpu_grid.h>
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+/**
+ * @addtogroup coalesced_group_tile coalesced_group_tile
+ * @{
+ * @ingroup DeviceLanguageTest
+ * Contains unit tests for partitioning of coalesced groups into tiled partitions
+ */
+
+namespace cg = cooperative_groups;
+
+namespace {
+#if HT_AMD
+constexpr auto kMaskMin = std::numeric_limits<uint64_t>().min();
+constexpr auto kMaskLimit = std::numeric_limits<uint64_t>().max();
+#else
+constexpr auto kMaskMin = std::numeric_limits<uint32_t>().min();
+constexpr auto kMaskLimit = std::numeric_limits<uint32_t>().max();
+#endif
+}  // namespace
+
+static unsigned int GenerateTileSizes() {
+#if HT_AMD
+  return GENERATE(2u, 4u, 8u, 16u, 32u, 64u);
+#else
+  return GENERATE(2u, 4u, 8u, 16u, 32u);
+#endif
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  static std::mt19937 mt(11);
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <size_t warp_size> static auto coalesce_threads(const uint64_t mask) {
+  std::tuple<std::array<unsigned int, warp_size>, unsigned int> res;
+  auto& [threads, count] = res;
+
+  count = 0u;
+  for (auto i = 0u; i < warp_size; ++i) {
+    if (mask & (1u << i)) {
+      threads[count++] = i;
+    }
+  }
+
+  return res;
+}
+
+template <size_t warp_size> __device__ bool deactivate_thread(uint64_t* active_masks) {
+  const cg::thread_block_tile<warp_size> warp =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const auto block = cg::this_thread_block();
+  const auto warps_per_block = (block.size() + warp_size - 1) / warp_size;
+  const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+  const auto idx = block_rank * warps_per_block + block.thread_rank() / warp.size();
+
+  return !(active_masks[idx] & (1u << warp.thread_rank()));
+}
+
+
+template <size_t warp_size>
+__global__ void coalesced_group_tiled_partition_size_getter(uint64_t* active_masks,
+                                                            unsigned int tile_size,
+                                                            unsigned int* sizes) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+  sizes[thread_rank_in_grid()] = cg::tiled_partition(cg::coalesced_threads(), tile_size).size();
+}
+
+template <size_t warp_size>
+__global__ void coalesced_group_tiled_partition_thread_rank_getter(uint64_t* active_masks,
+                                                                   unsigned int tile_size,
+                                                                   unsigned int* sizes) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+
+  sizes[thread_rank_in_grid()] =
+      cg::tiled_partition(cg::coalesced_threads(), tile_size).thread_rank();
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Deactivates threads based on passed in mask and creates tiled partitions over coalesced
+ * threads for each of the valid sizes{2, 4, 8, 16, 32, 64(if AMD)} and writes the return values of
+ * size and thread_rank member functions to an output array that is validated on the host side.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group_tiled_partition.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Getters_Positive_Basic") {
+  const auto tile_size = GenerateTileSizes();
+  INFO("Tile size: " << tile_size);
+  auto blocks = GenerateBlockDimensions();
+  auto threads = GenerateThreadDimensions();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(unsigned int);
+  LinearAllocGuard<unsigned int> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<unsigned int> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  const auto warps_in_grid = warps_in_block * grid.block_count_;
+  LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                              warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
+                                          warps_in_grid * sizeof(uint64_t));
+
+  std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
+                [] { return GenerateRandomInteger(0u, std::numeric_limits<uint32_t>().max()); });
+  HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
+  coalesced_group_tiled_partition_size_getter<32>
+      <<<blocks, threads>>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
+  coalesced_group_tiled_partition_thread_rank_getter<32>
+      <<<blocks, threads>>>(active_masks_dev.ptr(), tile_size, uint_arr_dev.ptr());
+
+  const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
+
+  // validate size
+  for (auto i = 0u; i < warps_in_grid; ++i) {
+    auto current_warp_mask = active_masks.ptr()[i];
+    const auto shift_amount =
+        (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
+    current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
+
+    const auto [active_threads, active_thread_count] =
+        coalesce_threads<kWarpSize>(current_warp_mask);
+
+    const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
+    const auto num_tiles = (active_thread_count + tile_size - 1) / tile_size;
+    const auto tile_tail = num_tiles * tile_size - active_thread_count;
+    // Step tile-sized window over active threads
+    for (auto t = 0u; t < active_thread_count; t += tile_size) {
+      const auto window_start = t;
+      const auto window_end = t + tile_size;
+      // Iterate through window
+      for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
+        const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
+        const auto expected_val = tile_size - tile_tail * (t + tile_size >= active_thread_count);
+        const auto actual_val = uint_arr.ptr()[global_thread_idx];
+        INFO("global index: " << global_thread_idx);
+        if (actual_val != expected_val) {
+          REQUIRE(actual_val == expected_val);
+        }
+      }
+    }
+  }
+
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  // validate rank
+  for (auto i = 0u; i < warps_in_grid; ++i) {
+    auto current_warp_mask = active_masks.ptr()[i];
+    const auto shift_amount =
+        (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
+    current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
+
+    const auto [active_threads, active_thread_count] =
+        coalesce_threads<kWarpSize>(current_warp_mask);
+
+    const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
+    // Step tile-sized window over active threads
+    for (auto t = 0u; t < active_thread_count; t += tile_size) {
+      const auto window_start = t;
+      const auto window_end = t + tile_size;
+      // Iterate through window
+      for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
+        const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
+        const auto expected_val = k % tile_size;
+        const auto actual_val = uint_arr.ptr()[global_thread_idx];
+        INFO("global index: " << global_thread_idx);
+        if (actual_val != expected_val) {
+          REQUIRE(actual_val == expected_val);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename T, size_t warp_size>
+__global__ void coalesced_group_tiled_partition_shfl_up(uint64_t* active_masks, T* const out,
+                                                        const unsigned int tile_size,
+                                                        const unsigned int delta) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+  const cg::thread_block_tile<warp_size> warp =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  T var = static_cast<T>(warp.thread_rank());
+
+  const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
+  out[thread_rank_in_grid()] = tile.shfl_up(var, delta);
+}
+
+
+template <typename T> static void CoalescedGroupTiledPartitonShflUpTestImpl() {
+  const auto tile_size = GenerateTileSizes();
+  INFO("Tile size: " << tile_size);
+  auto blocks = GenerateBlockDimensionsForShuffle();
+  auto threads = GenerateThreadDimensionsForShuffle();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  const auto delta = GENERATE_COPY(range(0u, tile_size));
+  INFO("Delta: " << delta);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  const auto warps_in_grid = warps_in_block * grid.block_count_;
+  LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                              warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
+                                          warps_in_grid * sizeof(uint64_t));
+
+  std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
+                [] { return GenerateRandomInteger(kMaskMin, kMaskLimit); });
+  HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
+  coalesced_group_tiled_partition_shfl_up<T, kWarpSize>
+      <<<blocks, threads>>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
+
+  for (auto i = 0u; i < warps_in_grid; ++i) {
+    auto current_warp_mask = active_masks.ptr()[i];
+    const auto shift_amount =
+        (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
+    current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
+
+    const auto [active_threads, active_thread_count] =
+        coalesce_threads<kWarpSize>(current_warp_mask);
+
+    const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
+    // Step tile-sized window over active threads
+    for (auto t = 0u; t < active_thread_count; t += tile_size) {
+      const auto window_start = t + delta;
+      const auto window_end = t + tile_size;
+      // Iterate through window
+      for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
+        const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
+        const auto expected_val = active_threads[k - delta];
+        const auto actual_val = uint_arr.ptr()[global_thread_idx];
+        INFO("global index: " << global_thread_idx);
+        if (actual_val != expected_val) {
+          REQUIRE(actual_val == expected_val);
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle up behavior of tiled partitions of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for delta values of [0, tile size). The partitions are created over a coalesced
+ * group, with memberships of threads in the coalesced group being controlled via a passed in active
+ * mask. The test is run for all overloads of shfl_up.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group_tiled_partition.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Up_Positive_Basic", "", int,
+                   unsigned int, long, unsigned long, long long, unsigned long long, float,
+                   double) {
+  CoalescedGroupTiledPartitonShflUpTestImpl<TestType>();
+}
+
+
+template <typename T, size_t warp_size>
+__global__ void coalesced_group_tiled_partition_shfl_down(uint64_t* active_masks, T* const out,
+                                                          const unsigned int tile_size,
+                                                          const unsigned int delta) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+  const cg::thread_block_tile<warp_size> warp =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  T var = static_cast<T>(warp.thread_rank());
+
+  const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
+  out[thread_rank_in_grid()] = tile.shfl_down(var, delta);
+}
+
+
+template <typename T> static void CoalescedGroupTiledPartitonShflDownTestImpl() {
+  const auto tile_size = GenerateTileSizes();
+  INFO("Tile size: " << tile_size);
+  auto blocks = GenerateBlockDimensionsForShuffle();
+  auto threads = GenerateThreadDimensionsForShuffle();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  const auto delta = GENERATE_COPY(range(0u, tile_size));
+  INFO("Delta: " << delta);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  const auto warps_in_grid = warps_in_block * grid.block_count_;
+  LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                              warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
+                                          warps_in_grid * sizeof(uint64_t));
+
+  std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
+                [] { return GenerateRandomInteger(kMaskMin, kMaskLimit); });
+  HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
+  coalesced_group_tiled_partition_shfl_down<T, kWarpSize>
+      <<<blocks, threads>>>(active_masks_dev.ptr(), uint_arr_dev.ptr(), tile_size, delta);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
+
+  for (auto i = 0u; i < warps_in_grid; ++i) {
+    auto current_warp_mask = active_masks.ptr()[i];
+    const auto shift_amount =
+        (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
+    current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
+
+    const auto [active_threads, active_thread_count] =
+        coalesce_threads<kWarpSize>(current_warp_mask);
+
+    if (delta >= active_thread_count) {
+      continue;
+    }
+
+    const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
+    // Step tile-sized window over active threads
+    for (auto t = 0u; t < active_thread_count; t += tile_size) {
+      const auto window_start = t;
+      const auto window_end = t + tile_size - delta;
+      // Iterate through window
+      for (auto k = window_start; k < window_end && k < active_thread_count - delta; ++k) {
+        const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
+        const auto expected_val = active_threads[k + delta];
+        const auto actual_val = uint_arr.ptr()[global_thread_idx];
+        INFO("global index: " << global_thread_idx);
+        if (actual_val != expected_val) {
+          REQUIRE(actual_val == expected_val);
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle down behavior of tiled partitions of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for delta values of [0, tile size). The partitions are created over a coalesced
+ * group, with memberships of threads in the coalesced group being controlled via a passed in active
+ * mask. The test is run for all overloads of shfl_down.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group_tiled_partition.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Down_Positive_Basic", "", int,
+                   unsigned int, long, unsigned long, long long, unsigned long long, float,
+                   double) {
+  CoalescedGroupTiledPartitonShflDownTestImpl<TestType>();
+}
+
+
+template <typename T, size_t warp_size>
+__global__ void coalesced_group_tiled_partition_shfl(uint64_t* active_masks, uint8_t* target_lanes,
+                                                     T* const out, const unsigned int tile_size) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+  const cg::thread_block_tile<warp_size> warp =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  T var = static_cast<T>(warp.thread_rank());
+
+  const auto tile = cg::tiled_partition(cg::coalesced_threads(), tile_size);
+  out[thread_rank_in_grid()] = tile.shfl(var, target_lanes[tile.thread_rank()]);
+}
+
+template <typename T> static void CoalescedGroupTiledPartitonShflTestImpl() {
+  const auto tile_size = GenerateTileSizes();
+  INFO("Tile size: " << tile_size);
+  auto blocks = GenerateBlockDimensionsForShuffle();
+  auto threads = GenerateThreadDimensionsForShuffle();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  LinearAllocGuard<T> uint_arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> uint_arr(LinearAllocs::hipHostMalloc, alloc_size);
+
+  const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  const auto warps_in_grid = warps_in_block * grid.block_count_;
+  LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                              warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
+                                          warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint8_t> target_lanes_dev(LinearAllocs::hipMalloc, tile_size * sizeof(uint8_t));
+  LinearAllocGuard<uint8_t> target_lanes(LinearAllocs::hipHostMalloc, tile_size * sizeof(uint8_t));
+
+  std::generate(target_lanes.ptr(), target_lanes.ptr() + tile_size,
+                [tile_size] { return GenerateRandomInteger(0, static_cast<int>(2 * tile_size)); });
+  std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
+                [] { return GenerateRandomInteger(kMaskMin, kMaskLimit); });
+  HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(target_lanes_dev.ptr(), target_lanes.ptr(), tile_size * sizeof(uint8_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemsetAsync(uint_arr_dev.ptr(), 0, alloc_size));
+  coalesced_group_tiled_partition_shfl<T, kWarpSize><<<blocks, threads>>>(
+      active_masks_dev.ptr(), target_lanes_dev.ptr(), uint_arr_dev.ptr(), tile_size);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(uint_arr.ptr(), uint_arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
+
+  for (auto i = 0u; i < warps_in_grid; ++i) {
+    auto current_warp_mask = active_masks.ptr()[i];
+    const auto shift_amount =
+        (tail + 32 * TestContext::get().isNvidia()) * !((i + 1) % warps_in_block);
+    current_warp_mask = (current_warp_mask << shift_amount) >> shift_amount;
+
+    const auto [active_threads, active_thread_count] =
+        coalesce_threads<kWarpSize>(current_warp_mask);
+
+    const auto tails = tail * (i / warps_in_block) * (i >= warps_in_block);
+    // Step tile-sized window over active threads
+    for (auto t = 0u; t < active_thread_count; t += tile_size) {
+      const auto window_start = t;
+      const auto window_end = t + tile_size;
+      // Iterate through window
+      for (auto k = window_start; k < window_end && k < active_thread_count; ++k) {
+        const auto global_thread_idx = i * kWarpSize + active_threads[k] - tails;
+        const auto target_lane = target_lanes.ptr()[k % tile_size];
+        if (target_lane >= tile_size || target_lane >= active_thread_count - t) {
+          continue;
+        }
+        const auto expected_val = active_threads[t + target_lane];
+        const auto actual_val = uint_arr.ptr()[global_thread_idx];
+        INFO("global index: " << global_thread_idx);
+        if (actual_val != expected_val) {
+          REQUIRE(actual_val == expected_val);
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates the shuffle behavior of tiled partitions of all valid sizes{2, 4, 8, 16, 32,
+ * 64(if AMD)} for delta values of [0, tile size). The partitions are created over a coalesced
+ * group, with memberships of threads in the coalesced group being controlled via a passed in active
+ * mask. The test is run for all overloads of shfl.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group_tiled_partition.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Shfl_Positive_Basic", "", int,
+                   unsigned int, long, unsigned long, long long, unsigned long long, float,
+                   double) {
+  CoalescedGroupTiledPartitonShflTestImpl<TestType>();
+}
+
+
+template <bool use_global, size_t warp_size, typename T>
+__global__ void coalesced_group_tiled_partition_sync_check(uint64_t* active_masks, T* global_data,
+                                                           unsigned int* wait_modifiers,
+                                                           size_t tile_size) {
+  if (deactivate_thread<warp_size>(active_masks)) {
+    return;
+  }
+
+  extern __shared__ uint8_t shared_data[];
+  T* const data = use_global ? global_data : reinterpret_cast<T*>(shared_data);
+  const auto tid = cg::this_grid().thread_rank();
+  const auto block = cg::this_thread_block();
+  const auto coalesced = cg::coalesced_threads();
+  const auto partition = cg::tiled_partition(coalesced, tile_size);
+  const auto data_idx = [&block](unsigned int i) { return use_global ? i : (i % block.size()); };
+
+  const auto wait_modifier = wait_modifiers[tid];
+
+  const auto block_rank = tid / block.size();
+  const auto warp_rank = block.thread_rank() / warp_size;
+  const auto warp_base = block_rank * block.size() + warp_rank * warp_size;
+  const auto global_idx = warp_base + coalesced.thread_rank();
+
+  busy_wait(wait_modifier);
+  data[data_idx(global_idx)] = partition.thread_rank();
+  partition.sync();
+
+  bool valid = true;
+  const auto tile_rank = coalesced.thread_rank() / tile_size;
+  for (auto i = 0u; i < tile_size; ++i) {
+    const auto target_rank_in_tile = (coalesced.thread_rank() + i) % tile_size;
+    const auto target_rank_in_warp = tile_rank * tile_size + target_rank_in_tile;
+    if (target_rank_in_warp >= coalesced.size()) {
+      continue;
+    }
+    if (!(valid &= (data[data_idx(warp_base + target_rank_in_warp)] == target_rank_in_tile))) {
+      break;
+    }
+  }
+  // Validate
+  partition.sync();
+  data[data_idx(global_idx)] = valid;
+  if constexpr (!use_global) {
+    global_data[global_idx] = data[data_idx(global_idx)];
+  }
+}
+
+template <bool global_memory, typename T> void CoalescedGroupTiledPartitionSyncTest() {
+  const auto randomized_run_count = GENERATE(range(0, cmd_options.cg_iterations));
+  INFO("Run number: " << randomized_run_count + 1);
+  const auto tile_size = GenerateTileSizes();
+  INFO("Tile size: " << tile_size);
+  auto blocks = GenerateBlockDimensionsForShuffle();
+  auto threads = GenerateThreadDimensionsForShuffle();
+  INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+  INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+  CPUGrid grid(blocks, threads);
+
+  const auto alloc_size = grid.thread_count_ * sizeof(T);
+  const auto alloc_size_per_block = alloc_size / grid.block_count_;
+  int max_shared_mem_per_block = 0;
+  HIP_CHECK(hipDeviceGetAttribute(&max_shared_mem_per_block,
+                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+  if (!global_memory && (max_shared_mem_per_block < alloc_size_per_block)) {
+    return;
+  }
+
+  LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+  LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+  LinearAllocGuard<unsigned int> wait_modifiers_dev(LinearAllocs::hipMalloc,
+                                                    grid.thread_count_ * sizeof(unsigned int));
+  LinearAllocGuard<unsigned int> wait_modifiers(LinearAllocs::hipHostMalloc,
+                                                grid.thread_count_ * sizeof(unsigned int));
+  const auto warps_in_block = (grid.threads_in_block_count_ + kWarpSize - 1) / kWarpSize;
+  const auto warps_in_grid = warps_in_block * grid.block_count_;
+  LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                              warps_in_grid * sizeof(uint64_t));
+  LinearAllocGuard<uint64_t> active_masks(LinearAllocs::hipHostMalloc,
+                                          warps_in_grid * sizeof(uint64_t));
+  if (randomized_run_count != 0) {
+    std::generate(wait_modifiers.ptr(), wait_modifiers.ptr() + grid.thread_count_,
+                  [] { return GenerateRandomInteger(0u, 1500u); });
+  } else {
+    std::fill_n(wait_modifiers.ptr(), grid.thread_count_, 0u);
+  }
+  std::generate(active_masks.ptr(), active_masks.ptr() + warps_in_grid,
+                [] { return GenerateRandomInteger(kMaskMin, kMaskLimit); });
+
+  HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks.ptr(), warps_in_grid * sizeof(uint64_t),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(wait_modifiers_dev.ptr(), wait_modifiers.ptr(),
+                      grid.thread_count_ * sizeof(unsigned int), hipMemcpyHostToDevice));
+
+  const auto shared_memory_size = global_memory ? 0u : alloc_size_per_block;
+  coalesced_group_tiled_partition_sync_check<global_memory, kWarpSize>
+      <<<blocks, threads, shared_memory_size>>>(active_masks_dev.ptr(), arr_dev.ptr(),
+                                                wait_modifiers_dev.ptr(), tile_size);
+  HIP_CHECK(hipGetLastError());
+
+  HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  const auto tail = warps_in_block * kWarpSize - grid.threads_in_block_count_;
+  for (int i = 0u; i < grid.block_count_; ++i) {
+    for (int j = 0u; j < warps_in_block; ++j) {
+      const auto warp_idx = i * warps_in_block + j;
+      auto mask = active_masks.ptr()[warp_idx];
+      const auto shift_amount =
+          (tail + 32 * TestContext::get().isNvidia()) * !((warp_idx + 1) % warps_in_block);
+      mask = (mask << shift_amount) >> shift_amount;
+      const auto active_count = std::bitset<sizeof(mask) * 8>(mask).count();
+      const auto start_offset = i * grid.threads_in_block_count_ + j * kWarpSize;
+      const auto end_offset = start_offset + active_count;
+      const auto valid =
+          std::all_of(arr.ptr() + start_offset, arr.ptr() + end_offset, [](T e) { return e; });
+      if (!valid) {
+        REQUIRE(valid);
+      }
+    }
+  }
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Launches a kernel wherein threads in each warp are deactivated based on a passed bitmask.
+ * Coalesced groups are formed and divided into tiled partitions(size of 2, 4, 8, 16, 32, 64 if AMD)
+ * and every thread writes its intra-tile rank into an array slot determined by its global warp rank
+ * and coalesced group rank. The array is either in global or dynamic shared memory based on a
+ * compile time switch, and the test is run for arrays of 1, 2, and 4 byte elements. Before the
+ * write each thread executes a busy wait loop for a random amount of clock cycles, the amount being
+ * read from an input array. After the write a tile-wide sync is performed and each thread validates
+ * that it can read the expected values that other threads within the same tile have written to
+ * their respective array slots.
+ * Test source
+ * ------------------------
+ *    - unit/cooperativeGrps/coalesced_group_tiled_partition.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.2
+ */
+uint64_t counter = 0;
+TEMPLATE_TEST_CASE("Unit_Coalesced_Group_Tiled_Partition_Sync_Positive_Basic", "", uint8_t,
+                   uint16_t, uint32_t) {
+  SECTION("Global memory") { CoalescedGroupTiledPartitionSyncTest<true, TestType>(); }
+  SECTION("Shared memory") { CoalescedGroupTiledPartitionSyncTest<false, TestType>(); }
+}
@@ -21,7 +21,7 @@ THE SOFTWARE.
 */
 #include <hip_test_common.hh>
 #include <hip/hip_cooperative_groups.h>
-#include <hip_test_defgroups.hh>
+ 

 /**
 * @addtogroup coalesced_group thread_block_tile
@@ -76,3 +76,4 @@ template <class T> bool CheckDimensions(unsigned int device, T kernel, dim3 bloc

  return true;
 }
+
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */
 #include <hip_test_common.hh>
 #include <dlfcn.h>
-#include <hip_test_defgroups.hh>
+ 
 /**
 * @addtogroup hipLaunchKernelGGL hipLaunchCooperativeKernel
 * @{
@@ -17,7 +17,7 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 #include <hip_test_common.hh>
-#include <hip_test_defgroups.hh>
+ 
 #include <stdio.h>
 #include <dlfcn.h>
 #include <vector>
@@ -1,14 +1,15 @@
 # Common Tests - Test independent of all platforms
 set(TEST_SRC
+    error_handling_common.cc
    hipGetErrorName.cc
    hipGetErrorString.cc
-    hipGetLastError.cc
-    hipPeekAtLastError.cc
    hipDrvGetErrorName.cc
    hipDrvGetErrorString.cc
+    hipGetLastError.cc
+    hipPeekAtLastError.cc
 )

 hip_add_exe_to_target(NAME ErrorHandlingTest
                      TEST_SRC ${TEST_SRC}
                      TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++17)
+                      COMPILE_OPTIONS -std=c++17)
@@ -0,0 +1,534 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "error_handling_common.hh"
+
+const char* ErrorName(hipError_t enumerator) {
+  switch (enumerator) {
+#if HT_AMD
+    case hipSuccess:
+      return "hipSuccess";
+    case hipErrorInvalidValue:
+      return "hipErrorInvalidValue";
+    case hipErrorOutOfMemory:
+      return "hipErrorOutOfMemory";
+    case hipErrorNotInitialized:
+      return "hipErrorNotInitialized";
+    case hipErrorDeinitialized:
+      return "hipErrorDeinitialized";
+    case hipErrorProfilerDisabled:
+      return "hipErrorProfilerDisabled";
+    case hipErrorProfilerNotInitialized:
+      return "hipErrorProfilerNotInitialized";
+    case hipErrorProfilerAlreadyStarted:
+      return "hipErrorProfilerAlreadyStarted";
+    case hipErrorProfilerAlreadyStopped:
+      return "hipErrorProfilerAlreadyStopped";
+    case hipErrorInvalidConfiguration:
+      return "hipErrorInvalidConfiguration";
+    case hipErrorInvalidSymbol:
+      return "hipErrorInvalidSymbol";
+    case hipErrorInvalidDevicePointer:
+      return "hipErrorInvalidDevicePointer";
+    case hipErrorInvalidMemcpyDirection:
+      return "hipErrorInvalidMemcpyDirection";
+    case hipErrorInsufficientDriver:
+      return "hipErrorInsufficientDriver";
+    case hipErrorMissingConfiguration:
+      return "hipErrorMissingConfiguration";
+    case hipErrorPriorLaunchFailure:
+      return "hipErrorPriorLaunchFailure";
+    case hipErrorInvalidDeviceFunction:
+      return "hipErrorInvalidDeviceFunction";
+    case hipErrorNoDevice:
+      return "hipErrorNoDevice";
+    case hipErrorInvalidDevice:
+      return "hipErrorInvalidDevice";
+    case hipErrorInvalidPitchValue:
+      return "hipErrorInvalidPitchValue";
+    case hipErrorInvalidImage:
+      return "hipErrorInvalidImage";
+    case hipErrorInvalidContext:
+      return "hipErrorInvalidContext";
+    case hipErrorContextAlreadyCurrent:
+      return "hipErrorContextAlreadyCurrent";
+    case hipErrorMapFailed:
+      return "hipErrorMapFailed";
+    case hipErrorUnmapFailed:
+      return "hipErrorUnmapFailed";
+    case hipErrorArrayIsMapped:
+      return "hipErrorArrayIsMapped";
+    case hipErrorAlreadyMapped:
+      return "hipErrorAlreadyMapped";
+    case hipErrorNoBinaryForGpu:
+      return "hipErrorNoBinaryForGpu";
+    case hipErrorAlreadyAcquired:
+      return "hipErrorAlreadyAcquired";
+    case hipErrorNotMapped:
+      return "hipErrorNotMapped";
+    case hipErrorNotMappedAsArray:
+      return "hipErrorNotMappedAsArray";
+    case hipErrorNotMappedAsPointer:
+      return "hipErrorNotMappedAsPointer";
+    case hipErrorECCNotCorrectable:
+      return "hipErrorECCNotCorrectable";
+    case hipErrorUnsupportedLimit:
+      return "hipErrorUnsupportedLimit";
+    case hipErrorContextAlreadyInUse:
+      return "hipErrorContextAlreadyInUse";
+    case hipErrorPeerAccessUnsupported:
+      return "hipErrorPeerAccessUnsupported";
+    case hipErrorInvalidKernelFile:
+      return "hipErrorInvalidKernelFile";
+    case hipErrorInvalidGraphicsContext:
+      return "hipErrorInvalidGraphicsContext";
+    case hipErrorInvalidSource:
+      return "hipErrorInvalidSource";
+    case hipErrorFileNotFound:
+      return "hipErrorFileNotFound";
+    case hipErrorSharedObjectSymbolNotFound:
+      return "hipErrorSharedObjectSymbolNotFound";
+    case hipErrorSharedObjectInitFailed:
+      return "hipErrorSharedObjectInitFailed";
+    case hipErrorOperatingSystem:
+      return "hipErrorOperatingSystem";
+    case hipErrorInvalidHandle:
+      return "hipErrorInvalidHandle";
+    case hipErrorIllegalState:
+      return "hipErrorIllegalState";
+    case hipErrorNotFound:
+      return "hipErrorNotFound";
+    case hipErrorNotReady:
+      return "hipErrorNotReady";
+    case hipErrorIllegalAddress:
+      return "hipErrorIllegalAddress";
+    case hipErrorLaunchOutOfResources:
+      return "hipErrorLaunchOutOfResources";
+    case hipErrorLaunchTimeOut:
+      return "hipErrorLaunchTimeOut";
+    case hipErrorPeerAccessAlreadyEnabled:
+      return "hipErrorPeerAccessAlreadyEnabled";
+    case hipErrorPeerAccessNotEnabled:
+      return "hipErrorPeerAccessNotEnabled";
+    case hipErrorSetOnActiveProcess:
+      return "hipErrorSetOnActiveProcess";
+    case hipErrorContextIsDestroyed:
+      return "hipErrorContextIsDestroyed";
+    case hipErrorAssert:
+      return "hipErrorAssert";
+    case hipErrorHostMemoryAlreadyRegistered:
+      return "hipErrorHostMemoryAlreadyRegistered";
+    case hipErrorHostMemoryNotRegistered:
+      return "hipErrorHostMemoryNotRegistered";
+    case hipErrorLaunchFailure:
+      return "hipErrorLaunchFailure";
+    case hipErrorNotSupported:
+      return "hipErrorNotSupported";
+    case hipErrorUnknown:
+      return "hipErrorUnknown";
+    case hipErrorRuntimeMemory:
+      return "hipErrorRuntimeMemory";
+    case hipErrorRuntimeOther:
+      return "hipErrorRuntimeOther";
+    case hipErrorCooperativeLaunchTooLarge:
+      return "hipErrorCooperativeLaunchTooLarge";
+    case hipErrorStreamCaptureUnsupported:
+      return "hipErrorStreamCaptureUnsupported";
+    case hipErrorStreamCaptureInvalidated:
+      return "hipErrorStreamCaptureInvalidated";
+    case hipErrorStreamCaptureMerge:
+      return "hipErrorStreamCaptureMerge";
+    case hipErrorStreamCaptureUnmatched:
+      return "hipErrorStreamCaptureUnmatched";
+    case hipErrorStreamCaptureUnjoined:
+      return "hipErrorStreamCaptureUnjoined";
+    case hipErrorStreamCaptureIsolation:
+      return "hipErrorStreamCaptureIsolation";
+    case hipErrorStreamCaptureImplicit:
+      return "hipErrorStreamCaptureImplicit";
+    case hipErrorCapturedEvent:
+      return "hipErrorCapturedEvent";
+    case hipErrorStreamCaptureWrongThread:
+      return "hipErrorStreamCaptureWrongThread";
+    case hipErrorGraphExecUpdateFailure:
+      return "hipErrorGraphExecUpdateFailure";
+    case hipErrorTbd:
+      return "hipErrorTbd";
+    default:
+      return "hipErrorUnknown";
+#else
+    case hipSuccess:
+      return "CUDA_SUCCESS";
+    case hipErrorInvalidValue:
+      return "CUDA_ERROR_INVALID_VALUE";
+    case hipErrorOutOfMemory:
+      return "CUDA_ERROR_OUT_OF_MEMORY";
+    case hipErrorNotInitialized:
+      return "CUDA_ERROR_NOT_INITIALIZED";
+    case hipErrorDeinitialized:
+      return "CUDA_ERROR_DEINITIALIZED";
+    case hipErrorProfilerDisabled:
+      return "CUDA_ERROR_PROFILER_DISABLED";
+    case hipErrorProfilerNotInitialized:
+      return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+    case hipErrorProfilerAlreadyStarted:
+      return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+    case hipErrorProfilerAlreadyStopped:
+      return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+    case hipErrorInvalidConfiguration:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInvalidSymbol:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInvalidDevicePointer:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInvalidMemcpyDirection:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInsufficientDriver:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorMissingConfiguration:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorPriorLaunchFailure:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInvalidDeviceFunction:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorNoDevice:
+      return "CUDA_ERROR_NO_DEVICE";
+    case hipErrorInvalidDevice:
+      return "CUDA_ERROR_INVALID_DEVICE";
+    case hipErrorInvalidPitchValue:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorInvalidImage:
+      return "CUDA_ERROR_INVALID_IMAGE";
+    case hipErrorInvalidContext:
+      return "CUDA_ERROR_INVALID_CONTEXT";
+    case hipErrorContextAlreadyCurrent:
+      return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+    case hipErrorMapFailed:
+      return "CUDA_ERROR_MAP_FAILED";
+    case hipErrorUnmapFailed:
+      return "CUDA_ERROR_UNMAP_FAILED";
+    case hipErrorArrayIsMapped:
+      return "CUDA_ERROR_ARRAY_IS_MAPPED";
+    case hipErrorAlreadyMapped:
+      return "CUDA_ERROR_ALREADY_MAPPED";
+    case hipErrorNoBinaryForGpu:
+      return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+    case hipErrorAlreadyAcquired:
+      return "CUDA_ERROR_ALREADY_ACQUIRED";
+    case hipErrorNotMapped:
+      return "CUDA_ERROR_NOT_MAPPED";
+    case hipErrorNotMappedAsArray:
+      return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+    case hipErrorNotMappedAsPointer:
+      return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+    case hipErrorECCNotCorrectable:
+      return "CUDA_ERROR_ECC_UNCORRECTABLE";
+    case hipErrorUnsupportedLimit:
+      return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+    case hipErrorContextAlreadyInUse:
+      return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+    case hipErrorPeerAccessUnsupported:
+      return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
+    case hipErrorInvalidKernelFile:
+      return "CUDA_ERROR_INVALID_PTX";
+    case hipErrorInvalidGraphicsContext:
+      return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
+    case hipErrorInvalidSource:
+      return "CUDA_ERROR_INVALID_SOURCE";
+    case hipErrorFileNotFound:
+      return "CUDA_ERROR_FILE_NOT_FOUND";
+    case hipErrorSharedObjectSymbolNotFound:
+      return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+    case hipErrorSharedObjectInitFailed:
+      return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+    case hipErrorOperatingSystem:
+      return "CUDA_ERROR_OPERATING_SYSTEM";
+    case hipErrorInvalidHandle:
+      return "CUDA_ERROR_INVALID_HANDLE";
+    case hipErrorIllegalState:
+      return "CUDA_ERROR_ILLEGAL_STATE";
+    case hipErrorNotFound:
+      return "CUDA_ERROR_NOT_FOUND";
+    case hipErrorNotReady:
+      return "CUDA_ERROR_NOT_READY";
+    case hipErrorIllegalAddress:
+      return "CUDA_ERROR_ILLEGAL_ADDRESS";
+    case hipErrorLaunchOutOfResources:
+      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+    case hipErrorLaunchTimeOut:
+      return "CUDA_ERROR_LAUNCH_TIMEOUT";
+    case hipErrorPeerAccessAlreadyEnabled:
+      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+    case hipErrorPeerAccessNotEnabled:
+      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+    case hipErrorSetOnActiveProcess:
+      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+    case hipErrorContextIsDestroyed:
+      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+    case hipErrorAssert:
+      return "CUDA_ERROR_ASSERT";
+    case hipErrorHostMemoryAlreadyRegistered:
+      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
+    case hipErrorHostMemoryNotRegistered:
+      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
+    case hipErrorLaunchFailure:
+      return "CUDA_ERROR_LAUNCH_FAILED";
+    case hipErrorNotSupported:
+      return "CUDA_ERROR_NOT_SUPPORTED";
+    case hipErrorUnknown:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorRuntimeMemory:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorRuntimeOther:
+      return "CUDA_ERROR_UNKNOWN";
+    case hipErrorCooperativeLaunchTooLarge:
+      return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE";
+    case hipErrorStreamCaptureUnsupported:
+      return "CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED";
+    case hipErrorStreamCaptureInvalidated:
+      return "CUDA_ERROR_STREAM_CAPTURE_INVALIDATED";
+    case hipErrorStreamCaptureMerge:
+      return "CUDA_ERROR_STREAM_CAPTURE_MERGE";
+    case hipErrorStreamCaptureUnmatched:
+      return "CUDA_ERROR_STREAM_CAPTURE_UNMATCHED";
+    case hipErrorStreamCaptureUnjoined:
+      return "CUDA_ERROR_STREAM_CAPTURE_UNJOINED";
+    case hipErrorStreamCaptureIsolation:
+      return "CUDA_ERROR_STREAM_CAPTURE_ISOLATION";
+    case hipErrorStreamCaptureImplicit:
+      return "CUDA_ERROR_STREAM_CAPTURE_IMPLICIT";
+    case hipErrorCapturedEvent:
+      return "CUDA_ERROR_CAPTURED_EVENT";
+    case hipErrorStreamCaptureWrongThread:
+      return "CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD";
+    case hipErrorGraphExecUpdateFailure:
+      return "CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE";
+    default:
+      return "CUDA_ERROR_UNKNOWN";
+#endif
+  }
+}
+
+const char* ErrorString(hipError_t enumerator) {
+  switch (enumerator) {
+    case hipSuccess:
+      return "no error";
+    case hipErrorInvalidValue:
+      return "invalid argument";
+    case hipErrorOutOfMemory:
+      return "out of memory";
+    case hipErrorNotInitialized:
+      return "initialization error";
+    case hipErrorDeinitialized:
+      return "driver shutting down";
+    case hipErrorProfilerDisabled:
+      return "profiler disabled while using external profiling tool";
+    case hipErrorProfilerNotInitialized:
+#if HT_AMD
+      return "profiler is not initialized";
+#elif HT_NVIDIA
+      return "profiler not initialized: call cudaProfilerInitialize()";
+#endif
+    case hipErrorProfilerAlreadyStarted:
+      return "profiler already started";
+    case hipErrorProfilerAlreadyStopped:
+      return "profiler already stopped";
+#if HT_AMD
+    case hipErrorInvalidConfiguration:
+      return "invalid configuration argument";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInvalidPitchValue:
+      return "invalid pitch argument";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInvalidSymbol:
+      return "invalid device symbol";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInvalidDevicePointer:
+      return "invalid device pointer";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInvalidMemcpyDirection:
+      return "invalid copy direction for memcpy";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInsufficientDriver:
+      return "driver version is insufficient for runtime version";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorMissingConfiguration:
+      return "__global__ function call is not configured";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorPriorLaunchFailure:
+      return "unspecified launch failure in prior launch";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+#if HT_AMD
+    case hipErrorInvalidDeviceFunction:
+      return "invalid device function";
+#elif HT_NVIDIA
+      return "unknown error";
+#endif
+    case hipErrorNoDevice:
+#if HT_AMD
+      return "no ROCm-capable device is detected";
+#elif HT_NVIDIA
+      return "no CUDA-capable device is detected";
+#endif
+    case hipErrorInvalidDevice:
+      return "invalid device ordinal";
+    case hipErrorInvalidImage:
+      return "device kernel image is invalid";
+    case hipErrorInvalidContext:
+      return "invalid device context";
+    case hipErrorContextAlreadyCurrent:
+#if HT_AMD
+      return "context is already current context";
+#elif HT_NVIDIA
+      return "context already current";
+#endif
+    case hipErrorMapFailed:
+      return "mapping of buffer object failed";
+    case hipErrorUnmapFailed:
+      return "unmapping of buffer object failed";
+    case hipErrorArrayIsMapped:
+      return "array is mapped";
+    case hipErrorAlreadyMapped:
+      return "resource already mapped";
+    case hipErrorNoBinaryForGpu:
+      return "no kernel image is available for execution on the device";
+    case hipErrorAlreadyAcquired:
+      return "resource already acquired";
+    case hipErrorNotMapped:
+      return "resource not mapped";
+    case hipErrorNotMappedAsArray:
+      return "resource not mapped as array";
+    case hipErrorNotMappedAsPointer:
+      return "resource not mapped as pointer";
+    case hipErrorECCNotCorrectable:
+      return "uncorrectable ECC error encountered";
+    case hipErrorUnsupportedLimit:
+      return "limit is not supported on this architecture";
+    case hipErrorContextAlreadyInUse:
+      return "exclusive-thread device already in use by a different thread";
+    case hipErrorPeerAccessUnsupported:
+      return "peer access is not supported between these two devices";
+    case hipErrorInvalidKernelFile:
+#if HT_AMD
+      return "invalid kernel file";
+#elif HT_NVIDIA
+      return "a PTX JIT compilation failed";
+#endif
+    case hipErrorInvalidGraphicsContext:
+      return "invalid OpenGL or DirectX context";
+    case hipErrorInvalidSource:
+      return "device kernel image is invalid";
+    case hipErrorFileNotFound:
+      return "file not found";
+    case hipErrorSharedObjectSymbolNotFound:
+      return "shared object symbol not found";
+    case hipErrorSharedObjectInitFailed:
+      return "shared object initialization failed";
+    case hipErrorOperatingSystem:
+      return "OS call failed or operation not supported on this OS";
+    case hipErrorInvalidHandle:
+      return "invalid resource handle";
+    case hipErrorIllegalState:
+      return "the operation cannot be performed in the present state";
+    case hipErrorNotFound:
+      return "named symbol not found";
+    case hipErrorNotReady:
+      return "device not ready";
+    case hipErrorIllegalAddress:
+      return "an illegal memory access was encountered";
+    case hipErrorLaunchOutOfResources:
+      return "too many resources requested for launch";
+    case hipErrorLaunchTimeOut:
+      return "the launch timed out and was terminated";
+    case hipErrorPeerAccessAlreadyEnabled:
+      return "peer access is already enabled";
+    case hipErrorPeerAccessNotEnabled:
+      return "peer access has not been enabled";
+    case hipErrorSetOnActiveProcess:
+      return "cannot set while device is active in this process";
+    case hipErrorContextIsDestroyed:
+      return "context is destroyed";
+    case hipErrorAssert:
+      return "device-side assert triggered";
+    case hipErrorHostMemoryAlreadyRegistered:
+      return "part or all of the requested memory range is already mapped";
+    case hipErrorHostMemoryNotRegistered:
+      return "pointer does not correspond to a registered memory region";
+    case hipErrorLaunchFailure:
+      return "unspecified launch failure";
+    case hipErrorCooperativeLaunchTooLarge:
+      return "too many blocks in cooperative launch";
+    case hipErrorNotSupported:
+      return "operation not supported";
+    case hipErrorStreamCaptureUnsupported:
+      return "operation not permitted when stream is capturing";
+    case hipErrorStreamCaptureInvalidated:
+      return "operation failed due to a previous error during capture";
+    case hipErrorStreamCaptureMerge:
+      return "operation would result in a merge of separate capture sequences";
+    case hipErrorStreamCaptureUnmatched:
+      return "capture was not ended in the same stream as it began";
+    case hipErrorStreamCaptureUnjoined:
+      return "capturing stream has unjoined work";
+    case hipErrorStreamCaptureIsolation:
+      return "dependency created on uncaptured work in another stream";
+    case hipErrorStreamCaptureImplicit:
+      return "operation would make the legacy stream depend on a capturing blocking stream";  // NOLINT
+    case hipErrorCapturedEvent:
+      return "operation not permitted on an event last recorded in a capturing stream";  // NOLINT
+    case hipErrorStreamCaptureWrongThread:
+      return "attempt to terminate a thread-local capture sequence from another thread";  // NOLINT
+    case hipErrorGraphExecUpdateFailure:
+      return "the graph update was not performed because it included changes which violated "
+             "constraints specific to instantiated graph update";  // NOLINT
+    case hipErrorRuntimeMemory:
+      return "runtime memory call returned error";
+    case hipErrorRuntimeOther:
+      return "runtime call other than memory returned error";
+    case hipErrorUnknown:
+    default:
+      return "unknown error";
+  }
+}
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -33,7 +33,7 @@ constexpr hipError_t kErrorEnumerators[] = {hipSuccess,
                                            hipErrorProfilerNotInitialized,
                                            hipErrorProfilerAlreadyStarted,
                                            hipErrorProfilerAlreadyStopped,
-                                            #if HT_AMD
+#if HT_AMD
                                            hipErrorInvalidConfiguration,
                                            hipErrorInvalidPitchValue,
                                            hipErrorInvalidSymbol,
@@ -43,7 +43,7 @@ constexpr hipError_t kErrorEnumerators[] = {hipSuccess,
                                            hipErrorMissingConfiguration,
                                            hipErrorPriorLaunchFailure,
                                            hipErrorInvalidDeviceFunction,
-                                            #endif
+#endif
                                            hipErrorNoDevice,
                                            hipErrorInvalidDevice,
                                            hipErrorInvalidImage,
@@ -97,8 +97,12 @@ constexpr hipError_t kErrorEnumerators[] = {hipSuccess,
                                            hipErrorStreamCaptureWrongThread,
                                            hipErrorGraphExecUpdateFailure,
                                            hipErrorUnknown,
-                                            #if HT_AMD
+#if HT_AMD
                                            hipErrorRuntimeMemory,
                                            hipErrorRuntimeOther
-                                            #endif
-                                            };
+#endif
+};
+
+const char* ErrorName(hipError_t enumerator);
+
+const char* ErrorString(hipError_t enumerator);
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -17,347 +17,67 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
 #include <hip_test_common.hh>
-#include "errorEnumerators.h"

-// Local Function to return the error code in string
+#include "error_handling_common.hh"

-static const char *ErrorName(hipError_t enumerator) {
-  switch (enumerator) {
-    #if HT_AMD
-    case hipSuccess:
-        return "hipSuccess";
-    case hipErrorInvalidValue:
-        return "hipErrorInvalidValue";
-    case hipErrorOutOfMemory:
-        return "hipErrorOutOfMemory";
-    case hipErrorNotInitialized:
-        return "hipErrorNotInitialized";
-    case hipErrorDeinitialized:
-        return "hipErrorDeinitialized";
-    case hipErrorProfilerDisabled:
-        return "hipErrorProfilerDisabled";
-    case hipErrorProfilerNotInitialized:
-        return "hipErrorProfilerNotInitialized";
-    case hipErrorProfilerAlreadyStarted:
-        return "hipErrorProfilerAlreadyStarted";
-    case hipErrorProfilerAlreadyStopped:
-        return "hipErrorProfilerAlreadyStopped";
-    case hipErrorInvalidConfiguration:
-        return "hipErrorInvalidConfiguration";
-    case hipErrorInvalidSymbol:
-        return "hipErrorInvalidSymbol";
-    case hipErrorInvalidDevicePointer:
-        return "hipErrorInvalidDevicePointer";
-    case hipErrorInvalidMemcpyDirection:
-        return "hipErrorInvalidMemcpyDirection";
-    case hipErrorInsufficientDriver:
-        return "hipErrorInsufficientDriver";
-    case hipErrorMissingConfiguration:
-        return "hipErrorMissingConfiguration";
-    case hipErrorPriorLaunchFailure:
-        return "hipErrorPriorLaunchFailure";
-    case hipErrorInvalidDeviceFunction:
-        return "hipErrorInvalidDeviceFunction";
-    case hipErrorNoDevice:
-        return "hipErrorNoDevice";
-    case hipErrorInvalidDevice:
-        return "hipErrorInvalidDevice";
-    case hipErrorInvalidPitchValue:
-        return "hipErrorInvalidPitchValue";
-    case hipErrorInvalidImage:
-        return "hipErrorInvalidImage";
-    case hipErrorInvalidContext:
-        return "hipErrorInvalidContext";
-    case hipErrorContextAlreadyCurrent:
-        return "hipErrorContextAlreadyCurrent";
-    case hipErrorMapFailed:
-        return "hipErrorMapFailed";
-    case hipErrorUnmapFailed:
-        return "hipErrorUnmapFailed";
-    case hipErrorArrayIsMapped:
-        return "hipErrorArrayIsMapped";
-    case hipErrorAlreadyMapped:
-        return "hipErrorAlreadyMapped";
-    case hipErrorNoBinaryForGpu:
-        return "hipErrorNoBinaryForGpu";
-    case hipErrorAlreadyAcquired:
-        return "hipErrorAlreadyAcquired";
-    case hipErrorNotMapped:
-        return "hipErrorNotMapped";
-    case hipErrorNotMappedAsArray:
-        return "hipErrorNotMappedAsArray";
-    case hipErrorNotMappedAsPointer:
-        return "hipErrorNotMappedAsPointer";
-    case hipErrorECCNotCorrectable:
-        return "hipErrorECCNotCorrectable";
-    case hipErrorUnsupportedLimit:
-        return "hipErrorUnsupportedLimit";
-    case hipErrorContextAlreadyInUse:
-        return "hipErrorContextAlreadyInUse";
-    case hipErrorPeerAccessUnsupported:
-        return "hipErrorPeerAccessUnsupported";
-    case hipErrorInvalidKernelFile:
-        return "hipErrorInvalidKernelFile";
-    case hipErrorInvalidGraphicsContext:
-        return "hipErrorInvalidGraphicsContext";
-    case hipErrorInvalidSource:
-        return "hipErrorInvalidSource";
-    case hipErrorFileNotFound:
-        return "hipErrorFileNotFound";
-    case hipErrorSharedObjectSymbolNotFound:
-        return "hipErrorSharedObjectSymbolNotFound";
-    case hipErrorSharedObjectInitFailed:
-        return "hipErrorSharedObjectInitFailed";
-    case hipErrorOperatingSystem:
-        return "hipErrorOperatingSystem";
-    case hipErrorInvalidHandle:
-        return "hipErrorInvalidHandle";
-    case hipErrorIllegalState:
-        return "hipErrorIllegalState";
-    case hipErrorNotFound:
-        return "hipErrorNotFound";
-    case hipErrorNotReady:
-        return "hipErrorNotReady";
-    case hipErrorIllegalAddress:
-        return "hipErrorIllegalAddress";
-    case hipErrorLaunchOutOfResources:
-        return "hipErrorLaunchOutOfResources";
-    case hipErrorLaunchTimeOut:
-        return "hipErrorLaunchTimeOut";
-    case hipErrorPeerAccessAlreadyEnabled:
-        return "hipErrorPeerAccessAlreadyEnabled";
-    case hipErrorPeerAccessNotEnabled:
-        return "hipErrorPeerAccessNotEnabled";
-    case hipErrorSetOnActiveProcess:
-        return "hipErrorSetOnActiveProcess";
-    case hipErrorContextIsDestroyed:
-        return "hipErrorContextIsDestroyed";
-    case hipErrorAssert:
-        return "hipErrorAssert";
-    case hipErrorHostMemoryAlreadyRegistered:
-        return "hipErrorHostMemoryAlreadyRegistered";
-    case hipErrorHostMemoryNotRegistered:
-        return "hipErrorHostMemoryNotRegistered";
-    case hipErrorLaunchFailure:
-        return "hipErrorLaunchFailure";
-    case hipErrorNotSupported:
-        return "hipErrorNotSupported";
-    case hipErrorUnknown:
-        return "hipErrorUnknown";
-    case hipErrorRuntimeMemory:
-        return "hipErrorRuntimeMemory";
-    case hipErrorRuntimeOther:
-        return "hipErrorRuntimeOther";
-    case hipErrorCooperativeLaunchTooLarge:
-        return "hipErrorCooperativeLaunchTooLarge";
-    case hipErrorStreamCaptureUnsupported:
-        return "hipErrorStreamCaptureUnsupported";
-    case hipErrorStreamCaptureInvalidated:
-        return "hipErrorStreamCaptureInvalidated";
-    case hipErrorStreamCaptureMerge:
-        return "hipErrorStreamCaptureMerge";
-    case hipErrorStreamCaptureUnmatched:
-        return "hipErrorStreamCaptureUnmatched";
-    case hipErrorStreamCaptureUnjoined:
-        return "hipErrorStreamCaptureUnjoined";
-    case hipErrorStreamCaptureIsolation:
-        return "hipErrorStreamCaptureIsolation";
-    case hipErrorStreamCaptureImplicit:
-        return "hipErrorStreamCaptureImplicit";
-    case hipErrorCapturedEvent:
-        return "hipErrorCapturedEvent";
-    case hipErrorStreamCaptureWrongThread:
-        return "hipErrorStreamCaptureWrongThread";
-    case hipErrorGraphExecUpdateFailure:
-        return "hipErrorGraphExecUpdateFailure";
-    case hipErrorTbd:
-        return "hipErrorTbd";
-    default:
-        return "hipErrorUnknown";
-    #endif
-    #if HT_NVIDIA
-    case hipSuccess:
-        return "CUDA_SUCCESS";
-    case hipErrorInvalidValue:
-        return "CUDA_ERROR_INVALID_VALUE";
-    case hipErrorOutOfMemory:
-        return "CUDA_ERROR_OUT_OF_MEMORY";
-    case hipErrorNotInitialized:
-        return "CUDA_ERROR_NOT_INITIALIZED";
-    case hipErrorDeinitialized:
-        return "CUDA_ERROR_DEINITIALIZED";
-    case hipErrorProfilerDisabled:
-        return "CUDA_ERROR_PROFILER_DISABLED";
-    case hipErrorProfilerNotInitialized:
-        return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
-    case hipErrorProfilerAlreadyStarted:
-        return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
-    case hipErrorProfilerAlreadyStopped:
-        return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
-    case hipErrorInvalidConfiguration:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInvalidSymbol:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInvalidDevicePointer:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInvalidMemcpyDirection:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInsufficientDriver:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorMissingConfiguration:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorPriorLaunchFailure:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInvalidDeviceFunction:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorNoDevice:
-        return "CUDA_ERROR_NO_DEVICE";
-    case hipErrorInvalidDevice:
-        return "CUDA_ERROR_INVALID_DEVICE";
-    case hipErrorInvalidPitchValue:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorInvalidImage:
-        return "CUDA_ERROR_INVALID_IMAGE";
-    case hipErrorInvalidContext:
-        return "CUDA_ERROR_INVALID_CONTEXT";
-    case hipErrorContextAlreadyCurrent:
-        return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
-    case hipErrorMapFailed:
-        return "CUDA_ERROR_MAP_FAILED";
-    case hipErrorUnmapFailed:
-        return "CUDA_ERROR_UNMAP_FAILED";
-    case hipErrorArrayIsMapped:
-        return "CUDA_ERROR_ARRAY_IS_MAPPED";
-    case hipErrorAlreadyMapped:
-        return "CUDA_ERROR_ALREADY_MAPPED";
-    case hipErrorNoBinaryForGpu:
-        return "CUDA_ERROR_NO_BINARY_FOR_GPU";
-    case hipErrorAlreadyAcquired:
-        return "CUDA_ERROR_ALREADY_ACQUIRED";
-    case hipErrorNotMapped:
-        return "CUDA_ERROR_NOT_MAPPED";
-    case hipErrorNotMappedAsArray:
-        return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
-    case hipErrorNotMappedAsPointer:
-        return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
-    case hipErrorECCNotCorrectable:
-        return "CUDA_ERROR_ECC_UNCORRECTABLE";
-    case hipErrorUnsupportedLimit:
-        return "CUDA_ERROR_UNSUPPORTED_LIMIT";
-    case hipErrorContextAlreadyInUse:
-        return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
-    case hipErrorPeerAccessUnsupported:
-        return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
-    case hipErrorInvalidKernelFile:
-        return "CUDA_ERROR_INVALID_PTX";
-    case hipErrorInvalidGraphicsContext:
-        return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
-    case hipErrorInvalidSource:
-        return "CUDA_ERROR_INVALID_SOURCE";
-    case hipErrorFileNotFound:
-        return "CUDA_ERROR_FILE_NOT_FOUND";
-    case hipErrorSharedObjectSymbolNotFound:
-        return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
-    case hipErrorSharedObjectInitFailed:
-        return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
-    case hipErrorOperatingSystem:
-        return "CUDA_ERROR_OPERATING_SYSTEM";
-    case hipErrorInvalidHandle:
-        return "CUDA_ERROR_INVALID_HANDLE";
-    case hipErrorIllegalState:
-        return "CUDA_ERROR_ILLEGAL_STATE";
-    case hipErrorNotFound:
-        return "CUDA_ERROR_NOT_FOUND";
-    case hipErrorNotReady:
-        return "CUDA_ERROR_NOT_READY";
-    case hipErrorIllegalAddress:
-        return "CUDA_ERROR_ILLEGAL_ADDRESS";
-    case hipErrorLaunchOutOfResources:
-        return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-    case hipErrorLaunchTimeOut:
-        return "CUDA_ERROR_LAUNCH_TIMEOUT";
-    case hipErrorPeerAccessAlreadyEnabled:
-        return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-    case hipErrorPeerAccessNotEnabled:
-        return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-    case hipErrorSetOnActiveProcess:
-        return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-    case hipErrorContextIsDestroyed:
-        return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-    case hipErrorAssert:
-        return "CUDA_ERROR_ASSERT";
-    case hipErrorHostMemoryAlreadyRegistered:
-        return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-    case hipErrorHostMemoryNotRegistered:
-        return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-    case hipErrorLaunchFailure:
-        return "CUDA_ERROR_LAUNCH_FAILED";
-    case hipErrorNotSupported:
-        return "CUDA_ERROR_NOT_SUPPORTED";
-    case hipErrorUnknown:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorRuntimeMemory:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorRuntimeOther:
-        return "CUDA_ERROR_UNKNOWN";
-    case hipErrorCooperativeLaunchTooLarge:
-        return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE";
-    case hipErrorStreamCaptureUnsupported:
-        return "CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED";
-    case hipErrorStreamCaptureInvalidated:
-        return "CUDA_ERROR_STREAM_CAPTURE_INVALIDATED";
-    case hipErrorStreamCaptureMerge:
-        return "CUDA_ERROR_STREAM_CAPTURE_MERGE";
-    case hipErrorStreamCaptureUnmatched:
-        return "CUDA_ERROR_STREAM_CAPTURE_UNMATCHED";
-    case hipErrorStreamCaptureUnjoined:
-        return "CUDA_ERROR_STREAM_CAPTURE_UNJOINED";
-    case hipErrorStreamCaptureIsolation:
-        return "CUDA_ERROR_STREAM_CAPTURE_ISOLATION";
-    case hipErrorStreamCaptureImplicit:
-        return "CUDA_ERROR_STREAM_CAPTURE_IMPLICIT";
-    case hipErrorCapturedEvent:
-        return "CUDA_ERROR_CAPTURED_EVENT";
-    case hipErrorStreamCaptureWrongThread:
-        return "CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD";
-    case hipErrorGraphExecUpdateFailure:
-        return "CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE";
-    default:
-        return "CUDA_ERROR_UNKNOWN";
-    #endif
-    }
-}
+/**
+ * @addtogroup hipDrvGetErrorName hipDrvGetErrorName
+ * @{
+ * @ingroup ErrorTest
+ * `hipDrvGetErrorName(hipError_t hip_error)` -
+ * Return hip error as text string form.
+ */

-// Functional test case
-// Test case to verify the returned error name is same as generated error name.
-
-TEST_CASE("Unit_hipDrvGetErrorName_Functional") {
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate that the correct string is returned for each supported
+ *    device error enumeration.
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorName.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorName_Positive_Basic") {
  const char* error_string = nullptr;
-  hipError_t error_ret;
  const auto enumerator =
-      GENERATE(from_range(std::begin(kErrorEnumerators),
-                           std::end(kErrorEnumerators)));
-  error_ret = hipDrvGetErrorName(enumerator, &error_string);
+      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);
+
+  HIP_CHECK(hipDrvGetErrorName(enumerator, &error_string));
+
  REQUIRE(error_string != nullptr);
  REQUIRE(strcmp(error_string, ErrorName(enumerator)) == 0);
-  REQUIRE(error_ret == hipSuccess);
 }

-// Negative test cases.
-
-TEST_CASE("Unit_hipDrvGetErrorName_Negative") {
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate handling of invalid arguments:
+ *    -# When error enumerator is invalid (-1)
+ *      - AMD expected output: return "hipErrorUnknown"
+ *      - NVIDIA expected output: return "cudaErrorUnknown"
+ *    -# When nullptr is passed as store location
+ *      - Expected output: return "hipErrorInvalidValue"
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorName.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorName_Negative_Parameters") {
  const char* error_string = nullptr;
  SECTION("pass unknown value to hipError") {
-    REQUIRE((hipDrvGetErrorName(static_cast<hipError_t>(-1), &error_string))
-                                  == hipErrorInvalidValue);
+    HIP_CHECK_ERROR((hipDrvGetErrorName(static_cast<hipError_t>(-1), &error_string)),
+                    hipErrorInvalidValue);
  }
-  #if HT_AMD
+#if HT_AMD  // segfaults on NVIDIA
  SECTION("pass nullptr to error string") {
-    REQUIRE((hipDrvGetErrorString(static_cast<hipError_t>(0), nullptr))
-                                   == hipErrorInvalidValue);
+    HIP_CHECK_ERROR((hipDrvGetErrorString(hipErrorInvalidValue, nullptr)), hipErrorInvalidValue);
  }
-  #endif
+#endif
 }
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -17,247 +17,67 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
 #include <hip_test_common.hh>
-#include "errorEnumerators.h"

-// Local Function to return the error string.
+#include "error_handling_common.hh"

-static const char *ErrorString(hipError_t enumerator) {
-  switch (enumerator) {
-    case hipSuccess:
-      return "no error";
-    case hipErrorInvalidValue:
-      return "invalid argument";
-    case hipErrorOutOfMemory:
-      return "out of memory";
-    case hipErrorNotInitialized:
-      return "initialization error";
-    case hipErrorDeinitialized:
-      return "driver shutting down";
-    case hipErrorProfilerDisabled:
-      return "profiler disabled while using external profiling tool";
-    case hipErrorProfilerNotInitialized:
-    #if HT_AMD
-      return "profiler is not initialized";
-    #elif HT_NVIDIA
-      return "profiler not initialized: call cudaProfilerInitialize()";
-    #endif
-    case hipErrorProfilerAlreadyStarted:
-      return "profiler already started";
-    case hipErrorProfilerAlreadyStopped:
-      return "profiler already stopped";
-    #if HT_AMD
-    case hipErrorInvalidConfiguration:
-      return "invalid configuration argument";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInvalidPitchValue:
-      return "invalid pitch argument";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInvalidSymbol:
-      return "invalid device symbol";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInvalidDevicePointer:
-      return "invalid device pointer";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInvalidMemcpyDirection:
-      return "invalid copy direction for memcpy";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInsufficientDriver:
-      return "driver version is insufficient for runtime version";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorMissingConfiguration:
-      return "__global__ function call is not configured";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorPriorLaunchFailure:
-      return "unspecified launch failure in prior launch";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    #if HT_AMD
-    case hipErrorInvalidDeviceFunction:
-      return "invalid device function";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-    case hipErrorNoDevice:
-    #if HT_AMD
-      return "no ROCm-capable device is detected";
-    #elif HT_NVIDIA
-      return "no CUDA-capable device is detected";
-    #endif
-    case hipErrorInvalidDevice:
-      return "invalid device ordinal";
-    case hipErrorInvalidImage:
-      return "device kernel image is invalid";
-    case hipErrorInvalidContext:
-      return "invalid device context";
-    case hipErrorContextAlreadyCurrent:
-    #if HT_AMD
-      return "context is already current context";
-    #elif HT_NVIDIA
-      return "context already current";
-    #endif
-    case hipErrorMapFailed:
-      return "mapping of buffer object failed";
-    case hipErrorUnmapFailed:
-      return "unmapping of buffer object failed";
-    case hipErrorArrayIsMapped:
-      return "array is mapped";
-    case hipErrorAlreadyMapped:
-      return "resource already mapped";
-    case hipErrorNoBinaryForGpu:
-      return "no kernel image is available for execution on the device";
-    case hipErrorAlreadyAcquired:
-      return "resource already acquired";
-    case hipErrorNotMapped:
-      return "resource not mapped";
-    case hipErrorNotMappedAsArray:
-      return "resource not mapped as array";
-    case hipErrorNotMappedAsPointer:
-      return "resource not mapped as pointer";
-    case hipErrorECCNotCorrectable:
-      return "uncorrectable ECC error encountered";
-    case hipErrorUnsupportedLimit:
-      return "limit is not supported on this architecture";
-    case hipErrorContextAlreadyInUse:
-      return "exclusive-thread device already in use by a different thread";
-    case hipErrorPeerAccessUnsupported:
-      return "peer access is not supported between these two devices";
-    case hipErrorInvalidKernelFile:
-    #if HT_AMD
-      return "invalid kernel file";
-    #elif HT_NVIDIA
-      return "a PTX JIT compilation failed";
-    #endif
-    case hipErrorInvalidGraphicsContext:
-      return "invalid OpenGL or DirectX context";
-    case hipErrorInvalidSource:
-      return "device kernel image is invalid";
-    case hipErrorFileNotFound:
-      return "file not found";
-    case hipErrorSharedObjectSymbolNotFound:
-      return "shared object symbol not found";
-    case hipErrorSharedObjectInitFailed:
-      return "shared object initialization failed";
-    case hipErrorOperatingSystem:
-      return "OS call failed or operation not supported on this OS";
-    case hipErrorInvalidHandle:
-      return "invalid resource handle";
-    case hipErrorIllegalState:
-      return "the operation cannot be performed in the present state";
-    case hipErrorNotFound:
-      return "named symbol not found";
-    case hipErrorNotReady:
-      return "device not ready";
-    case hipErrorIllegalAddress:
-      return "an illegal memory access was encountered";
-    case hipErrorLaunchOutOfResources:
-      return "too many resources requested for launch";
-    case hipErrorLaunchTimeOut:
-      return "the launch timed out and was terminated";
-    case hipErrorPeerAccessAlreadyEnabled:
-      return "peer access is already enabled";
-    case hipErrorPeerAccessNotEnabled:
-      return "peer access has not been enabled";
-    case hipErrorSetOnActiveProcess:
-      return "cannot set while device is active in this process";
-    case hipErrorContextIsDestroyed:
-      return "context is destroyed";
-    case hipErrorAssert:
-      return "device-side assert triggered";
-    case hipErrorHostMemoryAlreadyRegistered:
-      return "part or all of the requested memory range is already mapped";
-    case hipErrorHostMemoryNotRegistered:
-      return "pointer does not correspond to a registered memory region";
-    case hipErrorLaunchFailure:
-      return "unspecified launch failure";
-    case hipErrorCooperativeLaunchTooLarge:
-      return "too many blocks in cooperative launch";
-    case hipErrorNotSupported:
-      return "operation not supported";
-    case hipErrorStreamCaptureUnsupported:
-      return "operation not permitted when stream is capturing";
-    case hipErrorStreamCaptureInvalidated:
-      return "operation failed due to a previous error during capture";
-    case hipErrorStreamCaptureMerge:
-      return "operation would result in a merge of separate capture sequences";
-    case hipErrorStreamCaptureUnmatched:
-      return "capture was not ended in the same stream as it began";
-    case hipErrorStreamCaptureUnjoined:
-      return "capturing stream has unjoined work";
-    case hipErrorStreamCaptureIsolation:
-      return "dependency created on uncaptured work in another stream";
-    case hipErrorStreamCaptureImplicit:
-      return "operation would make the legacy stream depend on a capturing blocking stream";  //NOLINT
-    case hipErrorCapturedEvent:
-      return "operation not permitted on an event last recorded in a capturing stream";  //NOLINT
-    case hipErrorStreamCaptureWrongThread:
-      return "attempt to terminate a thread-local capture sequence from another thread";  //NOLINT
-    case hipErrorGraphExecUpdateFailure:
-      return "the graph update was not performed because it included changes which violated constraints specific to instantiated graph update";  //NOLINT
-    case hipErrorRuntimeMemory:
-      return "runtime memory call returned error";
-    case hipErrorRuntimeOther:
-      return "runtime call other than memory returned error";
-    case hipErrorUnknown:
-    default:
-    #if HT_AMD
-      return "unknown error";
-    #elif HT_NVIDIA
-      return "unknown error";
-    #endif
-  }
-}
+/**
+ * @addtogroup hipDrvGetErrorString hipDrvGetErrorString
+ * @{
+ * @ingroup ErrorTest
+ * `hipDrvGetErrorString(hipError_t hipError)` -
+ * Return handy text string message to explain the error which occurred.
+ */

-// Test case to verify the returned error string is
-// same as generated error string.
-
-TEST_CASE("Unit_hipDrvGetErrorString_Functional") {
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate that the correct string is returned for each supported
+ *    device error enumeration.
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorString.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorString_Positive_Basic") {
  const char* error_string = nullptr;
  const auto enumerator =
-      GENERATE(from_range(std::begin(kErrorEnumerators),
-                           std::end(kErrorEnumerators)));
-  hipError_t error_ret = hipDrvGetErrorString(enumerator, &error_string);
+      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);
+
+  HIP_CHECK(hipDrvGetErrorString(enumerator, &error_string));
+
  REQUIRE(error_string != nullptr);
  REQUIRE(strcmp(error_string, ErrorString(enumerator)) == 0);
-  REQUIRE(error_ret == hipSuccess);
 }

-// Negative test cases.
-
-TEST_CASE("Unit_hipDrvGetErrorString_Negative") {
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate handling of invalid arguments:
+ *    -# When error enumerator is invalid (-1)
+ *      - Expected output: return "hipErrorInvalidValue"
+ *    -# When nullptr is passed as store location
+ *      - Expected output: return "hipErrorInvalidValue"
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorString.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorString_Negative_Parameters") {
  const char* error_string = nullptr;
  SECTION("pass unknown value to hipError") {
-    REQUIRE((hipDrvGetErrorString(static_cast<hipError_t>(-1), &error_string))
-                                  == hipErrorInvalidValue);
+    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(-1), &error_string)),
+                    hipErrorInvalidValue);
  }
-  #if HT_AMD
+#if HT_AMD  // segfaults on NVIDIA
  SECTION("pass nullptr to error string") {
-     REQUIRE((hipDrvGetErrorString(static_cast<hipError_t>(0), nullptr))
-                                   == hipErrorInvalidValue);
+    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(0), nullptr)),
+                    hipErrorInvalidValue);
  }
-  #endif
+#endif
 }
@@ -20,10 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-
-#include "errorEnumerators.h"
 #include <hip_test_common.hh>
-#include <hip/hip_runtime_api.h>
+
+#include "error_handling_common.hh"

 /**
 * @addtogroup hipGetErrorName hipGetErrorName
@@ -49,6 +48,7 @@ TEST_CASE("Unit_hipGetErrorName_Positive_Basic") {
  const char* error_string = nullptr;
  const auto enumerator =
      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);

  error_string = hipGetErrorName(enumerator);

@@ -20,9 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

-#include "errorEnumerators.h"
 #include <hip_test_common.hh>
-#include <hip/hip_runtime_api.h>
+
+#include "error_handling_common.hh"

 /**
 * @addtogroup hipGetErrorString hipGetErrorString
@@ -48,6 +48,7 @@ TEST_CASE("Unit_hipGetErrorString_Positive_Basic") {
  const char* error_string = nullptr;
  const auto enumerator =
      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);

  error_string = hipGetErrorString(enumerator);

@@ -21,7 +21,6 @@ THE SOFTWARE.
 */

 #include <hip_test_common.hh>
-#include <hip/hip_runtime_api.h>
 #include <threaded_zig_zag_test.hh>

 /**
@@ -56,7 +55,8 @@ TEST_CASE("Unit_hipPeekAtLastError_Positive_Basic") {
 * Test Description
 * ------------------------
 *  - Validate that appropriate error is returned when working with multiple threads.
- *  - Validate that appropriate error is returned for getting the last erro when working with multiple threads.
+ *  - Validate that appropriate error is returned for getting the last error when working with
+ * multiple threads.
 *  - Cause error on purpose within one of the threads.
 * Test source
 * ------------------------
@@ -22,7 +22,7 @@ THE SOFTWARE.

 #include <hip_test_common.hh>
 #include <hip_test_kernels.hh>
-#include <hip_test_defgroups.hh>
+ 
 #include <stdlib.h>

 constexpr size_t buffer_size = (1024*1024);
@@ -4,6 +4,7 @@ set(TEST_SRC
    hipFuncSetSharedMemConfig.cc
    hipFuncSetAttribute.cc
    hipFuncGetAttributes.cc
+    hipLaunchKernel.cc
    hipLaunchCooperativeKernel.cc
    hipLaunchCooperativeKernelMultiDevice.cc
 )
@@ -12,6 +13,7 @@ if(HIP_PLATFORM MATCHES "amd")
    set(TEST_SRC ${TEST_SRC}
        hipExtLaunchKernel.cc
        hipExtLaunchMultiKernelMultiDevice.cc
+        launch_api.cc
    )
 endif()

@@ -49,19 +49,19 @@ TEST_CASE("Unit_hipExtLaunchKernel_Positive_Basic") {

 TEST_CASE("Unit_hipExtLaunchKernel_Positive_Parameters") {
  SECTION("blockDim.x == maxBlockDimX") {
-    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX);
+    const unsigned int x = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimX, 0);
    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{x, 1, 1},
                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
  }

  SECTION("blockDim.y == maxBlockDimY") {
-    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY);
+    const unsigned int y = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimY, 0);
    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{y, 1, 1},
                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
  }

  SECTION("blockDim.z == maxBlockDimZ") {
-    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ);
+    const unsigned int z = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimZ, 0);
    HIP_CHECK(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{z, 1, 1},
                                 nullptr, 0, nullptr, nullptr, nullptr, 0u));
  }
@@ -111,28 +111,28 @@ TEST_CASE("Unit_hipExtLaunchKernel_Negative_Parameters") {
  }

  SECTION("blockDim.x > maxBlockDimX") {
-    const unsigned int x = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimX) + 1u;
+    const unsigned int x = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimX, 0) + 1u;
    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
                                       dim3{x, 1, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
                    hipErrorInvalidConfiguration);
  }

  SECTION("blockDim.y > maxBlockDimY") {
-    const unsigned int y = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimY) + 1u;
+    const unsigned int y = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimY, 0) + 1u;
    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
                                       dim3{1, y, 1}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
                    hipErrorInvalidConfiguration);
  }

  SECTION("blockDim.z > maxBlockDimZ") {
-    const unsigned int z = GetDeviceAttribute(0, hipDeviceAttributeMaxBlockDimZ) + 1u;
+    const unsigned int z = GetDeviceAttribute(hipDeviceAttributeMaxBlockDimZ, 0) + 1u;
    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
                                       dim3{1, 1, z}, nullptr, 0, nullptr, nullptr, nullptr, 0u),
                    hipErrorInvalidConfiguration);
  }

  SECTION("blockDim.x * blockDim.y * blockDim.z > maxThreadsPerBlock") {
-    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxThreadsPerBlock);
+    const unsigned int max = GetDeviceAttribute(hipDeviceAttributeMaxThreadsPerBlock, 0);
    const unsigned int dim = std::ceil(std::cbrt(max));
    HIP_CHECK_ERROR(
        hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1}, dim3{dim, dim, dim},
@@ -141,7 +141,7 @@ TEST_CASE("Unit_hipExtLaunchKernel_Negative_Parameters") {
  }

  SECTION("sharedMemBytes > maxSharedMemoryPerBlock") {
-    const unsigned int max = GetDeviceAttribute(0, hipDeviceAttributeMaxSharedMemoryPerBlock) + 1u;
+    const unsigned int max = GetDeviceAttribute(hipDeviceAttributeMaxSharedMemoryPerBlock, 0) + 1u;
    HIP_CHECK_ERROR(hipExtLaunchKernel(reinterpret_cast<void*>(kernel), dim3{1, 1, 1},
                                       dim3{1, 1, 1}, nullptr, max, nullptr, nullptr, nullptr, 0u),
                    hipErrorOutOfMemory);
--- a/Показать больше
+++ b/Показать больше