SWDEV-472723 - Correct file format and remove trailing spaces

Change-Id: Ie40c763e9391fa36d6c890cd0a171659a1502a83 [ROCm/hip-tests commit: 5d042c80fa]
2024-07-10 16:06:00 -04:00
@@ -0,0 +1,20 @@
+# Set the default behavior, in case people don't have core.autolf set.
+* text=auto
+
+# Explicitly declare text files you want to always be normalized and converted
+# to have LF line endings on checkout.
+*.c text eol=lf
+*.cpp text eol=lf
+*.cc text eol=lf
+*.h text eol=lf
+*.hpp text eol=lf
+*.txt text eol=lf
+
+# Define files to support auto-remove trailing white space
+# Need to run the command below, before add modified file(s) to the staging area
+# git config filter.trimspace.clean 'sed -e "s/[[:space:]]*$//g"'
+*.cpp filter=trimspace
+*.c filter=trimspace
+*.h filter=trimspacecpp
+*.hpp filter=trimspace
+*.md filter=trimspace
@@ -180,7 +180,7 @@ hipcc <path_to_test.cpp> -I<HIP_SRC_DIR>/tests/catch/include <HIP_SRC_DIR>/tests
 ## Debugging support
 Catch2 allows multiple ways in which you can debug the test case.
 - `-b` options breaks into a debugger as soon as there is a failure encountered [Catch2 Options Reference](https://github.com/catchorg/Catch2/blob/devel/docs/command-line.md#breaking-into-the-debugger)
- Catch2 provided [logging macro](https://github.com/catchorg/Catch2/blob/v2.13.6/docs/logging.md#top) that print useful information on test case failure 
+- Catch2 provided [logging macro](https://github.com/catchorg/Catch2/blob/v2.13.6/docs/logging.md#top) that print useful information on test case failure
 - User can also call [CATCH_BREAK_INTO_DEBUGGER](https://github.com/catchorg/Catch2/blob/devel/docs/configuration.md#overriding-catchs-debug-break--b) macro to break at a certain point in a test case.
 - User can also mention filename.cc:__LineNumber__ to break into a test case via gdb.

@@ -1,119 +1,119 @@
-/*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_common.hh>
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-
-// Test case to validate atomicInc and atomicDec functions.
-// if TestToRun=1, then atomicInc function will be tested and validated
-// if TestToRun=2, then atomicDec function will be tested and validated.
-
-
-// kernel function for atomicInc
-static __global__ void AtomicCheckInc(int* g_ptr) {
-  atomicInc(reinterpret_cast<unsigned int*>(&g_ptr[0]), 17);
-}
-
-// kernel function for atomicDec
-static __global__ void AtomicCheckDec(int* g_ptr) {
-  atomicDec(reinterpret_cast<unsigned int*>(&g_ptr[0]), 25);
-}
-
-// verify results for atomicInc
-static int verifyResultInc(int value) {
-  int limit = 17;
-  value = (value >= limit) ? 0 : value + 1;
-  return value;
-}
-
-// verify results for atomicDec
-static int verifyResultDec(int value) {
-  int limit = 25;
-  value = ((value == 0) || (value > limit)) ? limit : value - 1;
-  return value;
-}
-
-// common fuction to launch atomic functions kernel.
-static void launchAtomicFunction(int *Hptr, int val, int TestToRun) {
-  unsigned int memSize = sizeof(int) * 1;
-  int *dptr{nullptr};
-  // allocate device memory
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dptr), memSize));
-  // copy host memory to device
-  HIP_CHECK(hipMemcpy(dptr, Hptr, memSize, hipMemcpyHostToDevice));
-  // launch kernel function
-  if (TestToRun == 1) {
-    AtomicCheckInc<<<1, 1>>>(dptr);
-  } else if (TestToRun == 2) {
-    AtomicCheckDec<<<1, 1>>>(dptr);
-  }
-  // copy back from device to host
-  HIP_CHECK(hipMemcpy(Hptr, dptr, memSize, hipMemcpyDeviceToHost));
-  // verify the results.
-  if (TestToRun == 1) {
-    int result = verifyResultInc(val);
-    REQUIRE(result == Hptr[0]);
-  } else if (TestToRun == 2) {
-    int result = verifyResultDec(val);
-    REQUIRE(result == Hptr[0]);
-  }
-  // Cleanup memory
-  HIP_CHECK(hipFree(dptr));
-}
-
-TEST_CASE("Unit_AtomicFunctions_Inc") {
-  int *Hptr{nullptr};
-  int val;
-  // Allocate Host memory
-  Hptr = reinterpret_cast<int*>(malloc(sizeof(int)));
-  SECTION("Test case when value is lesser than limit") {
-    val = Hptr[0] = 10;
-    launchAtomicFunction(Hptr, val, 1);
-  }
-  SECTION("Test case when value is greater than limit") {
-    val = Hptr[0] = 20;
-    launchAtomicFunction(Hptr, val, 1);
-  }
-  SECTION("Test case when value is equal to the limit") {
-    val = Hptr[0] = 17;
-    launchAtomicFunction(Hptr, val, 1);
-  }
-  free(Hptr);
-}
-
-TEST_CASE("Unit_AtomicFunctions_Dec") {
-  int *Hptr{nullptr};
-  int val;
-  // Allocate Host memory
-  Hptr = reinterpret_cast<int*>(malloc(sizeof(int)));
-  SECTION("Test case when value is less than limit") {
-    val = Hptr[0] = 4;
-    launchAtomicFunction(Hptr, val, 2);
-  }
-  SECTION("Test case when value is greater than limit") {
-    val = Hptr[0] = 31;
-    launchAtomicFunction(Hptr, val, 2);
-  }
-  SECTION("Test case when value is equal to the limit") {
-    val = Hptr[0] = 25;
-    launchAtomicFunction(Hptr, val, 2);
-  }
-  free(Hptr);
-}
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+
+// Test case to validate atomicInc and atomicDec functions.
+// if TestToRun=1, then atomicInc function will be tested and validated
+// if TestToRun=2, then atomicDec function will be tested and validated.
+
+
+// kernel function for atomicInc
+static __global__ void AtomicCheckInc(int* g_ptr) {
+  atomicInc(reinterpret_cast<unsigned int*>(&g_ptr[0]), 17);
+}
+
+// kernel function for atomicDec
+static __global__ void AtomicCheckDec(int* g_ptr) {
+  atomicDec(reinterpret_cast<unsigned int*>(&g_ptr[0]), 25);
+}
+
+// verify results for atomicInc
+static int verifyResultInc(int value) {
+  int limit = 17;
+  value = (value >= limit) ? 0 : value + 1;
+  return value;
+}
+
+// verify results for atomicDec
+static int verifyResultDec(int value) {
+  int limit = 25;
+  value = ((value == 0) || (value > limit)) ? limit : value - 1;
+  return value;
+}
+
+// common fuction to launch atomic functions kernel.
+static void launchAtomicFunction(int *Hptr, int val, int TestToRun) {
+  unsigned int memSize = sizeof(int) * 1;
+  int *dptr{nullptr};
+  // allocate device memory
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dptr), memSize));
+  // copy host memory to device
+  HIP_CHECK(hipMemcpy(dptr, Hptr, memSize, hipMemcpyHostToDevice));
+  // launch kernel function
+  if (TestToRun == 1) {
+    AtomicCheckInc<<<1, 1>>>(dptr);
+  } else if (TestToRun == 2) {
+    AtomicCheckDec<<<1, 1>>>(dptr);
+  }
+  // copy back from device to host
+  HIP_CHECK(hipMemcpy(Hptr, dptr, memSize, hipMemcpyDeviceToHost));
+  // verify the results.
+  if (TestToRun == 1) {
+    int result = verifyResultInc(val);
+    REQUIRE(result == Hptr[0]);
+  } else if (TestToRun == 2) {
+    int result = verifyResultDec(val);
+    REQUIRE(result == Hptr[0]);
+  }
+  // Cleanup memory
+  HIP_CHECK(hipFree(dptr));
+}
+
+TEST_CASE("Unit_AtomicFunctions_Inc") {
+  int *Hptr{nullptr};
+  int val;
+  // Allocate Host memory
+  Hptr = reinterpret_cast<int*>(malloc(sizeof(int)));
+  SECTION("Test case when value is lesser than limit") {
+    val = Hptr[0] = 10;
+    launchAtomicFunction(Hptr, val, 1);
+  }
+  SECTION("Test case when value is greater than limit") {
+    val = Hptr[0] = 20;
+    launchAtomicFunction(Hptr, val, 1);
+  }
+  SECTION("Test case when value is equal to the limit") {
+    val = Hptr[0] = 17;
+    launchAtomicFunction(Hptr, val, 1);
+  }
+  free(Hptr);
+}
+
+TEST_CASE("Unit_AtomicFunctions_Dec") {
+  int *Hptr{nullptr};
+  int val;
+  // Allocate Host memory
+  Hptr = reinterpret_cast<int*>(malloc(sizeof(int)));
+  SECTION("Test case when value is less than limit") {
+    val = Hptr[0] = 4;
+    launchAtomicFunction(Hptr, val, 2);
+  }
+  SECTION("Test case when value is greater than limit") {
+    val = Hptr[0] = 31;
+    launchAtomicFunction(Hptr, val, 2);
+  }
+  SECTION("Test case when value is equal to the limit") {
+    val = Hptr[0] = 25;
+    launchAtomicFunction(Hptr, val, 2);
+  }
+  free(Hptr);
+}
@@ -1,81 +1,81 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__device__ void double_precision_intrinsics() {
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __dadd_rd(0.0, 1.0);
-#endif
-    __dadd_rn(0.0, 1.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __dadd_ru(0.0, 1.0);
-    __dadd_rz(0.0, 1.0);
-    __ddiv_rd(0.0, 1.0);
-#endif
-    __ddiv_rn(0.0, 1.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __ddiv_ru(0.0, 1.0);
-    __ddiv_rz(0.0, 1.0);
-    __dmul_rd(1.0, 2.0);
-#endif
-    __dmul_rn(1.0, 2.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __dmul_ru(1.0, 2.0);
-    __dmul_rz(1.0, 2.0);
-    __drcp_rd(2.0);
-#endif
-    __drcp_rn(2.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __drcp_ru(2.0);
-    __drcp_rz(2.0);
-    __dsqrt_rd(4.0);
-#endif
-    __dsqrt_rn(4.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __dsqrt_ru(4.0);
-    __dsqrt_rz(4.0);
-    __dsub_rd(2.0, 1.0);
-#endif
-    __dsub_rn(2.0, 1.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __dsub_ru(2.0, 1.0);
-    __dsub_rz(2.0, 1.0);
-    __fma_rd(1.0, 2.0, 3.0);
-#endif
-    __fma_rn(1.0, 2.0, 3.0);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fma_ru(1.0, 2.0, 3.0);
-    __fma_rz(1.0, 2.0, 3.0);
-#endif
-}
-
-__global__ void compileDoublePrecisionIntrinsics(int) {
-    double_precision_intrinsics();
-}
-
-TEST_CASE("Unit_DoublePrecisionIntrinsics") {
-  hipLaunchKernelGGL(compileDoublePrecisionIntrinsics, dim3(1, 1, 1),
-                                             dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__device__ void double_precision_intrinsics() {
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dadd_rd(0.0, 1.0);
+#endif
+    __dadd_rn(0.0, 1.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dadd_ru(0.0, 1.0);
+    __dadd_rz(0.0, 1.0);
+    __ddiv_rd(0.0, 1.0);
+#endif
+    __ddiv_rn(0.0, 1.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __ddiv_ru(0.0, 1.0);
+    __ddiv_rz(0.0, 1.0);
+    __dmul_rd(1.0, 2.0);
+#endif
+    __dmul_rn(1.0, 2.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dmul_ru(1.0, 2.0);
+    __dmul_rz(1.0, 2.0);
+    __drcp_rd(2.0);
+#endif
+    __drcp_rn(2.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __drcp_ru(2.0);
+    __drcp_rz(2.0);
+    __dsqrt_rd(4.0);
+#endif
+    __dsqrt_rn(4.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dsqrt_ru(4.0);
+    __dsqrt_rz(4.0);
+    __dsub_rd(2.0, 1.0);
+#endif
+    __dsub_rn(2.0, 1.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __dsub_ru(2.0, 1.0);
+    __dsub_rz(2.0, 1.0);
+    __fma_rd(1.0, 2.0, 3.0);
+#endif
+    __fma_rn(1.0, 2.0, 3.0);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fma_ru(1.0, 2.0, 3.0);
+    __fma_rz(1.0, 2.0, 3.0);
+#endif
+}
+
+__global__ void compileDoublePrecisionIntrinsics(int) {
+    double_precision_intrinsics();
+}
+
+TEST_CASE("Unit_DoublePrecisionIntrinsics") {
+  hipLaunchKernelGGL(compileDoublePrecisionIntrinsics, dim3(1, 1, 1),
+                                             dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,133 +1,133 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__device__ void double_precision_math_functions() {
-    int iX;
-    double fX, fY;
-
-    acos(1.0);
-    acosh(1.0);
-    asin(0.0);
-    asinh(0.0);
-    atan(0.0);
-    atan2(0.0, 1.0);
-    atanh(0.0);
-    cbrt(0.0);
-    ceil(0.0);
-    copysign(1.0, -2.0);
-    cos(0.0);
-    cosh(0.0);
-    cospi(0.0);
-    cyl_bessel_i0(0.0);
-    cyl_bessel_i1(0.0);
-    erf(0.0);
-    erfc(0.0);
-    erfcinv(2.0);
-    erfcx(0.0);
-    erfinv(1.0);
-    exp(0.0);
-    exp10(0.0);
-    exp2(0.0);
-    expm1(0.0);
-    fabs(1.0);
-    fdim(1.0, 0.0);
-    floor(0.0);
-    fma(1.0, 2.0, 3.0);
-    fmax(0.0, 0.0);
-    fmin(0.0, 0.0);
-    fmod(0.0, 1.0);
-    frexp(0.0, &iX);
-    hypot(1.0, 0.0);
-    ilogb(1.0);
-    isfinite(0.0);
-    isinf(0.0);
-    isnan(0.0);
-    j0(0.0);
-    j1(0.0);
-    jn(-1.0, 1.0);
-    ldexp(0.0, 0);
-    lgamma(1.0);
-    llrint(0.0);
-    llround(0.0);
-    log(1.0);
-    log10(1.0);
-    log1p(-1.0);
-    log2(1.0);
-    logb(1.0);
-    lrint(0.0);
-    lround(0.0);
-    modf(0.0, &fX);
-    nan("1");
-    nearbyint(0.0);
-    nextafter(0.0, 0.0);
-    fX = 1.0;
-    norm(1, &fX);
-    norm3d(1.0, 0.0, 0.0);
-    norm4d(1.0, 0.0, 0.0, 0.0);
-    normcdf(0.0);
-    normcdfinv(1.0);
-    pow(1.0, 0.0);
-    rcbrt(1.0);
-    remainder(2.0, 1.0);
-    remquo(1.0, 2.0, &iX);
-    rhypot(0.0, 1.0);
-    rint(1.0);
-    fX = 1.0;
-    rnorm(1, &fX);
-    rnorm3d(0.0, 0.0, 1.0);
-    rnorm4d(0.0, 0.0, 0.0, 1.0);
-    round(0.0);
-    rsqrt(1.0);
-    scalbln(0.0, 1);
-    scalbn(0.0, 1);
-    signbit(1.0);
-    sin(0.0);
-#if HT_AMD
-    // NV A100 has a bug in sincos(), so temporarily disbale it
-    sincos(0.0, &fX, &fY);
-#endif
-    sincospi(0.0, &fX, &fY);
-    sinh(0.0);
-    sinpi(0.0);
-    sqrt(0.0);
-    tan(0.0);
-    tanh(0.0);
-    tgamma(2.0);
-    trunc(0.0);
-    y0(1.0);
-    y1(1.0);
-    yn(1, 1.0);
-}
-
-__global__ void compileDoublePrecisionMathOnDevice(int) {
-    double_precision_math_functions();
-}
-
-TEST_CASE("Unit_DoublePrecisionMathDevice") {
-  hipLaunchKernelGGL(compileDoublePrecisionMathOnDevice, dim3(1, 1, 1),
-                                               dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__device__ void double_precision_math_functions() {
+    int iX;
+    double fX, fY;
+
+    acos(1.0);
+    acosh(1.0);
+    asin(0.0);
+    asinh(0.0);
+    atan(0.0);
+    atan2(0.0, 1.0);
+    atanh(0.0);
+    cbrt(0.0);
+    ceil(0.0);
+    copysign(1.0, -2.0);
+    cos(0.0);
+    cosh(0.0);
+    cospi(0.0);
+    cyl_bessel_i0(0.0);
+    cyl_bessel_i1(0.0);
+    erf(0.0);
+    erfc(0.0);
+    erfcinv(2.0);
+    erfcx(0.0);
+    erfinv(1.0);
+    exp(0.0);
+    exp10(0.0);
+    exp2(0.0);
+    expm1(0.0);
+    fabs(1.0);
+    fdim(1.0, 0.0);
+    floor(0.0);
+    fma(1.0, 2.0, 3.0);
+    fmax(0.0, 0.0);
+    fmin(0.0, 0.0);
+    fmod(0.0, 1.0);
+    frexp(0.0, &iX);
+    hypot(1.0, 0.0);
+    ilogb(1.0);
+    isfinite(0.0);
+    isinf(0.0);
+    isnan(0.0);
+    j0(0.0);
+    j1(0.0);
+    jn(-1.0, 1.0);
+    ldexp(0.0, 0);
+    lgamma(1.0);
+    llrint(0.0);
+    llround(0.0);
+    log(1.0);
+    log10(1.0);
+    log1p(-1.0);
+    log2(1.0);
+    logb(1.0);
+    lrint(0.0);
+    lround(0.0);
+    modf(0.0, &fX);
+    nan("1");
+    nearbyint(0.0);
+    nextafter(0.0, 0.0);
+    fX = 1.0;
+    norm(1, &fX);
+    norm3d(1.0, 0.0, 0.0);
+    norm4d(1.0, 0.0, 0.0, 0.0);
+    normcdf(0.0);
+    normcdfinv(1.0);
+    pow(1.0, 0.0);
+    rcbrt(1.0);
+    remainder(2.0, 1.0);
+    remquo(1.0, 2.0, &iX);
+    rhypot(0.0, 1.0);
+    rint(1.0);
+    fX = 1.0;
+    rnorm(1, &fX);
+    rnorm3d(0.0, 0.0, 1.0);
+    rnorm4d(0.0, 0.0, 0.0, 1.0);
+    round(0.0);
+    rsqrt(1.0);
+    scalbln(0.0, 1);
+    scalbn(0.0, 1);
+    signbit(1.0);
+    sin(0.0);
+#if HT_AMD
+    // NV A100 has a bug in sincos(), so temporarily disbale it
+    sincos(0.0, &fX, &fY);
+#endif
+    sincospi(0.0, &fX, &fY);
+    sinh(0.0);
+    sinpi(0.0);
+    sqrt(0.0);
+    tan(0.0);
+    tanh(0.0);
+    tgamma(2.0);
+    trunc(0.0);
+    y0(1.0);
+    y1(1.0);
+    yn(1, 1.0);
+}
+
+__global__ void compileDoublePrecisionMathOnDevice(int) {
+    double_precision_math_functions();
+}
+
+TEST_CASE("Unit_DoublePrecisionMathDevice") {
+  hipLaunchKernelGGL(compileDoublePrecisionMathOnDevice, dim3(1, 1, 1),
+                                               dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,117 +1,117 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_common.hh>
-#include <cmath>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__host__ static void double_precision_math_functions() {
-    int iX;
-    double fX, fY;
-
-    acos(1.0);
-    acosh(1.0);
-    asin(0.0);
-    asinh(0.0);
-    atan(0.0);
-    atan2(0.0, 1.0);
-    atanh(0.0);
-    cbrt(0.0);
-    ceil(0.0);
-    copysign(1.0, -2.0);
-    cos(0.0);
-    cosh(0.0);
-    erf(0.0);
-    erfc(0.0);
-    exp(0.0);
-    #ifdef __unix__
-    exp10(0.0);
-    #endif
-    exp2(0.0);
-    expm1(0.0);
-    fabs(1.0);
-    fdim(1.0, 0.0);
-    floor(0.0);
-    fma(1.0, 2.0, 3.0);
-    fmax(0.0, 0.0);
-    fmin(0.0, 0.0);
-    fmod(0.0, 1.0);
-    frexp(0.0, &iX);
-    hypot(1.0, 0.0);
-    ilogb(1.0);
-    std::isfinite(0.0);
-    std::isinf(0.0);
-    std::isnan(0.0);
-    #ifdef __unix__
-    j0(0.0);
-    j1(0.0);
-    jn(-1.0, 1.0);
-    #elif _WIN64
-    _j0(0.0);
-    _j1(0.0);
-    _jn(-1.0, 1.0);
-    #endif
-    ldexp(0.0, 0);
-    llrint(0.0);
-    llround(0.0);
-    log(1.0);
-    log10(1.0);
-    log1p(-1.0);
-    log2(1.0);
-    logb(1.0);
-    lrint(0.0);
-    lround(0.0);
-    modf(0.0, &fX);
-    nan("1");
-    nearbyint(0.0);
-    fX = 1.0;
-    pow(1.0, 0.0);
-    remainder(2.0, 1.0);
-    remquo(1.0, 2.0, &iX);
-    rint(1.0);
-    round(0.0);
-    scalbln(0.0, 1);
-    scalbn(0.0, 1);
-    std::signbit(1.0);
-    sin(0.0);
-    #ifdef _unix__
-    sincos(0.0, &fX, &fY);
-    #endif
-    sinh(0.0);
-    sqrt(0.0);
-    tan(0.0);
-    tanh(0.0);
-    tgamma(2.0);
-    trunc(0.0);
-    #ifdef __unix__
-    y0(1.0);
-    y1(1.0);
-    yn(1, 1.0);
-    #elif _WIN64
-    _y0(1.0);
-    _y1(1.0);
-    _yn(1, 1.0);
-    #endif
-}
-
-TEST_CASE("Unit_DoublePrecisionMathHost") {
-  double_precision_math_functions();
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <cmath>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__host__ static void double_precision_math_functions() {
+    int iX;
+    double fX, fY;
+
+    acos(1.0);
+    acosh(1.0);
+    asin(0.0);
+    asinh(0.0);
+    atan(0.0);
+    atan2(0.0, 1.0);
+    atanh(0.0);
+    cbrt(0.0);
+    ceil(0.0);
+    copysign(1.0, -2.0);
+    cos(0.0);
+    cosh(0.0);
+    erf(0.0);
+    erfc(0.0);
+    exp(0.0);
+    #ifdef __unix__
+    exp10(0.0);
+    #endif
+    exp2(0.0);
+    expm1(0.0);
+    fabs(1.0);
+    fdim(1.0, 0.0);
+    floor(0.0);
+    fma(1.0, 2.0, 3.0);
+    fmax(0.0, 0.0);
+    fmin(0.0, 0.0);
+    fmod(0.0, 1.0);
+    frexp(0.0, &iX);
+    hypot(1.0, 0.0);
+    ilogb(1.0);
+    std::isfinite(0.0);
+    std::isinf(0.0);
+    std::isnan(0.0);
+    #ifdef __unix__
+    j0(0.0);
+    j1(0.0);
+    jn(-1.0, 1.0);
+    #elif _WIN64
+    _j0(0.0);
+    _j1(0.0);
+    _jn(-1.0, 1.0);
+    #endif
+    ldexp(0.0, 0);
+    llrint(0.0);
+    llround(0.0);
+    log(1.0);
+    log10(1.0);
+    log1p(-1.0);
+    log2(1.0);
+    logb(1.0);
+    lrint(0.0);
+    lround(0.0);
+    modf(0.0, &fX);
+    nan("1");
+    nearbyint(0.0);
+    fX = 1.0;
+    pow(1.0, 0.0);
+    remainder(2.0, 1.0);
+    remquo(1.0, 2.0, &iX);
+    rint(1.0);
+    round(0.0);
+    scalbln(0.0, 1);
+    scalbn(0.0, 1);
+    std::signbit(1.0);
+    sin(0.0);
+    #ifdef _unix__
+    sincos(0.0, &fX, &fY);
+    #endif
+    sinh(0.0);
+    sqrt(0.0);
+    tan(0.0);
+    tanh(0.0);
+    tgamma(2.0);
+    trunc(0.0);
+    #ifdef __unix__
+    y0(1.0);
+    y1(1.0);
+    yn(1, 1.0);
+    #elif _WIN64
+    _y0(1.0);
+    _y1(1.0);
+    _yn(1, 1.0);
+    #endif
+}
+
+TEST_CASE("Unit_DoublePrecisionMathHost") {
+  double_precision_math_functions();
+}
@@ -1,128 +1,128 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/math_functions.h>
-
-__device__ void FloatMathPrecise() {
-    int iX;
-    float fX, fY;
-
-    acosf(1.0f);
-    acoshf(1.0f);
-    asinf(0.0f);
-    asinhf(0.0f);
-    atan2f(0.0f, 1.0f);
-    atanf(0.0f);
-    atanhf(0.0f);
-    cbrtf(0.0f);
-    fX = ceilf(0.0f);
-    fX = copysignf(1.0f, -2.0f);
-    cosf(0.0f);
-    coshf(0.0f);
-    cospif(0.0f);
-    cyl_bessel_i0f(0.0f);
-    cyl_bessel_i1f(0.0f);
-    erfcf(0.0f);
-    erfcinvf(2.0f);
-    erfcxf(0.0f);
-    erff(0.0f);
-    erfinvf(1.0f);
-    exp10f(0.0f);
-    exp2f(0.0f);
-    expf(0.0f);
-    expm1f(0.0f);
-    fX = fabsf(1.0f);
-    fdimf(1.0f, 0.0f);
-    fdividef(0.0f, 1.0f);
-    fX = floorf(0.0f);
-    fmaf(1.0f, 2.0f, 3.0f);
-    fX = fmaxf(0.0f, 0.0f);
-    fX = fminf(0.0f, 0.0f);
-    fmodf(0.0f, 1.0f);
-    frexpf(0.0f, &iX);
-    hypotf(1.0f, 0.0f);
-    ilogbf(1.0f);
-    isfinite(0.0f);
-    fX = isinf(0.0f);
-    fX = isnan(0.0f);
-    j0f(0.0f);
-    j1f(0.0f);
-    jnf(-1.0f, 1.0f);
-    ldexpf(0.0f, 0);
-    lgammaf(1.0f);
-    llrintf(0.0f);
-    llroundf(0.0f);
-    log10f(1.0f);
-    log1pf(-1.0f);
-    log2f(1.0f);
-    logbf(1.0f);
-    logf(1.0f);
-    lrintf(0.0f);
-    lroundf(0.0f);
-    modff(0.0f, &fX);
-    fX = nanf("1");
-    fX = nearbyintf(0.0f);
-    nextafterf(0.0f, 0.0f);
-    norm3df(1.0f, 0.0f, 0.0f);
-    norm4df(1.0f, 0.0f, 0.0f, 0.0f);
-    normcdff(0.0f);
-    normcdfinvf(1.0f);
-    fX = 1.0f;
-    normf(1, &fX);
-    powf(1.0f, 0.0f);
-    rcbrtf(1.0f);
-    remainderf(2.0f, 1.0f);
-    remquof(1.0f, 2.0f, &iX);
-    rhypotf(0.0f, 1.0f);
-    fY = rintf(1.0f);
-    rnorm3df(0.0f, 0.0f, 1.0f);
-    rnorm4df(0.0f, 0.0f, 0.0f, 1.0f);
-    fX = 1.0f;
-    rnormf(1, &fX);
-    fY = roundf(0.0f);
-    rsqrtf(1.0f);
-    scalblnf(0.0f, 1);
-    scalbnf(0.0f, 1);
-    signbit(1.0f);
-    sincosf(0.0f, &fX, &fY);
-    sincospif(0.0f, &fX, &fY);
-    sinf(0.0f);
-    sinhf(0.0f);
-    sinpif(0.0f);
-    sqrtf(0.0f);
-    tanf(0.0f);
-    tanhf(0.0f);
-    tgammaf(2.0f);
-    fY = truncf(0.0f);
-    y0f(1.0f);
-    y1f(1.0f);
-    ynf(1, 1.0f);
-}
-
-__global__ void CompileFloatMathPrecise(int) {
-  FloatMathPrecise();
-}
-
-TEST_CASE("Unit_FloatMathPrecise") {
-    hipLaunchKernelGGL(CompileFloatMathPrecise, dim3(1, 1, 1),
-                                      dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/math_functions.h>
+
+__device__ void FloatMathPrecise() {
+    int iX;
+    float fX, fY;
+
+    acosf(1.0f);
+    acoshf(1.0f);
+    asinf(0.0f);
+    asinhf(0.0f);
+    atan2f(0.0f, 1.0f);
+    atanf(0.0f);
+    atanhf(0.0f);
+    cbrtf(0.0f);
+    fX = ceilf(0.0f);
+    fX = copysignf(1.0f, -2.0f);
+    cosf(0.0f);
+    coshf(0.0f);
+    cospif(0.0f);
+    cyl_bessel_i0f(0.0f);
+    cyl_bessel_i1f(0.0f);
+    erfcf(0.0f);
+    erfcinvf(2.0f);
+    erfcxf(0.0f);
+    erff(0.0f);
+    erfinvf(1.0f);
+    exp10f(0.0f);
+    exp2f(0.0f);
+    expf(0.0f);
+    expm1f(0.0f);
+    fX = fabsf(1.0f);
+    fdimf(1.0f, 0.0f);
+    fdividef(0.0f, 1.0f);
+    fX = floorf(0.0f);
+    fmaf(1.0f, 2.0f, 3.0f);
+    fX = fmaxf(0.0f, 0.0f);
+    fX = fminf(0.0f, 0.0f);
+    fmodf(0.0f, 1.0f);
+    frexpf(0.0f, &iX);
+    hypotf(1.0f, 0.0f);
+    ilogbf(1.0f);
+    isfinite(0.0f);
+    fX = isinf(0.0f);
+    fX = isnan(0.0f);
+    j0f(0.0f);
+    j1f(0.0f);
+    jnf(-1.0f, 1.0f);
+    ldexpf(0.0f, 0);
+    lgammaf(1.0f);
+    llrintf(0.0f);
+    llroundf(0.0f);
+    log10f(1.0f);
+    log1pf(-1.0f);
+    log2f(1.0f);
+    logbf(1.0f);
+    logf(1.0f);
+    lrintf(0.0f);
+    lroundf(0.0f);
+    modff(0.0f, &fX);
+    fX = nanf("1");
+    fX = nearbyintf(0.0f);
+    nextafterf(0.0f, 0.0f);
+    norm3df(1.0f, 0.0f, 0.0f);
+    norm4df(1.0f, 0.0f, 0.0f, 0.0f);
+    normcdff(0.0f);
+    normcdfinvf(1.0f);
+    fX = 1.0f;
+    normf(1, &fX);
+    powf(1.0f, 0.0f);
+    rcbrtf(1.0f);
+    remainderf(2.0f, 1.0f);
+    remquof(1.0f, 2.0f, &iX);
+    rhypotf(0.0f, 1.0f);
+    fY = rintf(1.0f);
+    rnorm3df(0.0f, 0.0f, 1.0f);
+    rnorm4df(0.0f, 0.0f, 0.0f, 1.0f);
+    fX = 1.0f;
+    rnormf(1, &fX);
+    fY = roundf(0.0f);
+    rsqrtf(1.0f);
+    scalblnf(0.0f, 1);
+    scalbnf(0.0f, 1);
+    signbit(1.0f);
+    sincosf(0.0f, &fX, &fY);
+    sincospif(0.0f, &fX, &fY);
+    sinf(0.0f);
+    sinhf(0.0f);
+    sinpif(0.0f);
+    sqrtf(0.0f);
+    tanf(0.0f);
+    tanhf(0.0f);
+    tgammaf(2.0f);
+    fY = truncf(0.0f);
+    y0f(1.0f);
+    y1f(1.0f);
+    ynf(1, 1.0f);
+}
+
+__global__ void CompileFloatMathPrecise(int) {
+  FloatMathPrecise();
+}
+
+TEST_CASE("Unit_FloatMathPrecise") {
+    hipLaunchKernelGGL(CompileFloatMathPrecise, dim3(1, 1, 1),
+                                      dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,68 +1,68 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/device_functions.h>
-#include <algorithm>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__device__ void integer_intrinsics() {
-    __brev((unsigned int)10);
-    __brevll((uint64_t)10);
-    __byte_perm((unsigned int)0, (unsigned int)0, 0);
-    __clz(static_cast<int>(10));
-    __clzll((int64_t)10);
-    __ffs(static_cast<int>(10));
-    __ffsll((long long)(10)); // NOLINT
-    __funnelshift_l((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_lc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_r((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __funnelshift_rc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
-    __hadd(static_cast<int>(1), static_cast<int>(3));
-    __mul24(static_cast<int>(1), static_cast<int>(2));
-    __mul64hi((int64_t)1, (int64_t)2);
-    __mulhi(static_cast<int>(1), static_cast<int>(2));
-    __popc((unsigned int)4);
-    __popcll((uint64_t)4);
-    int a = min(static_cast<int>(4), static_cast<int>(5));
-    int b = max(static_cast<int>(4), static_cast<int>(5));
-    __rhadd(static_cast<int>(1), static_cast<int>(2));
-    __sad(static_cast<int>(1), static_cast<int>(2), 0);
-    __uhadd((unsigned int)1, (unsigned int)3);
-    __umul24((unsigned int)1, (unsigned int)2);
-    __umul64hi((uint64_t)1, (uint64_t)2);
-    __umulhi((unsigned int)1, (unsigned int)2);
-    __urhadd((unsigned int)1, (unsigned int)2);
-    __usad((unsigned int)1, (unsigned int)2, 0);
-
-    assert(1);
-}
-
-__global__ void compileIntegerIntrinsics(int) {
-  integer_intrinsics();
-}
-
-TEST_CASE("Unit_IntegerIntrinsics") {
-    hipLaunchKernelGGL(compileIntegerIntrinsics, dim3(1, 1, 1),
-                                       dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/device_functions.h>
+#include <algorithm>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__device__ void integer_intrinsics() {
+    __brev((unsigned int)10);
+    __brevll((uint64_t)10);
+    __byte_perm((unsigned int)0, (unsigned int)0, 0);
+    __clz(static_cast<int>(10));
+    __clzll((int64_t)10);
+    __ffs(static_cast<int>(10));
+    __ffsll((long long)(10)); // NOLINT
+    __funnelshift_l((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
+    __funnelshift_lc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
+    __funnelshift_r((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
+    __funnelshift_rc((unsigned int)0xfacefeed, (unsigned int)0xdeadbeef, 0);
+    __hadd(static_cast<int>(1), static_cast<int>(3));
+    __mul24(static_cast<int>(1), static_cast<int>(2));
+    __mul64hi((int64_t)1, (int64_t)2);
+    __mulhi(static_cast<int>(1), static_cast<int>(2));
+    __popc((unsigned int)4);
+    __popcll((uint64_t)4);
+    int a = min(static_cast<int>(4), static_cast<int>(5));
+    int b = max(static_cast<int>(4), static_cast<int>(5));
+    __rhadd(static_cast<int>(1), static_cast<int>(2));
+    __sad(static_cast<int>(1), static_cast<int>(2), 0);
+    __uhadd((unsigned int)1, (unsigned int)3);
+    __umul24((unsigned int)1, (unsigned int)2);
+    __umul64hi((uint64_t)1, (uint64_t)2);
+    __umulhi((unsigned int)1, (unsigned int)2);
+    __urhadd((unsigned int)1, (unsigned int)2);
+    __usad((unsigned int)1, (unsigned int)2, 0);
+
+    assert(1);
+}
+
+__global__ void compileIntegerIntrinsics(int) {
+  integer_intrinsics();
+}
+
+TEST_CASE("Unit_IntegerIntrinsics") {
+    hipLaunchKernelGGL(compileIntegerIntrinsics, dim3(1, 1, 1),
+                                       dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,298 +1,298 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <string.h>
-#include <math.h>
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-
-#include <algorithm>
-#include <type_traits>
-
-using namespace std;
-////////////////////////////////////////////////////////////////////////////////
-// Auto-Verification Code
-////////////////////////////////////////////////////////////////////////////////
-
-bool verifyBitwise(...) {
-    return true;
-}
-
-template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
-bool verifyBitwise(T* gpuData, int len) {
-  // Atomic and
-  T val = 0xff;
-  for (int i = 0; i < len; ++i) {
-    // 9th element should be 1
-    val &= (2 * i + 7);
-  }
-  REQUIRE(val == gpuData[8]);
-
-  // atomic Or
-  val = 0;
-  for (int i = 0; i < len; ++i) {
-    // 10th element should be 0xff
-    val |= (1 << i);
-  }
-  REQUIRE(val == gpuData[9]);
-
-  // atomic Xor
-  val = 0xff;
-
-  for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
-    val ^= i;
-  }
-
-  REQUIRE(val == gpuData[10]);
-  return true;
-}
-
-bool verifySub(...) {
-  return true;
-}
-
-template<
-    typename T,
-    typename enable_if<
-        is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
-bool verifySub(T* gpuData, int len) {
-  T val = 0;
-
-  for (int i = 0; i < len; ++i) {
-      val -= 10;
-  }
-
-  REQUIRE(val == gpuData[1]);
-  return true;
-}
-
-bool verifyExch(...) {
-  return true;
-}
-
-template<typename T, typename enable_if<!is_same<T, double> {}>::type* = nullptr> // NOLINT
-bool computeExchExch(T* gpuData, int len) {
-  T val = 0;
-
-  for (T i = 0; i < len; ++i) {
-      if (i == gpuData[2]) {
-          return true;
-          break;
-      }
-  }
-}
-
-bool VerifyIntegral(...) {
-  return true;
-}
-
-template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
-bool VerifyIntegral(T* gpuData, int len) {
-  // atomic Max
-  T val = 0;
-  for (int i = 0; i < len; ++i) {
-    // fourth element should be len-1
-    val = max(val, static_cast<T>(i));
-  }
-
-  REQUIRE(val == gpuData[3]);
-
-  // atomic Min
-  val = 1 << 8;
-
-  for (int i = 0; i < len; ++i) {
-      val = min(val, static_cast<T>(i));
-  }
-
-  REQUIRE(val == gpuData[4]);
-
-  // atomic Inc
-  T limit = 17;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-      val = (val >= limit) ? 0 : val + 1;
-  }
-
-  REQUIRE(val == gpuData[5]);
-
-  // atomic Dec
-  limit = 137;
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-      val = ((val == 0) || (val > limit)) ? limit : val - 1;
-  }
-
-  REQUIRE(val == gpuData[6]);
-
-  // atomic CAS
-  for (int i = 0; i < len; ++i) {
-    // eighth element should be a member of [0, len)
-    if (static_cast<T>(i) == gpuData[7]) {
-      return true;
-      break;
-    }
-  }
-  return verifyBitwise(gpuData, len) && verifySub(gpuData, len);
-}
-
-template<typename T>
-bool verifyData(T* gpuData, int len) {
-  T val = 0;
-  for (int i = 0; i < len; ++i) {
-      val += 10;
-  }
-
-  REQUIRE(val == gpuData[0]);
-  return VerifyIntegral(gpuData, len) && verifyExch(gpuData, len);
-}
-
-__device__
-void testKernelExch(...) {}
-
-template<typename T, typename enable_if<!is_same<T, double>{}>::type* = nullptr>
-__device__
-void testKernelExch(T* g_odata) {
-  // access thread id
-  const T tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  // Atomic exchange
-  atomicExch(&g_odata[2], tid);
-}
-
-__device__
-void testKernelSub(...) {}
-
-template<
-    typename T,
-    typename enable_if<
-        is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
-__device__
-void testKernelSub(T* g_odata) {
-    // Atomic subtraction (final should be 0)
-    atomicSub(&g_odata[1], 10);
-}
-
-__device__
-void testKernelIntegral(...) {}
-
-template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
-__device__
-void testKernelIntegral(T* g_odata) {
-  // access thread id
-  const T tid = blockDim.x * blockIdx.x + threadIdx.x;
-
-  // Atomic maximum
-  atomicMax(&g_odata[3], tid);
-
-  // Atomic minimum
-  atomicMin(&g_odata[4], tid);
-
-  // Atomic increment (modulo 17+1)
-  atomicInc((unsigned int*)&g_odata[5], 17);
-
-  // Atomic decrement
-  atomicDec((unsigned int*)&g_odata[6], 137);
-
-  // Atomic compare-and-swap
-  atomicCAS(&g_odata[7], tid - 1, tid);
-
-  // Bitwise atomic instructions
-
-  // Atomic AND
-  atomicAnd(&g_odata[8], 2 * tid + 7);
-
-  // Atomic OR
-  atomicOr(&g_odata[9], 1 << tid);
-
-  // Atomic XOR
-  atomicXor(&g_odata[10], tid);
-
-  testKernelSub(g_odata);
-}
-
-template<typename T>
-__global__ void testKernel(T* g_odata) {
-    // Atomic addition
-    atomicAdd(&g_odata[0], 10);
-    testKernelIntegral(g_odata);
-    testKernelExch(g_odata);
-}
-
-template<typename T>
-static void runTest() {
-  bool testResult = true;
-  unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
-  unsigned int numData = 11;
-  unsigned int memSize = sizeof(T) * numData;
-
-  // allocate mem for the result on host side
-  T* hOData = reinterpret_cast<T*>(malloc(memSize));
-
-  // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) {
-    hOData[i] = 0;
-  }
-  // To make the AND and XOR tests generate something other than 0...
-  hOData[8] = hOData[10] = 0xff;
-
-  // allocate device memory for result
-  T* dOData;
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dOData), memSize));
-  // copy host memory to device to initialize to zero
-  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
-
-  // execute the kernel
-  hipLaunchKernelGGL(
-      testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
-
-  // Copy result from device to host
-  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
-
-  // Compute reference solution
-  REQUIRE(testResult == verifyData(hOData, numThreads * numBlocks));
-
-  // Cleanup memory
-  free(hOData);
-  HIP_CHECK(hipFree(dOData));
-}
-
-TEST_CASE("Unit_SimpleAtomicsTest") {
-  SECTION("test for int") {
-    runTest<int>();
-  }
-  SECTION("test for unsigned int") {
-    runTest<unsigned int>();
-  }
-  SECTION("test for float") {
-    runTest<float>();
-  }
-  #if HT_AMD
-  SECTION("test for unsigned long long") {
-    runTest<uint64_t>();
-  }
-  SECTION("test for double") {
-    runTest<double>();
-  }
-  #endif
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <string.h>
+#include <math.h>
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+
+#include <algorithm>
+#include <type_traits>
+
+using namespace std;
+////////////////////////////////////////////////////////////////////////////////
+// Auto-Verification Code
+////////////////////////////////////////////////////////////////////////////////
+
+bool verifyBitwise(...) {
+    return true;
+}
+
+template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
+bool verifyBitwise(T* gpuData, int len) {
+  // Atomic and
+  T val = 0xff;
+  for (int i = 0; i < len; ++i) {
+    // 9th element should be 1
+    val &= (2 * i + 7);
+  }
+  REQUIRE(val == gpuData[8]);
+
+  // atomic Or
+  val = 0;
+  for (int i = 0; i < len; ++i) {
+    // 10th element should be 0xff
+    val |= (1 << i);
+  }
+  REQUIRE(val == gpuData[9]);
+
+  // atomic Xor
+  val = 0xff;
+
+  for (int i = 0; i < len; ++i) {
+    // 11th element should be 0xff
+    val ^= i;
+  }
+
+  REQUIRE(val == gpuData[10]);
+  return true;
+}
+
+bool verifySub(...) {
+  return true;
+}
+
+template<
+    typename T,
+    typename enable_if<
+        is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
+bool verifySub(T* gpuData, int len) {
+  T val = 0;
+
+  for (int i = 0; i < len; ++i) {
+      val -= 10;
+  }
+
+  REQUIRE(val == gpuData[1]);
+  return true;
+}
+
+bool verifyExch(...) {
+  return true;
+}
+
+template<typename T, typename enable_if<!is_same<T, double> {}>::type* = nullptr> // NOLINT
+bool computeExchExch(T* gpuData, int len) {
+  T val = 0;
+
+  for (T i = 0; i < len; ++i) {
+      if (i == gpuData[2]) {
+          return true;
+          break;
+      }
+  }
+}
+
+bool VerifyIntegral(...) {
+  return true;
+}
+
+template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
+bool VerifyIntegral(T* gpuData, int len) {
+  // atomic Max
+  T val = 0;
+  for (int i = 0; i < len; ++i) {
+    // fourth element should be len-1
+    val = max(val, static_cast<T>(i));
+  }
+
+  REQUIRE(val == gpuData[3]);
+
+  // atomic Min
+  val = 1 << 8;
+
+  for (int i = 0; i < len; ++i) {
+      val = min(val, static_cast<T>(i));
+  }
+
+  REQUIRE(val == gpuData[4]);
+
+  // atomic Inc
+  T limit = 17;
+  val = 0;
+
+  for (int i = 0; i < len; ++i) {
+      val = (val >= limit) ? 0 : val + 1;
+  }
+
+  REQUIRE(val == gpuData[5]);
+
+  // atomic Dec
+  limit = 137;
+  val = 0;
+
+  for (int i = 0; i < len; ++i) {
+      val = ((val == 0) || (val > limit)) ? limit : val - 1;
+  }
+
+  REQUIRE(val == gpuData[6]);
+
+  // atomic CAS
+  for (int i = 0; i < len; ++i) {
+    // eighth element should be a member of [0, len)
+    if (static_cast<T>(i) == gpuData[7]) {
+      return true;
+      break;
+    }
+  }
+  return verifyBitwise(gpuData, len) && verifySub(gpuData, len);
+}
+
+template<typename T>
+bool verifyData(T* gpuData, int len) {
+  T val = 0;
+  for (int i = 0; i < len; ++i) {
+      val += 10;
+  }
+
+  REQUIRE(val == gpuData[0]);
+  return VerifyIntegral(gpuData, len) && verifyExch(gpuData, len);
+}
+
+__device__
+void testKernelExch(...) {}
+
+template<typename T, typename enable_if<!is_same<T, double>{}>::type* = nullptr>
+__device__
+void testKernelExch(T* g_odata) {
+  // access thread id
+  const T tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Atomic exchange
+  atomicExch(&g_odata[2], tid);
+}
+
+__device__
+void testKernelSub(...) {}
+
+template<
+    typename T,
+    typename enable_if<
+        is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
+__device__
+void testKernelSub(T* g_odata) {
+    // Atomic subtraction (final should be 0)
+    atomicSub(&g_odata[1], 10);
+}
+
+__device__
+void testKernelIntegral(...) {}
+
+template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
+__device__
+void testKernelIntegral(T* g_odata) {
+  // access thread id
+  const T tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // Atomic maximum
+  atomicMax(&g_odata[3], tid);
+
+  // Atomic minimum
+  atomicMin(&g_odata[4], tid);
+
+  // Atomic increment (modulo 17+1)
+  atomicInc((unsigned int*)&g_odata[5], 17);
+
+  // Atomic decrement
+  atomicDec((unsigned int*)&g_odata[6], 137);
+
+  // Atomic compare-and-swap
+  atomicCAS(&g_odata[7], tid - 1, tid);
+
+  // Bitwise atomic instructions
+
+  // Atomic AND
+  atomicAnd(&g_odata[8], 2 * tid + 7);
+
+  // Atomic OR
+  atomicOr(&g_odata[9], 1 << tid);
+
+  // Atomic XOR
+  atomicXor(&g_odata[10], tid);
+
+  testKernelSub(g_odata);
+}
+
+template<typename T>
+__global__ void testKernel(T* g_odata) {
+    // Atomic addition
+    atomicAdd(&g_odata[0], 10);
+    testKernelIntegral(g_odata);
+    testKernelExch(g_odata);
+}
+
+template<typename T>
+static void runTest() {
+  bool testResult = true;
+  unsigned int numThreads = 256;
+  unsigned int numBlocks = 64;
+  unsigned int numData = 11;
+  unsigned int memSize = sizeof(T) * numData;
+
+  // allocate mem for the result on host side
+  T* hOData = reinterpret_cast<T*>(malloc(memSize));
+
+  // initialize the memory
+  for (unsigned int i = 0; i < numData; i++) {
+    hOData[i] = 0;
+  }
+  // To make the AND and XOR tests generate something other than 0...
+  hOData[8] = hOData[10] = 0xff;
+
+  // allocate device memory for result
+  T* dOData;
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dOData), memSize));
+  // copy host memory to device to initialize to zero
+  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
+
+  // execute the kernel
+  hipLaunchKernelGGL(
+      testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
+
+  // Compute reference solution
+  REQUIRE(testResult == verifyData(hOData, numThreads * numBlocks));
+
+  // Cleanup memory
+  free(hOData);
+  HIP_CHECK(hipFree(dOData));
+}
+
+TEST_CASE("Unit_SimpleAtomicsTest") {
+  SECTION("test for int") {
+    runTest<int>();
+  }
+  SECTION("test for unsigned int") {
+    runTest<unsigned int>();
+  }
+  SECTION("test for float") {
+    runTest<float>();
+  }
+  #if HT_AMD
+  SECTION("test for unsigned long long") {
+    runTest<uint64_t>();
+  }
+  SECTION("test for double") {
+    runTest<double>();
+  }
+  #endif
+}
@@ -1,101 +1,101 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/device_functions.h>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__device__ void single_precision_intrinsics() {
-    float fX, fY;
-
-    __cosf(0.0f);
-    __exp10f(0.0f);
-    __expf(0.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fadd_rd(0.0f, 1.0f);
-#endif
-    __fadd_rn(0.0f, 1.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fadd_ru(0.0f, 1.0f);
-    __fadd_rz(0.0f, 1.0f);
-    __fdiv_rd(4.0f, 2.0f);
-#endif
-    __fdiv_rn(4.0f, 2.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fdiv_ru(4.0f, 2.0f);
-    __fdiv_rz(4.0f, 2.0f);
-#endif
-    __fdividef(4.0f, 2.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fmaf_rd(1.0f, 2.0f, 3.0f);
-#endif
-    __fmaf_rn(1.0f, 2.0f, 3.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fmaf_ru(1.0f, 2.0f, 3.0f);
-    __fmaf_rz(1.0f, 2.0f, 3.0f);
-    __fmul_rd(1.0f, 2.0f);
-#endif
-    __fmul_rn(1.0f, 2.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fmul_ru(1.0f, 2.0f);
-    __fmul_rz(1.0f, 2.0f);
-    __frcp_rd(2.0f);
-#endif
-    __frcp_rn(2.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __frcp_ru(2.0f);
-    __frcp_rz(2.0f);
-#endif
-    __frsqrt_rn(4.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fsqrt_rd(4.0f);
-#endif
-    __fsqrt_rn(4.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fsqrt_ru(4.0f);
-    __fsqrt_rz(4.0f);
-    __fsub_rd(2.0f, 1.0f);
-#endif
-    __fsub_rn(2.0f, 1.0f);
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-    __fsub_ru(2.0f, 1.0f);
-    __fsub_rz(2.0f, 1.0f);
-#endif
-    __log10f(1.0f);
-    __log2f(1.0f);
-    __logf(1.0f);
-    __powf(1.0f, 0.0f);
-    __saturatef(0.1f);
-    __sincosf(0.0f, &fX, &fY);
-    __sinf(0.0f);
-    __tanf(0.0f);
-}
-
-__global__ void compileSinglePrecisionIntrinsics(int) {
-    single_precision_intrinsics();
-}
-
-TEST_CASE("Unit_SinglePrecisionIntrinsics") {
-    hipLaunchKernelGGL(compileSinglePrecisionIntrinsics, dim3(1, 1, 1),
-                                               dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/device_functions.h>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__device__ void single_precision_intrinsics() {
+    float fX, fY;
+
+    __cosf(0.0f);
+    __exp10f(0.0f);
+    __expf(0.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fadd_rd(0.0f, 1.0f);
+#endif
+    __fadd_rn(0.0f, 1.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fadd_ru(0.0f, 1.0f);
+    __fadd_rz(0.0f, 1.0f);
+    __fdiv_rd(4.0f, 2.0f);
+#endif
+    __fdiv_rn(4.0f, 2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fdiv_ru(4.0f, 2.0f);
+    __fdiv_rz(4.0f, 2.0f);
+#endif
+    __fdividef(4.0f, 2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fmaf_rd(1.0f, 2.0f, 3.0f);
+#endif
+    __fmaf_rn(1.0f, 2.0f, 3.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fmaf_ru(1.0f, 2.0f, 3.0f);
+    __fmaf_rz(1.0f, 2.0f, 3.0f);
+    __fmul_rd(1.0f, 2.0f);
+#endif
+    __fmul_rn(1.0f, 2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fmul_ru(1.0f, 2.0f);
+    __fmul_rz(1.0f, 2.0f);
+    __frcp_rd(2.0f);
+#endif
+    __frcp_rn(2.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __frcp_ru(2.0f);
+    __frcp_rz(2.0f);
+#endif
+    __frsqrt_rn(4.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fsqrt_rd(4.0f);
+#endif
+    __fsqrt_rn(4.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fsqrt_ru(4.0f);
+    __fsqrt_rz(4.0f);
+    __fsub_rd(2.0f, 1.0f);
+#endif
+    __fsub_rn(2.0f, 1.0f);
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+    __fsub_ru(2.0f, 1.0f);
+    __fsub_rz(2.0f, 1.0f);
+#endif
+    __log10f(1.0f);
+    __log2f(1.0f);
+    __logf(1.0f);
+    __powf(1.0f, 0.0f);
+    __saturatef(0.1f);
+    __sincosf(0.0f, &fX, &fY);
+    __sinf(0.0f);
+    __tanf(0.0f);
+}
+
+__global__ void compileSinglePrecisionIntrinsics(int) {
+    single_precision_intrinsics();
+}
+
+TEST_CASE("Unit_SinglePrecisionIntrinsics") {
+    hipLaunchKernelGGL(compileSinglePrecisionIntrinsics, dim3(1, 1, 1),
+                                               dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,123 +1,123 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/math_functions.h>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__device__ void single_precision_math_functions() {
-    int iX;
-    float fX, fY;
-
-    acosf(1.0f);
-    acoshf(1.0f);
-    asinf(0.0f);
-    asinhf(0.0f);
-    atan2f(0.0f, 1.0f);
-    atanf(0.0f);
-    atanhf(0.0f);
-    cbrtf(0.0f);
-    ceilf(0.0f);
-    copysignf(1.0f, -2.0f);
-    cosf(0.0f);
-    coshf(0.0f);
-    cospif(0.0f);
-    erfcf(0.0f);
-    erfcinvf(2.0f);
-    erfcxf(0.0f);
-    erff(0.0f);
-    erfinvf(1.0f);
-    exp10f(0.0f);
-    exp2f(0.0f);
-    expf(0.0f);
-    expm1f(0.0f);
-    fabsf(1.0f);
-    fdimf(1.0f, 0.0f);
-    fdividef(0.0f, 1.0f);
-    floorf(0.0f);
-    fmaf(1.0f, 2.0f, 3.0f);
-    fmaxf(0.0f, 0.0f);
-    fminf(0.0f, 0.0f);
-    fmodf(0.0f, 1.0f);
-    frexpf(0.0f, &iX);
-    hypotf(1.0f, 0.0f);
-    ilogbf(1.0f);
-    isfinite(0.0f);
-    isinf(0.0f);
-    isnan(0.0f);
-    j0f(0.0f);
-    j1f(0.0f);
-    jnf(-1.0f, 1.0f);
-    ldexpf(0.0f, 0);
-    llrintf(0.0f);
-    llroundf(0.0f);
-    log10f(1.0f);
-    log1pf(-1.0f);
-    log2f(1.0f);
-    logbf(1.0f);
-    logf(1.0f);
-    lrintf(0.0f);
-    lroundf(0.0f);
-    nanf("1");
-    nearbyintf(0.0f);
-    norm3df(1.0f, 0.0f, 0.0f);
-    norm4df(1.0f, 0.0f, 0.0f, 0.0f);
-    normcdff(0.0f);
-    normcdfinvf(1.0f);
-    fX = 1.0f;
-    normf(1, &fX);
-    powf(1.0f, 0.0f);
-    remainderf(2.0f, 1.0f);
-    rhypotf(0.0f, 1.0f);
-    rintf(1.0f);
-    rnorm3df(0.0f, 0.0f, 1.0f);
-    rnorm4df(0.0f, 0.0f, 0.0f, 1.0f);
-    fX = 1.0f;
-    rnormf(1, &fX);
-    roundf(0.0f);
-    rsqrtf(1.0f);
-    signbit(1.0f);
-    sincosf(0.0f, &fX, &fY);
-    sincospif(0.0f, &fX, &fY);
-    sinf(0.0f);
-    sinhf(0.0f);
-    sinpif(0.0f);
-    sqrtf(0.0f);
-    tanf(0.0f);
-    tanhf(0.0f);
-    tgammaf(2.0f);
-    truncf(0.0f);
-    y0f(1.0f);
-    y1f(1.0f);
-    ynf(1, 1.0f);
-}
-
-__global__ void compileSinglePrecisionMathOnDevice(int) {
-    single_precision_math_functions();
-}
-
-TEST_CASE("Unit_SinglePrecisionMathDevice") {
-    hipLaunchKernelGGL(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1),
-                                                 dim3(1, 1, 1), 0, 0, 1);
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/math_functions.h>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__device__ void single_precision_math_functions() {
+    int iX;
+    float fX, fY;
+
+    acosf(1.0f);
+    acoshf(1.0f);
+    asinf(0.0f);
+    asinhf(0.0f);
+    atan2f(0.0f, 1.0f);
+    atanf(0.0f);
+    atanhf(0.0f);
+    cbrtf(0.0f);
+    ceilf(0.0f);
+    copysignf(1.0f, -2.0f);
+    cosf(0.0f);
+    coshf(0.0f);
+    cospif(0.0f);
+    erfcf(0.0f);
+    erfcinvf(2.0f);
+    erfcxf(0.0f);
+    erff(0.0f);
+    erfinvf(1.0f);
+    exp10f(0.0f);
+    exp2f(0.0f);
+    expf(0.0f);
+    expm1f(0.0f);
+    fabsf(1.0f);
+    fdimf(1.0f, 0.0f);
+    fdividef(0.0f, 1.0f);
+    floorf(0.0f);
+    fmaf(1.0f, 2.0f, 3.0f);
+    fmaxf(0.0f, 0.0f);
+    fminf(0.0f, 0.0f);
+    fmodf(0.0f, 1.0f);
+    frexpf(0.0f, &iX);
+    hypotf(1.0f, 0.0f);
+    ilogbf(1.0f);
+    isfinite(0.0f);
+    isinf(0.0f);
+    isnan(0.0f);
+    j0f(0.0f);
+    j1f(0.0f);
+    jnf(-1.0f, 1.0f);
+    ldexpf(0.0f, 0);
+    llrintf(0.0f);
+    llroundf(0.0f);
+    log10f(1.0f);
+    log1pf(-1.0f);
+    log2f(1.0f);
+    logbf(1.0f);
+    logf(1.0f);
+    lrintf(0.0f);
+    lroundf(0.0f);
+    nanf("1");
+    nearbyintf(0.0f);
+    norm3df(1.0f, 0.0f, 0.0f);
+    norm4df(1.0f, 0.0f, 0.0f, 0.0f);
+    normcdff(0.0f);
+    normcdfinvf(1.0f);
+    fX = 1.0f;
+    normf(1, &fX);
+    powf(1.0f, 0.0f);
+    remainderf(2.0f, 1.0f);
+    rhypotf(0.0f, 1.0f);
+    rintf(1.0f);
+    rnorm3df(0.0f, 0.0f, 1.0f);
+    rnorm4df(0.0f, 0.0f, 0.0f, 1.0f);
+    fX = 1.0f;
+    rnormf(1, &fX);
+    roundf(0.0f);
+    rsqrtf(1.0f);
+    signbit(1.0f);
+    sincosf(0.0f, &fX, &fY);
+    sincospif(0.0f, &fX, &fY);
+    sinf(0.0f);
+    sinhf(0.0f);
+    sinpif(0.0f);
+    sqrtf(0.0f);
+    tanf(0.0f);
+    tanhf(0.0f);
+    tgammaf(2.0f);
+    truncf(0.0f);
+    y0f(1.0f);
+    y1f(1.0f);
+    ynf(1, 1.0f);
+}
+
+__global__ void compileSinglePrecisionMathOnDevice(int) {
+    single_precision_math_functions();
+}
+
+TEST_CASE("Unit_SinglePrecisionMathDevice") {
+    hipLaunchKernelGGL(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1),
+                                                 dim3(1, 1, 1), 0, 0, 1);
+}
@@ -1,113 +1,113 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_common.hh>
-#include <cmath>
-
-#pragma GCC diagnostic ignored "-Wall"
-#pragma clang diagnostic ignored "-Wunused-variable"
-
-__host__ static void single_precision_math_functions() {
-    int iX;
-    float fX, fY;
-
-    acosf(1.0f);
-    acoshf(1.0f);
-    asinf(0.0f);
-    asinhf(0.0f);
-    atan2f(0.0f, 1.0f);
-    atanf(0.0f);
-    atanhf(0.0f);
-    cbrtf(0.0f);
-    ceilf(0.0f);
-    copysignf(1.0f, -2.0f);
-    cosf(0.0f);
-    coshf(0.0f);
-    erfcf(0.0f);
-    erff(0.0f);
-    #ifdef __unix__
-    exp10f(0.0f);
-    #endif
-    exp2f(0.0f);
-    expf(0.0f);
-    expm1f(0.0f);
-    fabsf(1.0f);
-    fdimf(1.0f, 0.0f);
-    floorf(0.0f);
-    fmaf(1.0f, 2.0f, 3.0f);
-    fmaxf(0.0f, 0.0f);
-    fminf(0.0f, 0.0f);
-    fmodf(0.0f, 1.0f);
-    frexpf(0.0f, &iX);
-    hypotf(1.0f, 0.0f);
-    ilogbf(1.0f);
-    std::isfinite(0.0f);
-    std::isinf(0.0f);
-    std::isnan(0.0f);
-    #ifdef __unix__
-    j0f(0.0f);
-    j1f(0.0f);
-    jnf(-1.0f, 1.0f);
-    #endif
-    ldexpf(0.0f, 0);
-    lgammaf(1.0f);
-    llrintf(0.0f);
-    llroundf(0.0f);
-    log10f(1.0f);
-    log1pf(-1.0f);
-    log2f(1.0f);
-    logbf(1.0f);
-    logf(1.0f);
-    lrintf(0.0f);
-    lroundf(0.0f);
-    modff(0.0f, &fX);
-    nanf("1");
-    nearbyintf(0.0f);
-    powf(1.0f, 0.0f);
-    remainderf(2.0f, 1.0f);
-    remquof(1.0f, 2.0f, &iX);
-    rintf(1.0f);
-#if HT_AMD
-    fX = 1.0f;
-#endif
-    roundf(0.0f);
-    /// rsqrtf(1.0f);
-    scalblnf(0.0f, 1);
-    scalbnf(0.0f, 1);
-    std::signbit(1.0f);
-    #ifdef __unix__
-    sincosf(0.0f, &fX, &fY);
-    #endif
-    sinf(0.0f);
-    sinhf(0.0f);
-    sqrtf(0.0f);
-    tanf(0.0f);
-    tanhf(0.0f);
-    tgammaf(2.0f);
-    truncf(0.0f);
-    #ifdef __unix__
-    y0f(1.0f);
-    y1f(1.0f);
-    ynf(1, 1.0f);
-    #endif
-}
-
-TEST_CASE("Unit_SinglePrecisionMathHost") {
-  single_precision_math_functions();
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+#include <cmath>
+
+#pragma GCC diagnostic ignored "-Wall"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+__host__ static void single_precision_math_functions() {
+    int iX;
+    float fX, fY;
+
+    acosf(1.0f);
+    acoshf(1.0f);
+    asinf(0.0f);
+    asinhf(0.0f);
+    atan2f(0.0f, 1.0f);
+    atanf(0.0f);
+    atanhf(0.0f);
+    cbrtf(0.0f);
+    ceilf(0.0f);
+    copysignf(1.0f, -2.0f);
+    cosf(0.0f);
+    coshf(0.0f);
+    erfcf(0.0f);
+    erff(0.0f);
+    #ifdef __unix__
+    exp10f(0.0f);
+    #endif
+    exp2f(0.0f);
+    expf(0.0f);
+    expm1f(0.0f);
+    fabsf(1.0f);
+    fdimf(1.0f, 0.0f);
+    floorf(0.0f);
+    fmaf(1.0f, 2.0f, 3.0f);
+    fmaxf(0.0f, 0.0f);
+    fminf(0.0f, 0.0f);
+    fmodf(0.0f, 1.0f);
+    frexpf(0.0f, &iX);
+    hypotf(1.0f, 0.0f);
+    ilogbf(1.0f);
+    std::isfinite(0.0f);
+    std::isinf(0.0f);
+    std::isnan(0.0f);
+    #ifdef __unix__
+    j0f(0.0f);
+    j1f(0.0f);
+    jnf(-1.0f, 1.0f);
+    #endif
+    ldexpf(0.0f, 0);
+    lgammaf(1.0f);
+    llrintf(0.0f);
+    llroundf(0.0f);
+    log10f(1.0f);
+    log1pf(-1.0f);
+    log2f(1.0f);
+    logbf(1.0f);
+    logf(1.0f);
+    lrintf(0.0f);
+    lroundf(0.0f);
+    modff(0.0f, &fX);
+    nanf("1");
+    nearbyintf(0.0f);
+    powf(1.0f, 0.0f);
+    remainderf(2.0f, 1.0f);
+    remquof(1.0f, 2.0f, &iX);
+    rintf(1.0f);
+#if HT_AMD
+    fX = 1.0f;
+#endif
+    roundf(0.0f);
+    /// rsqrtf(1.0f);
+    scalblnf(0.0f, 1);
+    scalbnf(0.0f, 1);
+    std::signbit(1.0f);
+    #ifdef __unix__
+    sincosf(0.0f, &fX, &fY);
+    #endif
+    sinf(0.0f);
+    sinhf(0.0f);
+    sqrtf(0.0f);
+    tanf(0.0f);
+    tanhf(0.0f);
+    tgammaf(2.0f);
+    truncf(0.0f);
+    #ifdef __unix__
+    y0f(1.0f);
+    y1f(1.0f);
+    ynf(1, 1.0f);
+    #endif
+}
+
+TEST_CASE("Unit_SinglePrecisionMathHost") {
+  single_precision_math_functions();
+}
@@ -1,153 +1,153 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-#include <complex>
-
-#pragma clang diagnostic ignored "-Wunused-variable"
-// Tolerance for error
-const double tolerance = 1e-6;
-
-#define LEN 64
-
-#define ALL_FUN \
-  OP(add) \
-  OP(sub) \
-  OP(mul) \
-  OP(div) \
-  OP(abs) \
-  OP(arg) \
-  OP(sin) \
-  OP(cos)
-
-#define OP(x) CK_##x,
-enum CalcKind {
-  ALL_FUN
-};
-#undef OP
-
-#define OP(x) case CK_##x: return #x;
-std::string getName(enum CalcKind CK) {
-  switch (CK) {
-  ALL_FUN
-  }
-  return "";  // To prevent compile warning
-}
-#undef OP
-
-// Calculates function.
-// If the function has one argument, B is ignored.
-// If the function returns real number, converts it to a complex number.
-#define ONE_ARG(func) \
-  case CK_##func: \
-    return std::complex<FloatT>(func(A));
-
-template<typename FloatT>
-__device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
-                                        std::complex<FloatT> B,
-                                        enum CalcKind CK) {
-  switch (CK) {
-  case CK_add:
-    return A + B;
-  case CK_sub:
-    return A - B;
-  case CK_mul:
-    return A * B;
-  case CK_div:
-    return A / B;
-
-    ONE_ARG(abs)
-    ONE_ARG(arg)
-    ONE_ARG(sin)
-    ONE_ARG(cos)
-  }
-  return A;  // To prevent compile warning
-}
-
-template<typename FloatT>
-__global__ void kernel(std::complex<FloatT>* A,
-                       std::complex<FloatT>* B, std::complex<FloatT>* C,
-                       enum CalcKind CK) {
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
-    C[tx] = calc<FloatT>(A[tx], B[tx], CK);
-}
-
-template<typename FloatT>
-void test() {
-    typedef std::complex<FloatT> ComplexT;
-
-    ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D;
-    A = new ComplexT[LEN];
-    B = new ComplexT[LEN];
-    C = new ComplexT[LEN];
-    D = new ComplexT[LEN];
-    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Ad), sizeof(ComplexT)*LEN));
-    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Bd), sizeof(ComplexT)*LEN));
-    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Cd), sizeof(ComplexT)*LEN));
-
-    for (uint32_t i = 0; i < LEN; i++) {
-        A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
-        B[i] = A[i];
-        C[i] = A[i];
-    }
-    HIP_CHECK(hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice));
-    HIP_CHECK(hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice));
-
-    // Run kernel for a calculation kind and verify by comparing with host
-    // calculation result. Returns false if fails.
-    auto test_fun = [&](enum CalcKind CK) {
-      hipLaunchKernelGGL(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0,
-                                                   Ad, Bd, Cd, CK);
-      HIP_CHECK(hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost));
-      bool pass = true;
-      for (int i = 0; i < LEN; i++) {
-        ComplexT Expected = calc(A[i], B[i], CK);
-        FloatT error = abs(C[i] - Expected);
-        if (abs(Expected) > tolerance)
-          error /= abs(Expected);
-        pass &= error < tolerance;
-      }
-      return pass;
-    };
-
-#define OP(x) assert(test_fun(CK_##x));
-    ALL_FUN
-#undef OP
-
-    HIP_CHECK(hipFree(Ad));
-    HIP_CHECK(hipFree(Bd));
-    HIP_CHECK(hipFree(Cd));
-    delete[] A;
-    delete[] B;
-    delete[] C;
-    delete[] D;
-}
-
-#if HT_AMD
-TEST_CASE("Unit_StdComplex") {
-  SECTION("Test run with float") {
-  test<float>();
-  }
-  SECTION("Test run with double") {
-  test<double>();
-  }
-}
-#endif
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+#include <complex>
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+// Tolerance for error
+const double tolerance = 1e-6;
+
+#define LEN 64
+
+#define ALL_FUN \
+  OP(add) \
+  OP(sub) \
+  OP(mul) \
+  OP(div) \
+  OP(abs) \
+  OP(arg) \
+  OP(sin) \
+  OP(cos)
+
+#define OP(x) CK_##x,
+enum CalcKind {
+  ALL_FUN
+};
+#undef OP
+
+#define OP(x) case CK_##x: return #x;
+std::string getName(enum CalcKind CK) {
+  switch (CK) {
+  ALL_FUN
+  }
+  return "";  // To prevent compile warning
+}
+#undef OP
+
+// Calculates function.
+// If the function has one argument, B is ignored.
+// If the function returns real number, converts it to a complex number.
+#define ONE_ARG(func) \
+  case CK_##func: \
+    return std::complex<FloatT>(func(A));
+
+template<typename FloatT>
+__device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
+                                        std::complex<FloatT> B,
+                                        enum CalcKind CK) {
+  switch (CK) {
+  case CK_add:
+    return A + B;
+  case CK_sub:
+    return A - B;
+  case CK_mul:
+    return A * B;
+  case CK_div:
+    return A / B;
+
+    ONE_ARG(abs)
+    ONE_ARG(arg)
+    ONE_ARG(sin)
+    ONE_ARG(cos)
+  }
+  return A;  // To prevent compile warning
+}
+
+template<typename FloatT>
+__global__ void kernel(std::complex<FloatT>* A,
+                       std::complex<FloatT>* B, std::complex<FloatT>* C,
+                       enum CalcKind CK) {
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    C[tx] = calc<FloatT>(A[tx], B[tx], CK);
+}
+
+template<typename FloatT>
+void test() {
+    typedef std::complex<FloatT> ComplexT;
+
+    ComplexT *A, *Ad, *B, *Bd, *C, *Cd, *D;
+    A = new ComplexT[LEN];
+    B = new ComplexT[LEN];
+    C = new ComplexT[LEN];
+    D = new ComplexT[LEN];
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Ad), sizeof(ComplexT)*LEN));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Bd), sizeof(ComplexT)*LEN));
+    HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Cd), sizeof(ComplexT)*LEN));
+
+    for (uint32_t i = 0; i < LEN; i++) {
+        A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
+        B[i] = A[i];
+        C[i] = A[i];
+    }
+    HIP_CHECK(hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice));
+
+    // Run kernel for a calculation kind and verify by comparing with host
+    // calculation result. Returns false if fails.
+    auto test_fun = [&](enum CalcKind CK) {
+      hipLaunchKernelGGL(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0,
+                                                   Ad, Bd, Cd, CK);
+      HIP_CHECK(hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost));
+      bool pass = true;
+      for (int i = 0; i < LEN; i++) {
+        ComplexT Expected = calc(A[i], B[i], CK);
+        FloatT error = abs(C[i] - Expected);
+        if (abs(Expected) > tolerance)
+          error /= abs(Expected);
+        pass &= error < tolerance;
+      }
+      return pass;
+    };
+
+#define OP(x) assert(test_fun(CK_##x));
+    ALL_FUN
+#undef OP
+
+    HIP_CHECK(hipFree(Ad));
+    HIP_CHECK(hipFree(Bd));
+    HIP_CHECK(hipFree(Cd));
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    delete[] D;
+}
+
+#if HT_AMD
+TEST_CASE("Unit_StdComplex") {
+  SECTION("Test run with float") {
+  test<float>();
+  }
+  SECTION("Test run with double") {
+  test<double>();
+  }
+}
+#endif
@@ -1,220 +1,220 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
-Testcase Scenarios :
- (TestCase 1)::
- 1) Execute atomicAdd in multi threaded scenario by diverging the data across
- multiple threads and validate the output at the end of all operations.
- 2) Execute atomicAddNoRet in multi threaded scenario by diverging the data
- across multiple threads and validate the output at the end of all operations.
- (TestCase 2)::
- 3) Execute atomicAdd API and validate the result.
- 4) Execute atomicAddNoRet API and validate the result.
- (TestCase 3)::
- 5) atomicadd/NoRet negative scenarios (TBD).
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-/*
- * Defines initial and increment values
- */
-#define INCREMENT_VALUE 10
-#define INT_INITIAL_VALUE 10
-#define FLOAT_INITIAL_VALUE 10.50
-#define DOUBLE_INITIAL_VALUE 200.12
-#define LONG_INITIAL_VALUE 10000
-#define UNSIGNED_INITIAL_VALUE 20
-
-#if HT_NVIDIA
-// atomicAddNoRet is unavailable in cuda
-template <typename T>
-__device__ void atomicAddNoRet(T* x, int y) {
-  atomicAdd(x, static_cast<T>(y));
-}
-#endif
-
-bool p_atomicNoRet = false;
-
-template <typename T>
-__global__ void atomicnoret_manywaves(T* C_d) {
-  atomicAddNoRet(C_d, INCREMENT_VALUE);
-}
-
-template <typename T>
-__global__ void atomic_manywaves(T* C_d) {
-  atomicAdd(C_d, INCREMENT_VALUE);
-}
-
-template <typename T>
-__global__ void atomicnoret_simple(T* C_d) {
-  atomicAddNoRet(C_d, INCREMENT_VALUE);
-}
-
-template <typename T>
-__global__ void atomic_simple(T* C_d) {
-  atomicAdd(C_d, INCREMENT_VALUE);
-}
-
-template <typename T>
-bool atomictest_manywaves(const T& initial_val) {
-  unsigned int ThreadsperBlock = 10;
-  unsigned int numBlocks = 1;
-  T memSize = sizeof(T);
-  T* hOData = reinterpret_cast<T*>(malloc(memSize));
-  *hOData = initial_val;
-  T* dOData;
-  HIP_CHECK(hipMalloc(&dOData, memSize));
-  // copy host memory to device to initialize to zero
-  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
-
-  // execute the kernel
-  hipLaunchKernelGGL(atomic_manywaves, dim3(numBlocks),
-      dim3(ThreadsperBlock), 0, 0, dOData);
-
-  // Copy result from device to host
-  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
-  REQUIRE(hOData[0] == initial_val+
-                      static_cast<T>(INCREMENT_VALUE*(ThreadsperBlock*numBlocks)));
-
-  // Cleanup memory
-  free(hOData);
-  HIP_CHECK(hipFree(dOData));
-
-  return true;
-}
-
-template <typename T>
-bool atomictestnoret_manywaves(const T& initial_val) {
-  unsigned int ThreadsperBlock = 10;
-  unsigned int numBlocks = 1;
-  T memSize = sizeof(T);
-  T* hOData = reinterpret_cast<T*>(malloc(memSize));
-  *hOData = initial_val;
-  T* dOData;
-  HIP_CHECK(hipMalloc(&dOData, memSize));
-  // copy host memory to device to initialize to zero
-  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
-
-  // execute the kernel
-  hipLaunchKernelGGL(atomicnoret_manywaves, dim3(numBlocks),
-      dim3(ThreadsperBlock), 0, 0, dOData);
-
-  // Copy result from device to host
-  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
-  REQUIRE(hOData[0] == initial_val+
-                       (INCREMENT_VALUE*(ThreadsperBlock*numBlocks)));
-
-  // Cleanup memory
-  free(hOData);
-  HIP_CHECK(hipFree(dOData));
-
-  return true;
-}
-
-template <typename T>
-bool atomictest_simple(const T& initial_val) {
-  unsigned int ThreadsperBlock = 1;
-  unsigned int numBlocks = 1;
-  T memSize = sizeof(T);
-  T* hOData = reinterpret_cast<T*>(malloc(memSize));
-  *hOData = initial_val;
-  T* dOData;
-  HIP_CHECK(hipMalloc(&dOData, memSize));
-  // copy host memory to device to initialize to zero
-  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
-
-  // execute the kernel
-  hipLaunchKernelGGL(atomic_simple, dim3(numBlocks),
-      dim3(ThreadsperBlock), 0, 0, dOData);
-
-  // Copy result from device to host
-  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
-  REQUIRE(hOData[0] == initial_val+INCREMENT_VALUE);
-
-  // Cleanup memory
-  free(hOData);
-  HIP_CHECK(hipFree(dOData));
-
-  return true;
-}
-
-template <typename T>
-bool atomictestnoret_simple(const T& initial_val) {
-  unsigned int ThreadsperBlock = 1;
-  unsigned int numBlocks = 1;
-  T memSize = sizeof(T);
-  T* hOData = reinterpret_cast<T*>(malloc(memSize));
-  *hOData = initial_val;
-  T* dOData;
-  HIP_CHECK(hipMalloc(&dOData, memSize));
-  // copy host memory to device to initialize to zero
-  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
-
-  // execute the kernel
-  hipLaunchKernelGGL(atomicnoret_simple, dim3(numBlocks),
-      dim3(ThreadsperBlock), 0, 0, dOData);
-
-  // Copy result from device to host
-  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
-  REQUIRE(hOData[0] == initial_val+INCREMENT_VALUE);
-
-  // Cleanup memory
-  free(hOData);
-  HIP_CHECK(hipFree(dOData));
-
-  return true;
-}
-
-TEST_CASE("Unit_hipTestAtomicAdd") {
-  bool TestPassed = true;
-
-  SECTION("atomic tests with many waves") {
-    REQUIRE(TestPassed == atomictest_manywaves<int>(INT_INITIAL_VALUE));
-    REQUIRE(TestPassed ==
-            atomictest_manywaves<unsigned int>(UNSIGNED_INITIAL_VALUE));
-    REQUIRE(TestPassed == atomictest_manywaves<float>(FLOAT_INITIAL_VALUE));
-    #if HT_AMD
-    REQUIRE(TestPassed ==
-        atomictest_manywaves<uint64_t>(LONG_INITIAL_VALUE));
-    REQUIRE(TestPassed ==
-          atomictest_manywaves<double>(DOUBLE_INITIAL_VALUE));
-    #endif
-  }
-  SECTION("atomic tests with many waves and no return") {
-    REQUIRE(TestPassed ==
-            atomictestnoret_manywaves<float>(FLOAT_INITIAL_VALUE));
-  }
-  SECTION("simple atomic tests") {
-    REQUIRE(TestPassed == atomictest_simple<int>(INT_INITIAL_VALUE));
-    REQUIRE(TestPassed ==
-            atomictest_simple<unsigned int>(UNSIGNED_INITIAL_VALUE));
-    REQUIRE(TestPassed == atomictest_simple<float>(FLOAT_INITIAL_VALUE));
-    #if HT_AMD
-    REQUIRE(TestPassed ==
-            atomictest_simple<uint64_t>(LONG_INITIAL_VALUE));
-    REQUIRE(TestPassed == atomictest_simple<double>(DOUBLE_INITIAL_VALUE));
-    #endif
-  }
-  SECTION("Simple atomic test with no return") {
-    REQUIRE(TestPassed == atomictestnoret_simple<float>(FLOAT_INITIAL_VALUE));
-  }
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+Testcase Scenarios :
+ (TestCase 1)::
+ 1) Execute atomicAdd in multi threaded scenario by diverging the data across
+ multiple threads and validate the output at the end of all operations.
+ 2) Execute atomicAddNoRet in multi threaded scenario by diverging the data
+ across multiple threads and validate the output at the end of all operations.
+ (TestCase 2)::
+ 3) Execute atomicAdd API and validate the result.
+ 4) Execute atomicAddNoRet API and validate the result.
+ (TestCase 3)::
+ 5) atomicadd/NoRet negative scenarios (TBD).
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+/*
+ * Defines initial and increment values
+ */
+#define INCREMENT_VALUE 10
+#define INT_INITIAL_VALUE 10
+#define FLOAT_INITIAL_VALUE 10.50
+#define DOUBLE_INITIAL_VALUE 200.12
+#define LONG_INITIAL_VALUE 10000
+#define UNSIGNED_INITIAL_VALUE 20
+
+#if HT_NVIDIA
+// atomicAddNoRet is unavailable in cuda
+template <typename T>
+__device__ void atomicAddNoRet(T* x, int y) {
+  atomicAdd(x, static_cast<T>(y));
+}
+#endif
+
+bool p_atomicNoRet = false;
+
+template <typename T>
+__global__ void atomicnoret_manywaves(T* C_d) {
+  atomicAddNoRet(C_d, INCREMENT_VALUE);
+}
+
+template <typename T>
+__global__ void atomic_manywaves(T* C_d) {
+  atomicAdd(C_d, INCREMENT_VALUE);
+}
+
+template <typename T>
+__global__ void atomicnoret_simple(T* C_d) {
+  atomicAddNoRet(C_d, INCREMENT_VALUE);
+}
+
+template <typename T>
+__global__ void atomic_simple(T* C_d) {
+  atomicAdd(C_d, INCREMENT_VALUE);
+}
+
+template <typename T>
+bool atomictest_manywaves(const T& initial_val) {
+  unsigned int ThreadsperBlock = 10;
+  unsigned int numBlocks = 1;
+  T memSize = sizeof(T);
+  T* hOData = reinterpret_cast<T*>(malloc(memSize));
+  *hOData = initial_val;
+  T* dOData;
+  HIP_CHECK(hipMalloc(&dOData, memSize));
+  // copy host memory to device to initialize to zero
+  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
+
+  // execute the kernel
+  hipLaunchKernelGGL(atomic_manywaves, dim3(numBlocks),
+      dim3(ThreadsperBlock), 0, 0, dOData);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
+  REQUIRE(hOData[0] == initial_val+
+                      static_cast<T>(INCREMENT_VALUE*(ThreadsperBlock*numBlocks)));
+
+  // Cleanup memory
+  free(hOData);
+  HIP_CHECK(hipFree(dOData));
+
+  return true;
+}
+
+template <typename T>
+bool atomictestnoret_manywaves(const T& initial_val) {
+  unsigned int ThreadsperBlock = 10;
+  unsigned int numBlocks = 1;
+  T memSize = sizeof(T);
+  T* hOData = reinterpret_cast<T*>(malloc(memSize));
+  *hOData = initial_val;
+  T* dOData;
+  HIP_CHECK(hipMalloc(&dOData, memSize));
+  // copy host memory to device to initialize to zero
+  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
+
+  // execute the kernel
+  hipLaunchKernelGGL(atomicnoret_manywaves, dim3(numBlocks),
+      dim3(ThreadsperBlock), 0, 0, dOData);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
+  REQUIRE(hOData[0] == initial_val+
+                       (INCREMENT_VALUE*(ThreadsperBlock*numBlocks)));
+
+  // Cleanup memory
+  free(hOData);
+  HIP_CHECK(hipFree(dOData));
+
+  return true;
+}
+
+template <typename T>
+bool atomictest_simple(const T& initial_val) {
+  unsigned int ThreadsperBlock = 1;
+  unsigned int numBlocks = 1;
+  T memSize = sizeof(T);
+  T* hOData = reinterpret_cast<T*>(malloc(memSize));
+  *hOData = initial_val;
+  T* dOData;
+  HIP_CHECK(hipMalloc(&dOData, memSize));
+  // copy host memory to device to initialize to zero
+  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
+
+  // execute the kernel
+  hipLaunchKernelGGL(atomic_simple, dim3(numBlocks),
+      dim3(ThreadsperBlock), 0, 0, dOData);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
+  REQUIRE(hOData[0] == initial_val+INCREMENT_VALUE);
+
+  // Cleanup memory
+  free(hOData);
+  HIP_CHECK(hipFree(dOData));
+
+  return true;
+}
+
+template <typename T>
+bool atomictestnoret_simple(const T& initial_val) {
+  unsigned int ThreadsperBlock = 1;
+  unsigned int numBlocks = 1;
+  T memSize = sizeof(T);
+  T* hOData = reinterpret_cast<T*>(malloc(memSize));
+  *hOData = initial_val;
+  T* dOData;
+  HIP_CHECK(hipMalloc(&dOData, memSize));
+  // copy host memory to device to initialize to zero
+  HIP_CHECK(hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice));
+
+  // execute the kernel
+  hipLaunchKernelGGL(atomicnoret_simple, dim3(numBlocks),
+      dim3(ThreadsperBlock), 0, 0, dOData);
+
+  // Copy result from device to host
+  HIP_CHECK(hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost));
+  REQUIRE(hOData[0] == initial_val+INCREMENT_VALUE);
+
+  // Cleanup memory
+  free(hOData);
+  HIP_CHECK(hipFree(dOData));
+
+  return true;
+}
+
+TEST_CASE("Unit_hipTestAtomicAdd") {
+  bool TestPassed = true;
+
+  SECTION("atomic tests with many waves") {
+    REQUIRE(TestPassed == atomictest_manywaves<int>(INT_INITIAL_VALUE));
+    REQUIRE(TestPassed ==
+            atomictest_manywaves<unsigned int>(UNSIGNED_INITIAL_VALUE));
+    REQUIRE(TestPassed == atomictest_manywaves<float>(FLOAT_INITIAL_VALUE));
+    #if HT_AMD
+    REQUIRE(TestPassed ==
+        atomictest_manywaves<uint64_t>(LONG_INITIAL_VALUE));
+    REQUIRE(TestPassed ==
+          atomictest_manywaves<double>(DOUBLE_INITIAL_VALUE));
+    #endif
+  }
+  SECTION("atomic tests with many waves and no return") {
+    REQUIRE(TestPassed ==
+            atomictestnoret_manywaves<float>(FLOAT_INITIAL_VALUE));
+  }
+  SECTION("simple atomic tests") {
+    REQUIRE(TestPassed == atomictest_simple<int>(INT_INITIAL_VALUE));
+    REQUIRE(TestPassed ==
+            atomictest_simple<unsigned int>(UNSIGNED_INITIAL_VALUE));
+    REQUIRE(TestPassed == atomictest_simple<float>(FLOAT_INITIAL_VALUE));
+    #if HT_AMD
+    REQUIRE(TestPassed ==
+            atomictest_simple<uint64_t>(LONG_INITIAL_VALUE));
+    REQUIRE(TestPassed == atomictest_simple<double>(DOUBLE_INITIAL_VALUE));
+    #endif
+  }
+  SECTION("Simple atomic test with no return") {
+    REQUIRE(TestPassed == atomictestnoret_simple<float>(FLOAT_INITIAL_VALUE));
+  }
+}
@@ -1,51 +1,51 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
-
-#define LEN 512
-#define SIZE (LEN * sizeof(int64_t))
-
-static __global__ void kernel1(int64_t* Ad) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  Ad[tid] = clock() + clock64() + __clock() + __clock64();
-}
-
-static __global__ void kernel2(int64_t* Ad) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
-}
-
-TEST_CASE("Unit_hipTestClock") {
-  int64_t *A, *Ad;
-  A = new int64_t[LEN];
-  for (unsigned i = 0; i < LEN; i++) {
-      A[i] = 0;
-  }
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Ad), SIZE));
-  HIP_CHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
-  hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
-  HIP_CHECK(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
-  for (unsigned i = 0; i < LEN; i++) {
-    assert(0 != A[i]);
-  }
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+
+#define LEN 512
+#define SIZE (LEN * sizeof(int64_t))
+
+static __global__ void kernel1(int64_t* Ad) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  Ad[tid] = clock() + clock64() + __clock() + __clock64();
+}
+
+static __global__ void kernel2(int64_t* Ad) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
+}
+
+TEST_CASE("Unit_hipTestClock") {
+  int64_t *A, *Ad;
+  A = new int64_t[LEN];
+  for (unsigned i = 0; i < LEN; i++) {
+      A[i] = 0;
+  }
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&Ad), SIZE));
+  HIP_CHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+  hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+  HIP_CHECK(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
+  for (unsigned i = 0; i < LEN; i++) {
+    assert(0 != A[i]);
+  }
+}
@@ -1,88 +1,88 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_common.hh>
-
-#include "error_handling_common.hh"
-
-/**
- * @addtogroup hipDrvGetErrorName hipDrvGetErrorName
- * @{
- * @ingroup ErrorTest
- * `hipDrvGetErrorName(hipError_t hip_error)` -
- * Return hip error as text string form.
- */
-
-/**
- * Test Description
- * ------------------------
- *  - Validate that the correct string is returned for each supported
- *    device error enumeration.
- * Test source
- * ------------------------
- *  - unit/errorHandling/hipDrvGetErrorName.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.4
- */
-TEST_CASE("Unit_hipDrvGetErrorName_Positive_Basic") {
-  const char* error_string = nullptr;
-  const auto enumerator =
-      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
-  INFO("Error: " << enumerator);
-
-  HIP_CHECK(hipDrvGetErrorName(enumerator, &error_string));
-
-  REQUIRE(error_string != nullptr);
-  REQUIRE(strcmp(error_string, ErrorName(enumerator)) == 0);
-}
-
-/**
- * Test Description
- * ------------------------
- *  - Validate handling of invalid arguments:
- *    -# When error enumerator is invalid (-1)
- *      - AMD expected output: return "hipErrorUnknown"
- *      - NVIDIA expected output: return "cudaErrorUnknown"
- *    -# When nullptr is passed as store location
- *      - Expected output: return "hipErrorInvalidValue"
- * Test source
- * ------------------------
- *  - unit/errorHandling/hipDrvGetErrorName.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.4
- */
-TEST_CASE("Unit_hipDrvGetErrorName_Negative_Parameters") {
-  const char* error_string = nullptr;
-  SECTION("pass unknown value to hipError") {
-    HIP_CHECK_ERROR((hipDrvGetErrorName(static_cast<hipError_t>(-1), &error_string)),
-                    hipErrorInvalidValue);
-  }
-#if HT_AMD  // segfaults on NVIDIA
-  SECTION("pass nullptr to error string") {
-    HIP_CHECK_ERROR((hipDrvGetErrorString(hipErrorInvalidValue, nullptr)), hipErrorInvalidValue);
-  }
-#endif
-}
-
-/**
-* End doxygen group ErrorTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include "error_handling_common.hh"
+
+/**
+ * @addtogroup hipDrvGetErrorName hipDrvGetErrorName
+ * @{
+ * @ingroup ErrorTest
+ * `hipDrvGetErrorName(hipError_t hip_error)` -
+ * Return hip error as text string form.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate that the correct string is returned for each supported
+ *    device error enumeration.
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorName.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorName_Positive_Basic") {
+  const char* error_string = nullptr;
+  const auto enumerator =
+      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);
+
+  HIP_CHECK(hipDrvGetErrorName(enumerator, &error_string));
+
+  REQUIRE(error_string != nullptr);
+  REQUIRE(strcmp(error_string, ErrorName(enumerator)) == 0);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate handling of invalid arguments:
+ *    -# When error enumerator is invalid (-1)
+ *      - AMD expected output: return "hipErrorUnknown"
+ *      - NVIDIA expected output: return "cudaErrorUnknown"
+ *    -# When nullptr is passed as store location
+ *      - Expected output: return "hipErrorInvalidValue"
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorName.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorName_Negative_Parameters") {
+  const char* error_string = nullptr;
+  SECTION("pass unknown value to hipError") {
+    HIP_CHECK_ERROR((hipDrvGetErrorName(static_cast<hipError_t>(-1), &error_string)),
+                    hipErrorInvalidValue);
+  }
+#if HT_AMD  // segfaults on NVIDIA
+  SECTION("pass nullptr to error string") {
+    HIP_CHECK_ERROR((hipDrvGetErrorString(hipErrorInvalidValue, nullptr)), hipErrorInvalidValue);
+  }
+#endif
+}
+
+/**
+* End doxygen group ErrorTest.
+* @}
+*/
@@ -1,88 +1,88 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_common.hh>
-
-#include "error_handling_common.hh"
-
-/**
- * @addtogroup hipDrvGetErrorString hipDrvGetErrorString
- * @{
- * @ingroup ErrorTest
- * `hipDrvGetErrorString(hipError_t hipError)` -
- * Return handy text string message to explain the error which occurred.
- */
-
-/**
- * Test Description
- * ------------------------
- *  - Validate that the correct string is returned for each supported
- *    device error enumeration.
- * Test source
- * ------------------------
- *  - unit/errorHandling/hipDrvGetErrorString.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.4
- */
-TEST_CASE("Unit_hipDrvGetErrorString_Positive_Basic") {
-  const char* error_string = nullptr;
-  const auto enumerator =
-      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
-  INFO("Error: " << enumerator);
-
-  HIP_CHECK(hipDrvGetErrorString(enumerator, &error_string));
-
-  REQUIRE(error_string != nullptr);
-  REQUIRE(strcmp(error_string, ErrorString(enumerator)) == 0);
-}
-
-/**
- * Test Description
- * ------------------------
- *  - Validate handling of invalid arguments:
- *    -# When error enumerator is invalid (-1)
- *      - Expected output: return "hipErrorInvalidValue"
- *    -# When nullptr is passed as store location
- *      - Expected output: return "hipErrorInvalidValue"
- * Test source
- * ------------------------
- *  - unit/errorHandling/hipDrvGetErrorString.cc
- * Test requirements
- * ------------------------
- *  - HIP_VERSION >= 5.4
- */
-TEST_CASE("Unit_hipDrvGetErrorString_Negative_Parameters") {
-  const char* error_string = nullptr;
-  SECTION("pass unknown value to hipError") {
-    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(-1), &error_string)),
-                    hipErrorInvalidValue);
-  }
-#if HT_AMD  // segfaults on NVIDIA
-  SECTION("pass nullptr to error string") {
-    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(0), nullptr)),
-                    hipErrorInvalidValue);
-  }
-#endif
-}
-
-/**
-* End doxygen group ErrorTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_common.hh>
+
+#include "error_handling_common.hh"
+
+/**
+ * @addtogroup hipDrvGetErrorString hipDrvGetErrorString
+ * @{
+ * @ingroup ErrorTest
+ * `hipDrvGetErrorString(hipError_t hipError)` -
+ * Return handy text string message to explain the error which occurred.
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate that the correct string is returned for each supported
+ *    device error enumeration.
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorString.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorString_Positive_Basic") {
+  const char* error_string = nullptr;
+  const auto enumerator =
+      GENERATE(from_range(std::begin(kErrorEnumerators), std::end(kErrorEnumerators)));
+  INFO("Error: " << enumerator);
+
+  HIP_CHECK(hipDrvGetErrorString(enumerator, &error_string));
+
+  REQUIRE(error_string != nullptr);
+  REQUIRE(strcmp(error_string, ErrorString(enumerator)) == 0);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validate handling of invalid arguments:
+ *    -# When error enumerator is invalid (-1)
+ *      - Expected output: return "hipErrorInvalidValue"
+ *    -# When nullptr is passed as store location
+ *      - Expected output: return "hipErrorInvalidValue"
+ * Test source
+ * ------------------------
+ *  - unit/errorHandling/hipDrvGetErrorString.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.4
+ */
+TEST_CASE("Unit_hipDrvGetErrorString_Negative_Parameters") {
+  const char* error_string = nullptr;
+  SECTION("pass unknown value to hipError") {
+    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(-1), &error_string)),
+                    hipErrorInvalidValue);
+  }
+#if HT_AMD  // segfaults on NVIDIA
+  SECTION("pass nullptr to error string") {
+    HIP_CHECK_ERROR((hipDrvGetErrorString(static_cast<hipError_t>(0), nullptr)),
+                    hipErrorInvalidValue);
+  }
+#endif
+}
+
+/**
+* End doxygen group ErrorTest.
+* @}
+*/
@@ -1,19 +1,19 @@
-# AMD specific test
-if(HIP_PLATFORM MATCHES "amd")
-if(UNIX)
-set(TEST_SRC
-    hipMalloc.cc
-)
-# Creating Custom object file
-add_custom_target(malloc_custom COMMAND g++ -c ${CMAKE_CURRENT_SOURCE_DIR}/hipMalloc.cpp -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o malloc.o BYPRODUCTS malloc.o)
-add_library(malloc_gpp OBJECT IMPORTED)
-set_property(TARGET malloc_gpp PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/malloc.o")
-
-hip_add_exe_to_target(NAME gppTests
-                      TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests
-                      LINKER_LIBS malloc_gpp)
-
-add_dependencies(gppTests malloc_custom)
-endif()
-endif()
+# AMD specific test
+if(HIP_PLATFORM MATCHES "amd")
+if(UNIX)
+set(TEST_SRC
+    hipMalloc.cc
+)
+# Creating Custom object file
+add_custom_target(malloc_custom COMMAND g++ -c ${CMAKE_CURRENT_SOURCE_DIR}/hipMalloc.cpp -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o malloc.o BYPRODUCTS malloc.o)
+add_library(malloc_gpp OBJECT IMPORTED)
+set_property(TARGET malloc_gpp PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/malloc.o")
+
+hip_add_exe_to_target(NAME gppTests
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      LINKER_LIBS malloc_gpp)
+
+add_dependencies(gppTests malloc_custom)
+endif()
+endif()
@@ -1,54 +1,54 @@
-/*
- * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * */
-
-#include <hip_test_common.hh>
- 
-#include "hipMalloc.h"
-/**
- * @addtogroup hipMalloc hipMalloc
- * @{
- * @ingroup MemoryTest
- * `hipError_t hipMalloc(void** ptr, size_t size)` -
- * Allocate memory on the default accelerator.
- * @}
- */
-
-/**
- * Test Description
- * ------------------------
- *    - Allocate memory by using hipMalloc API and verify hipSuccess is returned.
-
- * Test source
- * ------------------------
- *    - catch/unit/g++/hipMalloc.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- */
-
-TEST_CASE("Unit_hipMalloc_gpptest") {
-  printf("calling cpp function from here\n");
-  int result = MallocFunc();
-  REQUIRE(result == 1);
-}
-
-/**
-* End doxygen group MemoryTest.
-* @}
-*/
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+#include <hip_test_common.hh>
+ 
+#include "hipMalloc.h"
+/**
+ * @addtogroup hipMalloc hipMalloc
+ * @{
+ * @ingroup MemoryTest
+ * `hipError_t hipMalloc(void** ptr, size_t size)` -
+ * Allocate memory on the default accelerator.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Allocate memory by using hipMalloc API and verify hipSuccess is returned.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/g++/hipMalloc.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipMalloc_gpptest") {
+  printf("calling cpp function from here\n");
+  int result = MallocFunc();
+  REQUIRE(result == 1);
+}
+
+/**
+* End doxygen group MemoryTest.
+* @}
+*/
@@ -1,22 +1,22 @@
-/*
- * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * */
-
-#include <iostream>
-
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+#include <iostream>
+
 extern int MallocFunc();
@@ -1,28 +1,28 @@
-# Common Tests - Test independent of all platforms
-if(HIP_PLATFORM MATCHES "amd")
-if(UNIX)
-set(TEST_SRC
-    gccTest.cc
-    gpu.cpp
-)
-# Creating Custom object file
-add_custom_command(OUTPUT LaunchKernel.o COMMAND gcc -c ${CMAKE_CURRENT_SOURCE_DIR}/LaunchKernel.c -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o LaunchKernel.o)
-add_custom_target(LaunchKernel_custom DEPENDS LaunchKernel.o)
-add_custom_command(OUTPUT hipMalloc.o COMMAND gcc -c ${CMAKE_CURRENT_SOURCE_DIR}/hipMalloc.c -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o hipMalloc.o)
-add_custom_target(hipMalloc_custom DEPENDS hipMalloc.o)
-
-add_library(LaunchKernel_lib OBJECT IMPORTED)
-add_library(hipMalloc_lib OBJECT IMPORTED)
-
-set_property(TARGET LaunchKernel_lib PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/LaunchKernel.o")
-set_property(TARGET hipMalloc_lib PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/hipMalloc.o")
-
-
-hip_add_exe_to_target(NAME gccTests
-                      TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests
-		              LINKER_LIBS LaunchKernel_lib hipMalloc_lib)
-
-add_dependencies(gccTests LaunchKernel_custom hipMalloc_custom)
-endif()
-endif()
+# Common Tests - Test independent of all platforms
+if(HIP_PLATFORM MATCHES "amd")
+if(UNIX)
+set(TEST_SRC
+    gccTest.cc
+    gpu.cpp
+)
+# Creating Custom object file
+add_custom_command(OUTPUT LaunchKernel.o COMMAND gcc -c ${CMAKE_CURRENT_SOURCE_DIR}/LaunchKernel.c -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o LaunchKernel.o)
+add_custom_target(LaunchKernel_custom DEPENDS LaunchKernel.o)
+add_custom_command(OUTPUT hipMalloc.o COMMAND gcc -c ${CMAKE_CURRENT_SOURCE_DIR}/hipMalloc.c -I${HIP_PATH}/include -D__HIP_PLATFORM_AMD__ -o hipMalloc.o)
+add_custom_target(hipMalloc_custom DEPENDS hipMalloc.o)
+
+add_library(LaunchKernel_lib OBJECT IMPORTED)
+add_library(hipMalloc_lib OBJECT IMPORTED)
+
+set_property(TARGET LaunchKernel_lib PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/LaunchKernel.o")
+set_property(TARGET hipMalloc_lib PROPERTY IMPORTED_OBJECTS "${CMAKE_CURRENT_BINARY_DIR}/hipMalloc.o")
+
+
+hip_add_exe_to_target(NAME gccTests
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+		              LINKER_LIBS LaunchKernel_lib hipMalloc_lib)
+
+add_dependencies(gccTests LaunchKernel_custom hipMalloc_custom)
+endif()
+endif()
@@ -1,64 +1,64 @@
-/*
- * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- * */
-
-#include <hip_test_common.hh>
- 
-extern "C" {
-#include "LaunchKernel.h"
-}
-
-/**
- * Test Description
- * ------------------------
- *    - calling launchKernel which is c function from catch2
- * and compile with gcc compiler and verify the results.
-
- * Test source
- * ------------------------
- *    - catch/unit/gcc/gccTest.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- */
-
-TEST_CASE("Unit_LaunchKernelgccTests") {
-  printf("Calling launchKernel files from here\n");
-  int result = launchKernel();
-  REQUIRE(result == 1);
-}
-
-/**
- * Test Description
- * ------------------------
- *    - Calling hipMalloc which is c file from catch2 and compile
- * with gcc compiler and verify the results.
-
- * Test source
- * ------------------------
- *    - catch/unit/gcc/gccTest.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- */
-
-TEST_CASE("Unit_hipMallocgccTests") {
-  printf("Calling hipMalloc files from here\n");
-  int result = hipMallocfunc();
-  REQUIRE(result == 1);
-}
+/*
+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * */
+
+#include <hip_test_common.hh>
+ 
+extern "C" {
+#include "LaunchKernel.h"
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - calling launchKernel which is c function from catch2
+ * and compile with gcc compiler and verify the results.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/gcc/gccTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_LaunchKernelgccTests") {
+  printf("Calling launchKernel files from here\n");
+  int result = launchKernel();
+  REQUIRE(result == 1);
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Calling hipMalloc which is c file from catch2 and compile
+ * with gcc compiler and verify the results.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/gcc/gccTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipMallocgccTests") {
+  printf("Calling hipMalloc files from here\n");
+  int result = hipMallocfunc();
+  REQUIRE(result == 1);
+}
@@ -1,176 +1,176 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-
-#pragma clang diagnostic ignored "-Wunused-parameter"
-
-unsigned threadsPerBlock = 256;
-
-template <unsigned batch, typename T>
-__device__ void sum(T* sdata, unsigned groupElements, unsigned tid) {
-  T tmp;
-  if (groupElements < batch)
-      return;
-  // sdata[tid] += sdata[tid - batch/2] does not work when block size is
-  // greater than wave size because one wave may complete before another
-  // wave.
-  if (tid >= batch/2 && tid < groupElements)
-      tmp = sdata[tid - batch/2];
-  __syncthreads();
-  if (tid >= batch/2 && tid < groupElements)
-      sdata[tid] += tmp;
-  __syncthreads();
-}
-
-template <typename T>
-__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
-                                    size_t numElements, size_t groupElements) {
-  // declare dynamic shared memory
-  extern __shared__ double sdata0[];
-  T* sdata = reinterpret_cast<T *>(sdata0);
-
-  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t tid = threadIdx.x;
-
-  // initialize dynamic shared memory
-  if (tid < groupElements) {
-      sdata[tid] = static_cast<T>(tid);
-  }
-  __syncthreads();
-
-  // prefix sum inside dynamic shared memory
-  sum<512>(sdata, groupElements, tid);
-  sum<256>(sdata, groupElements, tid);
-  sum<128>(sdata, groupElements, tid);
-  sum<64>(sdata, groupElements, tid);
-  sum<32>(sdata, groupElements, tid);
-  sum<16>(sdata, groupElements, tid);
-  sum<8>(sdata, groupElements, tid);
-  sum<4>(sdata, groupElements, tid);
-  sum<2>(sdata, groupElements, tid);
-  C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements];
-}
-
-template <typename T>
-void testExternShared(size_t N, unsigned groupElements) {
-  size_t Nbytes = N * sizeof(T);
-
-  T *A_d, *B_d, *C_d;
-  T *A_h, *B_h, *C_h;
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
-  unsigned blocks = N/threadsPerBlock;
-  assert(N == blocks * threadsPerBlock);
-
-  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-
-  // calculate the amount of dynamic shared memory required
-  size_t groupMemBytes = groupElements * sizeof(T);
-
-  // launch kernel with dynamic shared memory
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks),
-                     dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d,
-                     N, groupElements);
-
-  HIP_CHECK(hipDeviceSynchronize());
-  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-
-  // verify
-  for (size_t i = 0; i < N; ++i) {
-    size_t tid = (i % min(threadsPerBlock, groupElements));
-    T sumFromSharedMemory = static_cast<T>(tid * (tid + 1) / 2);
-    T expected = A_h[i] + B_h[i] + sumFromSharedMemory;
-    REQUIRE(C_h[i] == expected);
-  }
-  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
-}
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - launch kernel with dynamic shared memory for float and double
- * datatypes and verify the results.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipDynamicShared.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipDynamicShared") {
-  SECTION("test case with float for least size") {
-    testExternShared<float>(1024, 4);
-    testExternShared<float>(1024, 8);
-    testExternShared<float>(1024, 16);
-    testExternShared<float>(1024, 32);
-    testExternShared<float>(1024, 64);
-  }
-
-  SECTION("test case with float for max size") {
-    testExternShared<float>(65536, 4);
-    testExternShared<float>(65536, 8);
-    testExternShared<float>(65536, 16);
-    testExternShared<float>(65536, 32);
-    testExternShared<float>(65536, 64);
-  }
-
-  SECTION("test case with double for least size") {
-    testExternShared<double>(1024, 4);
-    testExternShared<double>(1024, 8);
-    testExternShared<double>(1024, 16);
-    testExternShared<double>(1024, 32);
-    testExternShared<double>(1024, 64);
-  }
-
-  SECTION("test case with double for max size") {
-    testExternShared<double>(65536, 4);
-    testExternShared<double>(65536, 8);
-    testExternShared<double>(65536, 16);
-    testExternShared<double>(65536, 32);
-    testExternShared<double>(65536, 64);
-  }
-
-  SECTION("test case with float for max LDS size") {
-    int maxLDS = 0;
-    HIP_CHECK(hipDeviceGetAttribute(&maxLDS,
-                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
-    testExternShared<float>(1024, maxLDS/sizeof(float));
-  }
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+
+#pragma clang diagnostic ignored "-Wunused-parameter"
+
+unsigned threadsPerBlock = 256;
+
+template <unsigned batch, typename T>
+__device__ void sum(T* sdata, unsigned groupElements, unsigned tid) {
+  T tmp;
+  if (groupElements < batch)
+      return;
+  // sdata[tid] += sdata[tid - batch/2] does not work when block size is
+  // greater than wave size because one wave may complete before another
+  // wave.
+  if (tid >= batch/2 && tid < groupElements)
+      tmp = sdata[tid - batch/2];
+  __syncthreads();
+  if (tid >= batch/2 && tid < groupElements)
+      sdata[tid] += tmp;
+  __syncthreads();
+}
+
+template <typename T>
+__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
+                                    size_t numElements, size_t groupElements) {
+  // declare dynamic shared memory
+  extern __shared__ double sdata0[];
+  T* sdata = reinterpret_cast<T *>(sdata0);
+
+  size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t tid = threadIdx.x;
+
+  // initialize dynamic shared memory
+  if (tid < groupElements) {
+      sdata[tid] = static_cast<T>(tid);
+  }
+  __syncthreads();
+
+  // prefix sum inside dynamic shared memory
+  sum<512>(sdata, groupElements, tid);
+  sum<256>(sdata, groupElements, tid);
+  sum<128>(sdata, groupElements, tid);
+  sum<64>(sdata, groupElements, tid);
+  sum<32>(sdata, groupElements, tid);
+  sum<16>(sdata, groupElements, tid);
+  sum<8>(sdata, groupElements, tid);
+  sum<4>(sdata, groupElements, tid);
+  sum<2>(sdata, groupElements, tid);
+  C_d[gid] = A_d[gid] + B_d[gid] + sdata[tid % groupElements];
+}
+
+template <typename T>
+void testExternShared(size_t N, unsigned groupElements) {
+  size_t Nbytes = N * sizeof(T);
+
+  T *A_d, *B_d, *C_d;
+  T *A_h, *B_h, *C_h;
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
+  unsigned blocks = N/threadsPerBlock;
+  assert(N == blocks * threadsPerBlock);
+
+  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+
+  // calculate the amount of dynamic shared memory required
+  size_t groupMemBytes = groupElements * sizeof(T);
+
+  // launch kernel with dynamic shared memory
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks),
+                     dim3(threadsPerBlock), groupMemBytes, 0, A_d, B_d, C_d,
+                     N, groupElements);
+
+  HIP_CHECK(hipDeviceSynchronize());
+  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+
+  // verify
+  for (size_t i = 0; i < N; ++i) {
+    size_t tid = (i % min(threadsPerBlock, groupElements));
+    T sumFromSharedMemory = static_cast<T>(tid * (tid + 1) / 2);
+    T expected = A_h[i] + B_h[i] + sumFromSharedMemory;
+    REQUIRE(C_h[i] == expected);
+  }
+  HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
+}
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - launch kernel with dynamic shared memory for float and double
+ * datatypes and verify the results.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipDynamicShared.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipDynamicShared") {
+  SECTION("test case with float for least size") {
+    testExternShared<float>(1024, 4);
+    testExternShared<float>(1024, 8);
+    testExternShared<float>(1024, 16);
+    testExternShared<float>(1024, 32);
+    testExternShared<float>(1024, 64);
+  }
+
+  SECTION("test case with float for max size") {
+    testExternShared<float>(65536, 4);
+    testExternShared<float>(65536, 8);
+    testExternShared<float>(65536, 16);
+    testExternShared<float>(65536, 32);
+    testExternShared<float>(65536, 64);
+  }
+
+  SECTION("test case with double for least size") {
+    testExternShared<double>(1024, 4);
+    testExternShared<double>(1024, 8);
+    testExternShared<double>(1024, 16);
+    testExternShared<double>(1024, 32);
+    testExternShared<double>(1024, 64);
+  }
+
+  SECTION("test case with double for max size") {
+    testExternShared<double>(65536, 4);
+    testExternShared<double>(65536, 8);
+    testExternShared<double>(65536, 16);
+    testExternShared<double>(65536, 32);
+    testExternShared<double>(65536, 64);
+  }
+
+  SECTION("test case with float for max LDS size") {
+    int maxLDS = 0;
+    HIP_CHECK(hipDeviceGetAttribute(&maxLDS,
+                                  hipDeviceAttributeMaxSharedMemoryPerBlock, 0));
+    testExternShared<float>(1024, maxLDS/sizeof(float));
+  }
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,94 +1,94 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-
-#define LEN  (16 * 1024)
-#define SIZE (LEN * sizeof(float))
-
-__global__ void vectorAdd(float* Ad, float* Bd) {
-  extern __shared__ float sBd[];
-  int tx = threadIdx.x;
-  for (int i = 0; i < LEN / 64; i++) {
-    sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
-    Bd[tx + i * 64] = sBd[tx + i * 64];
-  }
-}
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - Assign max dynamic shared memory to kernel function and 
- * verify the results.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipDynamicShared2.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipDynamicShared2") {
-  float *A, *B, *Ad, *Bd;
-  A = new float[LEN];
-  B = new float[LEN];
-  for (int i = 0; i < LEN; i++) {
-      A[i] = 1.0f;
-      B[i] = 1.0f;
-  }
-  HIP_CHECK(hipMalloc(&Ad, SIZE));
-  HIP_CHECK(hipMalloc(&Bd, SIZE));
-  HIP_CHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
-
-  hipError_t ret = hipFuncSetAttribute(
-      reinterpret_cast<const void*>(&vectorAdd),
-      hipFuncAttributeMaxDynamicSharedMemorySize, SIZE);
-
-  REQUIRE(ret == hipSuccess);
-  hipLaunchKernelGGL(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
-  HIP_CHECK(hipGetLastError());
-  HIP_CHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
-  for (int i = 0; i < LEN; i++) {
-    assert(B[i] > 1.0f && B[i] < 3.0f);
-  }
-  HIP_CHECK(hipFree(Ad));
-  HIP_CHECK(hipFree(Bd));
-
-  delete[] A;
-  delete[] B;
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+
+#define LEN  (16 * 1024)
+#define SIZE (LEN * sizeof(float))
+
+__global__ void vectorAdd(float* Ad, float* Bd) {
+  extern __shared__ float sBd[];
+  int tx = threadIdx.x;
+  for (int i = 0; i < LEN / 64; i++) {
+    sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
+    Bd[tx + i * 64] = sBd[tx + i * 64];
+  }
+}
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Assign max dynamic shared memory to kernel function and 
+ * verify the results.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipDynamicShared2.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipDynamicShared2") {
+  float *A, *B, *Ad, *Bd;
+  A = new float[LEN];
+  B = new float[LEN];
+  for (int i = 0; i < LEN; i++) {
+      A[i] = 1.0f;
+      B[i] = 1.0f;
+  }
+  HIP_CHECK(hipMalloc(&Ad, SIZE));
+  HIP_CHECK(hipMalloc(&Bd, SIZE));
+  HIP_CHECK(hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice));
+
+  hipError_t ret = hipFuncSetAttribute(
+      reinterpret_cast<const void*>(&vectorAdd),
+      hipFuncAttributeMaxDynamicSharedMemorySize, SIZE);
+
+  REQUIRE(ret == hipSuccess);
+  hipLaunchKernelGGL(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost));
+  for (int i = 0; i < LEN; i++) {
+    assert(B[i] > 1.0f && B[i] < 3.0f);
+  }
+  HIP_CHECK(hipFree(Ad));
+  HIP_CHECK(hipFree(Bd));
+
+  delete[] A;
+  delete[] B;
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,59 +1,59 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-
-#pragma clang diagnostic ignored "-Wunused-parameter"
-
-__global__ void Empty(int param) {}
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - pass empty Kernel function.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipEmptyKernel.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipEmptyKernel") {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
-    HIP_CHECK(hipDeviceSynchronize());
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+
+#pragma clang diagnostic ignored "-Wunused-parameter"
+
+__global__ void Empty(int param) {}
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - pass empty Kernel function.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipEmptyKernel.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipEmptyKernel") {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
+    HIP_CHECK(hipDeviceSynchronize());
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,138 +1,138 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test the Grid_Launch syntax.
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-#include "hip/hip_ext.h"
-
-static unsigned threadsPerBlock = 256;
-static unsigned blocksPerCU = 6;
-
-struct _t {
-    double _a, _b, _c, _d, _e, _f, _g, _h, _i, _j;
-};
-
-typedef struct _t _T;
-
-__global__ void sKernel(_T s, double *a) {
-    *a = s._a + s._b + s._c + s._d + s._e + s._f + s._g + s._h + s._i + s._j;
-}
-
-__global__ void mKernel(char f, int16_t a, int b, double c,
-                        int16_t d, int e, double* res) {
-    *res = a + b + c + d + e + f;
-}
-
-void testMixData() {
-  double m = 0;
-  double *d_m;
-  HIP_CHECK(hipMalloc(&d_m, sizeof(double)));
-  int a = 1, e = 10;
-  int16_t b = 2, d = 4;
-  double c = 3.0;
-  char ff = 10;
-  hipExtLaunchKernelGGL(mKernel, 1, 1, 0, 0, nullptr, nullptr, 0, ff,
-                         b, a, c, d, e, d_m);
-  HIP_CHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
-  REQUIRE(m == 30.0);
-  HIP_CHECK(hipFree(d_m));
-}
-
-void testStruct() {
-  double m = 0;
-  double *d_m;
-  HIP_CHECK(hipMalloc(&d_m, sizeof(double)));
-  _T s{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-  hipExtLaunchKernelGGL(sKernel, 1, 1, 0, 0, nullptr, nullptr, 0, s, d_m);
-  HIP_CHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
-  REQUIRE(m == 55.0);
-  HIP_CHECK(hipFree(d_m));
-}
-
-void test(size_t N) {
-  size_t Nbytes = N * sizeof(int);
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
-
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
-  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-
-  hipExtLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
-                        dim3(threadsPerBlock), 0, 0, nullptr, nullptr, 0,
-                        static_cast<const int*>(A_d),
-                        static_cast<const int*>(B_d), C_d, N);
-
-  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipDeviceSynchronize());
-  HipTest::checkVectorADD(A_h, B_h, C_h, N);
-}
-
-/**
-* @addtogroup hipExtLaunchKernelGGL hipExtLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipExtLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-                              std::uint32_t sharedMemBytes, hipStream_t stream,
-                              hipEvent_t startEvent, hipEvent_t stopEvent, std::uint32_t flags,
-                              Args... args)` -
-* Launches kernel with dimention parameters and shared memory on stream with templated kernel and arguments
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - Test case to verify sample array with hipExtLaunchKernelGGL()
- * and verify the results.
- *    - Test case to verify struct data with hipExtLaunchKernelGGL()
- * and verify the results.
- *    - Test case to verify mix datatypes with hipExtLaunchKernelGGL()
- * and verify the results.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipExtLaunchKernelGGL.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipExtLaunchKernelGGL") {
-  SECTION("test run") {
-    size_t N = 4 * 1024 * 1024;
-    test(N);
-  }
-  SECTION("testStruct run") {
-    testStruct();
-  }
-  SECTION("testMixData run") {
-    testMixData();
-  }
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test the Grid_Launch syntax.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+#include "hip/hip_ext.h"
+
+static unsigned threadsPerBlock = 256;
+static unsigned blocksPerCU = 6;
+
+struct _t {
+    double _a, _b, _c, _d, _e, _f, _g, _h, _i, _j;
+};
+
+typedef struct _t _T;
+
+__global__ void sKernel(_T s, double *a) {
+    *a = s._a + s._b + s._c + s._d + s._e + s._f + s._g + s._h + s._i + s._j;
+}
+
+__global__ void mKernel(char f, int16_t a, int b, double c,
+                        int16_t d, int e, double* res) {
+    *res = a + b + c + d + e + f;
+}
+
+void testMixData() {
+  double m = 0;
+  double *d_m;
+  HIP_CHECK(hipMalloc(&d_m, sizeof(double)));
+  int a = 1, e = 10;
+  int16_t b = 2, d = 4;
+  double c = 3.0;
+  char ff = 10;
+  hipExtLaunchKernelGGL(mKernel, 1, 1, 0, 0, nullptr, nullptr, 0, ff,
+                         b, a, c, d, e, d_m);
+  HIP_CHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
+  REQUIRE(m == 30.0);
+  HIP_CHECK(hipFree(d_m));
+}
+
+void testStruct() {
+  double m = 0;
+  double *d_m;
+  HIP_CHECK(hipMalloc(&d_m, sizeof(double)));
+  _T s{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  hipExtLaunchKernelGGL(sKernel, 1, 1, 0, 0, nullptr, nullptr, 0, s, d_m);
+  HIP_CHECK(hipMemcpy(&m, d_m, sizeof(double), hipMemcpyDeviceToHost));
+  REQUIRE(m == 55.0);
+  HIP_CHECK(hipFree(d_m));
+}
+
+void test(size_t N) {
+  size_t Nbytes = N * sizeof(int);
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+
+  hipExtLaunchKernelGGL(HipTest::vectorADD, dim3(blocks),
+                        dim3(threadsPerBlock), 0, 0, nullptr, nullptr, 0,
+                        static_cast<const int*>(A_d),
+                        static_cast<const int*>(B_d), C_d, N);
+
+  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+}
+
+/**
+* @addtogroup hipExtLaunchKernelGGL hipExtLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipExtLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                              std::uint32_t sharedMemBytes, hipStream_t stream,
+                              hipEvent_t startEvent, hipEvent_t stopEvent, std::uint32_t flags,
+                              Args... args)` -
+* Launches kernel with dimention parameters and shared memory on stream with templated kernel and arguments
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify sample array with hipExtLaunchKernelGGL()
+ * and verify the results.
+ *    - Test case to verify struct data with hipExtLaunchKernelGGL()
+ * and verify the results.
+ *    - Test case to verify mix datatypes with hipExtLaunchKernelGGL()
+ * and verify the results.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipExtLaunchKernelGGL.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipExtLaunchKernelGGL") {
+  SECTION("test run") {
+    size_t N = 4 * 1024 * 1024;
+    test(N);
+  }
+  SECTION("testStruct run") {
+    testStruct();
+  }
+  SECTION("testMixData run") {
+    testMixData();
+  }
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,122 +1,122 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Test the Grid_Launch syntax.
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-
-static unsigned threadsPerBlock = 256;
-static unsigned blocksPerCU = 6;
-
-// __device__ maps to __attribute__((hc))
-__device__ int foo(int i) { return i + 1; }
-
-
-template <typename T>
-__global__ void vectorADD2(T* A_d, T* B_d, T* C_d, size_t N) {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x;
-
-    for (size_t i = offset; i < N; i += stride) {
-        double foo = __hiloint2double(A_d[i], B_d[i]);
-        C_d[i] = __double2loint(foo) + __double2hiint(foo);
-    }
-}
-
-int test_gl2(size_t N) {
-  size_t Nbytes = N * sizeof(int);
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
-
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-
-  // Full vadd in one large chunk, to get things started:
-  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(vectorADD2, dim3(blocks), dim3(threadsPerBlock),
-                      0, 0, A_d, B_d, C_d, N);
-  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipDeviceSynchronize());
-  // verify
-  HipTest::checkVectorADD(A_h, B_h, C_h, N);
-  return 0;
-}
-
-#if __HIP__
-int test_triple_chevron(size_t N) {
-  size_t Nbytes = N * sizeof(int);
-  int *A_d, *B_d, *C_d;
-  int *A_h, *B_h, *C_h;
-  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
-
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
-  // Full vadd in one large chunk, to get things started:
-  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-  vectorADD2<<<dim3(blocks), dim3(threadsPerBlock)>>>(A_d, B_d, C_d, N);
-  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipDeviceSynchronize());
-  // verify
-  HipTest::checkVectorADD(A_h, B_h, C_h, N);
-  return 0;
-}
-#endif
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - Test case to verify the Grid_Launch syntax.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipGridLaunch.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipGridLaunch") {
-  size_t N = 4 * 1024 * 1024;
-  SECTION("Test test_gl2") {
-    test_gl2(N);
-  }
-
-#if __HIP__
-  SECTION("Test triple_chevron") {
-    test_triple_chevron(N);
-  }
-#endif
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Test the Grid_Launch syntax.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+
+static unsigned threadsPerBlock = 256;
+static unsigned blocksPerCU = 6;
+
+// __device__ maps to __attribute__((hc))
+__device__ int foo(int i) { return i + 1; }
+
+
+template <typename T>
+__global__ void vectorADD2(T* A_d, T* B_d, T* C_d, size_t N) {
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+
+    for (size_t i = offset; i < N; i += stride) {
+        double foo = __hiloint2double(A_d[i], B_d[i]);
+        C_d[i] = __double2loint(foo) + __double2hiint(foo);
+    }
+}
+
+int test_gl2(size_t N) {
+  size_t Nbytes = N * sizeof(int);
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+
+  // Full vadd in one large chunk, to get things started:
+  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(vectorADD2, dim3(blocks), dim3(threadsPerBlock),
+                      0, 0, A_d, B_d, C_d, N);
+  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  // verify
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+  return 0;
+}
+
+#if __HIP__
+int test_triple_chevron(size_t N) {
+  size_t Nbytes = N * sizeof(int);
+  int *A_d, *B_d, *C_d;
+  int *A_h, *B_h, *C_h;
+  HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);
+
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
+  // Full vadd in one large chunk, to get things started:
+  HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
+  vectorADD2<<<dim3(blocks), dim3(threadsPerBlock)>>>(A_d, B_d, C_d, N);
+  HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipDeviceSynchronize());
+  // verify
+  HipTest::checkVectorADD(A_h, B_h, C_h, N);
+  return 0;
+}
+#endif
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify the Grid_Launch syntax.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipGridLaunch.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipGridLaunch") {
+  size_t N = 4 * 1024 * 1024;
+  SECTION("Test test_gl2") {
+    test_gl2(N);
+  }
+
+#if __HIP__
+  SECTION("Test triple_chevron") {
+    test_triple_chevron(N);
+  }
+#endif
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,111 +1,111 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-#include <hip_test_checkers.hh>
- 
-#include <hip/math_functions.h>
-
-#pragma clang diagnostic ignored "-Wunused-variable"
-#pragma clang diagnostic ignored "-Wuninitialized"
-
-// Simple tests for variable type qualifiers:
-__device__ int deviceVar;
-
-// TODO-HCC __constant__ not working yet.
-__constant__ int constantVar1;
-
-__constant__ __device__ int constantVar2;
-
-// Test HOST space:
-__host__ void foo() { printf("foo!\n"); }
-
-__device__ __noinline__ int sum1_noinline(int a) { return a + 1; }
-__device__ __forceinline__ int sum1_forceinline(int a) { return a + 1; }
-
-
-__device__ __host__ float PlusOne(float x) { return x + 1.0; }
-
-__global__ void MyKernel(const float* a, const float* b, float* c,
-                         unsigned N) {
-  unsigned gid = threadIdx.x;
-  if (gid < N) {
-      c[gid] = a[gid] + PlusOne(b[gid]);
-  }
-}
-
-void callMyKernel() {
-  float *a, *b, *c;
-  const unsigned blockSize = 256;
-  unsigned N = blockSize;
-  hipLaunchKernelGGL(MyKernel, dim3(N / blockSize), dim3(blockSize),
-                     0, 0, a, b, c, N);
-}
-
-template <typename T>
-__global__ void vectorADD(T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
-#ifdef NOT_YET
-  int a = __shfl_up(x, 1);
-#endif
-  float x = 1.0;
-#ifdef NOT_YET
-    float fastZ = __sin(x);
-#endif
-  __syncthreads();
-
-  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-  size_t stride = blockDim.x * gridDim.x;
-
-  for (size_t i = offset; i < N; i += stride) {
-      C_d[i] = A_d[i] + B_d[i];
-  }
-}
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - Collection of code to make sure that various features
- * in the hip kernel language compile.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipLanguageExtensions.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipLanguageExtensions") {
-  REQUIRE(true);
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+#include <hip_test_checkers.hh>
+ 
+#include <hip/math_functions.h>
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wuninitialized"
+
+// Simple tests for variable type qualifiers:
+__device__ int deviceVar;
+
+// TODO-HCC __constant__ not working yet.
+__constant__ int constantVar1;
+
+__constant__ __device__ int constantVar2;
+
+// Test HOST space:
+__host__ void foo() { printf("foo!\n"); }
+
+__device__ __noinline__ int sum1_noinline(int a) { return a + 1; }
+__device__ __forceinline__ int sum1_forceinline(int a) { return a + 1; }
+
+
+__device__ __host__ float PlusOne(float x) { return x + 1.0; }
+
+__global__ void MyKernel(const float* a, const float* b, float* c,
+                         unsigned N) {
+  unsigned gid = threadIdx.x;
+  if (gid < N) {
+      c[gid] = a[gid] + PlusOne(b[gid]);
+  }
+}
+
+void callMyKernel() {
+  float *a, *b, *c;
+  const unsigned blockSize = 256;
+  unsigned N = blockSize;
+  hipLaunchKernelGGL(MyKernel, dim3(N / blockSize), dim3(blockSize),
+                     0, 0, a, b, c, N);
+}
+
+template <typename T>
+__global__ void vectorADD(T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
+#ifdef NOT_YET
+  int a = __shfl_up(x, 1);
+#endif
+  float x = 1.0;
+#ifdef NOT_YET
+    float fastZ = __sin(x);
+#endif
+  __syncthreads();
+
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = offset; i < N; i += stride) {
+      C_d[i] = A_d[i] + B_d[i];
+  }
+}
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Collection of code to make sure that various features
+ * in the hip kernel language compile.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipLanguageExtensions.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipLanguageExtensions") {
+  REQUIRE(true);
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -1,464 +1,464 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-
-class HipFunctorTests {
- public:
-    // Test that a class functor can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForSimpleClassFunctor(void);
-    // Test that a templated class functor can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForClassTemplateFunctor(void);
-    // Test that a class functor object ptr  can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForClassObjPtrFunctor(void);
-    // Test that a class object containing functor can be passed
-    // to hiplaunchparam and can be used in kernel
-    void TestForFunctorContainInClassObj(void);
-    // Test that a stuct functor can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForSimpleStructFunctor(void);
-    // Test that a stuct functor object ptr  can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForStructObjPtrFunctor(void);
-    // Test that a templated struct functor can be passed to hiplaunchparam
-    // and can be used in kernel
-    void TestForStructTemplateFunctor(void);
-    // Test that a struct object containing functor can be
-    // passed to hiplaunchparam and can be used in kernel
-    void TestForFunctorContainInStructObj(void);
-};
-
-static const int BLOCK_DIM_SIZE = 1024;
-static const int THREADS_PER_BLOCK = 1;
-
-// class functor tests
-
-// Simple doubler Functor
-class DoublerFunctor{
- public:
-    __device__ int operator()(int x) { return x * 2;}
-};
-
-// simple doubler functor passed to kernel
-__global__ void DoublerFunctorKernel(
-                    DoublerFunctor doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = doubler_(5);
-  deviceResult[x] = (result == 10);
-}
-
-void HipFunctorTests::TestForSimpleClassFunctor(void) {
-  DoublerFunctor doubler;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(DoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, doubler, deviceResults);
-
-  // Validation part of TestForSimpleClassFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-// pointer functor passed to kernel
-__global__ void PtrDoublerFunctorKernel(
-                    DoublerFunctor *doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = (*doubler_)(5);
-  deviceResult[x] = (result == 10);
-}
-
-void HipFunctorTests::TestForClassObjPtrFunctor(void) {
-  DoublerFunctor* ptrdoubler = new DoublerFunctor[sizeof(int)];
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(PtrDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, ptrdoubler, deviceResults);
-
-  // Validation part of TestForClassObjPtrFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-  delete[] ptrdoubler;
-}
-
-class compare {
- public:
-    template<typename T1, typename T2>
-    __device__ bool operator()(const T1& v1, const T2& v2) {
-       return v1 > v2;
-    }
-};
-
-// template functor passed to kernel
-__global__ void TemplateFunctorKernel(
-                    compare compare_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  deviceResult[x] = compare_(2.2, 2.1);
-  deviceResult[x] = compare_(2, 1);
-  deviceResult[x] = compare_('b', 'a');
-}
-
-void HipFunctorTests::TestForClassTemplateFunctor(void) {
-  compare comparefunctor;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(TemplateFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, comparefunctor, deviceResults);
-
-  // Validation part of TestForClassTemplateFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-
-// Doubler calculator
-class DoublerCalculator {
- public:
-    int a, result;
-    // fucntor contained in class object
-    DoublerFunctor doubler;
-};
-
-// doubler functor conatined in class obj passed to kernel
-__global__ void DoublerCalculatorFunctorKernel(
-                    DoublerCalculator doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = doubler_.doubler(doubler_.a);
-  deviceResult[x] = (doubler_.result == result);
-}
-
-void HipFunctorTests::TestForFunctorContainInClassObj(void) {
-  DoublerCalculator Doubler;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  Doubler.a = 5;
-  Doubler.result = 10;
-  // pass comparefunctor to  hipLaunchParm
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(DoublerCalculatorFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, Doubler, deviceResults);
-
-  // Validation part of TestForStructTemplateFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-// Struct functor tests
-
-// Simple doubler Functor
-struct sDoublerFunctor {
- public:
-    __device__ int operator()(int x) { return x * 2;}
-};
-
-
-// simple sturct doubler functor passed to kernel
-__global__ void structDoublerFunctorKernel(
-                    sDoublerFunctor doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = doubler_(5);
-  deviceResult[x] = (result == 10);
-}
-
-void HipFunctorTests::TestForSimpleStructFunctor(void) {
-  sDoublerFunctor doubler;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(structDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, doubler, deviceResults);
-
-  // Validation part of TestForSimpleStructFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-// ptr functor passed to kernel
-__global__ void structPtrDoublerFunctorKernel(
-                    sDoublerFunctor *doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = (*doubler_)(5);
-  deviceResult[x] = (result == 10);
-}
-
-void HipFunctorTests::TestForStructObjPtrFunctor(void) {
-  sDoublerFunctor* ptrdoubler = new sDoublerFunctor[sizeof(int)];
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-  hipLaunchKernelGGL(structPtrDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, ptrdoubler, deviceResults);
-
-  // Validation part of TestForStructObjPtrFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-  delete[] ptrdoubler;
-}
-
-struct sCompare {
- public:
-    template< typename T1, typename T2 >
-    __device__ bool operator()(const T1& v1, const T2& v2) {
-    return v1 > v2;
-    }
-};
-
-// template functor passed to kernel
-__global__ void structTemplateFunctorKernel(
-                    sCompare compare_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  deviceResult[x] = compare_(2.2, 2.1);
-  deviceResult[x] = compare_(2, 1);
-  deviceResult[x] = compare_('b', 'a');
-}
-
-void HipFunctorTests::TestForStructTemplateFunctor(void) {
-  sCompare comparefunctor;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-
-  // pass comparefunctor to  hipLaunchKernelGGL
-  hipLaunchKernelGGL(structTemplateFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, comparefunctor, deviceResults);
-
-  // Validation part of TestForStructTemplateFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-// Doubler calculator struct
-struct sDoublerCalculator {
- public:
-    int a, result;
-    // fucntor contained in class object
-    DoublerFunctor doubler;
-};
-
-
-
-// doubler functor contained in struct passed to kernel
-__global__ void DoublerCalculatorFunctorKernel(
-                    sDoublerCalculator doubler_,
-                    bool* deviceResult) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int result = doubler_.doubler(doubler_.a);
-  deviceResult[x] = (doubler_.result == result);
-}
-
-void HipFunctorTests::TestForFunctorContainInStructObj(void) {
-  sDoublerCalculator Doubler;
-  bool *deviceResults, *hostResults;
-  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
-    // initialize to false, will be set to
-    // true if the functor is called in device code
-    hostResults[k] = false;
-  }
-
-  Doubler.a = 5;
-  Doubler.result = 10;
-  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyHostToDevice));
-
-
-  // pass comparefunctor to  hipLaunchKernelGGL
-  hipLaunchKernelGGL(DoublerCalculatorFunctorKernel, dim3(BLOCK_DIM_SIZE),
-                  dim3(THREADS_PER_BLOCK), 0, 0, Doubler, deviceResults);
-
-  // Validation part of TestForStructTemplateFunctor
-  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
-           hipMemcpyDeviceToHost));
-  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
-    REQUIRE(hostResults[k] == true);
-  HIP_CHECK(hipHostFree(hostResults));
-  HIP_CHECK(hipFree(deviceResults));
-}
-
-/**
-* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
-* @{
-* @ingroup KernelTest
-* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
-* Method to invocate kernel functions
-*/
-
-/**
- * Test Description
- * ------------------------
- *    - Test that a class functor can be passed to hiplaunchparam
- * and can be used in kernel.
- *    - Test that a templated class functor can be passed to hiplaunchparam
- * and can be used in kernel.
- *    - Test that a class functor object ptr  can be passed to hiplaunchparam
- * and can be used in kernel.
- *    - Test that a class object containing functor can be passed to hiplaunchparam
- * and can be used in kernel
- *    - Test that a stuct functor can be passed to hiplaunchparam
- * and can be used in kernel
- *    - Test that a stuct functor object ptr  can be passed to hiplaunchparam
- * and can be used in kernel
- *    - Test that a templated struct functor can be passed to hiplaunchparam
- * and can be used in kernel
- *    - Test that a struct object containing functor can be passed to hiplaunchparam
- * and can be used in kernel
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipLaunchParmFunctor.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipLaunchParmFunctor") {
-  HipFunctorTests FunctorTests;
-
-  SECTION("test for simple class functor") {
-    FunctorTests.TestForSimpleClassFunctor();
-  }
-  SECTION("test for class objptr functor") {
-    FunctorTests.TestForClassObjPtrFunctor();
-  }
-  SECTION("test for class templete functor") {
-    FunctorTests.TestForClassTemplateFunctor();
-  }
-  SECTION("test for simple struct functor") {
-    FunctorTests.TestForSimpleStructFunctor();
-  }
-  SECTION("test for struct objptr functor") {
-    FunctorTests.TestForStructObjPtrFunctor();
-  }
-  SECTION("test for struct templete functor") {
-    FunctorTests.TestForStructTemplateFunctor();
-  }
-  SECTION("test for functor contain in classobj") {
-    FunctorTests.TestForFunctorContainInClassObj();
-  }
-  SECTION("test for functor contain in structobj") {
-    FunctorTests.TestForFunctorContainInStructObj();
-  }
-}
-
-/**
-* End doxygen group KernelTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+
+class HipFunctorTests {
+ public:
+    // Test that a class functor can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForSimpleClassFunctor(void);
+    // Test that a templated class functor can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForClassTemplateFunctor(void);
+    // Test that a class functor object ptr  can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForClassObjPtrFunctor(void);
+    // Test that a class object containing functor can be passed
+    // to hiplaunchparam and can be used in kernel
+    void TestForFunctorContainInClassObj(void);
+    // Test that a stuct functor can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForSimpleStructFunctor(void);
+    // Test that a stuct functor object ptr  can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForStructObjPtrFunctor(void);
+    // Test that a templated struct functor can be passed to hiplaunchparam
+    // and can be used in kernel
+    void TestForStructTemplateFunctor(void);
+    // Test that a struct object containing functor can be
+    // passed to hiplaunchparam and can be used in kernel
+    void TestForFunctorContainInStructObj(void);
+};
+
+static const int BLOCK_DIM_SIZE = 1024;
+static const int THREADS_PER_BLOCK = 1;
+
+// class functor tests
+
+// Simple doubler Functor
+class DoublerFunctor{
+ public:
+    __device__ int operator()(int x) { return x * 2;}
+};
+
+// simple doubler functor passed to kernel
+__global__ void DoublerFunctorKernel(
+                    DoublerFunctor doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = doubler_(5);
+  deviceResult[x] = (result == 10);
+}
+
+void HipFunctorTests::TestForSimpleClassFunctor(void) {
+  DoublerFunctor doubler;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(DoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, doubler, deviceResults);
+
+  // Validation part of TestForSimpleClassFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+// pointer functor passed to kernel
+__global__ void PtrDoublerFunctorKernel(
+                    DoublerFunctor *doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = (*doubler_)(5);
+  deviceResult[x] = (result == 10);
+}
+
+void HipFunctorTests::TestForClassObjPtrFunctor(void) {
+  DoublerFunctor* ptrdoubler = new DoublerFunctor[sizeof(int)];
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(PtrDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, ptrdoubler, deviceResults);
+
+  // Validation part of TestForClassObjPtrFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+  delete[] ptrdoubler;
+}
+
+class compare {
+ public:
+    template<typename T1, typename T2>
+    __device__ bool operator()(const T1& v1, const T2& v2) {
+       return v1 > v2;
+    }
+};
+
+// template functor passed to kernel
+__global__ void TemplateFunctorKernel(
+                    compare compare_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  deviceResult[x] = compare_(2.2, 2.1);
+  deviceResult[x] = compare_(2, 1);
+  deviceResult[x] = compare_('b', 'a');
+}
+
+void HipFunctorTests::TestForClassTemplateFunctor(void) {
+  compare comparefunctor;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(TemplateFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, comparefunctor, deviceResults);
+
+  // Validation part of TestForClassTemplateFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+
+// Doubler calculator
+class DoublerCalculator {
+ public:
+    int a, result;
+    // fucntor contained in class object
+    DoublerFunctor doubler;
+};
+
+// doubler functor conatined in class obj passed to kernel
+__global__ void DoublerCalculatorFunctorKernel(
+                    DoublerCalculator doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = doubler_.doubler(doubler_.a);
+  deviceResult[x] = (doubler_.result == result);
+}
+
+void HipFunctorTests::TestForFunctorContainInClassObj(void) {
+  DoublerCalculator Doubler;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  Doubler.a = 5;
+  Doubler.result = 10;
+  // pass comparefunctor to  hipLaunchParm
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(DoublerCalculatorFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, Doubler, deviceResults);
+
+  // Validation part of TestForStructTemplateFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+// Struct functor tests
+
+// Simple doubler Functor
+struct sDoublerFunctor {
+ public:
+    __device__ int operator()(int x) { return x * 2;}
+};
+
+
+// simple sturct doubler functor passed to kernel
+__global__ void structDoublerFunctorKernel(
+                    sDoublerFunctor doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = doubler_(5);
+  deviceResult[x] = (result == 10);
+}
+
+void HipFunctorTests::TestForSimpleStructFunctor(void) {
+  sDoublerFunctor doubler;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(structDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, doubler, deviceResults);
+
+  // Validation part of TestForSimpleStructFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+// ptr functor passed to kernel
+__global__ void structPtrDoublerFunctorKernel(
+                    sDoublerFunctor *doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = (*doubler_)(5);
+  deviceResult[x] = (result == 10);
+}
+
+void HipFunctorTests::TestForStructObjPtrFunctor(void) {
+  sDoublerFunctor* ptrdoubler = new sDoublerFunctor[sizeof(int)];
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+  hipLaunchKernelGGL(structPtrDoublerFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, ptrdoubler, deviceResults);
+
+  // Validation part of TestForStructObjPtrFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+  delete[] ptrdoubler;
+}
+
+struct sCompare {
+ public:
+    template< typename T1, typename T2 >
+    __device__ bool operator()(const T1& v1, const T2& v2) {
+    return v1 > v2;
+    }
+};
+
+// template functor passed to kernel
+__global__ void structTemplateFunctorKernel(
+                    sCompare compare_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  deviceResult[x] = compare_(2.2, 2.1);
+  deviceResult[x] = compare_(2, 1);
+  deviceResult[x] = compare_('b', 'a');
+}
+
+void HipFunctorTests::TestForStructTemplateFunctor(void) {
+  sCompare comparefunctor;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+
+  // pass comparefunctor to  hipLaunchKernelGGL
+  hipLaunchKernelGGL(structTemplateFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, comparefunctor, deviceResults);
+
+  // Validation part of TestForStructTemplateFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+// Doubler calculator struct
+struct sDoublerCalculator {
+ public:
+    int a, result;
+    // fucntor contained in class object
+    DoublerFunctor doubler;
+};
+
+
+
+// doubler functor contained in struct passed to kernel
+__global__ void DoublerCalculatorFunctorKernel(
+                    sDoublerCalculator doubler_,
+                    bool* deviceResult) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int result = doubler_.doubler(doubler_.a);
+  deviceResult[x] = (doubler_.result == result);
+}
+
+void HipFunctorTests::TestForFunctorContainInStructObj(void) {
+  sDoublerCalculator Doubler;
+  bool *deviceResults, *hostResults;
+  HIP_CHECK(hipMalloc(&deviceResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  HIP_CHECK(hipHostMalloc(&hostResults, BLOCK_DIM_SIZE*sizeof(bool)));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k) {
+    // initialize to false, will be set to
+    // true if the functor is called in device code
+    hostResults[k] = false;
+  }
+
+  Doubler.a = 5;
+  Doubler.result = 10;
+  HIP_CHECK(hipMemcpy(deviceResults, hostResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyHostToDevice));
+
+
+  // pass comparefunctor to  hipLaunchKernelGGL
+  hipLaunchKernelGGL(DoublerCalculatorFunctorKernel, dim3(BLOCK_DIM_SIZE),
+                  dim3(THREADS_PER_BLOCK), 0, 0, Doubler, deviceResults);
+
+  // Validation part of TestForStructTemplateFunctor
+  HIP_CHECK(hipMemcpy(hostResults, deviceResults, BLOCK_DIM_SIZE*sizeof(bool),
+           hipMemcpyDeviceToHost));
+  for (int k = 0; k < BLOCK_DIM_SIZE; ++k)
+    REQUIRE(hostResults[k] == true);
+  HIP_CHECK(hipHostFree(hostResults));
+  HIP_CHECK(hipFree(deviceResults));
+}
+
+/**
+* @addtogroup hipLaunchKernelGGL hipLaunchKernelGGL
+* @{
+* @ingroup KernelTest
+* `void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+   std::uint32_t sharedMemBytes, hipStream_t stream, Args... args)` -
+* Method to invocate kernel functions
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test that a class functor can be passed to hiplaunchparam
+ * and can be used in kernel.
+ *    - Test that a templated class functor can be passed to hiplaunchparam
+ * and can be used in kernel.
+ *    - Test that a class functor object ptr  can be passed to hiplaunchparam
+ * and can be used in kernel.
+ *    - Test that a class object containing functor can be passed to hiplaunchparam
+ * and can be used in kernel
+ *    - Test that a stuct functor can be passed to hiplaunchparam
+ * and can be used in kernel
+ *    - Test that a stuct functor object ptr  can be passed to hiplaunchparam
+ * and can be used in kernel
+ *    - Test that a templated struct functor can be passed to hiplaunchparam
+ * and can be used in kernel
+ *    - Test that a struct object containing functor can be passed to hiplaunchparam
+ * and can be used in kernel
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipLaunchParmFunctor.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipLaunchParmFunctor") {
+  HipFunctorTests FunctorTests;
+
+  SECTION("test for simple class functor") {
+    FunctorTests.TestForSimpleClassFunctor();
+  }
+  SECTION("test for class objptr functor") {
+    FunctorTests.TestForClassObjPtrFunctor();
+  }
+  SECTION("test for class templete functor") {
+    FunctorTests.TestForClassTemplateFunctor();
+  }
+  SECTION("test for simple struct functor") {
+    FunctorTests.TestForSimpleStructFunctor();
+  }
+  SECTION("test for struct objptr functor") {
+    FunctorTests.TestForStructObjPtrFunctor();
+  }
+  SECTION("test for struct templete functor") {
+    FunctorTests.TestForStructTemplateFunctor();
+  }
+  SECTION("test for functor contain in classobj") {
+    FunctorTests.TestForFunctorContainInClassObj();
+  }
+  SECTION("test for functor contain in structobj") {
+    FunctorTests.TestForFunctorContainInStructObj();
+  }
+}
+
+/**
+* End doxygen group KernelTest.
+* @}
+*/
@@ -119,7 +119,7 @@ void verify_linked_lists_on_device(hipStream_t stream, Node* pNodes,
                                          unsigned int* pNumCorrect, unsigned int numLists,
                                          unsigned int ListLength) {
  *pNumCorrect = 0;     // reset numCorrect to zero
- 
+
  verify_linked_lists_on_device<<<(numLists + 255) / 256, 256, 0, stream>>>(pNodes, pNumCorrect,
                                                                     ListLength);

@@ -1,24 +1,24 @@
-# Common Tests - Test independent of all platforms
-# moved hipDeviceGetP2PAttribute.cc from /catch/unit/device to
-# /catch/unit/p2p folder and its dependent files.
-set(TEST_SRC
-    hipDeviceGetP2PAttribute.cc
-)
-
-# only for AMD
-if(HIP_PLATFORM MATCHES "amd")
-  set(AMD_SRC
-    hipP2pLinkTypeAndHopFunc.cc
-  )
-  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
-endif()
-
-set_source_files_properties(hipDeviceGetP2PAttribute.cc PROPERTIES COMPILE_FLAGS -std=c++17)
-
-add_executable(hipDeviceGetP2PAttribute_exe EXCLUDE_FROM_ALL hipDeviceGetP2PAttribute_exe.cc)
-
-hip_add_exe_to_target(NAME p2pTests
-                      TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests)
-
-add_dependencies(build_tests hipDeviceGetP2PAttribute_exe)
+# Common Tests - Test independent of all platforms
+# moved hipDeviceGetP2PAttribute.cc from /catch/unit/device to
+# /catch/unit/p2p folder and its dependent files.
+set(TEST_SRC
+    hipDeviceGetP2PAttribute.cc
+)
+
+# only for AMD
+if(HIP_PLATFORM MATCHES "amd")
+  set(AMD_SRC
+    hipP2pLinkTypeAndHopFunc.cc
+  )
+  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
+endif()
+
+set_source_files_properties(hipDeviceGetP2PAttribute.cc PROPERTIES COMPILE_FLAGS -std=c++17)
+
+add_executable(hipDeviceGetP2PAttribute_exe EXCLUDE_FROM_ALL hipDeviceGetP2PAttribute_exe.cc)
+
+hip_add_exe_to_target(NAME p2pTests
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests)
+
+add_dependencies(build_tests hipDeviceGetP2PAttribute_exe)
@@ -1,356 +1,356 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include "hipP2pLinkTypeAndHopFunc.h"
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
- 
-#ifdef __linux__
-#include <unistd.h>
-#include <sys/wait.h>
-#include <dlfcn.h>
-#endif
-#include <vector>
-#define MAX_SIZE 30
-#define VISIBLE_DEVICE 0
-
-/**
- * Fetches Gpu device count
- */
-#ifdef __linux__
-void getDeviceCount(int *pdevCnt) {
-  int fd[2], val = 0;
-  pid_t childpid;
-  // create pipe descriptors
-  pipe(fd);
-  // disable visible_devices env from shell
-  unsetenv("ROCR_VISIBLE_DEVICES");
-  unsetenv("HIP_VISIBLE_DEVICES");
-
-  childpid = fork();
-  if (childpid > 0) {  // Parent
-    close(fd[1]);
-    // parent will wait to read the device cnt
-    read(fd[0], &val, sizeof(val));
-    // close the read-descriptor
-    close(fd[0]);
-    // wait for child exit
-    wait(NULL);
-    *pdevCnt = val;
-  } else if (!childpid) {  // Child
-    int devCnt = 1;
-    // writing only, no need for read-descriptor
-    close(fd[0]);
-    HIP_CHECK(hipGetDeviceCount(&devCnt));
-    // send the value on the write-descriptor:
-    write(fd[1], &devCnt, sizeof(devCnt));
-    // close the write descriptor:
-    close(fd[1]);
-    exit(0);
-  } else {  // failure
-    *pdevCnt = 1;
-    return;
-  }
-}
-
-bool testMaskedDevice(int actualNumGPUs) {
-  bool testResult = true;
-  int fd[2];
-  pipe(fd);
-
-  pid_t cPid;
-  cPid = fork();
-  if (cPid == 0) {  // child
-    hipError_t err;
-    char visibleDeviceString[MAX_SIZE] = {};
-    snprintf(visibleDeviceString, MAX_SIZE, "%d", VISIBLE_DEVICE);
-    // disable visible_devices env from shell
-    unsetenv("ROCR_VISIBLE_DEVICES");
-    unsetenv("HIP_VISIBLE_DEVICES");
-    setenv("ROCR_VISIBLE_DEVICES", visibleDeviceString, 1);
-    setenv("HIP_VISIBLE_DEVICES", visibleDeviceString, 1);
-    uint32_t linktype;
-    uint32_t hopcount;
-    for (int count = 1;
-        count < actualNumGPUs; count++) {
-      err = hipExtGetLinkTypeAndHopCount(VISIBLE_DEVICE,
-            VISIBLE_DEVICE+count, &linktype, &hopcount);
-      REQUIRE(err == hipSuccess);
-    }
-    close(fd[0]);
-    write(fd[1], &testResult, sizeof(testResult));
-    close(fd[1]);
-    exit(0);
-
-  } else if (cPid > 0) {  // parent
-    close(fd[1]);
-    read(fd[0], &testResult, sizeof(testResult));
-    close(fd[0]);
-    wait(NULL);
-
-  } else {
-    printf("Info:fork() failed\n");
-    testResult = false;
-  }
-  return testResult;
-}
-#endif
-
-bool testhipInvalidDevice(int numDevices) {
-  hipError_t ret;
-  uint32_t linktype;
-  uint32_t hopcount;
-  SECTION("Invalid device number case 1") {
-    ret = hipExtGetLinkTypeAndHopCount(-1, 0, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  SECTION("Invalid device number case 2") {
-    ret = hipExtGetLinkTypeAndHopCount(numDevices, 0, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  SECTION("Invalid device number case 3") {
-    ret = hipExtGetLinkTypeAndHopCount(0, -1, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  SECTION("Invalid device number case 4") {
-    ret = hipExtGetLinkTypeAndHopCount(0, numDevices, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  SECTION("Invalid device number case 5") {
-    ret = hipExtGetLinkTypeAndHopCount(-1, numDevices, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  return true;
-}
-
-#ifdef __linux__
-bool testhipInvalidLinkType() {
-  uint32_t hopcount;
-  REQUIRE(hipSuccess != hipExtGetLinkTypeAndHopCount(0, 1, nullptr,
-                                                     &hopcount));
-  return true;
-}
-
-bool testhipInvalidHopcount() {
-  uint32_t linktype;
-  REQUIRE(hipSuccess != hipExtGetLinkTypeAndHopCount(0, 1, &linktype, nullptr));
-  return true;
-}
-
-bool testhipSameDevice(int numGPUs) {
-  hipError_t ret;
-  uint32_t linktype = 0;
-  uint32_t hopcount = 0;
-  for (int gpuId = 0; gpuId < numGPUs; gpuId++) {
-    ret = hipExtGetLinkTypeAndHopCount(gpuId, gpuId, &linktype, &hopcount);
-    REQUIRE(ret != hipSuccess);
-  }
-  return true;
-}
-
-bool testhipLinkTypeHopcountDeviceOrderRev(int numDevices) {
-  bool TestPassed = true;
-  // Get the unique pair of devices
-  for (int x = 0; x < numDevices; x++) {
-    for (int y = x+1; y < numDevices; y++) {
-      uint32_t linktype1 = 0, linktype2 = 0;
-      uint32_t hopcount1 = 0, hopcount2 = 0;
-      HIP_CHECK(hipExtGetLinkTypeAndHopCount(x, y,
-                          &linktype1, &hopcount1));
-      HIP_CHECK(hipExtGetLinkTypeAndHopCount(y, x,
-                          &linktype2, &hopcount2));
-      if (hopcount1 != hopcount2) {
-        TestPassed = false;
-        break;
-      }
-    }
-  }
-  return TestPassed;
-}
-
-/**
- * Internal Function
- */
-bool validateLinkType(uint32_t linktype_Hip,
-                      RSMI_IO_LINK_TYPE linktype_RocmSmi) {
-  bool TestPassed = false;
-
-  if ((linktype_Hip == HSA_AMD_LINK_INFO_TYPE_PCIE) &&
-     (linktype_RocmSmi == RSMI_IOLINK_TYPE_PCIEXPRESS)) {
-    TestPassed = true;
-  } else if ((linktype_Hip == HSA_AMD_LINK_INFO_TYPE_XGMI) &&
-     (linktype_RocmSmi == RSMI_IOLINK_TYPE_XGMI)) {
-    TestPassed = true;
-  } else {
-    printf("linktype Hip = %u, linktype RocmSmi = %u\n",
-            linktype_Hip, linktype_RocmSmi);
-    TestPassed = false;
-  }
-  return TestPassed;
-}
-
-bool testhipLinkTypeHopcountDevice(int numDevices) {
-  bool TestPassed = true;
-  // Opening and initializing rocm-smi library
-  void *lib_rocm_smi_hdl;
-  rsmi_status_t (*fntopo_get_link_type)(uint32_t, uint32_t, uint64_t*,
-                      RSMI_IO_LINK_TYPE*);
-  rsmi_status_t (*fntopo_init)(uint64_t);
-  rsmi_status_t (*fntopo_shut_down)();
-
-  lib_rocm_smi_hdl = dlopen("/opt/rocm/lib/librocm_smi64.so",
-                        RTLD_LAZY);
-  REQUIRE(lib_rocm_smi_hdl);
-
-  void* fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_topo_get_link_type");
-  REQUIRE(fnsym);
-
-  fntopo_get_link_type = reinterpret_cast<rsmi_status_t (*)(uint32_t,
-            uint32_t, uint64_t*, RSMI_IO_LINK_TYPE*)>(fnsym);
-
-  fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_init");
-  REQUIRE(fnsym);
-  fntopo_init = reinterpret_cast<rsmi_status_t (*)(uint64_t)>(fnsym);
-
-  fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_shut_down");
-  REQUIRE(fnsym);
-  fntopo_shut_down = reinterpret_cast<rsmi_status_t (*)()>(fnsym);
-
-  uint64_t init_flags = 0;
-  rsmi_status_t retsmi_init;
-  retsmi_init = fntopo_init(init_flags);
-  REQUIRE(RSMI_STATUS_SUCCESS == retsmi_init);
-
-  // Use rocm-smi API rsmi_topo_get_link_type() to validate
-  struct devicePair {
-    int device1;
-    int device2;
-  };
-  std::vector<struct devicePair> devicePairList;
-  // Get the unique pair of devices
-  for (int x = 0; x < numDevices; x++) {
-    for (int y = x+1; y < numDevices; y++) {
-      devicePairList.push_back({x, y});
-    }
-  }
-  for (auto pos=devicePairList.begin();
-       pos != devicePairList.end(); pos++) {
-    uint32_t linktype1 = 0;
-    uint32_t hopcount1 = 0;
-    RSMI_IO_LINK_TYPE linktype2 = RSMI_IOLINK_TYPE_UNDEFINED;
-    uint64_t hopcount2 = 0;
-    rsmi_status_t retsmi;
-    HIPCHECK(hipExtGetLinkTypeAndHopCount((*pos).device1,
-                (*pos).device2, &linktype1, &hopcount1));
-    retsmi = fntopo_get_link_type((*pos).device1,
-                (*pos).device2, &hopcount2, &linktype2);
-    REQUIRE(RSMI_STATUS_SUCCESS == retsmi);
-
-    // Validate linktype
-    TestPassed = validateLinkType(linktype1, linktype2);
-  }
-  fntopo_shut_down();
-  dlclose(lib_rocm_smi_hdl);
-  return TestPassed;
-}
-#endif
-
-/**
- * @addtogroup hipExtGetLinkTypeAndHopCount hipExtGetLinkTypeAndHopCount
- * @{
- * @ingroup p2pTest
- * `hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount)` -
- * Returns the link type and hop count between two devices
- * @}
- */
-
-/**
- * Test Description
- * ------------------------
- *    - Validates negative scenarios for hipExtGetLinkTypeAndHopCount
- * 1)Test Scenario to verify when device1 is visible and device2 is masked
- * 2)Test Scenario to verify Invalid Device Number(s)
- * 3)Test Scenario to verify when linktype = NULL
- * 4)Test Scenario to verify when hopcount = NULL
- * 5)Test Scenario to verify when device1 = device2
- * 6)Test Scenario: Verify (hopcount, linktype) values for (src= device1, dest = device2)
- * and (src = device2, dest = device1), where device1 and device2 are valid device numbers.
- * 7)Test Scenario: Verify (hopcount, linktype) values for all combination of
- * GPUs with the output of rocm_smi tool.
-
- * Test source
- * ------------------------
- *    - catch/unit/p2p/hipExtGetLinkTypeAndHopCount.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_hipP2pLinkTypeAndHopFunc") {
-  int numDevices = 0;
-  bool TestPassed = true;
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices < 2) {
-    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
-    return;
-  }
-  SECTION("Test running for testhipInvalidDevice") {
-    TestPassed = testhipInvalidDevice(numDevices);
-    REQUIRE(TestPassed == true);
-  }
-#ifdef __linux__
-  getDeviceCount(&numDevices);
-  if (numDevices < 2) {
-    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
-    return;
-  }
-  SECTION("Test running for testMaskedDevice") {
-    TestPassed = testMaskedDevice(numDevices);
-    REQUIRE(TestPassed == true);
-  }
-  SECTION("Test running for testhipInvalidLinkType") {
-    TestPassed = testhipInvalidLinkType();
-    REQUIRE(TestPassed == true);
-  }
-  SECTION("Test running for testhipInvalidHopcount") {
-    TestPassed = testhipInvalidHopcount();
-    REQUIRE(TestPassed == true);
-  }
-  SECTION("Test running for testhipSameDevice") {
-    TestPassed = testhipSameDevice(numDevices);
-    REQUIRE(TestPassed == true);
-  }
-  SECTION("Test running for testhipLinkTypeHopcountDeviceOrderRev") {
-    TestPassed = testhipLinkTypeHopcountDeviceOrderRev(numDevices);
-    REQUIRE(TestPassed == true);
-  }
-  SECTION("Test running for testhipLinkTypeHopcountDevice") {
-    TestPassed = testhipLinkTypeHopcountDevice(numDevices);
-    REQUIRE(TestPassed == true);
-  }
-#else
-    printf("This test is skipped due to non linux environment.\n");
-#endif
-}
-
-/**
-* End doxygen group p2pTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "hipP2pLinkTypeAndHopFunc.h"
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+ 
+#ifdef __linux__
+#include <unistd.h>
+#include <sys/wait.h>
+#include <dlfcn.h>
+#endif
+#include <vector>
+#define MAX_SIZE 30
+#define VISIBLE_DEVICE 0
+
+/**
+ * Fetches Gpu device count
+ */
+#ifdef __linux__
+void getDeviceCount(int *pdevCnt) {
+  int fd[2], val = 0;
+  pid_t childpid;
+  // create pipe descriptors
+  pipe(fd);
+  // disable visible_devices env from shell
+  unsetenv("ROCR_VISIBLE_DEVICES");
+  unsetenv("HIP_VISIBLE_DEVICES");
+
+  childpid = fork();
+  if (childpid > 0) {  // Parent
+    close(fd[1]);
+    // parent will wait to read the device cnt
+    read(fd[0], &val, sizeof(val));
+    // close the read-descriptor
+    close(fd[0]);
+    // wait for child exit
+    wait(NULL);
+    *pdevCnt = val;
+  } else if (!childpid) {  // Child
+    int devCnt = 1;
+    // writing only, no need for read-descriptor
+    close(fd[0]);
+    HIP_CHECK(hipGetDeviceCount(&devCnt));
+    // send the value on the write-descriptor:
+    write(fd[1], &devCnt, sizeof(devCnt));
+    // close the write descriptor:
+    close(fd[1]);
+    exit(0);
+  } else {  // failure
+    *pdevCnt = 1;
+    return;
+  }
+}
+
+bool testMaskedDevice(int actualNumGPUs) {
+  bool testResult = true;
+  int fd[2];
+  pipe(fd);
+
+  pid_t cPid;
+  cPid = fork();
+  if (cPid == 0) {  // child
+    hipError_t err;
+    char visibleDeviceString[MAX_SIZE] = {};
+    snprintf(visibleDeviceString, MAX_SIZE, "%d", VISIBLE_DEVICE);
+    // disable visible_devices env from shell
+    unsetenv("ROCR_VISIBLE_DEVICES");
+    unsetenv("HIP_VISIBLE_DEVICES");
+    setenv("ROCR_VISIBLE_DEVICES", visibleDeviceString, 1);
+    setenv("HIP_VISIBLE_DEVICES", visibleDeviceString, 1);
+    uint32_t linktype;
+    uint32_t hopcount;
+    for (int count = 1;
+        count < actualNumGPUs; count++) {
+      err = hipExtGetLinkTypeAndHopCount(VISIBLE_DEVICE,
+            VISIBLE_DEVICE+count, &linktype, &hopcount);
+      REQUIRE(err == hipSuccess);
+    }
+    close(fd[0]);
+    write(fd[1], &testResult, sizeof(testResult));
+    close(fd[1]);
+    exit(0);
+
+  } else if (cPid > 0) {  // parent
+    close(fd[1]);
+    read(fd[0], &testResult, sizeof(testResult));
+    close(fd[0]);
+    wait(NULL);
+
+  } else {
+    printf("Info:fork() failed\n");
+    testResult = false;
+  }
+  return testResult;
+}
+#endif
+
+bool testhipInvalidDevice(int numDevices) {
+  hipError_t ret;
+  uint32_t linktype;
+  uint32_t hopcount;
+  SECTION("Invalid device number case 1") {
+    ret = hipExtGetLinkTypeAndHopCount(-1, 0, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  SECTION("Invalid device number case 2") {
+    ret = hipExtGetLinkTypeAndHopCount(numDevices, 0, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  SECTION("Invalid device number case 3") {
+    ret = hipExtGetLinkTypeAndHopCount(0, -1, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  SECTION("Invalid device number case 4") {
+    ret = hipExtGetLinkTypeAndHopCount(0, numDevices, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  SECTION("Invalid device number case 5") {
+    ret = hipExtGetLinkTypeAndHopCount(-1, numDevices, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  return true;
+}
+
+#ifdef __linux__
+bool testhipInvalidLinkType() {
+  uint32_t hopcount;
+  REQUIRE(hipSuccess != hipExtGetLinkTypeAndHopCount(0, 1, nullptr,
+                                                     &hopcount));
+  return true;
+}
+
+bool testhipInvalidHopcount() {
+  uint32_t linktype;
+  REQUIRE(hipSuccess != hipExtGetLinkTypeAndHopCount(0, 1, &linktype, nullptr));
+  return true;
+}
+
+bool testhipSameDevice(int numGPUs) {
+  hipError_t ret;
+  uint32_t linktype = 0;
+  uint32_t hopcount = 0;
+  for (int gpuId = 0; gpuId < numGPUs; gpuId++) {
+    ret = hipExtGetLinkTypeAndHopCount(gpuId, gpuId, &linktype, &hopcount);
+    REQUIRE(ret != hipSuccess);
+  }
+  return true;
+}
+
+bool testhipLinkTypeHopcountDeviceOrderRev(int numDevices) {
+  bool TestPassed = true;
+  // Get the unique pair of devices
+  for (int x = 0; x < numDevices; x++) {
+    for (int y = x+1; y < numDevices; y++) {
+      uint32_t linktype1 = 0, linktype2 = 0;
+      uint32_t hopcount1 = 0, hopcount2 = 0;
+      HIP_CHECK(hipExtGetLinkTypeAndHopCount(x, y,
+                          &linktype1, &hopcount1));
+      HIP_CHECK(hipExtGetLinkTypeAndHopCount(y, x,
+                          &linktype2, &hopcount2));
+      if (hopcount1 != hopcount2) {
+        TestPassed = false;
+        break;
+      }
+    }
+  }
+  return TestPassed;
+}
+
+/**
+ * Internal Function
+ */
+bool validateLinkType(uint32_t linktype_Hip,
+                      RSMI_IO_LINK_TYPE linktype_RocmSmi) {
+  bool TestPassed = false;
+
+  if ((linktype_Hip == HSA_AMD_LINK_INFO_TYPE_PCIE) &&
+     (linktype_RocmSmi == RSMI_IOLINK_TYPE_PCIEXPRESS)) {
+    TestPassed = true;
+  } else if ((linktype_Hip == HSA_AMD_LINK_INFO_TYPE_XGMI) &&
+     (linktype_RocmSmi == RSMI_IOLINK_TYPE_XGMI)) {
+    TestPassed = true;
+  } else {
+    printf("linktype Hip = %u, linktype RocmSmi = %u\n",
+            linktype_Hip, linktype_RocmSmi);
+    TestPassed = false;
+  }
+  return TestPassed;
+}
+
+bool testhipLinkTypeHopcountDevice(int numDevices) {
+  bool TestPassed = true;
+  // Opening and initializing rocm-smi library
+  void *lib_rocm_smi_hdl;
+  rsmi_status_t (*fntopo_get_link_type)(uint32_t, uint32_t, uint64_t*,
+                      RSMI_IO_LINK_TYPE*);
+  rsmi_status_t (*fntopo_init)(uint64_t);
+  rsmi_status_t (*fntopo_shut_down)();
+
+  lib_rocm_smi_hdl = dlopen("/opt/rocm/lib/librocm_smi64.so",
+                        RTLD_LAZY);
+  REQUIRE(lib_rocm_smi_hdl);
+
+  void* fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_topo_get_link_type");
+  REQUIRE(fnsym);
+
+  fntopo_get_link_type = reinterpret_cast<rsmi_status_t (*)(uint32_t,
+            uint32_t, uint64_t*, RSMI_IO_LINK_TYPE*)>(fnsym);
+
+  fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_init");
+  REQUIRE(fnsym);
+  fntopo_init = reinterpret_cast<rsmi_status_t (*)(uint64_t)>(fnsym);
+
+  fnsym = dlsym(lib_rocm_smi_hdl, "rsmi_shut_down");
+  REQUIRE(fnsym);
+  fntopo_shut_down = reinterpret_cast<rsmi_status_t (*)()>(fnsym);
+
+  uint64_t init_flags = 0;
+  rsmi_status_t retsmi_init;
+  retsmi_init = fntopo_init(init_flags);
+  REQUIRE(RSMI_STATUS_SUCCESS == retsmi_init);
+
+  // Use rocm-smi API rsmi_topo_get_link_type() to validate
+  struct devicePair {
+    int device1;
+    int device2;
+  };
+  std::vector<struct devicePair> devicePairList;
+  // Get the unique pair of devices
+  for (int x = 0; x < numDevices; x++) {
+    for (int y = x+1; y < numDevices; y++) {
+      devicePairList.push_back({x, y});
+    }
+  }
+  for (auto pos=devicePairList.begin();
+       pos != devicePairList.end(); pos++) {
+    uint32_t linktype1 = 0;
+    uint32_t hopcount1 = 0;
+    RSMI_IO_LINK_TYPE linktype2 = RSMI_IOLINK_TYPE_UNDEFINED;
+    uint64_t hopcount2 = 0;
+    rsmi_status_t retsmi;
+    HIPCHECK(hipExtGetLinkTypeAndHopCount((*pos).device1,
+                (*pos).device2, &linktype1, &hopcount1));
+    retsmi = fntopo_get_link_type((*pos).device1,
+                (*pos).device2, &hopcount2, &linktype2);
+    REQUIRE(RSMI_STATUS_SUCCESS == retsmi);
+
+    // Validate linktype
+    TestPassed = validateLinkType(linktype1, linktype2);
+  }
+  fntopo_shut_down();
+  dlclose(lib_rocm_smi_hdl);
+  return TestPassed;
+}
+#endif
+
+/**
+ * @addtogroup hipExtGetLinkTypeAndHopCount hipExtGetLinkTypeAndHopCount
+ * @{
+ * @ingroup p2pTest
+ * `hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount)` -
+ * Returns the link type and hop count between two devices
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Validates negative scenarios for hipExtGetLinkTypeAndHopCount
+ * 1)Test Scenario to verify when device1 is visible and device2 is masked
+ * 2)Test Scenario to verify Invalid Device Number(s)
+ * 3)Test Scenario to verify when linktype = NULL
+ * 4)Test Scenario to verify when hopcount = NULL
+ * 5)Test Scenario to verify when device1 = device2
+ * 6)Test Scenario: Verify (hopcount, linktype) values for (src= device1, dest = device2)
+ * and (src = device2, dest = device1), where device1 and device2 are valid device numbers.
+ * 7)Test Scenario: Verify (hopcount, linktype) values for all combination of
+ * GPUs with the output of rocm_smi tool.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/p2p/hipExtGetLinkTypeAndHopCount.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_hipP2pLinkTypeAndHopFunc") {
+  int numDevices = 0;
+  bool TestPassed = true;
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices < 2) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return;
+  }
+  SECTION("Test running for testhipInvalidDevice") {
+    TestPassed = testhipInvalidDevice(numDevices);
+    REQUIRE(TestPassed == true);
+  }
+#ifdef __linux__
+  getDeviceCount(&numDevices);
+  if (numDevices < 2) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return;
+  }
+  SECTION("Test running for testMaskedDevice") {
+    TestPassed = testMaskedDevice(numDevices);
+    REQUIRE(TestPassed == true);
+  }
+  SECTION("Test running for testhipInvalidLinkType") {
+    TestPassed = testhipInvalidLinkType();
+    REQUIRE(TestPassed == true);
+  }
+  SECTION("Test running for testhipInvalidHopcount") {
+    TestPassed = testhipInvalidHopcount();
+    REQUIRE(TestPassed == true);
+  }
+  SECTION("Test running for testhipSameDevice") {
+    TestPassed = testhipSameDevice(numDevices);
+    REQUIRE(TestPassed == true);
+  }
+  SECTION("Test running for testhipLinkTypeHopcountDeviceOrderRev") {
+    TestPassed = testhipLinkTypeHopcountDeviceOrderRev(numDevices);
+    REQUIRE(TestPassed == true);
+  }
+  SECTION("Test running for testhipLinkTypeHopcountDevice") {
+    TestPassed = testhipLinkTypeHopcountDevice(numDevices);
+    REQUIRE(TestPassed == true);
+  }
+#else
+    printf("This test is skipped due to non linux environment.\n");
+#endif
+}
+
+/**
+* End doxygen group p2pTest.
+* @}
+*/
@@ -1,110 +1,110 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef _HIP_DIRTEST_P2PLINKTYPEHOP_H_
-#define _HIP_DIRTEST_P2PLINKTYPEHOP_H_
-/**
- * rocm_smi.h enums
- */
-typedef enum {
-  RSMI_STATUS_SUCCESS = 0x0,             //!< Operation was successful
-  RSMI_STATUS_INVALID_ARGS,              //!< Passed in arguments are not valid
-  RSMI_STATUS_NOT_SUPPORTED,             //!< The requested information or
-                                         //!< action is not available for the
-                                         //!< given input, on the given system
-  RSMI_STATUS_FILE_ERROR,                //!< Problem accessing a file. This
-                                         //!< may because the operation is not
-                                         //!< supported by the Linux kernel
-                                         //!< version running on the executing
-                                         //!< machine
-  RSMI_STATUS_PERMISSION,                //!< Permission denied/EACCESS file
-                                         //!< error. Many functions require
-                                         //!< root access to run.
-  RSMI_STATUS_OUT_OF_RESOURCES,          //!< Unable to acquire memory or other
-                                         //!< resource
-  RSMI_STATUS_INTERNAL_EXCEPTION,        //!< An internal exception was caught
-  RSMI_STATUS_INPUT_OUT_OF_BOUNDS,       //!< The provided input is out of
-                                         //!< allowable or safe range
-  RSMI_STATUS_INIT_ERROR,                //!< An error occurred when rsmi
-                                         //!< initializing internal data
-                                         //!< structures
-  RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR,
-  RSMI_STATUS_NOT_YET_IMPLEMENTED,       //!< The requested function has not
-                                         //!< yet been implemented in the
-                                         //!< current system for the current
-                                         //!< devices
-  RSMI_STATUS_NOT_FOUND,                 //!< An item was searched for but not
-                                         //!< found
-  RSMI_STATUS_INSUFFICIENT_SIZE,         //!< Not enough resources were
-                                         //!< available for the operation
-  RSMI_STATUS_INTERRUPT,                 //!< An interrupt occurred during
-                                         //!< execution of function
-  RSMI_STATUS_UNEXPECTED_SIZE,           //!< An unexpected amount of data
-                                         //!< was read
-  RSMI_STATUS_NO_DATA,                   //!< No data was found for a given
-                                         //!< input
-  RSMI_STATUS_UNEXPECTED_DATA,           //!< The data read or provided to
-                                         //!< function is not what was expected
-  RSMI_STATUS_BUSY,                      //!< A resource or mutex could not be
-                                         //!< acquired because it is already
-                                         //!< being used
-  RSMI_STATUS_REFCOUNT_OVERFLOW,          //!< An internal reference counter
-                                         //!< exceeded INT32_MAX
-
-  RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF,  //!< An unknown error occurred
-} rsmi_status_t;
-
-/**
- * Types for IO Link returned from rocm_smi
- */
-typedef enum _RSMI_IO_LINK_TYPE {
-  RSMI_IOLINK_TYPE_UNDEFINED      = 0,          //!< unknown type.
-  RSMI_IOLINK_TYPE_PCIEXPRESS     = 1,          //!< PCI Express
-  RSMI_IOLINK_TYPE_XGMI           = 2,          //!< XGMI
-  RSMI_IOLINK_TYPE_NUMIOLINKTYPES,              //!< Number of IO Link types
-  RSMI_IOLINK_TYPE_SIZE           = 0xFFFFFFFF  //!< Max of IO Link types
-} RSMI_IO_LINK_TYPE;
-
-/**
- * Types for IO Link returned from rocm runtime
- */
-typedef enum {
-  /**
-  * Hyper-transport bus type.
-  */
-  HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0,
-  /**
-  * QPI bus type.
-  */
-  HSA_AMD_LINK_INFO_TYPE_QPI = 1,
-  /**
-  * PCIe bus type.
-  */
-  HSA_AMD_LINK_INFO_TYPE_PCIE = 2,
-  /**
-  * Infiniband bus type.
-  */
-  HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3,
-  /**
-  * xGMI link type.
-  */
-  HSA_AMD_LINK_INFO_TYPE_XGMI = 4
-} hsa_amd_link_info_type_t;
-
-#endif  // _HIP_DIRTEST_P2PLINKTYPEHOP_H_
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef _HIP_DIRTEST_P2PLINKTYPEHOP_H_
+#define _HIP_DIRTEST_P2PLINKTYPEHOP_H_
+/**
+ * rocm_smi.h enums
+ */
+typedef enum {
+  RSMI_STATUS_SUCCESS = 0x0,             //!< Operation was successful
+  RSMI_STATUS_INVALID_ARGS,              //!< Passed in arguments are not valid
+  RSMI_STATUS_NOT_SUPPORTED,             //!< The requested information or
+                                         //!< action is not available for the
+                                         //!< given input, on the given system
+  RSMI_STATUS_FILE_ERROR,                //!< Problem accessing a file. This
+                                         //!< may because the operation is not
+                                         //!< supported by the Linux kernel
+                                         //!< version running on the executing
+                                         //!< machine
+  RSMI_STATUS_PERMISSION,                //!< Permission denied/EACCESS file
+                                         //!< error. Many functions require
+                                         //!< root access to run.
+  RSMI_STATUS_OUT_OF_RESOURCES,          //!< Unable to acquire memory or other
+                                         //!< resource
+  RSMI_STATUS_INTERNAL_EXCEPTION,        //!< An internal exception was caught
+  RSMI_STATUS_INPUT_OUT_OF_BOUNDS,       //!< The provided input is out of
+                                         //!< allowable or safe range
+  RSMI_STATUS_INIT_ERROR,                //!< An error occurred when rsmi
+                                         //!< initializing internal data
+                                         //!< structures
+  RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR,
+  RSMI_STATUS_NOT_YET_IMPLEMENTED,       //!< The requested function has not
+                                         //!< yet been implemented in the
+                                         //!< current system for the current
+                                         //!< devices
+  RSMI_STATUS_NOT_FOUND,                 //!< An item was searched for but not
+                                         //!< found
+  RSMI_STATUS_INSUFFICIENT_SIZE,         //!< Not enough resources were
+                                         //!< available for the operation
+  RSMI_STATUS_INTERRUPT,                 //!< An interrupt occurred during
+                                         //!< execution of function
+  RSMI_STATUS_UNEXPECTED_SIZE,           //!< An unexpected amount of data
+                                         //!< was read
+  RSMI_STATUS_NO_DATA,                   //!< No data was found for a given
+                                         //!< input
+  RSMI_STATUS_UNEXPECTED_DATA,           //!< The data read or provided to
+                                         //!< function is not what was expected
+  RSMI_STATUS_BUSY,                      //!< A resource or mutex could not be
+                                         //!< acquired because it is already
+                                         //!< being used
+  RSMI_STATUS_REFCOUNT_OVERFLOW,          //!< An internal reference counter
+                                         //!< exceeded INT32_MAX
+
+  RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF,  //!< An unknown error occurred
+} rsmi_status_t;
+
+/**
+ * Types for IO Link returned from rocm_smi
+ */
+typedef enum _RSMI_IO_LINK_TYPE {
+  RSMI_IOLINK_TYPE_UNDEFINED      = 0,          //!< unknown type.
+  RSMI_IOLINK_TYPE_PCIEXPRESS     = 1,          //!< PCI Express
+  RSMI_IOLINK_TYPE_XGMI           = 2,          //!< XGMI
+  RSMI_IOLINK_TYPE_NUMIOLINKTYPES,              //!< Number of IO Link types
+  RSMI_IOLINK_TYPE_SIZE           = 0xFFFFFFFF  //!< Max of IO Link types
+} RSMI_IO_LINK_TYPE;
+
+/**
+ * Types for IO Link returned from rocm runtime
+ */
+typedef enum {
+  /**
+  * Hyper-transport bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0,
+  /**
+  * QPI bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_QPI = 1,
+  /**
+  * PCIe bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_PCIE = 2,
+  /**
+  * Infiniband bus type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3,
+  /**
+  * xGMI link type.
+  */
+  HSA_AMD_LINK_INFO_TYPE_XGMI = 4
+} hsa_amd_link_info_type_t;
+
+#endif  // _HIP_DIRTEST_P2PLINKTYPEHOP_H_
@@ -1,178 +1,178 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/*
-The Functions defined in RtcFunctions.cpp are declared here in RtcFunctions.h.
-*/
-
-#ifndef CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
-#define CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
-#include <string>
-
-bool check_architecture(const char** Combination_CO,
-                       int Combination_CO_size, int max_thread_pos,
-                       int fast_math_present);
-
-bool check_rdc(const char** Combination_CO,
-               int Combination_CO_size, int max_thread_pos,
-               int fast_math_present);
-
-bool check_denormals_enabled(const char** Combination_CO,
-                             int Combination_CO_size, int max_thread_pos,
-                             int fast_math_present);
-
-bool check_denormals_disabled(const char** Combination_CO,
-                              int Combination_CO_size, int max_thread_pos,
-                              int fast_math_present);
-
-bool check_ffp_contract_off(const char** Combination_CO,
-                            int Combination_CO_size, int max_thread_pos,
-                            int fast_math_present);
-
-bool check_ffp_contract_on(const char** Combination_CO,
-                           int Combination_CO_size, int max_thread_pos,
-                           int fast_math_present);
-
-bool check_ffp_contract_fast(const char** Combination_CO,
-                             int Combination_CO_size, int max_thread_pos,
-                             int fast_math_present);
-
-bool check_fast_math_enabled(const char** Combination_CO,
-                             int Combination_CO_size, int max_thread_pos,
-                             int fast_math_present);
-
-bool check_fast_math_disabled(const char** Combination_CO,
-                              int Combination_CO_size, int max_thread_pos,
-                              int fast_math_present);
-
-bool check_slp_vectorize_enabled(const char** Combination_CO,
-                                 int Combination_CO_size, int max_thread_pos,
-                                 int fast_math_present);
-
-bool check_slp_vectorize_disabled(const char** Combination_CO,
-                                  int Combination_CO_size, int max_thread_pos,
-                                  int fast_math_present);
-
-bool check_macro(const char** Combination_CO,
-                 int Combination_CO_size, int max_thread_pos,
-                 int fast_math_present);
-
-bool check_undef_macro(const char** Combination_CO,
-                       int Combination_CO_size, int max_thread_pos,
-                       int fast_math_present);
-
-bool check_header_dir(const char** Combination_CO,
-                      int Combination_CO_size, int max_thread_pos,
-                      int fast_math_present);
-
-bool check_warning(const char** Combination_CO,
-                   int Combination_CO_size, int max_thread_pos,
-                   int fast_math_present);
-
-bool check_Rpass_inline(const char** Combination_CO,
-                        int Combination_CO_size, int max_thread_pos,
-                        int fast_math_present);
-
-bool check_conversionerror_enabled(const char** Combination_CO,
-                                   int Combination_CO_size, int max_thread_pos,
-                                   int fast_math_present);
-
-bool check_conversionerror_disabled(const char** Combination_CO,
-                                    int Combination_CO_size,
-                                    int max_thread_pos,
-                                    int fast_math_present);
-
-bool check_conversionwarning_enabled(const char** Combination_CO,
-                                     int Combination_CO_size,
-                                     int max_thread_pos,
-                                     int fast_math_present);
-
-bool check_conversionwarning_disabled(const char** Combination_CO,
-                                      int Combination_CO_size,
-                                      int max_thread_pos,
-                                      int fast_math_present);
-
-bool check_max_thread(const char** Combination_CO,
-                      int Combination_CO_size, int max_thread_pos,
-                      int fast_math_present);
-
-bool check_unsafe_atomic_enabled(const char** Combination_CO,
-                                 int Combination_CO_size, int max_thread_pos,
-                                 int fast_math_present);
-
-bool check_unsafe_atomic_disabled(const char** Combination_CO,
-                                  int Combination_CO_size, int max_thread_pos,
-                                  int fast_math_present);
-
-bool check_infinite_num_enabled(const char** Combination_CO,
-                                int Combination_CO_size, int max_thread_pos,
-                                int fast_math_present);
-
-bool check_infinite_num_disabled(const char** Combination_CO,
-                                 int Combination_CO_size, int max_thread_pos,
-                                 int fast_math_present);
-
-bool check_NAN_num_enabled(const char** Combination_CO,
-                           int Combination_CO_size, int max_thread_pos,
-                           int fast_math_present);
-
-bool check_NAN_num_disabled(const char** Combination_CO,
-                            int Combination_CO_size, int max_thread_pos,
-                            int fast_math_present);
-
-bool check_finite_math_enabled(const char** Combination_CO,
-                               int Combination_CO_size, int max_thread_pos,
-                               int fast_math_present);
-
-bool check_finite_math_disabled(const char** Combination_CO,
-                                int Combination_CO_size, int max_thread_pos,
-                                int fast_math_present);
-
-bool check_associative_math_enabled(const char** Combination_CO,
-                                    int Combination_CO_size,
-                                    int max_thread_pos,
-                                    int fast_math_present);
-
-bool check_associative_math_disabled(const char** Combination_CO,
-                                     int Combination_CO_size,
-                                     int max_thread_pos,
-                                     int fast_math_present);
-
-bool check_signed_zeros_enabled(const char** Combination_CO,
-                                int Combination_CO_size,
-                                int max_thread_pos,
-                                int fast_math_present);
-
-bool check_signed_zeros_disabled(const char** Combination_CO,
-                                 int Combination_CO_size, int max_thread_pos,
-                                 int fast_math_present);
-
-bool check_trapping_math_enabled(const char** Combination_CO,
-                                 int Combination_CO_size, int max_thread_pos,
-                                 int fast_math_present);
-
-bool check_trapping_math_disabled(const char** Combination_CO,
-                                  int Combination_CO_size, int max_thread_pos,
-                                  int fast_math_present);
-
-std::string checking_IR(const char* kername, const char** extra_CO_IRadded,
-                        int extra_CO_IRadded_size, const char** Combination_CO,
-                        int Combination_CO_size);
-
-#endif  // CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+The Functions defined in RtcFunctions.cpp are declared here in RtcFunctions.h.
+*/
+
+#ifndef CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
+#define CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
+#include <string>
+
+bool check_architecture(const char** Combination_CO,
+                       int Combination_CO_size, int max_thread_pos,
+                       int fast_math_present);
+
+bool check_rdc(const char** Combination_CO,
+               int Combination_CO_size, int max_thread_pos,
+               int fast_math_present);
+
+bool check_denormals_enabled(const char** Combination_CO,
+                             int Combination_CO_size, int max_thread_pos,
+                             int fast_math_present);
+
+bool check_denormals_disabled(const char** Combination_CO,
+                              int Combination_CO_size, int max_thread_pos,
+                              int fast_math_present);
+
+bool check_ffp_contract_off(const char** Combination_CO,
+                            int Combination_CO_size, int max_thread_pos,
+                            int fast_math_present);
+
+bool check_ffp_contract_on(const char** Combination_CO,
+                           int Combination_CO_size, int max_thread_pos,
+                           int fast_math_present);
+
+bool check_ffp_contract_fast(const char** Combination_CO,
+                             int Combination_CO_size, int max_thread_pos,
+                             int fast_math_present);
+
+bool check_fast_math_enabled(const char** Combination_CO,
+                             int Combination_CO_size, int max_thread_pos,
+                             int fast_math_present);
+
+bool check_fast_math_disabled(const char** Combination_CO,
+                              int Combination_CO_size, int max_thread_pos,
+                              int fast_math_present);
+
+bool check_slp_vectorize_enabled(const char** Combination_CO,
+                                 int Combination_CO_size, int max_thread_pos,
+                                 int fast_math_present);
+
+bool check_slp_vectorize_disabled(const char** Combination_CO,
+                                  int Combination_CO_size, int max_thread_pos,
+                                  int fast_math_present);
+
+bool check_macro(const char** Combination_CO,
+                 int Combination_CO_size, int max_thread_pos,
+                 int fast_math_present);
+
+bool check_undef_macro(const char** Combination_CO,
+                       int Combination_CO_size, int max_thread_pos,
+                       int fast_math_present);
+
+bool check_header_dir(const char** Combination_CO,
+                      int Combination_CO_size, int max_thread_pos,
+                      int fast_math_present);
+
+bool check_warning(const char** Combination_CO,
+                   int Combination_CO_size, int max_thread_pos,
+                   int fast_math_present);
+
+bool check_Rpass_inline(const char** Combination_CO,
+                        int Combination_CO_size, int max_thread_pos,
+                        int fast_math_present);
+
+bool check_conversionerror_enabled(const char** Combination_CO,
+                                   int Combination_CO_size, int max_thread_pos,
+                                   int fast_math_present);
+
+bool check_conversionerror_disabled(const char** Combination_CO,
+                                    int Combination_CO_size,
+                                    int max_thread_pos,
+                                    int fast_math_present);
+
+bool check_conversionwarning_enabled(const char** Combination_CO,
+                                     int Combination_CO_size,
+                                     int max_thread_pos,
+                                     int fast_math_present);
+
+bool check_conversionwarning_disabled(const char** Combination_CO,
+                                      int Combination_CO_size,
+                                      int max_thread_pos,
+                                      int fast_math_present);
+
+bool check_max_thread(const char** Combination_CO,
+                      int Combination_CO_size, int max_thread_pos,
+                      int fast_math_present);
+
+bool check_unsafe_atomic_enabled(const char** Combination_CO,
+                                 int Combination_CO_size, int max_thread_pos,
+                                 int fast_math_present);
+
+bool check_unsafe_atomic_disabled(const char** Combination_CO,
+                                  int Combination_CO_size, int max_thread_pos,
+                                  int fast_math_present);
+
+bool check_infinite_num_enabled(const char** Combination_CO,
+                                int Combination_CO_size, int max_thread_pos,
+                                int fast_math_present);
+
+bool check_infinite_num_disabled(const char** Combination_CO,
+                                 int Combination_CO_size, int max_thread_pos,
+                                 int fast_math_present);
+
+bool check_NAN_num_enabled(const char** Combination_CO,
+                           int Combination_CO_size, int max_thread_pos,
+                           int fast_math_present);
+
+bool check_NAN_num_disabled(const char** Combination_CO,
+                            int Combination_CO_size, int max_thread_pos,
+                            int fast_math_present);
+
+bool check_finite_math_enabled(const char** Combination_CO,
+                               int Combination_CO_size, int max_thread_pos,
+                               int fast_math_present);
+
+bool check_finite_math_disabled(const char** Combination_CO,
+                                int Combination_CO_size, int max_thread_pos,
+                                int fast_math_present);
+
+bool check_associative_math_enabled(const char** Combination_CO,
+                                    int Combination_CO_size,
+                                    int max_thread_pos,
+                                    int fast_math_present);
+
+bool check_associative_math_disabled(const char** Combination_CO,
+                                     int Combination_CO_size,
+                                     int max_thread_pos,
+                                     int fast_math_present);
+
+bool check_signed_zeros_enabled(const char** Combination_CO,
+                                int Combination_CO_size,
+                                int max_thread_pos,
+                                int fast_math_present);
+
+bool check_signed_zeros_disabled(const char** Combination_CO,
+                                 int Combination_CO_size, int max_thread_pos,
+                                 int fast_math_present);
+
+bool check_trapping_math_enabled(const char** Combination_CO,
+                                 int Combination_CO_size, int max_thread_pos,
+                                 int fast_math_present);
+
+bool check_trapping_math_disabled(const char** Combination_CO,
+                                  int Combination_CO_size, int max_thread_pos,
+                                  int fast_math_present);
+
+std::string checking_IR(const char* kername, const char** extra_CO_IRadded,
+                        int extra_CO_IRadded_size, const char** Combination_CO,
+                        int Combination_CO_size);
+
+#endif  // CATCH_UNIT_RTC_HEADERS_RTCFUNCTIONS_H_
@@ -1,163 +1,163 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/*
-RtcKernels.h contains the string's with the which includes the kernel code.
-They are utilized by the compiler option functions, defined in RtcFunctions.cpp
-*/
-
-#ifndef CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
-#define CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
-#include <hip/hiprtc.h>
-#include <hip/hip_runtime.h>
-#include <math.h>
-
-static constexpr auto max_thread_string {
-R"(
-extern "C"
-__global__ void max_thread(int* a) {
-  int BD = blockDim.x;
-  *a = BD;
-}
-)"};
-
-static constexpr auto denormals_string {
-R"(
-extern "C"
-__global__ void denormals(double* base, double* power, double* result) {
-  float denorm = powf(*base, *power);
-  if (*result == 0 || *result ==1 )
-    *result = (denorm==0) ? 0 : 1;
-  else
-    *result = powf(*base, *power);
-}
-)"};
-
-static constexpr auto warning_string {
-R"(
-extern "C"
-__global__ void warning() {
-  #warning "Just printing a WARNING message onto the terminal";
-}
-)"};
-
-static constexpr auto fp32_div_sqrt_string {
-R"(
-extern "C"
-__global__ void fp32_div_sqrt(float* result) {
-  float input = 109.6209;
-  *result = sqrt(input);
-}
-)"};
-
-static constexpr auto error_string {
-R"(
-extern "C"
-__global__ void error() {
-  unsigned int a = -1;
-  unsigned int b = +1;
-  signed int c = -1;
-  signed int d = +1;
-}
-)"};
-
-static constexpr auto macro_string {
-R"(
-extern "C"
-__global__ void macro(int *result) {
-  *result = PI;
-}
-)"};
-
-static constexpr auto undef_macro_string {
-R"(
-extern "C"
-__global__ void undef_macro() {
-  int a = Z;
-}
-)"};
-
-static constexpr auto header_dir_string {
-R"(
-#include "RtcFact.h"
-extern "C"
-__global__ void header_dir(int* a, int* val) {
-  *a = fact(*val);
-}
-)"};
-
-static constexpr auto rdc_string {
-R"(
-extern "C"
-__global__ void rdc(float* a, float* b, float* c) {
-  *c = *a * *b;
-}
-)"};
-
-static constexpr auto ffp_contract_string {
-R"(
-extern "C"
-__global__ void ffp_contract(float* a, float* b, float* c) {
-  *c = *a * *b + *c;
-}
-)"};
-
-static constexpr auto slp_vectorize_string {
-R"(
-extern "C"
-__global__ void slp_vectorize(__half2 a, __half2 x, __half2 *y) {
-  (*y).data.x = x.data.x + a.data.x;
-  (*y).data.y = x.data.y + a.data.y;
-}
-)"};
-
-static constexpr auto unsafe_atomic_string {
-R"(
-extern "C"
-__global__ void unsafe_atomic(float* a) {
-  int id = threadIdx.x + blockIdx.x * blockDim.x;
-  if (id < 1000) {
-    unsafeAtomicAdd(&a[id], 0.2f);
-  }
-}
-)"};
-
-static constexpr auto amdgpu_ieee_string {
-R"(
-extern "C"
-__global__ void amdgpu_ieee(float* a, float* b, float* c) {
-  *c = sqrt(*a / *b);
-  printf("sqrt(a * b) = %f\n", *c);
-}
-)"};
-
-static constexpr auto associative_math_string {
-R"(
-extern "C"
-__global__ void associative_math(int* check) {
-  double x = 0.1f;
-  double y = 0.2f;
-  double z = 0.3f;
-  if((x*y)*z != x*(y*z))
-    *check = 1;
-  else *check = 0;
-}
-)"};
-
-#endif  // CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+RtcKernels.h contains the string's with the which includes the kernel code.
+They are utilized by the compiler option functions, defined in RtcFunctions.cpp
+*/
+
+#ifndef CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
+#define CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime.h>
+#include <math.h>
+
+static constexpr auto max_thread_string {
+R"(
+extern "C"
+__global__ void max_thread(int* a) {
+  int BD = blockDim.x;
+  *a = BD;
+}
+)"};
+
+static constexpr auto denormals_string {
+R"(
+extern "C"
+__global__ void denormals(double* base, double* power, double* result) {
+  float denorm = powf(*base, *power);
+  if (*result == 0 || *result ==1 )
+    *result = (denorm==0) ? 0 : 1;
+  else
+    *result = powf(*base, *power);
+}
+)"};
+
+static constexpr auto warning_string {
+R"(
+extern "C"
+__global__ void warning() {
+  #warning "Just printing a WARNING message onto the terminal";
+}
+)"};
+
+static constexpr auto fp32_div_sqrt_string {
+R"(
+extern "C"
+__global__ void fp32_div_sqrt(float* result) {
+  float input = 109.6209;
+  *result = sqrt(input);
+}
+)"};
+
+static constexpr auto error_string {
+R"(
+extern "C"
+__global__ void error() {
+  unsigned int a = -1;
+  unsigned int b = +1;
+  signed int c = -1;
+  signed int d = +1;
+}
+)"};
+
+static constexpr auto macro_string {
+R"(
+extern "C"
+__global__ void macro(int *result) {
+  *result = PI;
+}
+)"};
+
+static constexpr auto undef_macro_string {
+R"(
+extern "C"
+__global__ void undef_macro() {
+  int a = Z;
+}
+)"};
+
+static constexpr auto header_dir_string {
+R"(
+#include "RtcFact.h"
+extern "C"
+__global__ void header_dir(int* a, int* val) {
+  *a = fact(*val);
+}
+)"};
+
+static constexpr auto rdc_string {
+R"(
+extern "C"
+__global__ void rdc(float* a, float* b, float* c) {
+  *c = *a * *b;
+}
+)"};
+
+static constexpr auto ffp_contract_string {
+R"(
+extern "C"
+__global__ void ffp_contract(float* a, float* b, float* c) {
+  *c = *a * *b + *c;
+}
+)"};
+
+static constexpr auto slp_vectorize_string {
+R"(
+extern "C"
+__global__ void slp_vectorize(__half2 a, __half2 x, __half2 *y) {
+  (*y).data.x = x.data.x + a.data.x;
+  (*y).data.y = x.data.y + a.data.y;
+}
+)"};
+
+static constexpr auto unsafe_atomic_string {
+R"(
+extern "C"
+__global__ void unsafe_atomic(float* a) {
+  int id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id < 1000) {
+    unsafeAtomicAdd(&a[id], 0.2f);
+  }
+}
+)"};
+
+static constexpr auto amdgpu_ieee_string {
+R"(
+extern "C"
+__global__ void amdgpu_ieee(float* a, float* b, float* c) {
+  *c = sqrt(*a / *b);
+  printf("sqrt(a * b) = %f\n", *c);
+}
+)"};
+
+static constexpr auto associative_math_string {
+R"(
+extern "C"
+__global__ void associative_math(int* check) {
+  double x = 0.1f;
+  double y = 0.2f;
+  double z = 0.3f;
+  if((x*y)*z != x*(y*z))
+    *check = 1;
+  else *check = 0;
+}
+)"};
+
+#endif  // CATCH_UNIT_RTC_HEADERS_RTCKERNELS_H_
@@ -1,53 +1,53 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/*
-The Functions defined in RtcUtility.cpp are declared here in RtcUtility.h.
-*/
-
-#ifndef CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
-#define CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
-#include <picojson.h>
-#include <vector>
-#include <string>
-
-std::vector<std::string> get_combi_string_vec();
-
-int split_comb_string(std::string option);
-
-int calling_combination_function(std::vector<std::string> combi_vec_list);
-
-int check_positive_CO_present(std::string find_string);
-
-int check_negative_CO_present(std::string find_string);
-
-bool calling_resp_function(const std::string block_name,
-                           const char** Combination_CO,
-                           int Combination_CO_size, int max_thread_position,
-                           int fast_math_present);
-
-picojson::array getblock_fromconfig();
-
-std::string get_string_parameters(std::string para_name_to_retrieve,
-                                  std::string block_name);
-
-picojson::array get_array_parameters(std::string para_name_to_retrieve,
-                                     std::string block_name);
-
-#endif  // CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/*
+The Functions defined in RtcUtility.cpp are declared here in RtcUtility.h.
+*/
+
+#ifndef CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
+#define CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
+#include <picojson.h>
+#include <vector>
+#include <string>
+
+std::vector<std::string> get_combi_string_vec();
+
+int split_comb_string(std::string option);
+
+int calling_combination_function(std::vector<std::string> combi_vec_list);
+
+int check_positive_CO_present(std::string find_string);
+
+int check_negative_CO_present(std::string find_string);
+
+bool calling_resp_function(const std::string block_name,
+                           const char** Combination_CO,
+                           int Combination_CO_size, int max_thread_position,
+                           int fast_math_present);
+
+picojson::array getblock_fromconfig();
+
+std::string get_string_parameters(std::string para_name_to_retrieve,
+                                  std::string block_name);
+
+picojson::array get_array_parameters(std::string para_name_to_retrieve,
+                                     std::string block_name);
+
+#endif  // CATCH_UNIT_RTC_HEADERS_RTCUTILITY_H_
@@ -1,25 +1,25 @@
-# Common Tests - Test independent of all platforms
-set(TEST_SRC
-    copy_coherency.cc
-)
-add_custom_target(memcpyInt.hsaco COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR}
-                  ${CMAKE_CURRENT_SOURCE_DIR}/memcpyIntDevice.cpp -o
-                  ${CMAKE_CURRENT_BINARY_DIR}/../synchronization/memcpyInt.hsaco -I
-                  ${HIP_PATH}/include -I
-                  ${CMAKE_CURRENT_SOURCE_DIR}/../../include -L
-                  ${HIP_PATH}/${CMAKE_INSTALL_LIBDIR}/../../include --rocm-path=${ROCM_PATH})
-# only for AMD
-if(HIP_PLATFORM MATCHES "amd")
-  set(AMD_SRC
-    cache_coherency_cpu_gpu.cc
-    cache_coherency_gpu_gpu.cc
-  )
-  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
-endif()
-
-hip_add_exe_to_target(NAME synchronizationTests
-                      TEST_SRC ${TEST_SRC}
-                      TEST_TARGET_NAME build_tests
-                      COMPILE_OPTIONS -std=c++14)
-add_dependencies(synchronizationTests memcpyInt.hsaco)
-
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
+    copy_coherency.cc
+)
+add_custom_target(memcpyInt.hsaco COMMAND ${CMAKE_CXX_COMPILER} --genco ${OFFLOAD_ARCH_STR}
+                  ${CMAKE_CURRENT_SOURCE_DIR}/memcpyIntDevice.cpp -o
+                  ${CMAKE_CURRENT_BINARY_DIR}/../synchronization/memcpyInt.hsaco -I
+                  ${HIP_PATH}/include -I
+                  ${CMAKE_CURRENT_SOURCE_DIR}/../../include -L
+                  ${HIP_PATH}/${CMAKE_INSTALL_LIBDIR}/../../include --rocm-path=${ROCM_PATH})
+# only for AMD
+if(HIP_PLATFORM MATCHES "amd")
+  set(AMD_SRC
+    cache_coherency_cpu_gpu.cc
+    cache_coherency_gpu_gpu.cc
+  )
+  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
+endif()
+
+hip_add_exe_to_target(NAME synchronizationTests
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests
+                      COMPILE_OPTIONS -std=c++14)
+add_dependencies(synchronizationTests memcpyInt.hsaco)
+
@@ -1,282 +1,282 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Simple test for Fine Grained CPU-GPU coherency.
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-
-typedef _Atomic(unsigned int) atomic_uint;
-
-// Helper function to spin on address until address equals value.
-// If the address holds the value of -1, abort because the other thread failed.
-__device__ int
-gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
-                                       unsigned int value) {
-  unsigned int compare;
-  bool check = false;
-  do {
-    compare = value;
-    check = __opencl_atomic_compare_exchange_strong(
-      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
-       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
-      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    if (compare == -1)
-      return -1;
-  } while (!check);
-  return 0;
-}
-
-// This kernel requires a single block, single thread dispatch.
-__global__ void
-gpu_kernel(int *A, int *B, int *X, int *Y, size_t N,
-           unsigned int *AA1, unsigned int *AA2,
-           unsigned int *BA1, unsigned int *BA2, unsigned int *dresult) {
-  for (size_t i = 0; i < N; i++) {
-    // Store data into A, system fence, and atomically mark flag.
-    // This guarantees this global write is visible by device 1.
-    A[i] = X[i];
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
-                      __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    // Wait on device 1's global write to B.
-    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
-      *dresult = -1;
-      break;
-    }
-
-    // Check device 1 properly stored Y into B.
-    bool stored_data_matches = (B[i] == Y[i]);
-    if (!stored_data_matches) {
-      // If the data does not match, alert other thread and abort.
-      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
-             i, B[i], Y[i]);
-      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-      *dresult = -1;
-    }
-    // Otherwise tell the other thread to continue.
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
-    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
-      *dresult = -1;
-      break;
-    }
-  }
-  *dresult = 0;
-}
-
-__host__ int
-cpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
-                                       unsigned int value) {
-  unsigned int compare;
-  bool check = false;
-  do {
-    compare = value;
-    check = __atomic_compare_exchange_n(
-      address, /*expected=*/ &compare, /*desired=*/ value,
-      /*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
-    if (compare == -1)
-      return -1;
-  } while (!check);
-  return 0;
-}
-
-// This host thread runs only on a single CPU thread.
-__host__ void
-cpu_thread(int *A, int *B, int *X, int *Y, size_t N,
-           unsigned int *AA1, unsigned int *AA2,
-           unsigned int *BA1, unsigned int *BA2, unsigned int *hresult) {
-  for (size_t i = 0; i < N; i++) {
-    B[i] = Y[i];
-    __atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE);
-    if (cpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
-      *hresult = -1;
-      break;
-    }
-
-    bool stored_data_matches = (A[i] == X[i]);
-    if (!stored_data_matches) {
-      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
-             i, A[i], X[i]);
-      __atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE);
-      *hresult = -1;
-      break;
-    }
-    __atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE);
-    if (cpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
-      *hresult = -1;
-      break;
-    }
-  }
-  *hresult = 0;
-}
-
-static bool cpu_to_gpu_coherency() {
-  int *A_d, *B_d, *X_d, *Y_d;
-  int *A_res, *A_h, *B_h, *X_h, *Y_h;
-  unsigned int hresult, dresult;
-  size_t N = 1024;
-  size_t Nbytes = N * sizeof(int);
-  int numDevices = 0;
-
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices < 1) {
-    HipTest::HIP_SKIP_TEST("Skipping because devices < 1");
-    return 0;
-  }
-
-  // Skip this test if feature is not supported.
-  static int device0 = 0;
-  hipDeviceProp_t props;
-  HIP_CHECK(hipGetDeviceProperties(&props, device0));
-  if (strncmp(props.gcnArchName, "gfx90a", 6) != 0 &&
-      strncmp(props.gcnArchName, "gfx940", 6) != 0) {
-    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
-    return true;
-  }
-
-  // Allocate Host Side Memory. Coherent Fine-grained Memory for array B.
-  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-  HIP_CHECK(hipHostMalloc(&B_h, Nbytes,
-                         (hipHostMallocCoherent | hipHostMallocMapped)));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
-  X_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-
-  // Initialize the arrays and atomic variables.
-  for (size_t i = 0; i < N; i++) {
-    X_h[i] = 100000000 + i;
-    Y_h[i] = 300000000 + i;
-  }
-
-  // Initialize shared atomic flags between CPU and GPU.
-  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
-  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
-  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
-                                    AA1_h, 0));
-  *AA1_h = 0;
-  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
-                                    AA2_h, 0));
-  *AA2_h = 0;
-  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
-                                    BA1_h, 0));
-  *BA1_h = 0;
-  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
-                                    BA2_h, 0));
-  *BA2_h = 0;
-
-  // Skip the first stream, ensure stream is non-blocking.
-  hipStream_t stream[2];
-  HIP_CHECK(hipStreamCreate(&stream[0]));
-  HIP_CHECK(hipSetDevice(0));
-  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
-
-  // Allocate Device Side Memory. Coherent Fine-grained Memory for array A.
-  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
-                                           Nbytes, hipDeviceMallocFinegrained);
-  REQUIRE(status == hipSuccess);
-  // SVM memory - host pointer is the same as device pointer to array A.
-  A_h = A_d;
-  HIP_CHECK(hipMalloc(&X_d, Nbytes));
-  HIP_CHECK(hipMalloc(&Y_d, Nbytes));
-
-  HIP_CHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice));
-
-  // Launch the GPU kernel.
-  const unsigned blocks = 1;
-  const unsigned threadsPerBlock = 1;
-  hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock),
-                     0, stream[1],
-                     A_d, B_d, X_d, Y_d, N,
-                     AA1_d, AA2_d, BA1_d, BA2_d, &dresult);
-  // Check if launch failed.
-  HIP_CHECK(hipGetLastError());
-  REQUIRE(dresult == 0);
-
-  // Do not sync the launched stream, instead run the cpu_thread.
-  std::thread host_thread(cpu_thread,
-                          A_h, B_h, X_h, Y_h, N,
-                          AA1_h, AA2_h, BA1_h, BA2_h, &hresult);
-  host_thread.detach();
-  REQUIRE(hresult == 0);
-  // Wait for Device side to finish.
-  HIP_CHECK(hipStreamSynchronize(stream[1]));
-
-  // Evaluate the resultant arrays A and B.
-  A_res = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess);
-  HIP_CHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost));
-
-  for (size_t i = 0; i < N; i++)  {
-    REQUIRE(A_res[i] == (100000000 + i));
-    REQUIRE(B_h[i] == (300000000 + i));
-  }
-
-  // Free all the device and host memory allocated.
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(X_d));
-  HIP_CHECK(hipFree(Y_d));
-  HIP_CHECK(hipHostFree(AA1_h));
-  HIP_CHECK(hipHostFree(AA2_h));
-  HIP_CHECK(hipHostFree(BA1_h));
-  HIP_CHECK(hipHostFree(BA2_h));
-  HIP_CHECK(hipHostFree(B_h));
-  free(X_h);
-  free(Y_h);
-  free(A_res);
-
-  return true;
-}
-
-/**
- * Test Description
- * ------------------------
- *    - This test runs on devices where XGMI enables fine-grained communication
- * between GPUs. This performs a message passing test.
- * Array A is allocated on Device 0, and remotely on host.
- * Device 0 also increments atomic ints AA1 and AA2.
- * Array B is allocated on host, and remotely on Device 0.
- * Host also increments atomic ints BA1 and BA2.
- * Kernel will launch on Device 0, and store array X into array A.
- * Host Thread will store array Y into array B.
- * Kernel will validate that the correct values of array Y are stored in B.
- * Host Thread will validate that the correct values of array X are stored in A.
-
- * Test source
- * ------------------------
- *    - catch/unit/synchronization/cache_coherency_cpu_gpu.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- *    - Test to be run only on AMD.
- */
-
-TEST_CASE("Unit_cache_coherency_cpu_gpu") {
-  bool passed = true;
-  // Coherency between CPU and GPU sharing host and device memory.
-  REQUIRE(passed == cpu_to_gpu_coherency());
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for Fine Grained CPU-GPU coherency.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+typedef _Atomic(unsigned int) atomic_uint;
+
+// Helper function to spin on address until address equals value.
+// If the address holds the value of -1, abort because the other thread failed.
+__device__ int
+gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __opencl_atomic_compare_exchange_strong(
+      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
+       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_kernel(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *dresult) {
+  for (size_t i = 0; i < N; i++) {
+    // Store data into A, system fence, and atomically mark flag.
+    // This guarantees this global write is visible by device 1.
+    A[i] = X[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
+                      __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on device 1's global write to B.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
+      *dresult = -1;
+      break;
+    }
+
+    // Check device 1 properly stored Y into B.
+    bool stored_data_matches = (B[i] == Y[i]);
+    if (!stored_data_matches) {
+      // If the data does not match, alert other thread and abort.
+      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
+             i, B[i], Y[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *dresult = -1;
+    }
+    // Otherwise tell the other thread to continue.
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
+      *dresult = -1;
+      break;
+    }
+  }
+  *dresult = 0;
+}
+
+__host__ int
+cpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __atomic_compare_exchange_n(
+      address, /*expected=*/ &compare, /*desired=*/ value,
+      /*weak=*/ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This host thread runs only on a single CPU thread.
+__host__ void
+cpu_thread(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *hresult) {
+  for (size_t i = 0; i < N; i++) {
+    B[i] = Y[i];
+    __atomic_fetch_add(BA1, 1, __ATOMIC_RELEASE);
+    if (cpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
+      *hresult = -1;
+      break;
+    }
+
+    bool stored_data_matches = (A[i] == X[i]);
+    if (!stored_data_matches) {
+      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
+             i, A[i], X[i]);
+      __atomic_exchange_n(BA2, -1, __ATOMIC_RELEASE);
+      *hresult = -1;
+      break;
+    }
+    __atomic_fetch_add(BA2, 1, __ATOMIC_RELEASE);
+    if (cpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
+      *hresult = -1;
+      break;
+    }
+  }
+  *hresult = 0;
+}
+
+static bool cpu_to_gpu_coherency() {
+  int *A_d, *B_d, *X_d, *Y_d;
+  int *A_res, *A_h, *B_h, *X_h, *Y_h;
+  unsigned int hresult, dresult;
+  size_t N = 1024;
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices < 1) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 1");
+    return 0;
+  }
+
+  // Skip this test if feature is not supported.
+  static int device0 = 0;
+  hipDeviceProp_t props;
+  HIP_CHECK(hipGetDeviceProperties(&props, device0));
+  if (strncmp(props.gcnArchName, "gfx90a", 6) != 0 &&
+      strncmp(props.gcnArchName, "gfx940", 6) != 0) {
+    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
+    return true;
+  }
+
+  // Allocate Host Side Memory. Coherent Fine-grained Memory for array B.
+  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  HIP_CHECK(hipHostMalloc(&B_h, Nbytes,
+                         (hipHostMallocCoherent | hipHostMallocMapped)));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&B_d), B_h, 0));
+  X_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Initialize the arrays and atomic variables.
+  for (size_t i = 0; i < N; i++) {
+    X_h[i] = 100000000 + i;
+    Y_h[i] = 300000000 + i;
+  }
+
+  // Initialize shared atomic flags between CPU and GPU.
+  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
+  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
+  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
+                                    AA1_h, 0));
+  *AA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
+                                    AA2_h, 0));
+  *AA2_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
+                                    BA1_h, 0));
+  *BA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
+                                    BA2_h, 0));
+  *BA2_h = 0;
+
+  // Skip the first stream, ensure stream is non-blocking.
+  hipStream_t stream[2];
+  HIP_CHECK(hipStreamCreate(&stream[0]));
+  HIP_CHECK(hipSetDevice(0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
+
+  // Allocate Device Side Memory. Coherent Fine-grained Memory for array A.
+  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
+                                           Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  // SVM memory - host pointer is the same as device pointer to array A.
+  A_h = A_d;
+  HIP_CHECK(hipMalloc(&X_d, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d, Nbytes));
+
+  HIP_CHECK(hipMemcpy(X_d, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d, Y_h, Nbytes, hipMemcpyHostToDevice));
+
+  // Launch the GPU kernel.
+  const unsigned blocks = 1;
+  const unsigned threadsPerBlock = 1;
+  hipLaunchKernelGGL(gpu_kernel, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[1],
+                     A_d, B_d, X_d, Y_d, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &dresult);
+  // Check if launch failed.
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(dresult == 0);
+
+  // Do not sync the launched stream, instead run the cpu_thread.
+  std::thread host_thread(cpu_thread,
+                          A_h, B_h, X_h, Y_h, N,
+                          AA1_h, AA2_h, BA1_h, BA2_h, &hresult);
+  host_thread.detach();
+  REQUIRE(hresult == 0);
+  // Wait for Device side to finish.
+  HIP_CHECK(hipStreamSynchronize(stream[1]));
+
+  // Evaluate the resultant arrays A and B.
+  A_res = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(A_res == 0 ? hipErrorOutOfMemory : hipSuccess);
+  HIP_CHECK(hipMemcpy(A_res, A_d, Nbytes, hipMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < N; i++)  {
+    REQUIRE(A_res[i] == (100000000 + i));
+    REQUIRE(B_h[i] == (300000000 + i));
+  }
+
+  // Free all the device and host memory allocated.
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(X_d));
+  HIP_CHECK(hipFree(Y_d));
+  HIP_CHECK(hipHostFree(AA1_h));
+  HIP_CHECK(hipHostFree(AA2_h));
+  HIP_CHECK(hipHostFree(BA1_h));
+  HIP_CHECK(hipHostFree(BA2_h));
+  HIP_CHECK(hipHostFree(B_h));
+  free(X_h);
+  free(Y_h);
+  free(A_res);
+
+  return true;
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - This test runs on devices where XGMI enables fine-grained communication
+ * between GPUs. This performs a message passing test.
+ * Array A is allocated on Device 0, and remotely on host.
+ * Device 0 also increments atomic ints AA1 and AA2.
+ * Array B is allocated on host, and remotely on Device 0.
+ * Host also increments atomic ints BA1 and BA2.
+ * Kernel will launch on Device 0, and store array X into array A.
+ * Host Thread will store array Y into array B.
+ * Kernel will validate that the correct values of array Y are stored in B.
+ * Host Thread will validate that the correct values of array X are stored in A.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/cache_coherency_cpu_gpu.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ *    - Test to be run only on AMD.
+ */
+
+TEST_CASE("Unit_cache_coherency_cpu_gpu") {
+  bool passed = true;
+  // Coherency between CPU and GPU sharing host and device memory.
+  REQUIRE(passed == cpu_to_gpu_coherency());
+}
@@ -1,294 +1,294 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-// Simple test for Fine Grained GPU-GPU coherency.
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-
-typedef _Atomic(unsigned int) atomic_uint;
-
-// Helper function to spin on address until address equals value.
-// If the address holds the value of -1, abort because the other thread failed.
-__device__ int
-gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
-                                       unsigned int value) {
-  unsigned int compare;
-  bool check = false;
-  do {
-    compare = value;
-    check = __opencl_atomic_compare_exchange_strong(
-      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
-       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
-      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    if (compare == -1)
-      return -1;
-  } while (!check);
-  return 0;
-}
-
-// This kernel requires a single block, single thread dispatch.
-__global__ void
-gpu_cache0(int *A, int *B, int *X, int *Y, size_t N,
-           unsigned int *AA1, unsigned int *AA2,
-           unsigned int *BA1, unsigned int *BA2, unsigned int *cache0_result) {
-  for (size_t i = 0; i < N; i++) {
-    // Store data into A, system fence, and atomically mark flag.
-    // This guarantees this global write is visible by device 1.
-    A[i] = X[i];
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    // Wait on device 1's global write to B.
-    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
-      *cache0_result = -1;
-      break;
-    }
-
-    // Check device 1 properly stored Y into B.
-    bool stored_data_matches = (B[i] == Y[i]);
-    if (!stored_data_matches) {
-      // If the data does not match, alert other thread and abort.
-      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
-             i, B[i], Y[i]);
-      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-      *cache0_result = -1;
-    }
-    // Otherwise tell the other thread to continue.
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
-    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
-      *cache0_result = -1;
-      break;
-    }
-  }
-  *cache0_result = 0;
-}
-
-// This kernel requires a single block, single thread dispatch.
-__global__ void
-gpu_cache1(int *A, int *B, int *X, int *Y, size_t N,
-           unsigned int *AA1, unsigned int *AA2,
-           unsigned int *BA1, unsigned int *BA2, unsigned int *cache1_result) {
-  for (size_t i = 0; i < N; i++) {
-    B[i] = Y[i];
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA1), 1,
-                __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    if (gpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
-      *cache1_result = -1;
-      break;
-    }
-
-    bool stored_data_matches = (A[i] == X[i]);
-    if (!stored_data_matches) {
-      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
-             i, A[i], X[i]);
-      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(BA2), -1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-      *cache1_result = -1;
-    }
-    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA2), 1,
-                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
-    if (gpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
-      *cache1_result = -1;
-      break;
-    }
-  }
-  *cache1_result = 0;
-}
-
-static bool gpu_to_gpu_coherency() {
-  int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1;
-  int *A_h, *B_h, *X_h, *Y_h;
-  unsigned int cache0_result, cache1_result;
-  size_t N = 1024;
-  size_t Nbytes = N * sizeof(int);
-  int numDevices = 0;
-  int numTestDevices = 2;
-
-  HIP_CHECK(hipGetDeviceCount(&numDevices));
-  if (numDevices < numTestDevices) {
-    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
-    return 0;
-  }
-
-  // Skip this test if either device does not support this feature.
-  hipDeviceProp_t props0, props1;
-  HIP_CHECK(hipGetDeviceProperties(&props0, 0));
-  HIP_CHECK(hipGetDeviceProperties(&props1, 1));
-  if ((strncmp(props0.gcnArchName, "gfx90a", 6) != 0 ||
-       strncmp(props1.gcnArchName, "gfx90a", 6) != 0) &&
-      (strncmp(props0.gcnArchName, "gfx940", 6) != 0 ||
-       strncmp(props1.gcnArchName, "gfx940", 6) != 0)) {
-    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
-    return true;
-  }
-
-  // Allocate Host Side Memory.
-  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-  A_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-  B_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-  X_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
-  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
-
-  // Initialize the arrays and atomic variables.
-  for (size_t i = 0; i < N; i++) {
-    X_h[i] = 100000000 + i;
-    Y_h[i] = 300000000 + i;
-  }
-
-  // Initialize shared atomic flags on host coherent memory.
-  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
-  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
-  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
-                                     AA1_h, 0));
-  *AA1_h = 0;
-  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
-                                     AA2_h, 0));
-  *AA2_h = 0;
-  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
-                                     BA1_h, 0));
-  *BA1_h = 0;
-  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
-  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
-                                     BA2_h, 0));
-  *BA2_h = 0;
-
-  // Skip the first stream.
-  hipStream_t stream[3];
-  HIP_CHECK(hipStreamCreate(&stream[0]));
-
-  // Set-up Device 0.
-  HIP_CHECK(hipSetDevice(0));
-  // Enable P2P access to Device 1.
-  HIP_CHECK(hipDeviceEnablePeerAccess(1, 0));
-  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
-  // Allocating Coherent Memory for Array A_d on Device 0.
-  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
-                                           Nbytes, hipDeviceMallocFinegrained);
-  REQUIRE(status == hipSuccess);
-  HIP_CHECK(hipMalloc(&X_d0, Nbytes));
-  HIP_CHECK(hipMalloc(&Y_d0, Nbytes));
-
-  // Set-up Device 1.
-  HIP_CHECK(hipSetDevice(1));
-  // Enable P2P access to Device 0.
-  HIP_CHECK(hipDeviceEnablePeerAccess(0, 0));
-  HIP_CHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking));
-  // Allocating Coherent Memory for Array B_d on Device 1.
-  printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
-  status = hipExtMallocWithFlags(reinterpret_cast<void**>(&B_d),
-                                 Nbytes, hipDeviceMallocFinegrained);
-  REQUIRE(status == hipSuccess);
-  HIP_CHECK(hipMalloc(&X_d1, Nbytes));
-  HIP_CHECK(hipMalloc(&Y_d1, Nbytes));
-
-  // Transfer initialized data onto the device arrays.
-  HIP_CHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice));
-  HIP_CHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice));
-
-  // Prepare and launch the device kernels.
-  const unsigned blocks = 1;
-  const unsigned threadsPerBlock = 1;
-  HIP_CHECK(hipSetDevice(0));
-  hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock),
-                     0, stream[1],
-                     A_d, B_d, X_d0, Y_d0, N,
-                     AA1_d, AA2_d, BA1_d, BA2_d, &cache0_result);
-  // Check if launch failed.
-  HIP_CHECK(hipGetLastError());
-  REQUIRE(cache0_result == 0);
-  HIP_CHECK(hipSetDevice(1));
-  hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock),
-                     0, stream[2],
-                     A_d, B_d, X_d1, Y_d1, N,
-                     AA1_d, AA2_d, BA1_d, BA2_d, &cache1_result);
-  HIP_CHECK(hipGetLastError());
-  REQUIRE(cache1_result == 0);
-
-  // Wait for kernels on both devices.
-  HIP_CHECK(hipStreamSynchronize(stream[1]));
-  HIP_CHECK(hipStreamSynchronize(stream[2]));
-
-  // Evaluate the resultant arrays A and B.
-  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
-  HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
-
-  for (size_t i = 0; i < N; i++)  {
-    REQUIRE(A_h[i] == (100000000 + i));
-    REQUIRE(B_h[i] == (300000000 + i));
-  }
-
-  // Free all the device and host memory allocated.
-  HIP_CHECK(hipFree(A_d));
-  HIP_CHECK(hipFree(B_d));
-  HIP_CHECK(hipFree(X_d0));
-  HIP_CHECK(hipFree(Y_d0));
-  HIP_CHECK(hipFree(X_d1));
-  HIP_CHECK(hipFree(Y_d1));
-  HIP_CHECK(hipHostFree(AA1_h));
-  HIP_CHECK(hipHostFree(AA2_h));
-  HIP_CHECK(hipHostFree(BA1_h));
-  HIP_CHECK(hipHostFree(BA2_h));
-  free(A_h);
-  free(B_h);
-  free(X_h);
-  free(Y_h);
-
-  return true;
-}
-
-/**
- * Test Description
- * ------------------------
- *    - This test runs on devices where XGMI enables fine-grained communication
- * between GPUs. This performs a message passing test.
- * Array A is allocated on Device 0, and remotely on Device 1.
- * Device 0 also increments atomic ints AA1 and AA2.
- * Array B is allocated on Device 1, and remotely on Device 0.
- * Device 1 also increments atomic ints BA1 and BA2.
- * Kernel 0 will launch on Device 0, and store array X into array A.
- * Kernel 1 will launch on Device 1, and store array Y into array B.
- * Kernel 0 will validate that the correct values of array Y are stored in B.
- * Kernel 1 will validate that the correct values of array X are stored in A.
-
- * Test source
- * ------------------------
- *    - catch/unit/synchronization/cache_coherency_gpu_gpu.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- *    - Test to be run only on AMD.
- */
-
-TEST_CASE("Unit_cache_coherency_gpu_gpu") {
-  bool passed = true;
-  // Coherency between GPUs accessing local or remote FB.
-  REQUIRE(passed == gpu_to_gpu_coherency());
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+// Simple test for Fine Grained GPU-GPU coherency.
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+typedef _Atomic(unsigned int) atomic_uint;
+
+// Helper function to spin on address until address equals value.
+// If the address holds the value of -1, abort because the other thread failed.
+__device__ int
+gpu_spin_loop_or_abort_on_negative_one(unsigned int* address,
+                                       unsigned int value) {
+  unsigned int compare;
+  bool check = false;
+  do {
+    compare = value;
+    check = __opencl_atomic_compare_exchange_strong(
+      reinterpret_cast<atomic_uint*>(address), /*expected=*/ &compare,
+       /*desired=*/ value, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+      /*scope=*/ __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (compare == -1)
+      return -1;
+  } while (!check);
+  return 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_cache0(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *cache0_result) {
+  for (size_t i = 0; i < N; i++) {
+    // Store data into A, system fence, and atomically mark flag.
+    // This guarantees this global write is visible by device 1.
+    A[i] = X[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA1), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on device 1's global write to B.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA1, i+1) == -1) {
+      *cache0_result = -1;
+      break;
+    }
+
+    // Check device 1 properly stored Y into B.
+    bool stored_data_matches = (B[i] == Y[i]);
+    if (!stored_data_matches) {
+      // If the data does not match, alert other thread and abort.
+      printf("FAIL: at i=%zu, B[i]=%d, which does not match Y[i]=%d.\n",
+             i, B[i], Y[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(AA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *cache0_result = -1;
+    }
+    // Otherwise tell the other thread to continue.
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(AA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    // Wait on kernel gpu_cache1 to finish checking X is stored in A.
+    if (gpu_spin_loop_or_abort_on_negative_one(BA2, i+1) == -1) {
+      *cache0_result = -1;
+      break;
+    }
+  }
+  *cache0_result = 0;
+}
+
+// This kernel requires a single block, single thread dispatch.
+__global__ void
+gpu_cache1(int *A, int *B, int *X, int *Y, size_t N,
+           unsigned int *AA1, unsigned int *AA2,
+           unsigned int *BA1, unsigned int *BA2, unsigned int *cache1_result) {
+  for (size_t i = 0; i < N; i++) {
+    B[i] = Y[i];
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA1), 1,
+                __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (gpu_spin_loop_or_abort_on_negative_one(AA1, i+1) == -1) {
+      *cache1_result = -1;
+      break;
+    }
+
+    bool stored_data_matches = (A[i] == X[i]);
+    if (!stored_data_matches) {
+      printf("FAIL: at i=%zu, A[i]=%d, which does not match X[i]=%d.\n",
+             i, A[i], X[i]);
+      __opencl_atomic_exchange(reinterpret_cast<atomic_uint*>(BA2), -1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+      *cache1_result = -1;
+    }
+    __opencl_atomic_fetch_add(reinterpret_cast<atomic_uint*>(BA2), 1,
+                    __ATOMIC_RELEASE, __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES);
+    if (gpu_spin_loop_or_abort_on_negative_one(AA2, i+1) == -1) {
+      *cache1_result = -1;
+      break;
+    }
+  }
+  *cache1_result = 0;
+}
+
+static bool gpu_to_gpu_coherency() {
+  int *A_d, *B_d, *X_d0, *X_d1, *Y_d0, *Y_d1;
+  int *A_h, *B_h, *X_h, *Y_h;
+  unsigned int cache0_result, cache1_result;
+  size_t N = 1024;
+  size_t Nbytes = N * sizeof(int);
+  int numDevices = 0;
+  int numTestDevices = 2;
+
+  HIP_CHECK(hipGetDeviceCount(&numDevices));
+  if (numDevices < numTestDevices) {
+    HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
+    return 0;
+  }
+
+  // Skip this test if either device does not support this feature.
+  hipDeviceProp_t props0, props1;
+  HIP_CHECK(hipGetDeviceProperties(&props0, 0));
+  HIP_CHECK(hipGetDeviceProperties(&props1, 1));
+  if ((strncmp(props0.gcnArchName, "gfx90a", 6) != 0 ||
+       strncmp(props1.gcnArchName, "gfx90a", 6) != 0) &&
+      (strncmp(props0.gcnArchName, "gfx940", 6) != 0 ||
+       strncmp(props1.gcnArchName, "gfx940", 6) != 0)) {
+    printf("info: skipping test on devices other than gfx90a and gfx940.\n");
+    return true;
+  }
+
+  // Allocate Host Side Memory.
+  printf("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  A_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(A_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  B_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(B_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  X_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(X_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+  Y_h = reinterpret_cast<int*>(malloc(Nbytes));
+  HIP_CHECK(Y_h == 0 ? hipErrorOutOfMemory : hipSuccess);
+
+  // Initialize the arrays and atomic variables.
+  for (size_t i = 0; i < N; i++) {
+    X_h[i] = 100000000 + i;
+    Y_h[i] = 300000000 + i;
+  }
+
+  // Initialize shared atomic flags on host coherent memory.
+  unsigned int *AA1_h, *AA2_h, *BA1_h, *BA2_h;
+  unsigned int *AA1_d, *AA2_d, *BA1_d, *BA2_d;
+  HIP_CHECK(hipHostMalloc(&AA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA1_d),
+                                     AA1_h, 0));
+  *AA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&AA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&AA2_d),
+                                     AA2_h, 0));
+  *AA2_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA1_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA1_d),
+                                     BA1_h, 0));
+  *BA1_h = 0;
+  HIP_CHECK(hipHostMalloc(&BA2_h, sizeof(unsigned int), hipHostMallocCoherent));
+  HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&BA2_d),
+                                     BA2_h, 0));
+  *BA2_h = 0;
+
+  // Skip the first stream.
+  hipStream_t stream[3];
+  HIP_CHECK(hipStreamCreate(&stream[0]));
+
+  // Set-up Device 0.
+  HIP_CHECK(hipSetDevice(0));
+  // Enable P2P access to Device 1.
+  HIP_CHECK(hipDeviceEnablePeerAccess(1, 0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
+  // Allocating Coherent Memory for Array A_d on Device 0.
+  printf("info: allocate device 0 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  hipError_t status = hipExtMallocWithFlags(reinterpret_cast<void**>(&A_d),
+                                           Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  HIP_CHECK(hipMalloc(&X_d0, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d0, Nbytes));
+
+  // Set-up Device 1.
+  HIP_CHECK(hipSetDevice(1));
+  // Enable P2P access to Device 0.
+  HIP_CHECK(hipDeviceEnablePeerAccess(0, 0));
+  HIP_CHECK(hipStreamCreateWithFlags(&stream[2], hipStreamNonBlocking));
+  // Allocating Coherent Memory for Array B_d on Device 1.
+  printf("info: allocate device 1 mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0);
+  status = hipExtMallocWithFlags(reinterpret_cast<void**>(&B_d),
+                                 Nbytes, hipDeviceMallocFinegrained);
+  REQUIRE(status == hipSuccess);
+  HIP_CHECK(hipMalloc(&X_d1, Nbytes));
+  HIP_CHECK(hipMalloc(&Y_d1, Nbytes));
+
+  // Transfer initialized data onto the device arrays.
+  HIP_CHECK(hipMemcpy(X_d0, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(X_d1, X_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d0, Y_h, Nbytes, hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(Y_d1, Y_h, Nbytes, hipMemcpyHostToDevice));
+
+  // Prepare and launch the device kernels.
+  const unsigned blocks = 1;
+  const unsigned threadsPerBlock = 1;
+  HIP_CHECK(hipSetDevice(0));
+  hipLaunchKernelGGL(gpu_cache0, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[1],
+                     A_d, B_d, X_d0, Y_d0, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &cache0_result);
+  // Check if launch failed.
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(cache0_result == 0);
+  HIP_CHECK(hipSetDevice(1));
+  hipLaunchKernelGGL(gpu_cache1, dim3(blocks), dim3(threadsPerBlock),
+                     0, stream[2],
+                     A_d, B_d, X_d1, Y_d1, N,
+                     AA1_d, AA2_d, BA1_d, BA2_d, &cache1_result);
+  HIP_CHECK(hipGetLastError());
+  REQUIRE(cache1_result == 0);
+
+  // Wait for kernels on both devices.
+  HIP_CHECK(hipStreamSynchronize(stream[1]));
+  HIP_CHECK(hipStreamSynchronize(stream[2]));
+
+  // Evaluate the resultant arrays A and B.
+  HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
+  HIP_CHECK(hipMemcpy(B_h, B_d, Nbytes, hipMemcpyDeviceToHost));
+
+  for (size_t i = 0; i < N; i++)  {
+    REQUIRE(A_h[i] == (100000000 + i));
+    REQUIRE(B_h[i] == (300000000 + i));
+  }
+
+  // Free all the device and host memory allocated.
+  HIP_CHECK(hipFree(A_d));
+  HIP_CHECK(hipFree(B_d));
+  HIP_CHECK(hipFree(X_d0));
+  HIP_CHECK(hipFree(Y_d0));
+  HIP_CHECK(hipFree(X_d1));
+  HIP_CHECK(hipFree(Y_d1));
+  HIP_CHECK(hipHostFree(AA1_h));
+  HIP_CHECK(hipHostFree(AA2_h));
+  HIP_CHECK(hipHostFree(BA1_h));
+  HIP_CHECK(hipHostFree(BA2_h));
+  free(A_h);
+  free(B_h);
+  free(X_h);
+  free(Y_h);
+
+  return true;
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - This test runs on devices where XGMI enables fine-grained communication
+ * between GPUs. This performs a message passing test.
+ * Array A is allocated on Device 0, and remotely on Device 1.
+ * Device 0 also increments atomic ints AA1 and AA2.
+ * Array B is allocated on Device 1, and remotely on Device 0.
+ * Device 1 also increments atomic ints BA1 and BA2.
+ * Kernel 0 will launch on Device 0, and store array X into array A.
+ * Kernel 1 will launch on Device 1, and store array Y into array B.
+ * Kernel 0 will validate that the correct values of array Y are stored in B.
+ * Kernel 1 will validate that the correct values of array X are stored in A.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/cache_coherency_gpu_gpu.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ *    - Test to be run only on AMD.
+ */
+
+TEST_CASE("Unit_cache_coherency_gpu_gpu") {
+  bool passed = true;
+  // Coherency between GPUs accessing local or remote FB.
+  REQUIRE(passed == gpu_to_gpu_coherency());
+}
@@ -1,340 +1,340 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_common.hh>
-
-unsigned threadsPerBlock = 256;
-unsigned blocksPerCU = 6;
-
-class MemcpyFunction {
- public:
-    MemcpyFunction(const char* fileName, const char* functionName) {
-      load(fileName, functionName);
-    }
-    void load(const char* fileName, const char* functionName);
-    void launch(int* dst, const int* src, size_t numElements, hipStream_t s);
-
- private:
-    hipFunction_t _function;
-    hipModule_t _module;
-};
-
-
-void MemcpyFunction::load(const char* fileName, const char* functionName) {
-    HIP_CHECK(hipModuleLoad(&_module, fileName));
-    HIP_CHECK(hipModuleGetFunction(&_function, _module, functionName));
-}
-
-void MemcpyFunction::launch(int* dst, const int* src, size_t numElements, hipStream_t s) { // NOLINT
-  struct {
-    int* _dst;
-    const int* _src;
-    size_t _numElements;
-  } args;
-
-  args._dst = dst;
-  args._src = src;
-  args._numElements = numElements;
-
-  size_t size = sizeof(args);
-  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
-                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
-                    numElements);
-  HIP_CHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock,
-            1, 1, 0, s, NULL,
-            reinterpret_cast<void**>(&config)));
-}
-
-bool g_warnOnFail = true;
-int g_elementSizes[] = {128 * 1000, 256 * 1000, 16 * 1000 * 1000};
-
-// Set value of array to specified 32-bit integer:
-__global__ void memsetIntKernel(int* ptr, const int val, size_t numElements) {
-  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int stride = blockDim.x * gridDim.x;
-  for (size_t i = gid; i < numElements; i += stride) {
-    ptr[i] = val;
-  }
-}
-
-__global__ void memcpyIntKernel(int* dst, const int* src, size_t numElements) {
-  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-  int stride = blockDim.x * gridDim.x;
-  for (size_t i = gid; i < numElements; i += stride) {
-      dst[i] = src[i];
-  }
-}
-
-// Check arrays in reverse order, to more easily detect cases where
-// the copy is "partially" done.
-void checkReverse(const int* ptr, int numElements, int expected) {
-  int mismatchCnt = 0;
-  for (int i = numElements - 1; i >= 0; i--) {
-    if (!g_warnOnFail) {
-      REQUIRE(ptr[i] == expected);
-    }
-    if (++mismatchCnt >= 10) {
-        break;
-    }
-  }
-}
-
-#define ENUM_CASE_STR(x)                                                      \
-    case x:                                                                   \
-        return #x
-
-enum CmdType { COPY, KERNEL, MODULE_KERNEL, MAX_CmdType };
-
-const char* CmdTypeStr(CmdType c) {
-    switch (c) {
-        ENUM_CASE_STR(COPY);
-        ENUM_CASE_STR(KERNEL);
-        ENUM_CASE_STR(MODULE_KERNEL);
-        default:
-            return "UNKNOWN";
-    }
-}
-
-enum SyncType {
-  NONE,
-  EVENT_QUERY,
-  EVENT_SYNC,
-  STREAM_WAIT_EVENT,
-  STREAM_QUERY,
-  STREAM_SYNC,
-  DEVICE_SYNC,
-  MAX_SyncType
-};
-
-const char* SyncTypeStr(SyncType s) {
-  switch (s) {
-    ENUM_CASE_STR(NONE);
-    ENUM_CASE_STR(EVENT_QUERY);
-    ENUM_CASE_STR(EVENT_SYNC);
-    ENUM_CASE_STR(STREAM_WAIT_EVENT);
-    ENUM_CASE_STR(STREAM_QUERY);
-    ENUM_CASE_STR(STREAM_SYNC);
-    ENUM_CASE_STR(DEVICE_SYNC);
-    default:
-      return "UNKNOWN";
-  }
-}
-
-void runCmd(CmdType cmd, int* dst, const int* src, hipStream_t s,
-             size_t numElements) {
-  switch (cmd) {
-    case COPY:
-      HIP_CHECK(
-        hipMemcpyAsync(dst, src, numElements * sizeof(int),
-                        hipMemcpyDeviceToDevice, s));
-      break;
-    case KERNEL: {
-      unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
-                                 threadsPerBlock, numElements);
-      hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock),
-                          0, s, dst, src, numElements);
-    } break;
-    case MODULE_KERNEL: {
-      MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
-      g_moduleMemcpy.launch(dst, src, numElements, s);
-    } break;
-    default:
-      printf("Info:unknown cmd=%d type", cmd);
-  }
-}
-
-void resetInputs(int* Ad, int* Bd, int* Ch,
-                 size_t numElements, int expected) {
-  unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
-                                          threadsPerBlock, numElements);
-  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
-                      0, hipStream_t(0), Ad, expected, numElements);
-  // poison with bad value to ensure is overwritten correctly
-  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
-                      0, hipStream_t(0), Bd, 0xDEADBEEF, numElements);
-  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
-                      0, hipStream_t(0), Bd, 0xF000BA55, numElements);
-  memset(Ch, 13, numElements * sizeof(int));
-  HIP_CHECK(hipDeviceSynchronize());
-}
-
-// Intended to test proper synchronization and cache flushing
-// between CMDA and CMDB. CMD are of type CmdType. All command copy memory,
-// using either hipMemcpyAsync or kernel implementations.
-// Some form of synchronization is applied. Then cmdB copies from Bd to Cd.
-// CmdA copies from Ad to Bd, Cd is then copied to host Ch using a memory copy.
-// Correct result at the end is that Ch contains the
-// contents originally in Ad (integer 0x42)
-
-void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
-                 hipStream_t stream1, hipStream_t stream2, int numElements,
-                 int* Ad, int* Bd, int* Cd, int* Ch, int expected) {
-  hipEvent_t e;
-  HIP_CHECK(hipEventCreateWithFlags(&e, 0));
-
-  resetInputs(Ad, Bd, Ch, numElements, expected);
-
-  const size_t sizeElements = numElements * sizeof(int);
-  fprintf(stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", // NOLINT
-          sizeElements, static_cast<double>(sizeElements / 1024.0),
-          CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
-
-  /*if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { // NOLINT
-    fprintf(stderr, "warn: skipping since test infra does not yet support modules\n"); // NOLINT
-    return;
-  }*/
-
-  // Step A:
-  runCmd(cmdAType, Bd, Ad, stream1, numElements);
-
-  // Sync in-between?
-  switch (syncType) {
-    case NONE:
-      break;
-    case EVENT_QUERY: {
-      hipError_t st = hipErrorNotReady;
-      HIP_CHECK(hipEventRecord(e, stream1));
-      do {
-          st = hipEventQuery(e);
-      } while (st == hipErrorNotReady);
-      HIP_CHECK(st);
-    } break;
-    case EVENT_SYNC:
-      HIP_CHECK(hipEventRecord(e, stream1));
-      HIP_CHECK(hipEventSynchronize(e));
-      break;
-    case STREAM_WAIT_EVENT:
-      HIP_CHECK(hipEventRecord(e, stream1));
-      HIP_CHECK(hipStreamWaitEvent(stream2, e, 0));
-      break;
-    case STREAM_QUERY: {
-      hipError_t st = hipErrorNotReady;
-      do {
-          st = hipStreamQuery(stream1);
-      } while (st == hipErrorNotReady);
-      HIP_CHECK(st);
-    } break;
-    case STREAM_SYNC:
-      HIP_CHECK(hipStreamSynchronize(stream1));
-      break;
-    case DEVICE_SYNC:
-      HIP_CHECK(hipDeviceSynchronize());
-      break;
-    default:
-      fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType));
-      return;
-  }
-  runCmd(cmdBType, Cd, Bd, stream2, numElements);
-
-  // Copy back to host, use async copy to avoid any extra synchronization
-  //  that might mask issues.
-  HIP_CHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost,
-                            stream2));
-  HIP_CHECK(hipStreamSynchronize(stream2));
-
-  checkReverse(Ch, numElements, expected);
-
-  HIP_CHECK(hipEventDestroy(e));
-}
-
-void testWrapper(size_t numElements) {
-  const size_t sizeElements = numElements * sizeof(int);
-  const int expected = 0x42;
-  int *Ad, *Bd, *Cd, *Ch;
-
-  HIP_CHECK(hipMalloc(&Ad, sizeElements));
-  HIP_CHECK(hipMalloc(&Bd, sizeElements));
-  HIP_CHECK(hipMalloc(&Cd, sizeElements));
-  HIP_CHECK(hipHostMalloc(&Ch, sizeElements));
-
-  hipStream_t stream1, stream2;
-
-  HIP_CHECK(hipStreamCreate(&stream1));
-  HIP_CHECK(hipStreamCreate(&stream2));
-  HIP_CHECK(hipDeviceSynchronize());
-
-  runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements,
-              Ad, Bd, Cd, Ch, expected);
-
-  for (int cmdA = 0; cmdA < MAX_CmdType; cmdA++) {
-    for (int cmdB = 0; cmdB < MAX_CmdType; cmdB++) {
-      for (int syncMode = 0; syncMode < MAX_SyncType; syncMode++) {
-        switch (syncMode) {
-          // case NONE::
-          case EVENT_QUERY:
-          case EVENT_SYNC:
-          case STREAM_WAIT_EVENT:
-          // case STREAM_QUERY:
-          case STREAM_SYNC:
-          case DEVICE_SYNC:
-            runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB),
-                      stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
-            break;
-          default:
-            break;
-        }
-      }
-    }
-  }
-
-#if 0
-  runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2,
-              numElements, Ad, Bd, Cd, Ch, expected);
-  runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements,
-              Ad, Bd, Cd, Ch, expected);
-  runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2,
-               numElements, Ad, Bd, Cd, Ch, expected);
-  runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements,
-              Ad, Bd, Cd, Ch, expected);
-#endif
-
-  HIP_CHECK(hipFree(Ad));
-  HIP_CHECK(hipFree(Bd));
-  HIP_CHECK(hipFree(Cd));
-  HIP_CHECK(hipHostFree(Ch));
-
-  HIP_CHECK(hipStreamDestroy(stream1));
-  HIP_CHECK(hipStreamDestroy(stream2));
-}
-
-/**
- * Test Description
- * ------------------------
- *    - Test cache management (fences) and synchronization between
- * kernel and copy commands. Exhaustively tests 3 command types
- * (copy, kernel, module kernel), many sync types (see SyncType), followed by
- *  another command, across a sweep of data sizes designed to stress
- * various levels of the memory hierarchy.
-
- * Test source
- * ------------------------
- *    - catch/unit/synchronization/copy_coherency.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.5
- */
-
-TEST_CASE("Unit_Copy_Coherency") {
-  for (int index = 0; index < sizeof(g_elementSizes) / sizeof(int); index++) {
-    size_t numElements = g_elementSizes[index];
-    testWrapper(numElements);
-  }
-}
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_common.hh>
+
+unsigned threadsPerBlock = 256;
+unsigned blocksPerCU = 6;
+
+class MemcpyFunction {
+ public:
+    MemcpyFunction(const char* fileName, const char* functionName) {
+      load(fileName, functionName);
+    }
+    void load(const char* fileName, const char* functionName);
+    void launch(int* dst, const int* src, size_t numElements, hipStream_t s);
+
+ private:
+    hipFunction_t _function;
+    hipModule_t _module;
+};
+
+
+void MemcpyFunction::load(const char* fileName, const char* functionName) {
+    HIP_CHECK(hipModuleLoad(&_module, fileName));
+    HIP_CHECK(hipModuleGetFunction(&_function, _module, functionName));
+}
+
+void MemcpyFunction::launch(int* dst, const int* src, size_t numElements, hipStream_t s) { // NOLINT
+  struct {
+    int* _dst;
+    const int* _src;
+    size_t _numElements;
+  } args;
+
+  args._dst = dst;
+  args._src = src;
+  args._numElements = numElements;
+
+  size_t size = sizeof(args);
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args,
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock,
+                    numElements);
+  HIP_CHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock,
+            1, 1, 0, s, NULL,
+            reinterpret_cast<void**>(&config)));
+}
+
+bool g_warnOnFail = true;
+int g_elementSizes[] = {128 * 1000, 256 * 1000, 16 * 1000 * 1000};
+
+// Set value of array to specified 32-bit integer:
+__global__ void memsetIntKernel(int* ptr, const int val, size_t numElements) {
+  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int stride = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < numElements; i += stride) {
+    ptr[i] = val;
+  }
+}
+
+__global__ void memcpyIntKernel(int* dst, const int* src, size_t numElements) {
+  int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+  int stride = blockDim.x * gridDim.x;
+  for (size_t i = gid; i < numElements; i += stride) {
+      dst[i] = src[i];
+  }
+}
+
+// Check arrays in reverse order, to more easily detect cases where
+// the copy is "partially" done.
+void checkReverse(const int* ptr, int numElements, int expected) {
+  int mismatchCnt = 0;
+  for (int i = numElements - 1; i >= 0; i--) {
+    if (!g_warnOnFail) {
+      REQUIRE(ptr[i] == expected);
+    }
+    if (++mismatchCnt >= 10) {
+        break;
+    }
+  }
+}
+
+#define ENUM_CASE_STR(x)                                                      \
+    case x:                                                                   \
+        return #x
+
+enum CmdType { COPY, KERNEL, MODULE_KERNEL, MAX_CmdType };
+
+const char* CmdTypeStr(CmdType c) {
+    switch (c) {
+        ENUM_CASE_STR(COPY);
+        ENUM_CASE_STR(KERNEL);
+        ENUM_CASE_STR(MODULE_KERNEL);
+        default:
+            return "UNKNOWN";
+    }
+}
+
+enum SyncType {
+  NONE,
+  EVENT_QUERY,
+  EVENT_SYNC,
+  STREAM_WAIT_EVENT,
+  STREAM_QUERY,
+  STREAM_SYNC,
+  DEVICE_SYNC,
+  MAX_SyncType
+};
+
+const char* SyncTypeStr(SyncType s) {
+  switch (s) {
+    ENUM_CASE_STR(NONE);
+    ENUM_CASE_STR(EVENT_QUERY);
+    ENUM_CASE_STR(EVENT_SYNC);
+    ENUM_CASE_STR(STREAM_WAIT_EVENT);
+    ENUM_CASE_STR(STREAM_QUERY);
+    ENUM_CASE_STR(STREAM_SYNC);
+    ENUM_CASE_STR(DEVICE_SYNC);
+    default:
+      return "UNKNOWN";
+  }
+}
+
+void runCmd(CmdType cmd, int* dst, const int* src, hipStream_t s,
+             size_t numElements) {
+  switch (cmd) {
+    case COPY:
+      HIP_CHECK(
+        hipMemcpyAsync(dst, src, numElements * sizeof(int),
+                        hipMemcpyDeviceToDevice, s));
+      break;
+    case KERNEL: {
+      unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
+                                 threadsPerBlock, numElements);
+      hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                          0, s, dst, src, numElements);
+    } break;
+    case MODULE_KERNEL: {
+      MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
+      g_moduleMemcpy.launch(dst, src, numElements, s);
+    } break;
+    default:
+      printf("Info:unknown cmd=%d type", cmd);
+  }
+}
+
+void resetInputs(int* Ad, int* Bd, int* Ch,
+                 size_t numElements, int expected) {
+  unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
+                                          threadsPerBlock, numElements);
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Ad, expected, numElements);
+  // poison with bad value to ensure is overwritten correctly
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Bd, 0xDEADBEEF, numElements);
+  hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock),
+                      0, hipStream_t(0), Bd, 0xF000BA55, numElements);
+  memset(Ch, 13, numElements * sizeof(int));
+  HIP_CHECK(hipDeviceSynchronize());
+}
+
+// Intended to test proper synchronization and cache flushing
+// between CMDA and CMDB. CMD are of type CmdType. All command copy memory,
+// using either hipMemcpyAsync or kernel implementations.
+// Some form of synchronization is applied. Then cmdB copies from Bd to Cd.
+// CmdA copies from Ad to Bd, Cd is then copied to host Ch using a memory copy.
+// Correct result at the end is that Ch contains the
+// contents originally in Ad (integer 0x42)
+
+void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
+                 hipStream_t stream1, hipStream_t stream2, int numElements,
+                 int* Ad, int* Bd, int* Cd, int* Ch, int expected) {
+  hipEvent_t e;
+  HIP_CHECK(hipEventCreateWithFlags(&e, 0));
+
+  resetInputs(Ad, Bd, Ch, numElements, expected);
+
+  const size_t sizeElements = numElements * sizeof(int);
+  fprintf(stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", // NOLINT
+          sizeElements, static_cast<double>(sizeElements / 1024.0),
+          CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));
+
+  /*if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { // NOLINT
+    fprintf(stderr, "warn: skipping since test infra does not yet support modules\n"); // NOLINT
+    return;
+  }*/
+
+  // Step A:
+  runCmd(cmdAType, Bd, Ad, stream1, numElements);
+
+  // Sync in-between?
+  switch (syncType) {
+    case NONE:
+      break;
+    case EVENT_QUERY: {
+      hipError_t st = hipErrorNotReady;
+      HIP_CHECK(hipEventRecord(e, stream1));
+      do {
+          st = hipEventQuery(e);
+      } while (st == hipErrorNotReady);
+      HIP_CHECK(st);
+    } break;
+    case EVENT_SYNC:
+      HIP_CHECK(hipEventRecord(e, stream1));
+      HIP_CHECK(hipEventSynchronize(e));
+      break;
+    case STREAM_WAIT_EVENT:
+      HIP_CHECK(hipEventRecord(e, stream1));
+      HIP_CHECK(hipStreamWaitEvent(stream2, e, 0));
+      break;
+    case STREAM_QUERY: {
+      hipError_t st = hipErrorNotReady;
+      do {
+          st = hipStreamQuery(stream1);
+      } while (st == hipErrorNotReady);
+      HIP_CHECK(st);
+    } break;
+    case STREAM_SYNC:
+      HIP_CHECK(hipStreamSynchronize(stream1));
+      break;
+    case DEVICE_SYNC:
+      HIP_CHECK(hipDeviceSynchronize());
+      break;
+    default:
+      fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType));
+      return;
+  }
+  runCmd(cmdBType, Cd, Bd, stream2, numElements);
+
+  // Copy back to host, use async copy to avoid any extra synchronization
+  //  that might mask issues.
+  HIP_CHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost,
+                            stream2));
+  HIP_CHECK(hipStreamSynchronize(stream2));
+
+  checkReverse(Ch, numElements, expected);
+
+  HIP_CHECK(hipEventDestroy(e));
+}
+
+void testWrapper(size_t numElements) {
+  const size_t sizeElements = numElements * sizeof(int);
+  const int expected = 0x42;
+  int *Ad, *Bd, *Cd, *Ch;
+
+  HIP_CHECK(hipMalloc(&Ad, sizeElements));
+  HIP_CHECK(hipMalloc(&Bd, sizeElements));
+  HIP_CHECK(hipMalloc(&Cd, sizeElements));
+  HIP_CHECK(hipHostMalloc(&Ch, sizeElements));
+
+  hipStream_t stream1, stream2;
+
+  HIP_CHECK(hipStreamCreate(&stream1));
+  HIP_CHECK(hipStreamCreate(&stream2));
+  HIP_CHECK(hipDeviceSynchronize());
+
+  runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+
+  for (int cmdA = 0; cmdA < MAX_CmdType; cmdA++) {
+    for (int cmdB = 0; cmdB < MAX_CmdType; cmdB++) {
+      for (int syncMode = 0; syncMode < MAX_SyncType; syncMode++) {
+        switch (syncMode) {
+          // case NONE::
+          case EVENT_QUERY:
+          case EVENT_SYNC:
+          case STREAM_WAIT_EVENT:
+          // case STREAM_QUERY:
+          case STREAM_SYNC:
+          case DEVICE_SYNC:
+            runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB),
+                      stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
+            break;
+          default:
+            break;
+        }
+      }
+    }
+  }
+
+#if 0
+  runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2,
+              numElements, Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2,
+               numElements, Ad, Bd, Cd, Ch, expected);
+  runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements,
+              Ad, Bd, Cd, Ch, expected);
+#endif
+
+  HIP_CHECK(hipFree(Ad));
+  HIP_CHECK(hipFree(Bd));
+  HIP_CHECK(hipFree(Cd));
+  HIP_CHECK(hipHostFree(Ch));
+
+  HIP_CHECK(hipStreamDestroy(stream1));
+  HIP_CHECK(hipStreamDestroy(stream2));
+}
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test cache management (fences) and synchronization between
+ * kernel and copy commands. Exhaustively tests 3 command types
+ * (copy, kernel, module kernel), many sync types (see SyncType), followed by
+ *  another command, across a sweep of data sizes designed to stress
+ * various levels of the memory hierarchy.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/synchronization/copy_coherency.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.5
+ */
+
+TEST_CASE("Unit_Copy_Coherency") {
+  for (int index = 0; index < sizeof(g_elementSizes) / sizeof(int); index++) {
+    size_t numElements = g_elementSizes[index];
+    testWrapper(numElements);
+  }
+}
@@ -1,182 +1,182 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/hip_fp16.h>
-
-#define WIDTH 4
-
-#define NUM (WIDTH * WIDTH)
-
-#define THREADS_PER_BLOCK_X 4
-#define THREADS_PER_BLOCK_Y 4
-#define THREADS_PER_BLOCK_Z 1
-
-// Device (Kernel) function, it must be void
-template <typename T> __global__ void matrixTranspose(T* out, T* in, const int width) {
-  int x = blockDim.x * blockIdx.x + threadIdx.x;
-  T val = in[x];
-  for (int i = 0; i < width; i++) {
-    for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
-  }
-}
-
-// CPU implementation of matrix transpose
-template <typename T>
-void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) {
-  for (unsigned int j = 0; j < width; j++) {
-    for (unsigned int i = 0; i < width; i++) {
-      output[i * width + j] = input[j * width + i];
-    }
-  }
-}
-
-static void getFactor(int* fact) { *fact = 101; }
-static void getFactor(unsigned int* fact) { *fact = static_cast<unsigned int>(INT32_MAX) + 1; }
-static void getFactor(float* fact) { *fact = 2.5; }
-static void getFactor(__half* fact) { *fact = 2.5; }
-static void getFactor(double* fact) { *fact = 2.5; }
-static void getFactor(int64_t* fact) { *fact = 303; }
-static void getFactor(uint64_t* fact) { *fact = static_cast<uint64_t>(__LONG_LONG_MAX__) + 1; }
-
-template <typename T> int compare(T* TransposeMatrix, T* cpuTransposeMatrix) {
-  int errors = 0;
-  for (int i = 0; i < NUM; i++) {
-    if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
-      errors++;
-    }
-  }
-  return errors;
-}
-
-template <> int compare<__half>(__half* TransposeMatrix, __half* cpuTransposeMatrix) {
-  int errors = 0;
-  for (int i = 0; i < NUM; i++) {
-    if (__half2float(TransposeMatrix[i]) != __half2float(cpuTransposeMatrix[i])) {  // NOLINT
-      errors++;
-    }
-  }
-  return errors;
-}
-
-template <typename T> void init(T* Matrix) {
-  // initialize the input data
-  T factor;
-  getFactor(&factor);
-  for (int i = 0; i < NUM; i++) {
-    Matrix[i] = (T)i + factor;
-  }
-}
-
-template <> void init(__half* Matrix) {
-  // initialize the input data
-  __half factor;
-  getFactor(&factor);
-  for (int i = 0; i < NUM; i++) {
-    Matrix[i] = i + __half2float(factor);
-  }
-}
-
-template <typename T> static void runTest() {
-  T* Matrix;
-  T* TransposeMatrix;
-  T* cpuTransposeMatrix;
-
-  T* gpuMatrix;
-  T* gpuTransposeMatrix;
-
-  hipDeviceProp_t devProp;
-  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
-
-  int errors = 0;
-
-  Matrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
-  TransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
-  cpuTransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
-
-  init(Matrix);
-
-  // allocate the memory on the device side
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuMatrix), NUM * sizeof(T)));
-  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuTransposeMatrix), NUM * sizeof(T)));
-
-  // Memory transfer from host to device
-  HIP_CHECK(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), hipMemcpyHostToDevice));
-
-  // Lauching kernel from host
-  hipLaunchKernelGGL(matrixTranspose<T>, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
-                     0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);
-
-  // Memory transfer from device to host
-  HIP_CHECK(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(T), hipMemcpyDeviceToHost));
-
-  // CPU MatrixTranspose computation
-  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
-
-  // verify the results
-  REQUIRE(errors == compare(TransposeMatrix, cpuTransposeMatrix));
-  // free the resources on device side
-  HIP_CHECK(hipFree(gpuMatrix));
-  HIP_CHECK(hipFree(gpuTransposeMatrix));
-
-  // free the resources on host side
-  free(Matrix);
-  free(TransposeMatrix);
-  free(cpuTransposeMatrix);
-}
-
-/**
- * @addtogroup __shfl __shfl
- * @{
- * @ingroup ShflTest
- * `T  __shfl(T var, int srcLane, int width=warpSize)` -
- * Contains wrap __shfl functions.
- * @}
- */
-
-/**
- * Test Description
- * ------------------------
- * - Test case to verify __shfl warp functions for different datatypes.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipShflTests.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- */
-
-TEST_CASE("Unit_hipShflTests") {
-  SECTION("run test for int") { runTest<int>(); }
-  SECTION("run test for float") { runTest<float>(); }
-  SECTION("run test for double") { runTest<double>(); }
-  // Test added to support half datatype.
-  SECTION("run test for __half") { runTest<__half>(); }
-  SECTION("run test for int64_t") { runTest<int64_t>(); }
-  SECTION("run test for unsigned int") { runTest<unsigned int>(); }
-  SECTION("run test for uint64_t") { runTest<uint64_t>(); }
-}
-
-/**
-* End doxygen group ShflTest.
-* @}
-*/
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/hip_fp16.h>
+
+#define WIDTH 4
+
+#define NUM (WIDTH * WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+template <typename T> __global__ void matrixTranspose(T* out, T* in, const int width) {
+  int x = blockDim.x * blockIdx.x + threadIdx.x;
+  T val = in[x];
+  for (int i = 0; i < width; i++) {
+    for (int j = 0; j < width; j++) out[i * width + j] = __shfl(val, j * width + i);
+  }
+}
+
+// CPU implementation of matrix transpose
+template <typename T>
+void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) {
+  for (unsigned int j = 0; j < width; j++) {
+    for (unsigned int i = 0; i < width; i++) {
+      output[i * width + j] = input[j * width + i];
+    }
+  }
+}
+
+static void getFactor(int* fact) { *fact = 101; }
+static void getFactor(unsigned int* fact) { *fact = static_cast<unsigned int>(INT32_MAX) + 1; }
+static void getFactor(float* fact) { *fact = 2.5; }
+static void getFactor(__half* fact) { *fact = 2.5; }
+static void getFactor(double* fact) { *fact = 2.5; }
+static void getFactor(int64_t* fact) { *fact = 303; }
+static void getFactor(uint64_t* fact) { *fact = static_cast<uint64_t>(__LONG_LONG_MAX__) + 1; }
+
+template <typename T> int compare(T* TransposeMatrix, T* cpuTransposeMatrix) {
+  int errors = 0;
+  for (int i = 0; i < NUM; i++) {
+    if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
+      errors++;
+    }
+  }
+  return errors;
+}
+
+template <> int compare<__half>(__half* TransposeMatrix, __half* cpuTransposeMatrix) {
+  int errors = 0;
+  for (int i = 0; i < NUM; i++) {
+    if (__half2float(TransposeMatrix[i]) != __half2float(cpuTransposeMatrix[i])) {  // NOLINT
+      errors++;
+    }
+  }
+  return errors;
+}
+
+template <typename T> void init(T* Matrix) {
+  // initialize the input data
+  T factor;
+  getFactor(&factor);
+  for (int i = 0; i < NUM; i++) {
+    Matrix[i] = (T)i + factor;
+  }
+}
+
+template <> void init(__half* Matrix) {
+  // initialize the input data
+  __half factor;
+  getFactor(&factor);
+  for (int i = 0; i < NUM; i++) {
+    Matrix[i] = i + __half2float(factor);
+  }
+}
+
+template <typename T> static void runTest() {
+  T* Matrix;
+  T* TransposeMatrix;
+  T* cpuTransposeMatrix;
+
+  T* gpuMatrix;
+  T* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  int errors = 0;
+
+  Matrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+  TransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+  cpuTransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+
+  init(Matrix);
+
+  // allocate the memory on the device side
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuMatrix), NUM * sizeof(T)));
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuTransposeMatrix), NUM * sizeof(T)));
+
+  // Memory transfer from host to device
+  HIP_CHECK(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), hipMemcpyHostToDevice));
+
+  // Lauching kernel from host
+  hipLaunchKernelGGL(matrixTranspose<T>, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y),
+                     0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  HIP_CHECK(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(T), hipMemcpyDeviceToHost));
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  REQUIRE(errors == compare(TransposeMatrix, cpuTransposeMatrix));
+  // free the resources on device side
+  HIP_CHECK(hipFree(gpuMatrix));
+  HIP_CHECK(hipFree(gpuTransposeMatrix));
+
+  // free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+}
+
+/**
+ * @addtogroup __shfl __shfl
+ * @{
+ * @ingroup ShflTest
+ * `T  __shfl(T var, int srcLane, int width=warpSize)` -
+ * Contains wrap __shfl functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflTests") {
+  SECTION("run test for int") { runTest<int>(); }
+  SECTION("run test for float") { runTest<float>(); }
+  SECTION("run test for double") { runTest<double>(); }
+  // Test added to support half datatype.
+  SECTION("run test for __half") { runTest<__half>(); }
+  SECTION("run test for int64_t") { runTest<int64_t>(); }
+  SECTION("run test for unsigned int") { runTest<unsigned int>(); }
+  SECTION("run test for uint64_t") { runTest<uint64_t>(); }
+}
+
+/**
+* End doxygen group ShflTest.
+* @}
+*/
@@ -1,241 +1,241 @@
-/*
-Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#include <hip_test_kernels.hh>
-#include <hip_test_checkers.hh>
-#include <hip_test_common.hh>
-#include <hip/hip_fp16.h>
-
-const int size = 32;
-
-template <typename T> __global__ void shflDownSum(T* a, int size) {
-  T val = a[threadIdx.x];
-  for (int i = size / 2; i > 0; i /= 2) {
-    val += __shfl_down(val, i, size);
-  }
-  a[threadIdx.x] = val;
-}
-
-template <typename T> __global__ void shflUpSum(T* a, int size) {
-  T val = a[threadIdx.x];
-  for (int i = size / 2; i > 0; i /= 2) {
-    val += __shfl_up(val, i, size);
-  }
-  a[threadIdx.x] = val;
-}
-
-template <typename T> __global__ void shflXorSum(T* a, int size) {
-  T val = a[threadIdx.x];
-  for (int i = size / 2; i > 0; i /= 2) {
-    val += __shfl_xor(val, i, size);
-  }
-  a[threadIdx.x] = val;
-}
-
-static void getFactor(int* fact) { *fact = 101; }
-static void getFactor(unsigned int* fact) { *fact = static_cast<unsigned int>(INT32_MAX) + 1; }
-static void getFactor(float* fact) { *fact = 2.5; }
-static void getFactor(double* fact) { *fact = 2.5; }
-static void getFactor(__half* fact) { *fact = 2.5; }
-static void getFactor(int64_t* fact) { *fact = 303; }
-static void getFactor(uint64_t* fact) { *fact = static_cast<uint64_t>(__LONG_LONG_MAX__) + 1; }
-
-template <typename T> T sum(T* a) {
-  T cpuSum = 0;
-  T factor;
-  getFactor(&factor);
-  for (int i = 0; i < size; i++) {
-    a[i] = i + factor;
-    cpuSum += a[i];
-  }
-  return cpuSum;
-}
-
-template <> __half sum(__half* a) {
-  __half cpuSum = 0;
-  __half factor;
-  getFactor(&factor);
-  for (int i = 0; i < size; i++) {
-    a[i] = i + __half2float(factor);
-    cpuSum = __half2float(cpuSum) + __half2float(a[i]);
-  }
-  return cpuSum;
-}
-
-template <typename T> bool compare(T gpuSum, T cpuSum) {
-  if (gpuSum != cpuSum) {
-    return true;
-  }
-  return false;
-}
-
-template <> bool compare(__half gpuSum, __half cpuSum) {
-  if (__half2float(gpuSum) != __half2float(cpuSum)) {
-    return true;
-  }
-  return false;
-}
-
-template <typename T> static void runTestShflUp() {
-  const int size = 32;
-  T a[size];
-  T cpuSum = sum(a);
-  T* d_a;
-  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
-  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
-  hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
-  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
-  REQUIRE((compare(a[size - 1], cpuSum)) == 0);
-  HIP_CHECK(hipFree(d_a));
-}
-
-template <typename T> static void runTestShflDown() {
-  T a[size];
-  T cpuSum = sum(a);
-  T* d_a;
-  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
-  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
-  hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
-  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
-  REQUIRE((compare(a[0], cpuSum)) == 0);
-  HIP_CHECK(hipFree(d_a));
-}
-
-template <typename T> static void runTestShflXor() {
-  T a[size];
-  T cpuSum = sum(a);
-  T* d_a;
-  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
-  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
-  hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
-  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
-  REQUIRE((compare(a[0], cpuSum)) == 0);
-  HIP_CHECK(hipFree(d_a));
-}
-
-/**
- * @addtogroup __shfl __shfl
- * @{
- * @ingroup ShflTest
- * `T __shfl_up(T var, unsigned int lane_delta, int width = warpSize)` -
- * Contains warp __shfl_up function
- */
-
-/**
- * Test Description
- * ------------------------
- *    - Test case to verify __shfl_up warp functions for different datatypes.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipShflUpDownTest.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- *    - Gaurding this test against cuda with refernce to mentioned
- * ticket SWDEV-379177
- */
-
-TEST_CASE("Unit_runTestShfl_up") {
-  SECTION("runTestShflUp for int") { runTestShflUp<int>(); }
-  SECTION("runTestShflUp for float") { runTestShflUp<float>(); }
-  SECTION("runTestShflUp for double") { runTestShflUp<double>(); }
-  SECTION("runTestShflUp for __half") { runTestShflUp<__half>(); }
-  SECTION("runTestShflUp for int64_t") { runTestShflUp<int64_t>(); }
-  SECTION("runTestShflUp for unsigned int") { runTestShflUp<unsigned int>(); }
-  SECTION("runTestShflUp for uint64_t") { runTestShflUp<uint64_t>(); }
-}
-/**
- * End doxygen group __shfl.
- * @}
- */
-
-/**
- * @addtogroup __shfl __shfl
- * @{
- * @ingroup ShflTest
- * `T __shfl_down(T var, unsigned int lane_delta, int width = warpSize)` -
- * Contains warp __shfl_down function
- */
-
-/**
- * Test Description
- * ------------------------
- *    - Test case to verify __shfl_down warp functions for different datatypes.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipShflUpDownTest.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- *    - Gaurding this test against cuda with refernce to mentioned
- * ticket SWDEV-379177
- */
-
-TEST_CASE("Unit_runTestShfl_Down") {
-  SECTION("runTestShflDown for int") { runTestShflDown<int>(); }
-  SECTION("runTestShflDown for float") { runTestShflDown<float>(); }
-  SECTION("runTestShflDown for double") { runTestShflDown<double>(); }
-  SECTION("runTestShflDown for __half") { runTestShflDown<__half>(); }
-  SECTION("runTestShflDown for int64_t") { runTestShflDown<int64_t>(); }
-  SECTION("runTestShflDown for unsigned int") { runTestShflDown<unsigned int>(); }
-  SECTION("runTestShflDown for uint64_t") { runTestShflDown<uint64_t>(); }
-}
-/**
- * End doxygen group __shfl.
- * @}
- */
-
-/**
- * @addtogroup __shfl __shfl
- * @{
- * @ingroup ShflTest
- * `T __shfl_xor(T var, int laneMask, int width=warpSize)` -
- * Contains warp __shfl_xor function
- */
-
-/**
- * Test Description
- * ------------------------
- *    - Test case to verify __shfl_xor warp functions for different datatypes.
-
- * Test source
- * ------------------------
- *    - catch/unit/kernel/hipShflUpDownTest.cc
- * Test requirements
- * ------------------------
- *    - HIP_VERSION >= 5.6
- *    - Gaurding this test against cuda with refernce to mentioned
- * ticket SWDEV-379177
- */
-
-TEST_CASE("Unit_runTestShfl_Xor") {
-  SECTION("runTestShflXor for int") { runTestShflXor<int>(); }
-  SECTION("runTestShflXor for float") { runTestShflXor<float>(); }
-  SECTION("runTestShflXor for double") { runTestShflXor<double>(); }
-  SECTION("runTestShflXor for __half") { runTestShflXor<__half>(); }
-  SECTION("runTestShflXor for int64_t") { runTestShflXor<int64_t>(); }
-  SECTION("runTestShflXor for unsigned int") { runTestShflXor<unsigned int>(); }
-  SECTION("runTestShflXor for uint64_t") { runTestShflXor<uint64_t>(); }
-}
-/**
- * End doxygen group __shfl.
- * @}
- */
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/hip_fp16.h>
+
+const int size = 32;
+
+template <typename T> __global__ void shflDownSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_down(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T> __global__ void shflUpSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_up(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T> __global__ void shflXorSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_xor(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+static void getFactor(int* fact) { *fact = 101; }
+static void getFactor(unsigned int* fact) { *fact = static_cast<unsigned int>(INT32_MAX) + 1; }
+static void getFactor(float* fact) { *fact = 2.5; }
+static void getFactor(double* fact) { *fact = 2.5; }
+static void getFactor(__half* fact) { *fact = 2.5; }
+static void getFactor(int64_t* fact) { *fact = 303; }
+static void getFactor(uint64_t* fact) { *fact = static_cast<uint64_t>(__LONG_LONG_MAX__) + 1; }
+
+template <typename T> T sum(T* a) {
+  T cpuSum = 0;
+  T factor;
+  getFactor(&factor);
+  for (int i = 0; i < size; i++) {
+    a[i] = i + factor;
+    cpuSum += a[i];
+  }
+  return cpuSum;
+}
+
+template <> __half sum(__half* a) {
+  __half cpuSum = 0;
+  __half factor;
+  getFactor(&factor);
+  for (int i = 0; i < size; i++) {
+    a[i] = i + __half2float(factor);
+    cpuSum = __half2float(cpuSum) + __half2float(a[i]);
+  }
+  return cpuSum;
+}
+
+template <typename T> bool compare(T gpuSum, T cpuSum) {
+  if (gpuSum != cpuSum) {
+    return true;
+  }
+  return false;
+}
+
+template <> bool compare(__half gpuSum, __half cpuSum) {
+  if (__half2float(gpuSum) != __half2float(cpuSum)) {
+    return true;
+  }
+  return false;
+}
+
+template <typename T> static void runTestShflUp() {
+  const int size = 32;
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[size - 1], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+template <typename T> static void runTestShflDown() {
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[0], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+template <typename T> static void runTestShflXor() {
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[0], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+/**
+ * @addtogroup __shfl __shfl
+ * @{
+ * @ingroup ShflTest
+ * `T __shfl_up(T var, unsigned int lane_delta, int width = warpSize)` -
+ * Contains warp __shfl_up function
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_up warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_up") {
+  SECTION("runTestShflUp for int") { runTestShflUp<int>(); }
+  SECTION("runTestShflUp for float") { runTestShflUp<float>(); }
+  SECTION("runTestShflUp for double") { runTestShflUp<double>(); }
+  SECTION("runTestShflUp for __half") { runTestShflUp<__half>(); }
+  SECTION("runTestShflUp for int64_t") { runTestShflUp<int64_t>(); }
+  SECTION("runTestShflUp for unsigned int") { runTestShflUp<unsigned int>(); }
+  SECTION("runTestShflUp for uint64_t") { runTestShflUp<uint64_t>(); }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */
+
+/**
+ * @addtogroup __shfl __shfl
+ * @{
+ * @ingroup ShflTest
+ * `T __shfl_down(T var, unsigned int lane_delta, int width = warpSize)` -
+ * Contains warp __shfl_down function
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_down warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_Down") {
+  SECTION("runTestShflDown for int") { runTestShflDown<int>(); }
+  SECTION("runTestShflDown for float") { runTestShflDown<float>(); }
+  SECTION("runTestShflDown for double") { runTestShflDown<double>(); }
+  SECTION("runTestShflDown for __half") { runTestShflDown<__half>(); }
+  SECTION("runTestShflDown for int64_t") { runTestShflDown<int64_t>(); }
+  SECTION("runTestShflDown for unsigned int") { runTestShflDown<unsigned int>(); }
+  SECTION("runTestShflDown for uint64_t") { runTestShflDown<uint64_t>(); }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */
+
+/**
+ * @addtogroup __shfl __shfl
+ * @{
+ * @ingroup ShflTest
+ * `T __shfl_xor(T var, int laneMask, int width=warpSize)` -
+ * Contains warp __shfl_xor function
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_xor warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_Xor") {
+  SECTION("runTestShflXor for int") { runTestShflXor<int>(); }
+  SECTION("runTestShflXor for float") { runTestShflXor<float>(); }
+  SECTION("runTestShflXor for double") { runTestShflXor<double>(); }
+  SECTION("runTestShflXor for __half") { runTestShflXor<__half>(); }
+  SECTION("runTestShflXor for int64_t") { runTestShflXor<int64_t>(); }
+  SECTION("runTestShflXor for unsigned int") { runTestShflXor<unsigned int>(); }
+  SECTION("runTestShflXor for uint64_t") { runTestShflXor<uint64_t>(); }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */
@@ -1,437 +1,437 @@
-/*
- Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- */
-
-/* HIT_START
- * BUILD: %t %s ../../src/test_common.cpp
- * TEST: %t
- * HIT_END
- */
-
-#include "test_common.h"
-#include <iostream>
-#include <chrono>
-
-static unsigned int sizeList[] = {
-  256, 512, 1024, 2048, 4096, 8192,
-};
-
-static unsigned int eleNumList[] = {
-    0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000,
-    0x200000, 0x400000, 0x800000, 0x1000000
-};
-
-typedef struct _dataType {
-char memsetval = 0x42;
-char memsetD8val = 0xDE;
-int16_t memsetD16val = 0xDEAD;
-int memsetD32val = 0xDEADBEEF;
-}dataType;
-
-#define NUM_ITER 1000
-
-enum MemsetType {
-  hipMemsetTypeDefault,
-  hipMemsetTypeD8,
-  hipMemsetTypeD16,
-  hipMemsetTypeD32,
-  hipMemsetTypeMax
-
-};
-
-using namespace std;
-
-class hipPerfMemset {
-  private:
-    uint64_t     bufSize_;
-    unsigned int num_elements_;
-    unsigned int testNumEle_;
-    unsigned int _numSubTests = 0;
-    unsigned int _numSubTests2D = 0;
-    unsigned int _numSubTests3D = 0;
-    unsigned int num_sizes_ =0;
-
-  public:
-    hipPerfMemset() {
-    num_elements_ = sizeof(eleNumList) / sizeof(unsigned int);
-    _numSubTests = num_elements_ * hipMemsetTypeMax;
-
-    num_sizes_ = sizeof(sizeList) / sizeof(unsigned int);
-    _numSubTests2D = num_sizes_;
-    _numSubTests3D = _numSubTests2D;
-    };
-
-    ~hipPerfMemset() {};
-
-    void open(int deviceID);
-
-    template<typename T>
-    void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    template<typename T>
-    void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    template<typename T>
-    void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async);
-
-    uint getNumTests() {
-      return _numSubTests;
-    }
-
-    uint getNumTests2D() {
-      return _numSubTests2D;
-    }
-    uint getNumTests3D() {
-      return _numSubTests3D;
-    }
-};
-
-
-void hipPerfMemset::open(int deviceId) {
-  int nGpu = 0;
-  HIPCHECK(hipGetDeviceCount(&nGpu));
-  if (nGpu < 1) {
-    failed("No GPU!");
-  }
-
-  HIPCHECK(hipSetDevice(deviceId));
-  hipDeviceProp_t props = {0};
-  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
-  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
-            << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
-            << std::endl;
-}
-
-template<typename T>
-void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-  T * A_h;
-  T * A_d;
-
-  testNumEle_ = eleNumList[test % num_elements_];
-
-  bufSize_ = testNumEle_ * sizeof(uint32_t);
-
-  HIPCHECK(hipMalloc(&A_d, bufSize_));
-
-  A_h = reinterpret_cast<T*> (malloc(bufSize_));
-
-  hipStream_t stream;
-  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto start = chrono::high_resolution_clock::now();
-  for (uint i = 0; i < NUM_ITER; i++) {
-    if (type == hipMemsetTypeDefault && !async) {
-      HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
-    }
-    else if (type == hipMemsetTypeDefault && async) {
-      HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
-    }
-    else if (type == hipMemsetTypeD8 && !async){
-      HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_));
-    }
-    else if (type == hipMemsetTypeD8 && async) {
-      HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream));
-    }
-    else if (type == hipMemsetTypeD16 && !async) {
-      HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
-    }
-    else if (type == hipMemsetTypeD16 && async) {
-      HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
-    }
-    else if (type == hipMemsetTypeD32 && !async) {
-      HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
-    }
-    else if (type == hipMemsetTypeD32 && async) {
-      HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
-    }
-  }
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::high_resolution_clock::now();
-
-  HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) );
-
-  for (int i = 0; i < bufSize_ / sizeof(T); i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-    }
-  }
-
-  HIPCHECK(hipFree(A_d));
-  free(A_h);
-
-  auto diff = std::chrono::duration<double>(end - start);
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((bufSize_ * NUM_ITER * (double)(1e-09)) / sec);
-
-  cout <<  "[" << setw(2) << test << "] " << setw(5) << bufSize_/1024 << " Kb " << setw(4)
-       << " typeSize " << (int)sizeof(T) << " : " << setw(7) << perf << " GB/s " << endl;
-}
-
-template<typename T>
-void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-  bufSize_ = sizeList[test % num_sizes_];
-
-  size_t numH = bufSize_;
-  size_t numW = bufSize_;
-  size_t pitch_A;
-  size_t width = numW * sizeof(char);
-  size_t sizeElements = width * numH;
-  size_t elements = numW* numH;
-
-  T * A_h;
-  T * A_d;
-
-  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width ,
-                          numH));
-  A_h = reinterpret_cast<char*>(malloc(sizeElements));
-
-  for (size_t i=0; i < elements; i++) {
-    A_h[i] = 1;
-  }
-
-  hipStream_t stream;
-  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto start = chrono::steady_clock::now();
-
-  for (uint i = 0; i < NUM_ITER; i++) {
-    if (type == hipMemsetTypeDefault && !async) {
-    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
-    }
-    else if (type == hipMemsetTypeDefault && async) {
-      HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
-    }
-  }
-
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::steady_clock::now();
-
-  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
-                       hipMemcpyDeviceToHost));
-
-  for (int i=0; i < elements; i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-    }
-  }
-
-  chrono::duration<double> diff = end - start;
-
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((sizeElements* NUM_ITER * (double)(1e-09)) / sec);
-
-  cout << " hipPerf2DMemset" << (async ? "Async" : "     ") << "[" << test << "] "
-       << "  " << "(GB/s) for " << setw(5) << bufSize_
-       << " x " << setw(5) << bufSize_ << " bytes : " << setw(7) << perf <<  endl;
-
-  HIPCHECK(hipStreamDestroy(stream));
-  HIPCHECK(hipFree(A_d));
-  free(A_h);
-}
-
-template<typename T>
-void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
-
-    bufSize_ = sizeList[test % num_sizes_];
-
-    size_t numH = bufSize_;
-    size_t numW = bufSize_;
-    size_t depth = 10;
-    size_t width = numW * sizeof(char);
-    size_t sizeElements = width * numH * depth;
-    size_t elements = numW* numH* depth;
-
-    hipStream_t stream;
-    HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-
-    T *A_h;
-
-    hipExtent extent = make_hipExtent(width, numH, depth);
-    hipPitchedPtr devPitchedPtr;
-
-    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
-    A_h = (char*)malloc(sizeElements);
-    HIPASSERT(A_h != NULL);
-
-    for (size_t i=0; i<elements; i++) {
-        A_h[i] = 1;
-    }
-
-  // Warm-up
-  if (async) {
-    HIPCHECK(hipMemset3DAsync( devPitchedPtr, memsetval, extent, stream));
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
-    HIPCHECK(hipDeviceSynchronize());
-  }
-   auto start = chrono::steady_clock::now();
-
-   for (uint i = 0; i < NUM_ITER; i++) {
-     if (type == hipMemsetTypeDefault && !async) {
-       HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
-     }
-     else if (type == hipMemsetTypeDefault && async) {
-       HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
-     }
-   }
-
-  if (async) {
-    HIPCHECK(hipStreamSynchronize(stream));
-  } else {
-    HIPCHECK(hipDeviceSynchronize());
-  }
-
-  auto end = chrono::steady_clock::now();
-
-  hipMemcpy3DParms myparms = {0};
-  myparms.srcPos = make_hipPos(0,0,0);
-  myparms.dstPos = make_hipPos(0,0,0);
-  myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
-  myparms.srcPtr = devPitchedPtr;
-  myparms.extent = extent;
-
-  myparms.kind = hipMemcpyDeviceToHost;
-
-  HIPCHECK(hipMemcpy3D(&myparms));
-
-  for (int i=0; i<elements; i++) {
-    if (A_h[i] != memsetval) {
-      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
-           << ", memsetval: " << static_cast<int> (memsetval) << endl;
-      break;
-      }
-  }
-
-  chrono::duration<double> diff = end - start;
-
-  auto sec = diff.count();
-
-  auto perf = static_cast<double>((sizeElements * NUM_ITER * (double)(1e-09)) / sec);
-
-  cout << " hipPerf3DMemset" << (async ? "Async" : "     ") << "[" << test << "] " << "  "
-       <<  "(GB/s) for " << setw(5) << bufSize_ << " x " << setw(5)
-       << bufSize_  << " x " << depth << " bytes : " << setw(7) << perf <<  endl;
-  HIPCHECK(hipFree(devPitchedPtr.ptr));
-  free(A_h);
-}
-
-int main() {
-  hipPerfMemset hipPerfMemset;
-
-  dataType pattern;
-  int deviceId = 0;
-  hipPerfMemset.open(deviceId);
-  MemsetType type;
-
-  int numTests = hipPerfMemset.getNumTests();
-  int numTests2D = hipPerfMemset.getNumTests2D();
-  int numTests3D = hipPerfMemset.getNumTests3D();
-
-
-  cout << "--------------------- 1D buffer -------------------" << endl;
-  bool async= false;
-  for (uint i = 0; i < 2 ; i++) {
-    cout << endl;
-
-    for (auto testCase = 0; testCase < numTests; testCase++) {
-      if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD8" << (async ? "Async " : "      ");
-        hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async);
-      }
-
-      else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD16" << (async ? "Async" : "     ");
-        hipPerfMemset.run1D(testCase,pattern.memsetD16val, hipMemsetTypeD16, async);
-      }
-
-      else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) {
-        cout << "API: hipMemsetD32" << (async ? "Async" : "     ");
-        hipPerfMemset.run1D(testCase,pattern.memsetD32val, hipMemsetTypeD32, async);
-      }
-
-      else {
-        cout << "API: hipMemset" << (async ? "Async   " : "        ");
-        hipPerfMemset.run1D(testCase,pattern.memsetval, hipMemsetTypeDefault, async);
-      }
-    }
-    async = true;
-  }
-
-  cout << endl;
-  cout << "------------------ 2D buffer arrays ---------------" << endl;
-
-  async = false;
-  for (uint i = 0; i < 2; i++) {
-    cout << endl;
-    for (uint test = 0; test < numTests2D; test++) {
-      hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async);
-    }
-    async = true;
-  }
-
-  cout << endl;
-  cout << "------------------ 3D buffer arrays ---------------" << endl;
-
-  async = false;
-  for (uint i = 0; i < 2; i++) {
-    cout << endl;
-    for (uint test =0; test < numTests3D; test++) {
-      hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async);
-    }
-    async = true;
-  }
-
-  passed();
-}
+/*
+ Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/* HIT_START
+ * BUILD: %t %s ../../src/test_common.cpp
+ * TEST: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include <iostream>
+#include <chrono>
+
+static unsigned int sizeList[] = {
+  256, 512, 1024, 2048, 4096, 8192,
+};
+
+static unsigned int eleNumList[] = {
+    0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000,
+    0x200000, 0x400000, 0x800000, 0x1000000
+};
+
+typedef struct _dataType {
+char memsetval = 0x42;
+char memsetD8val = 0xDE;
+int16_t memsetD16val = 0xDEAD;
+int memsetD32val = 0xDEADBEEF;
+}dataType;
+
+#define NUM_ITER 1000
+
+enum MemsetType {
+  hipMemsetTypeDefault,
+  hipMemsetTypeD8,
+  hipMemsetTypeD16,
+  hipMemsetTypeD32,
+  hipMemsetTypeMax
+
+};
+
+using namespace std;
+
+class hipPerfMemset {
+  private:
+    uint64_t     bufSize_;
+    unsigned int num_elements_;
+    unsigned int testNumEle_;
+    unsigned int _numSubTests = 0;
+    unsigned int _numSubTests2D = 0;
+    unsigned int _numSubTests3D = 0;
+    unsigned int num_sizes_ =0;
+
+  public:
+    hipPerfMemset() {
+    num_elements_ = sizeof(eleNumList) / sizeof(unsigned int);
+    _numSubTests = num_elements_ * hipMemsetTypeMax;
+
+    num_sizes_ = sizeof(sizeList) / sizeof(unsigned int);
+    _numSubTests2D = num_sizes_;
+    _numSubTests3D = _numSubTests2D;
+    };
+
+    ~hipPerfMemset() {};
+
+    void open(int deviceID);
+
+    template<typename T>
+    void run1D(unsigned int test, T memsetval, enum MemsetType type, bool async);
+
+    template<typename T>
+    void run2D(unsigned int test, T memsetval, enum MemsetType type, bool async);
+
+    template<typename T>
+    void run3D(unsigned int test, T memsetval, enum MemsetType type, bool async);
+
+    uint getNumTests() {
+      return _numSubTests;
+    }
+
+    uint getNumTests2D() {
+      return _numSubTests2D;
+    }
+    uint getNumTests3D() {
+      return _numSubTests3D;
+    }
+};
+
+
+void hipPerfMemset::open(int deviceId) {
+  int nGpu = 0;
+  HIPCHECK(hipGetDeviceCount(&nGpu));
+  if (nGpu < 1) {
+    failed("No GPU!");
+  }
+
+  HIPCHECK(hipSetDevice(deviceId));
+  hipDeviceProp_t props = {0};
+  HIPCHECK(hipGetDeviceProperties(&props, deviceId));
+  std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
+            << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
+            << std::endl;
+}
+
+template<typename T>
+void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
+
+  T * A_h;
+  T * A_d;
+
+  testNumEle_ = eleNumList[test % num_elements_];
+
+  bufSize_ = testNumEle_ * sizeof(uint32_t);
+
+  HIPCHECK(hipMalloc(&A_d, bufSize_));
+
+  A_h = reinterpret_cast<T*> (malloc(bufSize_));
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+
+  // Warm-up
+  if (async) {
+    HIPCHECK(hipMemsetAsync((void *)A_d, memsetval, bufSize_, stream));
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
+    HIPCHECK(hipDeviceSynchronize());
+  }
+
+  auto start = chrono::high_resolution_clock::now();
+  for (uint i = 0; i < NUM_ITER; i++) {
+    if (type == hipMemsetTypeDefault && !async) {
+      HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
+    }
+    else if (type == hipMemsetTypeDefault && async) {
+      HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
+    }
+    else if (type == hipMemsetTypeD8 && !async){
+      HIPCHECK(hipMemsetD8((hipDeviceptr_t)A_d, memsetval, bufSize_));
+    }
+    else if (type == hipMemsetTypeD8 && async) {
+      HIPCHECK(hipMemsetD8Async((hipDeviceptr_t)A_d, memsetval, bufSize_, stream));
+    }
+    else if (type == hipMemsetTypeD16 && !async) {
+      HIPCHECK(hipMemsetD16((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
+    }
+    else if (type == hipMemsetTypeD16 && async) {
+      HIPCHECK(hipMemsetD16Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
+    }
+    else if (type == hipMemsetTypeD32 && !async) {
+      HIPCHECK(hipMemsetD32((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T)));
+    }
+    else if (type == hipMemsetTypeD32 && async) {
+      HIPCHECK(hipMemsetD32Async((hipDeviceptr_t)A_d, memsetval, bufSize_/sizeof(T), stream));
+    }
+  }
+  if (async) {
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipDeviceSynchronize());
+  }
+
+  auto end = chrono::high_resolution_clock::now();
+
+  HIPCHECK(hipMemcpy(A_h, A_d, bufSize_, hipMemcpyDeviceToHost) );
+
+  for (int i = 0; i < bufSize_ / sizeof(T); i++) {
+    if (A_h[i] != memsetval) {
+      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
+           << ", memsetval: " << static_cast<int> (memsetval) << endl;
+      break;
+    }
+  }
+
+  HIPCHECK(hipFree(A_d));
+  free(A_h);
+
+  auto diff = std::chrono::duration<double>(end - start);
+  auto sec = diff.count();
+
+  auto perf = static_cast<double>((bufSize_ * NUM_ITER * (double)(1e-09)) / sec);
+
+  cout <<  "[" << setw(2) << test << "] " << setw(5) << bufSize_/1024 << " Kb " << setw(4)
+       << " typeSize " << (int)sizeof(T) << " : " << setw(7) << perf << " GB/s " << endl;
+}
+
+template<typename T>
+void hipPerfMemset::run2D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
+
+  bufSize_ = sizeList[test % num_sizes_];
+
+  size_t numH = bufSize_;
+  size_t numW = bufSize_;
+  size_t pitch_A;
+  size_t width = numW * sizeof(char);
+  size_t sizeElements = width * numH;
+  size_t elements = numW* numH;
+
+  T * A_h;
+  T * A_d;
+
+  HIPCHECK(hipMallocPitch(reinterpret_cast<void**>(&A_d), &pitch_A, width ,
+                          numH));
+  A_h = reinterpret_cast<char*>(malloc(sizeElements));
+
+  for (size_t i=0; i < elements; i++) {
+    A_h[i] = 1;
+  }
+
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+
+  // Warm-up
+  if (async) {
+    HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
+    HIPCHECK(hipDeviceSynchronize());
+  }
+
+  auto start = chrono::steady_clock::now();
+
+  for (uint i = 0; i < NUM_ITER; i++) {
+    if (type == hipMemsetTypeDefault && !async) {
+    HIPCHECK(hipMemset2D(A_d, pitch_A, memsetval, numW, numH));
+    }
+    else if (type == hipMemsetTypeDefault && async) {
+      HIPCHECK(hipMemset2DAsync(A_d, pitch_A, memsetval, numW, numH, stream));
+    }
+  }
+
+  if (async) {
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipDeviceSynchronize());
+  }
+
+  auto end = chrono::steady_clock::now();
+
+  HIPCHECK(hipMemcpy2D(A_h, width, A_d, pitch_A, numW, numH,
+                       hipMemcpyDeviceToHost));
+
+  for (int i=0; i < elements; i++) {
+    if (A_h[i] != memsetval) {
+      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
+           << ", memsetval: " << static_cast<int> (memsetval) << endl;
+      break;
+    }
+  }
+
+  chrono::duration<double> diff = end - start;
+
+  auto sec = diff.count();
+
+  auto perf = static_cast<double>((sizeElements* NUM_ITER * (double)(1e-09)) / sec);
+
+  cout << " hipPerf2DMemset" << (async ? "Async" : "     ") << "[" << test << "] "
+       << "  " << "(GB/s) for " << setw(5) << bufSize_
+       << " x " << setw(5) << bufSize_ << " bytes : " << setw(7) << perf <<  endl;
+
+  HIPCHECK(hipStreamDestroy(stream));
+  HIPCHECK(hipFree(A_d));
+  free(A_h);
+}
+
+template<typename T>
+void hipPerfMemset::run3D(unsigned int test, T memsetval, enum MemsetType type, bool async) {
+
+    bufSize_ = sizeList[test % num_sizes_];
+
+    size_t numH = bufSize_;
+    size_t numW = bufSize_;
+    size_t depth = 10;
+    size_t width = numW * sizeof(char);
+    size_t sizeElements = width * numH * depth;
+    size_t elements = numW* numH* depth;
+
+    hipStream_t stream;
+    HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+
+    T *A_h;
+
+    hipExtent extent = make_hipExtent(width, numH, depth);
+    hipPitchedPtr devPitchedPtr;
+
+    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
+    A_h = (char*)malloc(sizeElements);
+    HIPASSERT(A_h != NULL);
+
+    for (size_t i=0; i<elements; i++) {
+        A_h[i] = 1;
+    }
+
+  // Warm-up
+  if (async) {
+    HIPCHECK(hipMemset3DAsync( devPitchedPtr, memsetval, extent, stream));
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
+    HIPCHECK(hipDeviceSynchronize());
+  }
+   auto start = chrono::steady_clock::now();
+
+   for (uint i = 0; i < NUM_ITER; i++) {
+     if (type == hipMemsetTypeDefault && !async) {
+       HIPCHECK(hipMemset3D( devPitchedPtr, memsetval, extent));
+     }
+     else if (type == hipMemsetTypeDefault && async) {
+       HIPCHECK(hipMemset3DAsync(devPitchedPtr, memsetval, extent, stream));
+     }
+   }
+
+  if (async) {
+    HIPCHECK(hipStreamSynchronize(stream));
+  } else {
+    HIPCHECK(hipDeviceSynchronize());
+  }
+
+  auto end = chrono::steady_clock::now();
+
+  hipMemcpy3DParms myparms = {0};
+  myparms.srcPos = make_hipPos(0,0,0);
+  myparms.dstPos = make_hipPos(0,0,0);
+  myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
+  myparms.srcPtr = devPitchedPtr;
+  myparms.extent = extent;
+
+  myparms.kind = hipMemcpyDeviceToHost;
+
+  HIPCHECK(hipMemcpy3D(&myparms));
+
+  for (int i=0; i<elements; i++) {
+    if (A_h[i] != memsetval) {
+      cout << "mismatch at index " << i << " computed: " << static_cast<int> (A_h[i])
+           << ", memsetval: " << static_cast<int> (memsetval) << endl;
+      break;
+      }
+  }
+
+  chrono::duration<double> diff = end - start;
+
+  auto sec = diff.count();
+
+  auto perf = static_cast<double>((sizeElements * NUM_ITER * (double)(1e-09)) / sec);
+
+  cout << " hipPerf3DMemset" << (async ? "Async" : "     ") << "[" << test << "] " << "  "
+       <<  "(GB/s) for " << setw(5) << bufSize_ << " x " << setw(5)
+       << bufSize_  << " x " << depth << " bytes : " << setw(7) << perf <<  endl;
+  HIPCHECK(hipFree(devPitchedPtr.ptr));
+  free(A_h);
+}
+
+int main() {
+  hipPerfMemset hipPerfMemset;
+
+  dataType pattern;
+  int deviceId = 0;
+  hipPerfMemset.open(deviceId);
+  MemsetType type;
+
+  int numTests = hipPerfMemset.getNumTests();
+  int numTests2D = hipPerfMemset.getNumTests2D();
+  int numTests3D = hipPerfMemset.getNumTests3D();
+
+
+  cout << "--------------------- 1D buffer -------------------" << endl;
+  bool async= false;
+  for (uint i = 0; i < 2 ; i++) {
+    cout << endl;
+
+    for (auto testCase = 0; testCase < numTests; testCase++) {
+      if (testCase < sizeof(eleNumList) / sizeof(uint32_t)) {
+        cout << "API: hipMemsetD8" << (async ? "Async " : "      ");
+        hipPerfMemset.run1D(testCase, pattern.memsetval, hipMemsetTypeD8, async);
+      }
+
+      else if (testCase < 2 * sizeof(eleNumList) / sizeof(uint32_t)) {
+        cout << "API: hipMemsetD16" << (async ? "Async" : "     ");
+        hipPerfMemset.run1D(testCase,pattern.memsetD16val, hipMemsetTypeD16, async);
+      }
+
+      else if (testCase < 3 * sizeof(eleNumList) / sizeof(uint32_t)) {
+        cout << "API: hipMemsetD32" << (async ? "Async" : "     ");
+        hipPerfMemset.run1D(testCase,pattern.memsetD32val, hipMemsetTypeD32, async);
+      }
+
+      else {
+        cout << "API: hipMemset" << (async ? "Async   " : "        ");
+        hipPerfMemset.run1D(testCase,pattern.memsetval, hipMemsetTypeDefault, async);
+      }
+    }
+    async = true;
+  }
+
+  cout << endl;
+  cout << "------------------ 2D buffer arrays ---------------" << endl;
+
+  async = false;
+  for (uint i = 0; i < 2; i++) {
+    cout << endl;
+    for (uint test = 0; test < numTests2D; test++) {
+      hipPerfMemset.run2D(test, pattern.memsetval, hipMemsetTypeDefault, async);
+    }
+    async = true;
+  }
+
+  cout << endl;
+  cout << "------------------ 3D buffer arrays ---------------" << endl;
+
+  async = false;
+  for (uint i = 0; i < 2; i++) {
+    cout << endl;
+    for (uint test =0; test < numTests3D; test++) {
+      hipPerfMemset.run3D(test, pattern.memsetval, hipMemsetTypeDefault, async);
+    }
+    async = true;
+  }
+
+  passed();
+}
@@ -41,4 +41,4 @@ cmake ../samples

 make package_samples

-## Note: sample 2_Cookbook/22_cmake_hip_lang is current not included in toplevel cmake. To build this sample from toplevel cmake, uncomment Line 43 inside samples/2_Cookbook/CMakeLists.txt. 
+## Note: sample 2_Cookbook/22_cmake_hip_lang is current not included in toplevel cmake. To build this sample from toplevel cmake, uncomment Line 43 inside samples/2_Cookbook/CMakeLists.txt.