SWDEV-470698 - fix formatting, add format check workflow (#657)

2025-08-20 16:28:06 +02:00
commit f7338717ae
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+RANGE=""
+
+while [[ $# -gt 0 ]]; do
+  echo $1
+  echo $2
+  case "$1" in
+  --range)
+    RANGE="$2"
+    shift 2
+    ;;
+  *)
+    echo "Unknown arg $1" >&2
+    exit 64
+    ;;
+  esac
+done
+
+regex='\.(c|cc|cpp|cxx|h|hh|hpp|hxx)$'
+
+clang_bin="${CLANG_FORMAT:-clang-format}"
+if ! command -v "$clang_bin" >/dev/null 2>&1; then
+  if [[ -x "/c/Program Files/LLVM/bin/clang-format.exe" ]]; then
+    clang_bin="/c/Program Files/LLVM/bin/clang-format.exe"
+  fi
+fi
+
+clang_format_diff="${CLANG_FORMAT_DIFF:-clang-format-diff}"
+if ! command -v "$clang_format_diff" >/dev/null 2>&1; then
+  if [[ -x "/c/Program Files/LLVM/share/clang/clang-format-diff.py" ]]; then
+    clang_format_diff="/c/Program Files/LLVM/share/clang/clang-format-diff.py"
+  fi
+fi
+
+directories=(projects/hip projects/clr projects/hipother projects/hip-tests)
+
+for dir in ${array[*]}; do
+  cd $dir
+  if [[ -n $RANGE ]]; then
+    files=$(git diff --name-only "$RANGE" . | grep -E "$regex" || true)
+  else
+    files=$(git diff --cached --name-only --diff-filter=ACMR . | grep -E "$regex" || true)
+  fi
+  echo "Checking $files"
+  [[ -z $files ]] && exit 0
+
+  for file in $files; do
+    echo "Checking lines of $file"
+
+    if [[ -n $RANGE ]]; then
+      diff_output=$(git diff -U0 "$RANGE" -- "$file")
+    else
+      diff_output=$(git diff -U0 --cached -- "$file")
+    fi
+
+    echo "$diff_output" | "$clang_format_diff" -style=file -fallback-style=none -p1
+  done
+  cd ..
+done
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+exec "$(git rev-parse --show-toplevel)/.github/hooks/clang-format-check.sh"
@@ -0,0 +1,27 @@
+name: Clang format check
+on:
+  pull_request:
+    types: [synchronize, opened]
+    paths:
+      - 'projects/hip/**'
+      - 'projects/clr/**'
+      - 'projects/hipother/**'
+      - 'projects/hip-tests/**'
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install clang-format
+        run: |
+          sudo apt update && sudo apt install -y clang-format
+
+      - name: Run clang-format-check
+        id: clang-format
+        run: |
+          chmod +x .github/hooks/clang-format-check.sh
+          ./.github/hooks/clang-format-check.sh --range "${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}"
@@ -31,8 +31,8 @@ THE SOFTWARE.

 #ifdef __cplusplus

-extern "C" HIP_PUBLIC_API
-hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
+extern "C" HIP_PUBLIC_API hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                                    hipChannelFormatKind f);

 static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
  int e = (int)sizeof(unsigned short) * 8;
@@ -54,295 +54,248 @@ static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
 }

-template <typename T>
-static inline hipChannelFormatDesc hipCreateChannelDesc() {
+template <typename T> static inline hipChannelFormatDesc hipCreateChannelDesc() {
  return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
  int e = (int)sizeof(char) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
  int e = (int)sizeof(signed char) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
  int e = (int)sizeof(unsigned char) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
  int e = (int)sizeof(unsigned char) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
  int e = (int)sizeof(signed char) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
  int e = (int)sizeof(unsigned char) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
  int e = (int)sizeof(signed char) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

 #ifndef __GNUC__  // vector3 is the same as vector4
-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
  int e = (int)sizeof(unsigned char) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
  int e = (int)sizeof(signed char) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }
 #endif

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
  int e = (int)sizeof(unsigned char) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
  int e = (int)sizeof(signed char) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
  int e = (int)sizeof(unsigned short) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
  int e = (int)sizeof(signed short) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
  int e = (int)sizeof(unsigned short) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
  int e = (int)sizeof(signed short) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
  int e = (int)sizeof(unsigned short) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
  int e = (int)sizeof(signed short) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

 #ifndef __GNUC__
-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
  int e = (int)sizeof(unsigned short) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
  int e = (int)sizeof(signed short) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }
 #endif

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
  int e = (int)sizeof(unsigned short) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
  int e = (int)sizeof(signed short) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
  int e = (int)sizeof(unsigned int) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
  int e = (int)sizeof(signed int) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
  int e = (int)sizeof(unsigned int) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
  int e = (int)sizeof(signed int) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
  int e = (int)sizeof(unsigned int) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
  int e = (int)sizeof(signed int) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

 #ifndef __GNUC__
-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
  int e = (int)sizeof(unsigned int) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
  int e = (int)sizeof(signed int) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }
 #endif

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
  int e = (int)sizeof(unsigned int) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
  int e = (int)sizeof(signed int) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
  int e = (int)sizeof(float) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
  int e = (int)sizeof(float) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
  int e = (int)sizeof(float) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
 }

 #ifndef __GNUC__
-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
  int e = (int)sizeof(float) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
 }
 #endif

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
  int e = (int)sizeof(float) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
 }

 #if !defined(__LP64__)

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
  int e = (int)sizeof(unsigned long) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
  int e = (int)sizeof(signed long) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
  int e = (int)sizeof(unsigned long) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
  int e = (int)sizeof(signed long) * 8;
  return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
  int e = (int)sizeof(unsigned long) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
  int e = (int)sizeof(signed long) * 8;
  return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
 }

 #ifndef __GNUC__
-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
  int e = (int)sizeof(unsigned long) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
  int e = (int)sizeof(signed long) * 8;
  return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
 }
 #endif

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
  int e = (int)sizeof(unsigned long) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
 }

-template <>
-inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+template <> inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
  int e = (int)sizeof(signed long) * 8;
  return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
 }
@@ -37,8 +37,7 @@ THE SOFTWARE.
 #if defined(__clang__) && defined(__HIP__)
 extern "C" __device__ int printf(const char* fmt, ...);
 #else
-template <typename... All>
-static inline __device__ void printf(const char* format, All... all) {}
+template <typename... All> static inline __device__ void printf(const char* format, All... all) {}
 #endif

 extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
@@ -55,9 +54,7 @@ __device__ static inline unsigned int __popcll(unsigned long long int input) {
  return __builtin_popcountll(input);
 }

-__device__ static inline int __clz(int input) {
-    return __ockl_clz_u32((uint)input);
-}
+__device__ static inline int __clz(int input) { return __ockl_clz_u32((uint)input); }

 __device__ static inline int __clzll(long long int input) {
  return __ockl_clz_u64((__hip_uint64_t)input);
@@ -80,25 +77,24 @@ __device__ static inline int __ffsll(long long int input) {
 }

 // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
-// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
-// If not found, return -1.
-__device__  static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
+// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit
+// position. If not found, return -1.
+__device__ static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t base,
+                                        __hip_int32_t offset) {
  __hip_uint64_t temp_mask = mask;
  __hip_int32_t temp_offset = offset;

  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
-  }
-  else if (offset < 0) {
+  } else if (offset < 0) {
    temp_mask = __builtin_bitreverse64(mask);
    base = 63 - base;
    temp_offset = -offset;
  }

  temp_mask = temp_mask & ((~0ULL) << base);
-  if (__builtin_popcountll(temp_mask) < temp_offset)
-    return -1;
+  if (__builtin_popcountll(temp_mask) < temp_offset) return -1;
  __hip_int32_t total = 0;
  for (int i = 0x20; i > 0; i >>= 1) {
    __hip_uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
@@ -107,8 +103,7 @@ __device__  static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t bas
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
-    }
-    else {
+    } else {
      temp_mask = temp_mask_lo;
    }
  }
@@ -118,21 +113,20 @@ __device__  static __hip_int32_t __fns64(__hip_uint64_t mask, __hip_uint32_t bas
    return total;
 }

-__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base, __hip_int32_t offset) {
+__device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base,
+                                        __hip_int32_t offset) {
  __hip_uint32_t temp_mask = mask;
  __hip_int32_t temp_offset = offset;
  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
-  }
-  else if (offset < 0) {
+  } else if (offset < 0) {
    temp_mask = __builtin_bitreverse32(mask);
    base = 31 - base;
    temp_offset = -offset;
  }
  temp_mask = temp_mask & ((~0U) << base);
-  if (__builtin_popcount(temp_mask) < temp_offset)
-    return -1;
+  if (__builtin_popcount(temp_mask) < temp_offset) return -1;
  __hip_int32_t total = 0;
  for (int i = 0x10; i > 0; i >>= 1) {
    __hip_uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
@@ -141,8 +135,7 @@ __device__ static __hip_int32_t __fns32(__hip_uint64_t mask, __hip_uint32_t base
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
-    }
-    else {
+    } else {
      temp_mask = temp_mask_lo;
    }
  }
@@ -169,51 +162,55 @@ __device__ static inline unsigned int __lastbit_u32_u64(__hip_uint64_t input) {
  return input == 0 ? -1 : __builtin_ctzl(input);
 }

-__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
+__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1,
+                                                       unsigned int src2) {
  __hip_uint32_t offset = src1 & 31;
  __hip_uint32_t width = src2 & 31;
  return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
 }

-__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1, unsigned int src2) {
+__device__ static inline __hip_uint64_t __bitextract_u64(__hip_uint64_t src0, unsigned int src1,
+                                                         unsigned int src2) {
  __hip_uint64_t offset = src1 & 63;
  __hip_uint64_t width = src2 & 63;
  return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
 }

-__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
+__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1,
+                                                      unsigned int src2, unsigned int src3) {
  __hip_uint32_t offset = src2 & 31;
  __hip_uint32_t width = src3 & 31;
  __hip_uint32_t mask = (1 << width) - 1;
  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }

-__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1, unsigned int src2, unsigned int src3) {
+__device__ static inline __hip_uint64_t __bitinsert_u64(__hip_uint64_t src0, __hip_uint64_t src1,
+                                                        unsigned int src2, unsigned int src3) {
  __hip_uint64_t offset = src2 & 63;
  __hip_uint64_t width = src3 & 63;
  __hip_uint64_t mask = (1ULL << width) - 1;
  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }

-__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
-{
+__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi,
+                                               unsigned int shift) {
  __hip_uint32_t mask_shift = shift & 31;
  return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
 }

-__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
-{
+__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi,
+                                                unsigned int shift) {
  __hip_uint32_t min_shift = shift >= 32 ? 32 : shift;
  return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
 }

-__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
-{
+__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi,
+                                               unsigned int shift) {
  return __builtin_amdgcn_alignbit(hi, lo, shift);
 }

-__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
-{
+__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi,
+                                                unsigned int shift) {
  return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
 }

@@ -226,7 +223,8 @@ __device__ static int __rhadd(int x, int y);
 __device__ static unsigned int __sad(int x, int y, unsigned int z);
 __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
 __device__ static int __umul24(unsigned int x, unsigned int y);
-__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+__device__ static unsigned long long int __umul64hi(unsigned long long int x,
+                                                    unsigned long long int y);
 __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
 __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
 __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
@@ -245,8 +243,7 @@ struct uchar2Holder {
  };
 } __attribute__((aligned(8)));

-__device__
-static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+__device__ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
  struct uchar2Holder cHoldVal;
  struct ucharHolder cHoldKey;
  cHoldKey.ui = s;
@@ -262,9 +259,7 @@ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned

 __device__ static inline int __hadd(int x, int y) { return ((long long)x + (long long)y) >> 1; }

-__device__ static inline int __mul24(int x, int y) {
-    return __ockl_mul24_i32(x, y);
-}
+__device__ static inline int __mul24(int x, int y) { return __ockl_mul24_i32(x, y); }

 __device__ static inline long long __mul64hi(long long int x, long long int y) {
  unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
@@ -279,9 +274,7 @@ __device__ static inline long long __mul64hi(long long int x, long long int y) {
  return x1 * y1 + z2 + (z1 >> 32);
 }

-__device__ static inline int __mulhi(int x, int y) {
-    return __ockl_mul_hi_i32(x, y);
-}
+__device__ static inline int __mulhi(int x, int y) { return __ockl_mul_hi_i32(x, y); }

 __device__ static inline int __rhadd(int x, int y) {
  return ((long long)x + (long long)y + 1) >> 1;
@@ -299,8 +292,8 @@ __device__ static inline int __umul24(unsigned int x, unsigned int y) {
  return __ockl_mul24_u32(x, y);
 }

-__device__
-static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
+__device__ static inline unsigned long long __umul64hi(unsigned long long int x,
+                                                       unsigned long long int y) {
  unsigned long long x0 = x & 0xffffffffUL;
  unsigned long long x1 = x >> 32;
  unsigned long long y0 = y & 0xffffffffUL;
@@ -325,11 +318,13 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
  return __ockl_sadd_u32(x, y, z);
 }

-__device__
-static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
+__device__ static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {
+  return __builtin_amdgcn_mbcnt_lo(x, y);
+};

-__device__
-static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
+__device__ static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {
+  return __builtin_amdgcn_mbcnt_hi(x, y);
+};

 /*
 HIP specific device functions
@@ -376,16 +371,10 @@ __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
  return out;
 }

-__device__ static inline float __double2float_rd(double x) {
-    return __ocml_cvtrtn_f32_f64(x);
-}
+__device__ static inline float __double2float_rd(double x) { return __ocml_cvtrtn_f32_f64(x); }
 __device__ static inline float __double2float_rn(double x) { return x; }
-__device__ static inline float __double2float_ru(double x) {
-    return __ocml_cvtrtp_f32_f64(x);
-}
-__device__ static inline float __double2float_rz(double x) {
-    return __ocml_cvtrtz_f32_f64(x);
-}
+__device__ static inline float __double2float_ru(double x) { return __ocml_cvtrtp_f32_f64(x); }
+__device__ static inline float __double2float_rz(double x) { return __ocml_cvtrtz_f32_f64(x); }

 __device__ static inline int __double2hiint(double x) {
  static_assert(sizeof(double) == 2 * sizeof(int), "");
@@ -533,7 +522,8 @@ __device__ static inline unsigned int __float_as_uint(float x) {
 __device__ static inline double __hiloint2double(int hi, int lo) {
  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo);
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(hi) << 32ull) | static_cast<__hip_uint32_t>(lo);
  double tmp1;
  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));

@@ -542,16 +532,10 @@ __device__ static inline double __hiloint2double(int hi, int lo) {

 __device__ static inline double __int2double_rn(int x) { return (double)x; }

-__device__ static inline float __int2float_rd(int x) {
-    return __ocml_cvtrtn_f32_s32(x);
-}
+__device__ static inline float __int2float_rd(int x) { return __ocml_cvtrtn_f32_s32(x); }
 __device__ static inline float __int2float_rn(int x) { return (float)x; }
-__device__ static inline float __int2float_ru(int x) {
-    return __ocml_cvtrtp_f32_s32(x);
-}
-__device__ static inline float __int2float_rz(int x) {
-    return __ocml_cvtrtz_f32_s32(x);
-}
+__device__ static inline float __int2float_ru(int x) { return __ocml_cvtrtp_f32_s32(x); }
+__device__ static inline float __int2float_rz(int x) { return __ocml_cvtrtz_f32_s32(x); }

 __device__ static inline float __int_as_float(int x) {
  static_assert(sizeof(float) == sizeof(int), "");
@@ -562,27 +546,15 @@ __device__ static inline float __int_as_float(int x) {
  return tmp;
 }

-__device__ static inline double __ll2double_rd(long long int x) {
-    return __ocml_cvtrtn_f64_s64(x);
-}
+__device__ static inline double __ll2double_rd(long long int x) { return __ocml_cvtrtn_f64_s64(x); }
 __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
-__device__ static inline double __ll2double_ru(long long int x) {
-    return __ocml_cvtrtp_f64_s64(x);
-}
-__device__ static inline double __ll2double_rz(long long int x) {
-    return __ocml_cvtrtz_f64_s64(x);
-}
+__device__ static inline double __ll2double_ru(long long int x) { return __ocml_cvtrtp_f64_s64(x); }
+__device__ static inline double __ll2double_rz(long long int x) { return __ocml_cvtrtz_f64_s64(x); }

-__device__ static inline float __ll2float_rd(long long int x) {
-    return __ocml_cvtrtn_f32_s64(x);
-}
+__device__ static inline float __ll2float_rd(long long int x) { return __ocml_cvtrtn_f32_s64(x); }
 __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
-__device__ static inline float __ll2float_ru(long long int x) {
-    return __ocml_cvtrtp_f32_s64(x);
-}
-__device__ static inline float __ll2float_rz(long long int x) {
-    return __ocml_cvtrtz_f32_s64(x);
-}
+__device__ static inline float __ll2float_ru(long long int x) { return __ocml_cvtrtp_f32_s64(x); }
+__device__ static inline float __ll2float_rz(long long int x) { return __ocml_cvtrtz_f32_s64(x); }

 __device__ static inline double __longlong_as_double(long long int x) {
  static_assert(sizeof(double) == sizeof(long long), "");
@@ -595,16 +567,10 @@ __device__ static inline double __longlong_as_double(long long int x) {

 __device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }

-__device__ static inline float __uint2float_rd(unsigned int x) {
-    return __ocml_cvtrtn_f32_u32(x);
-}
+__device__ static inline float __uint2float_rd(unsigned int x) { return __ocml_cvtrtn_f32_u32(x); }
 __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
-__device__ static inline float __uint2float_ru(unsigned int x) {
-    return __ocml_cvtrtp_f32_u32(x);
-}
-__device__ static inline float __uint2float_rz(unsigned int x) {
-    return __ocml_cvtrtz_f32_u32(x);
-}
+__device__ static inline float __uint2float_ru(unsigned int x) { return __ocml_cvtrtp_f32_u32(x); }
+__device__ static inline float __uint2float_rz(unsigned int x) { return __ocml_cvtrtz_f32_u32(x); }

 __device__ static inline float __uint_as_float(unsigned int x) {
  static_assert(sizeof(float) == sizeof(unsigned int), "");
@@ -653,66 +619,44 @@ __device__ void __named_sync();
 // Clock function to return GPU core cycle count.
 // GPU can change its core clock frequency at runtime. The maximum frequency can be queried
 // through hipDeviceAttributeClockRate attribute.
-__device__
-inline  __attribute((always_inline))
-long long int __clock64() {
+__device__ inline __attribute((always_inline)) long long int __clock64() {
  return (long long int)__builtin_readcyclecounter();
 }

-__device__
-inline __attribute((always_inline))
-long long int  __clock() { return __clock64(); }
+__device__ inline __attribute((always_inline)) long long int __clock() { return __clock64(); }

 // Clock function to return wall clock count at a constant frequency that can be queried
 // through hipDeviceAttributeWallClockRate attribute.
-__device__
-inline  __attribute__((always_inline))
-long long int wall_clock64() {
+__device__ inline __attribute__((always_inline)) long long int wall_clock64() {
  return (long long int)__ockl_steadyctr_u64();
 }

-__device__
-inline  __attribute__((always_inline))
-long long int clock64() { return __clock64(); }
+__device__ inline __attribute__((always_inline)) long long int clock64() { return __clock64(); }

-__device__
-inline __attribute__((always_inline))
-long long int  clock() { return __clock(); }
+__device__ inline __attribute__((always_inline)) long long int clock() { return __clock(); }

 // hip.amdgcn.bc - named sync
-__device__
-inline
-void __named_sync() { __builtin_amdgcn_s_barrier(); }
+__device__ inline void __named_sync() { __builtin_amdgcn_s_barrier(); }

 #endif  // __HIP_DEVICE_COMPILE__

 // hip.amdgcn.bc - lanemask
-__device__
-inline
-__hip_uint64_t  __lanemask_gt()
-{
+__device__ inline __hip_uint64_t __lanemask_gt() {
  __hip_uint32_t lane = __ockl_lane_u32();
-    if (lane == 63)
-      return 0;
+  if (lane == 63) return 0;
  __hip_uint64_t ballot = __ballot64(1);
  __hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1);
  return mask & ballot;
 }

-__device__
-inline
-__hip_uint64_t __lanemask_lt()
-{
+__device__ inline __hip_uint64_t __lanemask_lt() {
  __hip_uint32_t lane = __ockl_lane_u32();
  __hip_int64_t ballot = __ballot64(1);
  __hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1;
  return mask & ballot;
 }

-__device__
-inline
-__hip_uint64_t  __lanemask_eq()
-{
+__device__ inline __hip_uint64_t __lanemask_eq() {
  __hip_uint32_t lane = __ockl_lane_u32();
  __hip_int64_t mask = ((__hip_uint64_t)1 << lane);
  return mask;
@@ -722,43 +666,24 @@ __hip_uint64_t  __lanemask_eq()
 __device__ inline void* __local_to_generic(void* p) { return p; }

 #ifdef __HIP_DEVICE_COMPILE__
-__device__
-inline
-void* __get_dynamicgroupbaseptr()
-{
+__device__ inline void* __get_dynamicgroupbaseptr() {
  // Get group segment base pointer.
  return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
 }
 #else
-__device__
-void* __get_dynamicgroupbaseptr();
+__device__ void* __get_dynamicgroupbaseptr();
 #endif  // __HIP_DEVICE_COMPILE__

-__device__
-inline
-void *__amdgcn_get_dynamicgroupbaseptr() {
-    return __get_dynamicgroupbaseptr();
-}
+__device__ inline void* __amdgcn_get_dynamicgroupbaseptr() { return __get_dynamicgroupbaseptr(); }

 // Memory Fence Functions
-__device__
-inline
-static void __threadfence()
-{
-    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
-}
+__device__ inline static void __threadfence() { __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent"); }

-__device__
-inline
-static void __threadfence_block()
-{
+__device__ inline static void __threadfence_block() {
  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }

-__device__
-inline
-static void __threadfence_system()
-{
+__device__ inline static void __threadfence_system() {
  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 }
 __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
@@ -771,42 +696,21 @@ __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
  }
 }

-__device__
-inline
-static void __barrier(int n)
-{
-  __work_group_barrier((__cl_mem_fence_flags)n);
-}
+__device__ inline static void __barrier(int n) { __work_group_barrier((__cl_mem_fence_flags)n); }

-__device__
-inline
-__attribute__((convergent))
-void __syncthreads()
-{
+__device__ inline __attribute__((convergent)) void __syncthreads() {
  __barrier(__CLK_LOCAL_MEM_FENCE);
 }

-__device__
-inline
-__attribute__((convergent))
-int __syncthreads_count(int predicate)
-{
+__device__ inline __attribute__((convergent)) int __syncthreads_count(int predicate) {
  return __ockl_wgred_add_i32(!!predicate);
 }

-__device__
-inline
-__attribute__((convergent))
-int __syncthreads_and(int predicate)
-{
+__device__ inline __attribute__((convergent)) int __syncthreads_and(int predicate) {
  return __ockl_wgred_and_i32(!!predicate);
 }

-__device__
-inline
-__attribute__((convergent))
-int __syncthreads_or(int predicate)
-{
+__device__ inline __attribute__((convergent)) int __syncthreads_or(int predicate) {
  return __ockl_wgred_or_i32(!!predicate);
 }

@@ -857,8 +761,7 @@ int __syncthreads_or(int predicate)
 #define HW_ID_CU_ID_OFFSET 8
 #endif

-#if (defined(__gfx908__) || defined(__gfx90a__) || \
-     defined(__GFX11__))
+#if (defined(__gfx908__) || defined(__gfx90a__) || defined(__GFX11__))
 #define HW_ID_SE_ID_SIZE 3
 #else  // 4 SEs/XCC for 942
 #define HW_ID_SE_ID_SIZE 2
@@ -897,28 +800,25 @@ int __syncthreads_or(int predicate)
  Note: the results vary over time.
  SZ minus 1 since SIZE is 1-based.
 */
-__device__
-inline
-unsigned __smid(void)
-{
-    unsigned se_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
+__device__ inline unsigned __smid(void) {
+  unsigned se_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_SE_ID_SIZE - 1, HW_ID_SE_ID_OFFSET, HW_ID));
 #if (defined(__GFX10__) || defined(__GFX11__))
-      unsigned wgp_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
-      unsigned sa_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
+  unsigned wgp_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
+  unsigned sa_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
 #if (defined(__AMDGCN_CUMODE__))
-        unsigned cu_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
+  unsigned cu_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
 #endif
 #else
 #if defined(__gfx94plus_clr__)
-      unsigned xcc_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
+  unsigned xcc_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
 #endif
-      unsigned cu_id = __builtin_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
+  unsigned cu_id =
+      __builtin_amdgcn_s_getreg(GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
 #endif
 #if (defined(__GFX10__) || defined(__GFX11__))
  unsigned temp = se_id;
@@ -28,8 +28,12 @@ THE SOFTWARE.

 template <bool B, typename T, typename F> struct Cond_t;

-template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
-template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
+template <typename T, typename F> struct Cond_t<true, T, F> {
+  using type = T;
+};
+template <typename T, typename F> struct Cond_t<false, T, F> {
+  using type = F;
+};

 #if !__HIP_DEVICE_COMPILE__
 // TODO: Remove this after compiler pre-defines the following Macros.
@@ -45,26 +49,17 @@ template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
 #endif

 // Atomic expanders
-template<
-  int mem_order = __ATOMIC_SEQ_CST,
-  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
-  typename T,
-  typename Op,
-  typename F>
-inline
-__attribute__((always_inline, device))
-T hip_cas_expander(T* p, T x, Op op, F f) noexcept
-{
+template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
+          typename Op, typename F>
+inline __attribute__((always_inline, device)) T hip_cas_expander(T* p, T x, Op op, F f) noexcept {
  using FP = __attribute__((address_space(0))) const void*;

-  __device__
-  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
+  __device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

-  if (is_shared_workaround((FP)p))
-    return f();
+  if (is_shared_workaround((FP)p)) return f();

-  using U = typename Cond_t<
-    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
+  using U =
+      typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

@@ -74,204 +69,158 @@ T hip_cas_expander(T* p, T x, Op op, F f) noexcept
    tmp1 = tmp0;

    op(reinterpret_cast<T&>(tmp1), x);
-  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
-                                                 mem_order, mem_scope));
+  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order, mem_order, mem_scope));

  return reinterpret_cast<const T&>(tmp0);
 }

-template<
-  int mem_order = __ATOMIC_SEQ_CST,
-  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
-  typename T,
-  typename Cmp,
-  typename F>
-inline
-__attribute__((always_inline, device))
-T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
-{
+template <int mem_order = __ATOMIC_SEQ_CST, int mem_scope = __HIP_MEMORY_SCOPE_SYSTEM, typename T,
+          typename Cmp, typename F>
+inline __attribute__((always_inline, device)) T hip_cas_extrema_expander(T* p, T x, Cmp cmp,
+                                                                         F f) noexcept {
  using FP = __attribute__((address_space(0))) const void*;

-  __device__
-  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
+  __device__ extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

-  if (is_shared_workaround((FP)p))
-    return f();
+  if (is_shared_workaround((FP)p)) return f();

-  using U = typename Cond_t<
-    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
+  using U =
+      typename Cond_t<sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

  U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
  while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
-         !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
-                                               mem_scope));
+         !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order, mem_scope));

  return reinterpret_cast<const T&>(tmp);
 }

-__device__
-inline
-unsigned short int atomicCAS(unsigned short int* address, unsigned short int compare,
-                              unsigned short int val) {
+__device__ inline unsigned short int atomicCAS(unsigned short int* address,
+                                               unsigned short int compare, unsigned short int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-unsigned short int atomicCAS_system(unsigned short int* address, unsigned short int compare,
+__device__ inline unsigned short int atomicCAS_system(unsigned short int* address,
+                                                      unsigned short int compare,
                                                      unsigned short int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-int atomicCAS(int* address, int compare, int val) {
+__device__ inline int atomicCAS(int* address, int compare, int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-int atomicCAS_system(int* address, int compare, int val) {
+__device__ inline int atomicCAS_system(int* address, int compare, int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
+__device__ inline unsigned int atomicCAS(unsigned int* address, unsigned int compare,
+                                         unsigned int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
+__device__ inline unsigned int atomicCAS_system(unsigned int* address, unsigned int compare,
+                                                unsigned int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
+__device__ inline unsigned long atomicCAS(unsigned long* address, unsigned long compare,
+                                          unsigned long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
+__device__ inline unsigned long atomicCAS_system(unsigned long* address, unsigned long compare,
+                                                 unsigned long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
-                             unsigned long long val) {
+__device__ inline unsigned long long atomicCAS(unsigned long long* address,
+                                               unsigned long long compare, unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
+__device__ inline unsigned long long atomicCAS_system(unsigned long long* address,
+                                                      unsigned long long compare,
                                                      unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-float atomicCAS(float* address, float compare, float val) {
+__device__ inline float atomicCAS(float* address, float compare, float val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-float atomicCAS_system(float* address, float compare, float val) {
+__device__ inline float atomicCAS_system(float* address, float compare, float val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-double atomicCAS(double* address, double compare, double val) {
+__device__ inline double atomicCAS(double* address, double compare, double val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
 }

-__device__
-inline
-double atomicCAS_system(double* address, double compare, double val) {
+__device__ inline double atomicCAS_system(double* address, double compare, double val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
 }

-__device__
-inline
-int atomicAdd(int* address, int val) {
+__device__ inline int atomicAdd(int* address, int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicAdd_system(int* address, int val) {
+__device__ inline int atomicAdd_system(int* address, int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicAdd(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicAdd(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicAdd(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicAdd(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicAdd(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicAdd_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

@@ -281,9 +230,7 @@ unsigned long long atomicAdd_system(unsigned long long* address, unsigned long l
 #define __HIP_FINE_GRAINED_MEMORY
 #endif

-__device__
-inline
-float atomicAdd(float* address, float val) {
+__device__ inline float atomicAdd(float* address, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, val);
 #else
@@ -293,25 +240,16 @@ float atomicAdd(float* address, float val) {
 #endif
 }

-__device__
-inline
-float atomicAdd_system(float* address, float val) {
+__device__ inline float atomicAdd_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

 #if !defined(__HIPCC_RTC__)
 HIP_DEPRECATED("use atomicAdd instead")
 #endif  // !defined(__HIPCC_RTC__)
-__device__
-inline
-void atomicAddNoRet(float* address, float val)
-{
-  unsafeAtomicAdd(address, val);
-}
+__device__ inline void atomicAddNoRet(float* address, float val) { unsafeAtomicAdd(address, val); }

-__device__
-inline
-double atomicAdd(double* address, double val) {
+__device__ inline double atomicAdd(double* address, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, val);
 #else
@@ -321,63 +259,45 @@ double atomicAdd(double* address, double val) {
 #endif
 }

-__device__
-inline
-double atomicAdd_system(double* address, double val) {
+__device__ inline double atomicAdd_system(double* address, double val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-int atomicSub(int* address, int val) {
+__device__ inline int atomicSub(int* address, int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicSub_system(int* address, int val) {
+__device__ inline int atomicSub_system(int* address, int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicSub(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicSub(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicSub(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicSub(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicSub(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicSub_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-float atomicSub(float* address, float val) {
+__device__ inline float atomicSub(float* address, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, -val);
 #else
@@ -387,15 +307,11 @@ float atomicSub(float* address, float val) {
 #endif
 }

-__device__
-inline
-float atomicSub_system(float* address, float val) {
+__device__ inline float atomicSub_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-double atomicSub(double* address, double val) {
+__device__ inline double atomicSub(double* address, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, -val);
 #else
@@ -405,147 +321,103 @@ double atomicSub(double* address, double val) {
 #endif
 }

-__device__
-inline
-double atomicSub_system(double* address, double val) {
+__device__ inline double atomicSub_system(double* address, double val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-int atomicExch(int* address, int val) {
+__device__ inline int atomicExch(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicExch_system(int* address, int val) {
+__device__ inline int atomicExch_system(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicExch(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicExch(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicExch(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicExch(unsigned long* address, unsigned long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicExch(unsigned long long* address,
+                                                unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicExch_system(unsigned long long* address,
+                                                       unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-float atomicExch(float* address, float val) {
+__device__ inline float atomicExch(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-float atomicExch_system(float* address, float val) {
+__device__ inline float atomicExch_system(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-double atomicExch(double* address, double val) {
+__device__ inline double atomicExch(double* address, double val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-double atomicExch_system(double* address, double val) {
+__device__ inline double atomicExch_system(double* address, double val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-int atomicMin(int* address, int val) {
+__device__ inline int atomicMin(int* address, int val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicMin_system(int* address, int val) {
+__device__ inline int atomicMin_system(int* address, int val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicMin(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicMin(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicMin(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicMin(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicMin(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicMin_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-long long atomicMin(long long* address, long long val) {
+__device__ inline long long atomicMin(long long* address, long long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-long long atomicMin_system(long long* address, long long val) {
+__device__ inline long long atomicMin_system(long long* address, long long val) {
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-float atomicMin(float* addr, float val) {
+__device__ inline float atomicMin(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
 #else
@@ -555,9 +427,7 @@ float atomicMin(float* addr, float val) {
 #endif
 }

-__device__
-inline
-float atomicMin_system(float* addr, float val) {
+__device__ inline float atomicMin_system(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
 #else
@@ -567,9 +437,7 @@ float atomicMin_system(float* addr, float val) {
 #endif
 }

-__device__
-inline
-double atomicMin(double* addr, double val) {
+__device__ inline double atomicMin(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
 #else
@@ -579,9 +447,7 @@ double atomicMin(double* addr, double val) {
 #endif
 }

-__device__
-inline
-double atomicMin_system(double* addr, double val) {
+__device__ inline double atomicMin_system(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
 #else
@@ -591,68 +457,48 @@ double atomicMin_system(double* addr, double val) {
 #endif
 }

-__device__
-inline
-int atomicMax(int* address, int val) {
+__device__ inline int atomicMax(int* address, int val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicMax_system(int* address, int val) {
+__device__ inline int atomicMax_system(int* address, int val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicMax(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicMax(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicMax(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicMax(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicMax(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicMax_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }
-__device__
-inline
-long long atomicMax(long long* address, long long val) {
+__device__ inline long long atomicMax(long long* address, long long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-long long atomicMax_system(long long* address, long long val) {
+__device__ inline long long atomicMax_system(long long* address, long long val) {
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-float atomicMax(float* addr, float val) {
+__device__ inline float atomicMax(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
 #else
@@ -662,9 +508,7 @@ float atomicMax(float* addr, float val) {
 #endif
 }

-__device__
-inline
-float atomicMax_system(float* addr, float val) {
+__device__ inline float atomicMax_system(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
 #else
@@ -674,9 +518,7 @@ float atomicMax_system(float* addr, float val) {
 #endif
 }

-__device__
-inline
-double atomicMax(double* addr, double val) {
+__device__ inline double atomicMax(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
 #else
@@ -686,9 +528,7 @@ double atomicMax(double* addr, double val) {
 #endif
 }

-__device__
-inline
-double atomicMax_system(double* addr, double val) {
+__device__ inline double atomicMax_system(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
 #else
@@ -698,160 +538,111 @@ double atomicMax_system(double* addr, double val) {
 #endif
 }

-__device__
-inline
-unsigned int atomicInc(unsigned int* address, unsigned int val)
-{
+__device__ inline unsigned int atomicInc(unsigned int* address, unsigned int val) {
  return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
 }

-__device__
-inline
-unsigned int atomicDec(unsigned int* address, unsigned int val)
-{
+__device__ inline unsigned int atomicDec(unsigned int* address, unsigned int val) {
  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
 }

-__device__
-inline
-int atomicAnd(int* address, int val) {
+__device__ inline int atomicAnd(int* address, int val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicAnd_system(int* address, int val) {
+__device__ inline int atomicAnd_system(int* address, int val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicAnd(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicAnd(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicAnd(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicAnd(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicAnd(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicAnd_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-int atomicOr(int* address, int val) {
+__device__ inline int atomicOr(int* address, int val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicOr_system(int* address, int val) {
+__device__ inline int atomicOr_system(int* address, int val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicOr(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicOr(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicOr(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicOr(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicOr_system(unsigned long long* address,
+                                                     unsigned long long val) {
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-int atomicXor(int* address, int val) {
+__device__ inline int atomicXor(int* address, int val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-int atomicXor_system(int* address, int val) {
+__device__ inline int atomicXor_system(int* address, int val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned int atomicXor(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicXor(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
+__device__ inline unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long atomicXor(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicXor(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
+__device__ inline unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }

-__device__
-inline
-unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicXor(unsigned long long* address,
+                                               unsigned long long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 }

-__device__
-inline
-unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
+__device__ inline unsigned long long atomicXor_system(unsigned long long* address,
+                                                      unsigned long long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
 }
@@ -623,7 +623,11 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
 */
 __BF16_DEVICE_STATIC__
 __hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width = warpSize) {
-    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+  union {
+    int i;
+    __hip_bfloat16 f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl(tmp.i, src_lane, width);
  return tmp.f;
 }
@@ -633,9 +637,13 @@ __hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width =
 * \brief shfl up warp intrinsic for bfloat16
 */
 __BF16_DEVICE_STATIC__
-__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var,
-                         unsigned int lane_delta, int width = warpSize) {
-    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var, unsigned int lane_delta,
+                         int width = warpSize) {
+  union {
+    int i;
+    __hip_bfloat16 f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_up(tmp.i, lane_delta, width);
  return tmp.f;
 }
@@ -645,9 +653,13 @@ __hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var,
 * \brief shfl down warp intrinsic for bfloat16
 */
 __BF16_DEVICE_STATIC__
-__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var,
-                           unsigned int lane_delta, int width = warpSize) {
-    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var, unsigned int lane_delta,
+                           int width = warpSize) {
+  union {
+    int i;
+    __hip_bfloat16 f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_down(tmp.i, lane_delta, width);
  return tmp.f;
 }
@@ -658,7 +670,11 @@ __hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var,
 */
 __BF16_DEVICE_STATIC__
 __hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int width = warpSize) {
-    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+  union {
+    int i;
+    __hip_bfloat16 f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  return tmp.f;
 }
@@ -1899,7 +1915,8 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,
  static_assert(sizeof(unsigned short int) == sizeof(__hip_bfloat16_raw));
  unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
  // Align to 4 bytes
-  unsigned int* aligned_addr = __builtin_bit_cast(unsigned int*,
+  unsigned int* aligned_addr =
+      __builtin_bit_cast(unsigned int*,
                         __builtin_bit_cast(unsigned long long int, address_as_short) &
                             (unsigned long long int)(~0x3));

@@ -1914,8 +1931,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,

  __hip_bfloat162* in = (__hip_bfloat162*)(aligned_addr);
  __hip_bfloat162 out = unsafeAtomicAdd(in, fval);
-  if (is_lower)
-    return __low2bfloat16(out);
+  if (is_lower) return __low2bfloat16(out);
  return __high2bfloat16(out);
 }
 #endif  // defined(__clang__) && defined(__HIP__)
@@ -43,8 +43,7 @@

 #include <stdint.h>
 /*! \brief Struct to represent a 16 bit brain floating point number. */
-typedef struct
-{
+typedef struct {
  uint16_t data;
 } hip_bfloat16;

@@ -54,69 +53,52 @@ typedef struct

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wshadow"
-struct hip_bfloat16
-{
+struct hip_bfloat16 {
  __hip_uint16_t data;

-    enum truncate_t
-    {
-        truncate
-    };
+  enum truncate_t { truncate };

  __HOST_DEVICE__ hip_bfloat16() = default;

  // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit __HOST_DEVICE__ hip_bfloat16(float f)
-        : data(float_to_bfloat16(f))
-    {
-    }
+  explicit __HOST_DEVICE__ hip_bfloat16(float f) : data(float_to_bfloat16(f)) {}

  explicit __HOST_DEVICE__ hip_bfloat16(float f, truncate_t)
-        : data(truncate_float_to_bfloat16(f))
-    {
-    }
+      : data(truncate_float_to_bfloat16(f)) {}

  // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    __HOST_DEVICE__ operator float() const
-    {
-        union
-        {
+  __HOST_DEVICE__ operator float() const {
+    union {
      __hip_uint32_t int32;
      float fp32;
    } u = {__hip_uint32_t(data) << 16};
    return u.fp32;
  }

-    __HOST_DEVICE__ hip_bfloat16 &operator=(const float& f)
-    {
+  __HOST_DEVICE__ hip_bfloat16& operator=(const float& f) {
    data = float_to_bfloat16(f);
    return *this;
  }

-    static  __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f)
-    {
+  static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f) {
    hip_bfloat16 output;
    output.data = float_to_bfloat16(f);
    return output;
  }

-    static  __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
-    {
+  static __HOST_DEVICE__ hip_bfloat16 round_to_bfloat16(float f, truncate_t) {
    hip_bfloat16 output;
    output.data = truncate_float_to_bfloat16(f);
    return output;
  }

 private:
-    static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f)
-    {
-        union
-        {
+  static __HOST_DEVICE__ __hip_uint16_t float_to_bfloat16(float f) {
+    union {
      float fp32;
      __hip_uint32_t int32;
    } u = {f};
-        if(~u.int32 & 0x7f800000)
-        {
+    if (~u.int32 & 0x7f800000) {
      // When the exponent bits are not all 1s, then the value is zero, normal,
      // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
      // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
@@ -134,9 +116,7 @@ private:
      // incrementing it causes it to become an exponent of 0xFF and a mantissa
      // of 0x00, which is Inf, the next higher value to the unrounded value.
      u.int32 += 0x7fff + ((u.int32 >> 16) & 1);  // Round to nearest, round to even
-        }
-        else if(u.int32 & 0xffff)
-        {
+    } else if (u.int32 & 0xffff) {
      // When all of the exponent bits are 1, the value is Inf or NaN.
      // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
      // mantissa bit. Quiet NaN is indicated by the most significant mantissa
@@ -151,10 +131,8 @@ private:
  }

  // Truncate instead of rounding, preserving SNaN
-    static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f)
-    {
-        union
-        {
+  static __HOST_DEVICE__ __hip_uint16_t truncate_float_to_bfloat16(float f) {
+    union {
      float fp32;
      __hip_uint32_t int32;
    } u = {f};
@@ -163,8 +141,7 @@ private:
 };
 #pragma clang diagnostic pop

-typedef struct
-{
+typedef struct {
  __hip_uint16_t data;
 } hip_bfloat16_public;

@@ -176,117 +153,76 @@ static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
              "hip_bfloat16 is not a trivial type, and thus is "
              "incompatible with C.");
 #if !defined(__HIPCC_RTC__)
-static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
-                  && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
+static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public) &&
+                  offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
              "internal hip_bfloat16 does not match public hip_bfloat16");

-inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
-{
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16) {
  return os << float(bf16);
 }
 #endif

-inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
-{
-    return a;
-}
-inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a) { return a; }
+inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a) {
  a.data ^= 0x8000;
  return a;
 }
-inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b) {
  return hip_bfloat16(float(a) + float(b));
 }
-inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b) {
  return hip_bfloat16(float(a) - float(b));
 }
-inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b) {
  return hip_bfloat16(float(a) * float(b));
 }
-inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b) {
  return hip_bfloat16(float(a) / float(b));
 }
-inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ bool operator<(hip_bfloat16 a, hip_bfloat16 b) {
  return float(a) < float(b);
 }
-inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ bool operator==(hip_bfloat16 a, hip_bfloat16 b) {
  return float(a) == float(b);
 }
-inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
-{
-    return b < a;
-}
-inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
-{
-    return !(a > b);
-}
-inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
-{
-    return !(a == b);
-}
-inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
-{
-    return !(a < b);
-}
-inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ bool operator>(hip_bfloat16 a, hip_bfloat16 b) { return b < a; }
+inline __HOST_DEVICE__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b) { return !(a > b); }
+inline __HOST_DEVICE__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b) { return !(a == b); }
+inline __HOST_DEVICE__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b) { return !(a < b); }
+inline __HOST_DEVICE__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b) {
  return a = a + b;
 }
-inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b) {
  return a = a - b;
 }
-inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b) {
  return a = a * b;
 }
-inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
-{
+inline __HOST_DEVICE__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b) {
  return a = a / b;
 }
-inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a)
-{
-    return a += hip_bfloat16(1.0f);
-}
-inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a)
-{
-    return a -= hip_bfloat16(1.0f);
-}
-inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int)
-{
+inline __HOST_DEVICE__ hip_bfloat16& operator++(hip_bfloat16& a) { return a += hip_bfloat16(1.0f); }
+inline __HOST_DEVICE__ hip_bfloat16& operator--(hip_bfloat16& a) { return a -= hip_bfloat16(1.0f); }
+inline __HOST_DEVICE__ hip_bfloat16 operator++(hip_bfloat16& a, int) {
  hip_bfloat16 orig = a;
  ++a;
  return orig;
 }
-inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int)
-{
+inline __HOST_DEVICE__ hip_bfloat16 operator--(hip_bfloat16& a, int) {
  hip_bfloat16 orig = a;
  --a;
  return orig;
 }

-namespace std
-{
-    constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a)
-    {
+namespace std {
+constexpr __HOST_DEVICE__ bool isinf(hip_bfloat16 a) {
  return !(~a.data & 0x7f80) && !(a.data & 0x7f);
 }
-    constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a)
-    {
+constexpr __HOST_DEVICE__ bool isnan(hip_bfloat16 a) {
  return !(~a.data & 0x7f80) && +(a.data & 0x7f);
 }
-    constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a)
-    {
-        return !(a.data & 0x7fff);
-    }
-}
+constexpr __HOST_DEVICE__ bool iszero(hip_bfloat16 a) { return !(a.data & 0x7fff); }
+}  // namespace std

 #endif  // __cplusplus < 201103L || !defined(__HIPCC__)

@@ -63,9 +63,7 @@ __HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
  return ret;
 }

-__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
-    return z.x * z.x + z.y * z.y;
-}
+__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) { return z.x * z.x + z.y * z.y; }

 __HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
  return make_hipFloatComplex(p.x + q.x, p.y + q.y);
@@ -110,9 +108,7 @@ __HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
  return ret;
 }

-__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
-    return z.x * z.x + z.y * z.y;
-}
+__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) { return z.x * z.x + z.y * z.y; }

 __HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
  return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
@@ -146,7 +146,6 @@ class multi_grid_group : public thread_group {
      : thread_group(internal::cg_multi_grid, size) {}

 public:
-
  //! Number of invocations participating in this multi-grid group. In other
  //! words, the number of GPUs.
  __CG_QUALIFIER__ __hip_uint32_t num_grids() { return internal::multi_grid::num_grids(); }
@@ -155,7 +154,9 @@ class multi_grid_group : public thread_group {
  //! [0, num_grids()) of the GPU that kernel is running on.
  __CG_QUALIFIER__ __hip_uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
  //! @copydoc thread_group::thread_rank
-  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
+  __CG_QUALIFIER__ __hip_uint32_t thread_rank() const {
+    return internal::multi_grid::thread_rank();
+  }
  //! @copydoc thread_group::is_valid
  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
  //! @copydoc thread_group::sync
@@ -196,7 +197,8 @@ class grid_group : public thread_group {

 protected:
  //! Construct grid thread group (through the API this_grid())
-  explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size) : thread_group(internal::cg_grid, size) {}
+  explicit __CG_QUALIFIER__ grid_group(__hip_uint32_t size)
+      : thread_group(internal::cg_grid, size) {}

 public:
  //! @copydoc thread_group::thread_rank
@@ -237,6 +239,7 @@ class thread_block : public thread_group {
                                                       unsigned int tile_size);
  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
                                                       unsigned int tile_size);
+
 protected:
  // Construct a workgroup thread group (through the API this_thread_block())
  explicit __CG_QUALIFIER__ thread_block(__hip_uint32_t size)
@@ -269,9 +272,13 @@ class thread_block : public thread_group {
  //! Returns 3-dimensional thread index within the block.
  __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
  //! @copydoc thread_group::thread_rank
-  __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
+    return internal::workgroup::thread_rank();
+  }
  //! @copydoc thread_group::num_threads
-  __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return internal::workgroup::num_threads(); }
+  __CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
+    return internal::workgroup::num_threads();
+  }
  //! @copydoc thread_group::size
  __CG_STATIC_QUALIFIER__ __hip_uint32_t size() { return num_threads(); }
  //! @copydoc thread_group::is_valid
@@ -335,7 +342,9 @@ class tiled_group : public thread_group {

 public:
  //! @copydoc thread_group::num_threads
-  __CG_QUALIFIER__ unsigned int num_threads() const { return (coalesced_info.tiled_info.num_threads); }
+  __CG_QUALIFIER__ unsigned int num_threads() const {
+    return (coalesced_info.tiled_info.num_threads);
+  }

  //! @copydoc thread_group::size
  __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }
@@ -346,9 +355,7 @@ class tiled_group : public thread_group {
  }

  //! @copydoc thread_group::sync
-  __CG_QUALIFIER__ void sync() const {
-    internal::tiled_group::sync();
-  }
+  __CG_QUALIFIER__ void sync() const { internal::tiled_group::sync(); }
 };

 template <unsigned int size, class ParentCGTy> class thread_block_tile;
@@ -363,8 +370,10 @@ template <unsigned int size, class ParentCGTy> class thread_block_tile;
 class coalesced_group : public thread_group {
 private:
  friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
-  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
-  friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
+  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
+                                                       unsigned int tile_size);
+  friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
+                                                          unsigned int tile_size);
  friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
  template <unsigned int fsize, class fparent>
  friend __CG_QUALIFIER__ coalesced_group
@@ -381,8 +390,10 @@ class coalesced_group : public thread_group {
    // prepare a mask for further partitioning it so that it stays coalesced.
    if (coalesced_info.tiled_info.is_tiled) {
      unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
-      unsigned int masklength = min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
-      lane_mask full_mask = (static_cast<int>(warpSize) == 32) ? static_cast<lane_mask>((1u << 32) - 1)
+      unsigned int masklength =
+          min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
+      lane_mask full_mask = (static_cast<int>(warpSize) == 32)
+          ? static_cast<lane_mask>((1u << 32) - 1)
          : static_cast<lane_mask>(-1ull);
      lane_mask member_mask = full_mask >> (warpSize - masklength);

@@ -425,7 +436,8 @@ class coalesced_group : public thread_group {
  explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
      : thread_group(internal::cg_coalesced_group) {
    coalesced_info.member_mask = member_mask;  // Which threads are active
-    coalesced_info.num_threads = __popcll(coalesced_info.member_mask); // How many threads are active
+    coalesced_info.num_threads =
+        __popcll(coalesced_info.member_mask);    // How many threads are active
    coalesced_info.tiled_info.is_tiled = false;  // Not a partitioned group
    coalesced_info.tiled_info.meta_group_rank = 0;
    coalesced_info.tiled_info.meta_group_size = 1;
@@ -433,14 +445,10 @@ class coalesced_group : public thread_group {

 public:
  //! @copydoc thread_group::num_threads
-   __CG_QUALIFIER__ unsigned int num_threads() const {
-     return coalesced_info.num_threads;
-   }
+  __CG_QUALIFIER__ unsigned int num_threads() const { return coalesced_info.num_threads; }

  //! @copydoc thread_group::size
-   __CG_QUALIFIER__ unsigned int size() const {
-     return num_threads();
-   }
+  __CG_QUALIFIER__ unsigned int size() const { return num_threads(); }

  //! @copydoc thread_group::thread_rank
  __CG_QUALIFIER__ unsigned int thread_rank() const {
@@ -448,9 +456,7 @@ class coalesced_group : public thread_group {
  }

  //! @copydoc thread_group::sync
-   __CG_QUALIFIER__ void sync() const {
-       internal::coalesced_group::sync();
-    }
+  __CG_QUALIFIER__ void sync() const { internal::coalesced_group::sync(); }

  //! Returns the linear rank of the group within the set of tiles partitioned
  //! from a parent group (bounded by meta_group_size).
@@ -475,13 +481,12 @@ class coalesced_group : public thread_group {
   *                  group is copied to other threads.
   *  \param srcRank [in] The source thread ID of the group for copy.
   */
-  template <class T>
-  __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
-
+  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
    srcRank = srcRank % static_cast<int>(num_threads());

    int lane = (num_threads() == warpSize) ? srcRank
-             : (static_cast<int>(warpSize) == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
+        : (static_cast<int>(warpSize) == 64)
+        ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
        : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));

    return __shfl(var, lane, warpSize);
@@ -501,9 +506,7 @@ class coalesced_group : public thread_group {
   *                         between caller thread ID and source of copy thread
   *                         ID. sourceID = (threadID + lane_delta) % size()
   */
-  template <class T>
-  __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
-
+  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
    // Note: The cuda implementation appears to use the remainder of lane_delta
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.
@@ -515,8 +518,7 @@ class coalesced_group : public thread_group {
    int lane;
    if (static_cast<int>(warpSize) == 64) {
      lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
-    }
-    else {
+    } else {
      lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
    }

@@ -541,9 +543,7 @@ class coalesced_group : public thread_group {
   *                         between caller thread ID and source of copy thread
   *                         ID. sourceID = (threadID - lane_delta) % size()
   */
-  template <class T>
-  __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
-
+  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
    // Note: The cuda implementation appears to use the remainder of lane_delta
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.
@@ -555,8 +555,7 @@ class coalesced_group : public thread_group {
    int lane;
    if (static_cast<int>(warpSize) == 64) {
      lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
-    }
-    else if (static_cast<int>(warpSize) == 32) {
+    } else if (static_cast<int>(warpSize) == 32) {
      lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
    }

@@ -759,7 +758,9 @@ __CG_QUALIFIER__ void thread_group::sync() const {
 *           cooperative group type APIs. This function is implemented on Linux
 *           and is under development on Microsoft Windows.
 */
-template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) { return g.num_threads(); }
+template <class CGTy> __CG_QUALIFIER__ __hip_uint32_t group_size(CGTy const& g) {
+  return g.num_threads();
+}

 /** \brief   Returns the rank of thread of the group.
 *
@@ -842,16 +843,12 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
  __CG_QUALIFIER__ unsigned long long build_mask() const {
    unsigned long long mask = ~0ull >> (64 - numThreads);
    // thread_rank() gives thread id from 0..thread launch size.
-    return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) *
-                    numThreads);
+    return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) * numThreads);
  }
 #endif  // HIP_DISABLE_WARP_SYNC_BUILTINS

 public:
-
-  __CG_STATIC_QUALIFIER__ void sync() {
-    internal::tiled_group::sync();
-  }
+  __CG_STATIC_QUALIFIER__ void sync() { internal::tiled_group::sync(); }

  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
    return (__shfl(var, srcRank, numThreads));
@@ -893,8 +890,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz

 /** \brief   User exposed API that captures the state of the parent group pre-partition
 */
-template <unsigned int tileSize, typename ParentCGTy>
-class parent_group_info {
+template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
 public:
  //! Returns the linear rank of the group within the set of tiles partitioned
  //! from a parent group (bounded by meta_group_size)
@@ -920,11 +916,13 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
                               public parent_group_info<tileSize, ParentCGTy> {
  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
  typedef thread_block_tile_base<numThreads> tbtBase;
+
 protected:
  __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
    coalesced_info.tiled_info.num_threads = numThreads;
    coalesced_info.tiled_info.is_tiled = true;
  }
+
 public:
  using tbtBase::num_threads;
  using tbtBase::size;
@@ -935,15 +933,14 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
 // Partial template specialization
 template <unsigned int tileSize>
 class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
-                               public tiled_group
-                             {
+                                               public tiled_group {
  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;

  typedef thread_block_tile_base<numThreads> tbtBase;

 protected:
-
-    __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
+  __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank,
+                                          unsigned int meta_group_size)
      : tiled_group(numThreads) {
    coalesced_info.tiled_info.num_threads = numThreads;
    coalesced_info.tiled_info.is_tiled = true;
@@ -989,12 +986,10 @@ __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsign
  if (parent.cg_type() == internal::cg_tiled_group) {
    const tiled_group* cg = static_cast<const tiled_group*>(&parent);
    return cg->new_tiled_group(tile_size);
-  }
-  else if(parent.cg_type() == internal::cg_coalesced_group) {
+  } else if (parent.cg_type() == internal::cg_coalesced_group) {
    const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
    return cg->new_tiled_group(tile_size);
-  }
-  else {
+  } else {
    const thread_block* tb = static_cast<const thread_block*>(&parent);
    return tb->new_tiled_group(tile_size);
  }
@@ -1010,7 +1005,8 @@ __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned
 }

 // If a coalesced group is passed to be partitioned, it should remain coalesced
-__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
+__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
+                                                 unsigned int tile_size) {
  return (parent.new_tiled_group(tile_size));
 }

@@ -1314,7 +1314,8 @@ struct __hip_fp8_e4m3_fnuz {
 }
 return static_cast<unsigned short>(fval);
 }
-};
+}
+;

 /**
 * \brief struct representing two fp8 numbers with e4m3 interpretation
@@ -1394,7 +1395,8 @@ struct __hip_fp8x2_e4m3_fnuz {
                                                      __wm, __we));
 #endif
 }
-};
+}
+;

 /**
 * \brief struct representing four fp8 numbers with e4m3 interpretation
@@ -1506,7 +1508,8 @@ struct __hip_fp8x4_e4m3_fnuz {
 #endif
  return float4(low.x, low.y, high.x, high.y);
 }
-};
+}
+;

 /**
 * \brief struct representing one fp8 number with e5m2 interpretation
@@ -1872,7 +1875,8 @@ struct __hip_fp8_e5m2_fnuz {
 }
 return static_cast<unsigned short>(fval);
 }
-};
+}
+;

 /**
 * \brief struct representing two fp8 numbers with e5m2 interpretation
@@ -1952,7 +1956,8 @@ struct __hip_fp8x2_e5m2_fnuz {
                                                      __wm, __we));
 #endif
 }
-};
+}
+;

 /**
 * \brief struct representing four fp8 numbers with e5m2 interpretation
@@ -2064,7 +2069,8 @@ struct __hip_fp8x4_e5m2_fnuz {
 #endif
  return float4(low.x, low.y, high.x, high.y);
 }
-};
+}
+;

 #endif  // ENABLE_FNUZ_HIPRTC

@@ -2430,7 +2436,8 @@ struct __hip_fp8_e4m3 {
 }
 return static_cast<unsigned short>(fval);
 }
-};
+}
+;

 /**
 * \brief struct representing two ocp fp8 numbers with e4m3 interpretation
@@ -2511,7 +2518,8 @@ struct __hip_fp8x2_e4m3 {
                                                       __wm, __we));
 #endif
 }
-};
+}
+;

 /**
 * \brief struct representing four ocp fp8 numbers with e4m3 interpretation
@@ -2624,7 +2632,8 @@ struct __hip_fp8x4_e4m3 {
 #endif
  return float4(low.x, low.y, high.x, high.y);
 }
-};
+}
+;

 /**
 * \brief struct representing  ocp fp8 numbers with e5m2 interpretation
@@ -2992,7 +3001,8 @@ struct __hip_fp8_e5m2 {
 }
 return static_cast<unsigned short>(fval);
 }
-};
+}
+;

 /**
 * \brief struct representing two ocp fp8 numbers with e5m2 interpretation
@@ -3074,7 +3084,8 @@ struct __hip_fp8x2_e5m2 {
                                             __default_saturation == __HIP_SATFINITE));
 #endif
 }
-};
+}
+;

 /**
 * \brief struct representing four ocp fp8 numbers with e5m2 interpretation
@@ -3190,6 +3201,7 @@ struct __hip_fp8x4_e5m2 {
 #endif
  return float4(low.x, low.y, high.x, high.y);
 }
-};
+}
+;
 #endif  // ENABLE_OCP_HIPRTC
 #endif  // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
@@ -99,8 +99,8 @@ hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint bu
 * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorUnknown, #hipErrorInvalidResourceHandle
 *
 */
-hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image,
-                                      GLenum target, unsigned int flags);
+hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target,
+                                      unsigned int flags);
 /**
 * @}
 */
@@ -719,8 +719,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
  }
 #endif

-        __OCP_FP_HOST_DEVICE__
-        __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
+        __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in,
+                                                        const __amd_scale_t scale)
 #if HIP_ENABLE_GFX950_OCP_BUILTINS
      : __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, __amd_scale_to_float(scale))){}
 #else
@@ -742,8 +742,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
  }
 #endif

-        __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in,
-                                                        const __amd_scale_t scale)
+        __OCP_FP_HOST_DEVICE__
+        __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
 #if HIP_ENABLE_GFX950_OCP_BUILTINS
      : __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, __amd_scale_to_float(scale))){}
 #else
@@ -832,8 +832,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
  }
 #endif

-        __OCP_FP_HOST_DEVICE__
-        __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
+        __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in,
+                                                        const __amd_scale_t scale)
 #if HIP_ENABLE_GFX950_OCP_BUILTINS
      : __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, __amd_scale_to_float(scale))){}
 #else
@@ -855,8 +855,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
  }
 #endif

-        __OCP_FP_HOST_DEVICE__
-        __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
+        __OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in,
+                                                        const __amd_scale_t scale)
 #if HIP_ENABLE_GFX950_OCP_BUILTINS
      : __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, __amd_scale_to_float(scale))){}
 #else
@@ -203,7 +203,8 @@ void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
 }

 template <typename... Formals, typename... Actuals>
-std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
+std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...),
+                                             std::tuple<Actuals...>(actuals)) {
  static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
  std::tuple<Formals...> to_formals{std::move(actuals)};
  return to_formals;
@@ -271,9 +272,7 @@ __DEVICE__ unsigned int __hip_get_grid_dim_z() { return __ockl_get_num_groups(2)

 #define __HIP_DEVICE_BUILTIN(DIMENSION, FUNCTION)                                                  \
  __declspec(property(get = __get_##DIMENSION)) unsigned int DIMENSION;                            \
-  __DEVICE__ unsigned int __get_##DIMENSION(void) {                     \
-    return FUNCTION;                                            \
-  }
+  __DEVICE__ unsigned int __get_##DIMENSION(void) { return FUNCTION; }

 struct __hip_builtin_threadIdx_t {
  __HIP_DEVICE_BUILTIN(x, __hip_get_thread_idx_x());
@@ -359,9 +358,7 @@ __DEFINE_HCC_FUNC(num_groups, gridDim)
 #pragma pop_macro("__DEFINE_HCC_FUNC")

 extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(unsigned int);
-inline __device__ __attribute__((always_inline)) unsigned int
-hc_get_workitem_absolute_id(int dim)
-{
+inline __device__ __attribute__((always_inline)) unsigned int hc_get_workitem_absolute_id(int dim) {
  return (unsigned int)__ockl_get_global_id(dim);
 }

@@ -105,7 +105,8 @@ hipError_t hipMemcpy2D_spt(void* dst, size_t dpitch, const void* src, size_t spi
                           size_t height, hipMemcpyKind kind);

 hipError_t hipMemcpy2DFromArray_spt(void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset,
-                        size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
+                                    size_t hOffset, size_t width, size_t height,
+                                    hipMemcpyKind kind);

 hipError_t hipMemcpy3D_spt(const struct hipMemcpy3DParms* p);

@@ -116,8 +117,7 @@ hipError_t hipMemsetAsync_spt(void* dst, int value, size_t sizeBytes,

 hipError_t hipMemset2D_spt(void* dst, size_t pitch, int value, size_t width, size_t height);

-hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value,
-                            size_t width, size_t height,
+hipError_t hipMemset2DAsync_spt(void* dst, size_t pitch, int value, size_t width, size_t height,
                                hipStream_t stream __dparm(hipStreamPerThread));

 hipError_t hipMemset3DAsync_spt(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent,
@@ -131,8 +131,8 @@ hipError_t hipMemcpyAsync_spt(void* dst, const void* src, size_t sizeBytes, hipM
 hipError_t hipMemcpy3DAsync_spt(const hipMemcpy3DParms* p,
                                hipStream_t stream __dparm(hipStreamPerThread));

-hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
-                            size_t height, hipMemcpyKind kind,
+hipError_t hipMemcpy2DAsync_spt(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                size_t width, size_t height, hipMemcpyKind kind,
                                hipStream_t stream __dparm(hipStreamPerThread));

 hipError_t hipMemcpyFromSymbolAsync_spt(void* dst, const void* symbol, size_t sizeBytes,
@@ -143,19 +143,20 @@ hipError_t hipMemcpyToSymbolAsync_spt(const void* symbol, const void* src, size_
                                      size_t offset, hipMemcpyKind kind,
                                      hipStream_t stream __dparm(hipStreamPerThread));

-hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc, size_t hOffset,
-                                  size_t count, hipMemcpyKind kind);
+hipError_t hipMemcpyFromArray_spt(void* dst, hipArray_const_t src, size_t wOffsetSrc,
+                                  size_t hOffset, size_t count, hipMemcpyKind kind);

 hipError_t hipMemcpy2DToArray_spt(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
                                  size_t spitch, size_t width, size_t height, hipMemcpyKind kind);

 hipError_t hipMemcpy2DFromArrayAsync_spt(void* dst, size_t dpitch, hipArray_const_t src,
-                                  size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height,
-                                  hipMemcpyKind kind,
+                                         size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+                                         size_t height, hipMemcpyKind kind,
                                         hipStream_t stream __dparm(hipStreamPerThread));

-hipError_t hipMemcpy2DToArrayAsync_spt(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src,
-                                  size_t spitch, size_t width, size_t height, hipMemcpyKind kind,
+hipError_t hipMemcpy2DToArrayAsync_spt(hipArray_t dst, size_t wOffset, size_t hOffset,
+                                       const void* src, size_t spitch, size_t width, size_t height,
+                                       hipMemcpyKind kind,
                                       hipStream_t stream __dparm(hipStreamPerThread));

 hipError_t hipStreamQuery_spt(hipStream_t stream);
@@ -164,25 +165,23 @@ hipError_t hipStreamSynchronize_spt(hipStream_t stream);

 hipError_t hipStreamGetPriority_spt(hipStream_t stream, int* priority);

-hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event, unsigned int flags __dparm(0));
+hipError_t hipStreamWaitEvent_spt(hipStream_t stream, hipEvent_t event,
+                                  unsigned int flags __dparm(0));

 hipError_t hipStreamGetFlags_spt(hipStream_t stream, unsigned int* flags);

-hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback, void* userData,
-                                unsigned int flags);
+hipError_t hipStreamAddCallback_spt(hipStream_t stream, hipStreamCallback_t callback,
+                                    void* userData, unsigned int flags);

 hipError_t hipEventRecord_spt(hipEvent_t event, hipStream_t stream __dparm(hipStreamPerThread));

-hipError_t hipLaunchCooperativeKernel_spt(const void* f,
-                                      dim3 gridDim, dim3 blockDim,
+hipError_t hipLaunchCooperativeKernel_spt(const void* f, dim3 gridDim, dim3 blockDim,
                                          void** kernelParams, uint32_t sharedMemBytes,
                                          hipStream_t hStream __dparm(hipStreamPerThread));

-hipError_t hipLaunchKernel_spt(const void* function_address,
-                           dim3 numBlocks,
-                           dim3 dimBlocks,
-                           void** args,
-                           size_t sharedMemBytes, hipStream_t stream __dparm(hipStreamPerThread));
+hipError_t hipLaunchKernel_spt(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+                               void** args, size_t sharedMemBytes,
+                               hipStream_t stream __dparm(hipStreamPerThread));

 hipError_t hipGraphLaunch_spt(hipGraphExec_t graphExec, hipStream_t stream);
 hipError_t hipStreamBeginCapture_spt(hipStream_t stream, hipStreamCaptureMode mode);
@@ -190,7 +189,8 @@ hipError_t hipStreamEndCapture_spt(hipStream_t stream, hipGraph_t* pGraph);
 hipError_t hipStreamIsCapturing_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus);
 hipError_t hipStreamGetCaptureInfo_spt(hipStream_t stream, hipStreamCaptureStatus* pCaptureStatus,
                                       unsigned long long* pId);
-hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
+hipError_t hipStreamGetCaptureInfo_v2_spt(hipStream_t stream,
+                                          hipStreamCaptureStatus* captureStatus_out,
                                          unsigned long long* id_out, hipGraph_t* graph_out,
                                          const hipGraphNode_t** dependencies_out,
                                          size_t* numDependencies_out);
@@ -58,21 +58,17 @@ THE SOFTWARE.
 * @return Original value contained in \p addr.
 */
 __device__ inline float unsafeAtomicAdd(float* addr, float value) {
-#if defined(__gfx90a__) &&                                                   \
-    __has_builtin(__builtin_amdgcn_is_shared) &&                               \
+#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_is_shared) &&                            \
    __has_builtin(__builtin_amdgcn_is_private) &&                                                  \
    __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) &&                                          \
    __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
-  if (__builtin_amdgcn_is_shared(
-        (const __attribute__((address_space(0))) void*)addr))
+  if (__builtin_amdgcn_is_shared((const __attribute__((address_space(0))) void*)addr))
    return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
-  else if (__builtin_amdgcn_is_private(
-              (const __attribute__((address_space(0))) void*)addr)) {
+  else if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
    float temp = *addr;
    *addr = temp + value;
    return temp;
-  }
-  else
+  } else
    return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
 #elif __has_builtin(__hip_atomic_fetch_add)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
@@ -98,14 +94,13 @@ __device__ inline float unsafeAtomicAdd(float* addr, float value) {
 * @return Original value contained in \p addr.
 */
 __device__ inline float unsafeAtomicMax(float* addr, float val) {
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value < val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -114,8 +109,8 @@ __device__ inline float unsafeAtomicMax(float* addr, float val) {
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) < val) {
-    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
-               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
 #endif
@@ -136,14 +131,13 @@ __device__ inline float unsafeAtomicMax(float* addr, float val) {
 * @return Original value contained in \p addr.
 */
 __device__ inline float unsafeAtomicMin(float* addr, float val) {
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value > val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -152,8 +146,8 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) {
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) > val) {
-    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
-               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
 #endif
@@ -228,14 +222,13 @@ __device__ inline double unsafeAtomicMax(double* addr, double val) {
    __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
  return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
 #else
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value < val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -283,14 +276,13 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
    __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
  return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
 #else
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value > val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -322,9 +314,9 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
 * @return Original value contained in \p addr.
 */
 __device__ inline float safeAtomicAdd(float* addr, float value) {
-#if defined(__gfx908__)                               \
-    || ((defined(__gfx90a__) || defined(__gfx942__) || \
-    defined(__gfx950__)) && !__has_builtin(__hip_atomic_fetch_add))
+#if defined(__gfx908__) ||                                                                         \
+    ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) &&                        \
+     !__has_builtin(__hip_atomic_fetch_add))
  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
  // On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
@@ -335,7 +327,8 @@ __device__ inline float safeAtomicAdd(float* addr, float value) {
    old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
 #else   // !__has_builtin(__hip_atomic_load)
-  old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
+  old_val =
+      __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
 #endif  // __has_builtin(__hip_atomic_load)
  float expected, temp;
  do {
@@ -346,8 +339,8 @@ __device__ inline float safeAtomicAdd(float* addr, float value) {
                                           __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
 #else   // !__has_builtin(__hip_atomic_compare_exchange_strong)
-    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
-                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
 #endif  // __has_builtin(__hip_atomic_compare_exchange_strong)
    old_val = expected;
  } while (__float_as_uint(temp) != __float_as_uint(old_val));
@@ -384,14 +377,13 @@ __device__ inline float safeAtomicAdd(float* addr, float value) {
 * @return Original value contained in \p addr.
 */
 __device__ inline float safeAtomicMax(float* addr, float val) {
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value < val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -400,8 +392,8 @@ __device__ inline float safeAtomicMax(float* addr, float val) {
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) < val) {
-    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
-               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
 #endif
@@ -422,14 +414,13 @@ __device__ inline float safeAtomicMax(float* addr, float val) {
 * @return Original value contained in \p addr.
 */
 __device__ inline float safeAtomicMin(float* addr, float val) {
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
  __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
    float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    bool done = false;
    while (!done && value > val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+      done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                  __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
    return value;
  }
@@ -438,8 +429,8 @@ __device__ inline float safeAtomicMin(float* addr, float val) {
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) > val) {
-    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
-               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED,
+                                       __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
 #endif
@@ -477,7 +468,8 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
    old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
 #else   // !__has_builtin(__hip_atomic_load)
-  old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
+  old_val = __longlong_as_double(
+      __atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
 #endif  // __has_builtin(__hip_atomic_load)
  double expected, temp;
  do {
@@ -488,8 +480,8 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
                                           __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
    }
 #else   // !__has_builtin(__hip_atomic_compare_exchange_strong)
-    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
-                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    __atomic_compare_exchange_n(addr, &expected, old_val + value, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
 #endif  // __has_builtin(__hip_atomic_compare_exchange_strong)
    old_val = expected;
  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
@@ -521,21 +513,19 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
 */
 __device__ inline double safeAtomicMax(double* addr, double val) {
 #if __has_builtin(__builtin_amdgcn_is_private)
-  if (__builtin_amdgcn_is_private(
-          (const __attribute__((address_space(0))) void*)addr)) {
+  if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
    double old = *addr;
    *addr = __builtin_fmax(old, val);
    return old;
  } else {
 #endif
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
      double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      bool done = false;
      while (!done && value < val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                    __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      }
      return value;
    }
@@ -570,21 +560,19 @@ __device__ inline double safeAtomicMax(double* addr, double val) {
 */
 __device__ inline double safeAtomicMin(double* addr, double val) {
 #if __has_builtin(__builtin_amdgcn_is_private)
-  if (__builtin_amdgcn_is_private(
-           (const __attribute__((address_space(0))) void*)addr)) {
+  if (__builtin_amdgcn_is_private((const __attribute__((address_space(0))) void*)addr)) {
    double old = *addr;
    *addr = __builtin_fmin(old, val);
    return old;
  } else {
 #endif
-  #if __has_builtin(__hip_atomic_load) && \
-      __has_builtin(__hip_atomic_compare_exchange_strong)
+#if __has_builtin(__hip_atomic_load) && __has_builtin(__hip_atomic_compare_exchange_strong)
    __HIP_ATOMICS_IGNORE_DENORMAL_MODE {
      double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      bool done = false;
      while (!done && value > val) {
-      done = __hip_atomic_compare_exchange_strong(addr, &value, val,
-                 __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+        done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED,
+                                                    __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      }
      return value;
    }
@@ -50,33 +50,27 @@ THE SOFTWARE.
 // DOT FUNCTIONS
 #if defined(__clang__) && defined(__HIP__)
 __DEVICE__
-inline
-int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
+inline int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
  return __ockl_sdot2(get_native_vector(a), get_native_vector(b), c, saturate);
 }
 __DEVICE__
-inline
-uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
+inline uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
  return __ockl_udot2(get_native_vector(a), get_native_vector(b), c, saturate);
 }
 __DEVICE__
-inline
-int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
+inline int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
  return __ockl_sdot4(get_native_vector(a), get_native_vector(b), c, saturate);
 }
 __DEVICE__
-inline
-uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
+inline uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
  return __ockl_udot4(get_native_vector(a), get_native_vector(b), c, saturate);
 }
 __DEVICE__
-inline
-int amd_mixed_dot(int a, int b, int c, bool saturate) {
+inline int amd_mixed_dot(int a, int b, int c, bool saturate) {
  return __ockl_sdot8(a, b, c, saturate);
 }
 __DEVICE__
-inline
-uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
+inline uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
  return __ockl_udot8(a, b, c, saturate);
 }
 #endif
@@ -150,7 +150,8 @@ static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t su
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
+static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                  int y) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -170,7 +171,8 @@ static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t su
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
+static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                   int y) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -191,7 +193,8 @@ static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t su
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
+static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
+                                                  int z) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
  int4 coords{x, y, z, 0};
@@ -212,7 +215,8 @@ static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t su
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
+static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
+                                                   int z) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
  int4 coords{x, y, z, 0};
@@ -232,7 +236,8 @@ static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t su
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
+static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                         int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  auto tmp = __ockl_image_load_lod_1D(i, x, layer);
@@ -251,7 +256,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObje
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
+static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                          int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
  auto tmp = __hipMapTo<float4::Native_vec_>(data);
@@ -271,7 +277,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObje
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
+static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                         int y, int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -292,7 +299,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObje
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
+static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                          int y, int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -313,7 +321,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObje
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
+static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x,
+                                                       int y, int face) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -334,7 +343,8 @@ static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
+static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x,
+                                                        int y, int face) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -356,8 +366,8 @@ static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
-        int layer) {
+static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj,
+                                                              int x, int y, int face, int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -379,8 +389,8 @@ static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfac
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
-        int layer) {
+static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj,
+                                                               int x, int y, int face, int layer) {
  __HIP_SURFACE_OBJECT_PARAMETERS_INIT
  x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
  int2 coords{x, y};
@@ -34,25 +34,45 @@ THE SOFTWARE.
 #endif

 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
  return tmp.u;
 }

 __device__ static inline float __hip_ds_bpermutef(int index, float src) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
  tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
  return tmp.f;
 }

 __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
  return tmp.u;
 }

 __device__ static inline float __hip_ds_permutef(int index, float src) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
  tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
  return tmp.f;
 }
@@ -60,16 +80,24 @@ __device__ static inline float __hip_ds_permutef(int index, float src) {
 #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
 #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))

-template <int pattern>
-__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = src;
+template <int pattern> __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = src;
  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
  return tmp.u;
 }

-template <int pattern>
-__device__ static inline float __hip_ds_swizzlef_N(float src) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = src;
+template <int pattern> __device__ static inline float __hip_ds_swizzlef_N(float src) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = src;
  tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
  return tmp.f;
 }
@@ -79,214 +107,212 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {

 template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
 __device__ static inline int __hip_move_dpp_N(int src) {
-    return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
-                                    bound_ctrl);
+  return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
 }

 inline __device__ const struct final {
-  __device__
-  __attribute__((always_inline, const))
-    operator int() const noexcept {
+  __device__ __attribute__((always_inline, const)) operator int() const noexcept {
    return __builtin_amdgcn_wavefrontsize();
  }
 } warpSize{};

 // warp vote function __all __any __ballot
-__device__
-inline
-int __all(int predicate) {
-    return __ockl_wfall_i32(predicate);
-}
+__device__ inline int __all(int predicate) { return __ockl_wfall_i32(predicate); }

-__device__
-inline
-int __any(int predicate) {
-    return __ockl_wfany_i32(predicate);
-}
+__device__ inline int __any(int predicate) { return __ockl_wfany_i32(predicate); }

-__device__
-inline
-unsigned long long int __ballot(int predicate) {
+__device__ inline unsigned long long int __ballot(int predicate) {
  return __builtin_amdgcn_ballot_w64(predicate);
 }

-__device__
-inline
-unsigned long long int __ballot64(int predicate) {
-    return __ballot(predicate);
-}
+__device__ inline unsigned long long int __ballot64(int predicate) { return __ballot(predicate); }

 // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
 #if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
 // Since threads in a wave do not make independent progress, __activemask()
 // always returns the exact active mask, i.e, all active threads in the wave.
-__device__
-inline
-unsigned long long __activemask() {
-  return __ballot(true);
-}
+__device__ inline unsigned long long __activemask() { return __ballot(true); }
 #endif  // HIP_DISABLE_WARP_SYNC_BUILTINS

 __device__ static inline unsigned int __lane_id() {
  if (static_cast<int>(warpSize) == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);
-    return  __builtin_amdgcn_mbcnt_hi(
-        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+  return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
 }

-__device__
-inline
-int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
+__device__ inline int __shfl(MAYBE_UNDEF int var, int src_lane, int width = warpSize) {
  int self = __lane_id();
  int index = (src_lane & (width - 1)) + (self & ~(width - 1));
  return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
-__device__
-inline
-unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane, int width = warpSize) {
-     union { int i; unsigned u; float f; } tmp; tmp.u = var;
+__device__ inline unsigned int __shfl(MAYBE_UNDEF unsigned int var, int src_lane,
+                                      int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
  tmp.i = __shfl(tmp.i, src_lane, width);
  return tmp.u;
 }
-__device__
-inline
-float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+__device__ inline float __shfl(MAYBE_UNDEF float var, int src_lane, int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl(tmp.i, src_lane, width);
  return tmp.f;
 }
-__device__
-inline
-double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
+__device__ inline double __shfl(MAYBE_UNDEF double var, int src_lane, int width = warpSize) {
  static_assert(sizeof(double) == 2 * sizeof(int), "");
  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl(tmp[0], src_lane, width);
  tmp[1] = __shfl(tmp[1], src_lane, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize)
-{
+__device__ inline long __shfl(MAYBE_UNDEF long var, int src_lane, int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(long) == 2 * sizeof(int), "");
  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl(tmp[0], src_lane, width);
  tmp[1] = __shfl(tmp[1], src_lane, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(long) == sizeof(int), "");
  return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
 #endif
 }
-__device__
-inline
-unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane, int width = warpSize) {
+__device__ inline unsigned long __shfl(MAYBE_UNDEF unsigned long var, int src_lane,
+                                       int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");

-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl(tmp[0], src_lane, width);
  tmp[1] = __shfl(tmp[1], src_lane, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
 #endif
 }
-__device__
-inline
-long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize)
-{
+__device__ inline long long __shfl(MAYBE_UNDEF long long var, int src_lane, int width = warpSize) {
  static_assert(sizeof(long long) == 2 * sizeof(int), "");
  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl(tmp[0], src_lane, width);
  tmp[1] = __shfl(tmp[1], src_lane, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane, int width = warpSize) {
+__device__ inline unsigned long long __shfl(MAYBE_UNDEF unsigned long long var, int src_lane,
+                                            int width = warpSize) {
  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");

-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl(tmp[0], src_lane, width);
  tmp[1] = __shfl(tmp[1], src_lane, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }

-__device__
-inline
-int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
+__device__ inline int __shfl_up(MAYBE_UNDEF int var, unsigned int lane_delta,
+                                int width = warpSize) {
  int self = __lane_id();
  int index = self - lane_delta;
  index = (index < (self & ~(width - 1))) ? self : index;
  return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
-__device__
-inline
-unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+__device__ inline unsigned int __shfl_up(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
+                                         int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
  tmp.i = __shfl_up(tmp.i, lane_delta, width);
  return tmp.u;
 }
-__device__
-inline
-float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+__device__ inline float __shfl_up(MAYBE_UNDEF float var, unsigned int lane_delta,
+                                  int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_up(tmp.i, lane_delta, width);
  return tmp.f;
 }
-__device__
-inline
-double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
+__device__ inline double __shfl_up(MAYBE_UNDEF double var, unsigned int lane_delta,
+                                   int width = warpSize) {
  static_assert(sizeof(double) == 2 * sizeof(int), "");
  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  tmp[1] = __shfl_up(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta,
+                                 int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(long) == 2 * sizeof(int), "");
  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  tmp[1] = __shfl_up(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(long) == sizeof(int), "");
@@ -294,20 +320,21 @@ long __shfl_up(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSi
 #endif
 }

-__device__
-inline
-unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
+                                          int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");

-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  tmp[1] = __shfl_up(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
@@ -315,237 +342,261 @@ unsigned long __shfl_up(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
 #endif
 }

-__device__
-inline
-long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline long long __shfl_up(MAYBE_UNDEF long long var, unsigned int lane_delta,
+                                      int width = warpSize) {
  static_assert(sizeof(long long) == 2 * sizeof(int), "");
  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }

-__device__
-inline
-unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline unsigned long long __shfl_up(MAYBE_UNDEF unsigned long long var,
+                                               unsigned int lane_delta, int width = warpSize) {
  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_up(tmp[0], lane_delta, width);
  tmp[1] = __shfl_up(tmp[1], lane_delta, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }

-__device__
-inline
-int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta, int width = warpSize) {
+__device__ inline int __shfl_down(MAYBE_UNDEF int var, unsigned int lane_delta,
+                                  int width = warpSize) {
  int self = __lane_id();
  int index = self + lane_delta;
  index = (int)((self & (width - 1)) + lane_delta) >= width ? self : index;
  return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
-__device__
-inline
-unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+__device__ inline unsigned int __shfl_down(MAYBE_UNDEF unsigned int var, unsigned int lane_delta,
+                                           int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
  tmp.i = __shfl_down(tmp.i, lane_delta, width);
  return tmp.u;
 }
-__device__
-inline
-float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+__device__ inline float __shfl_down(MAYBE_UNDEF float var, unsigned int lane_delta,
+                                    int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_down(tmp.i, lane_delta, width);
  return tmp.f;
 }
-__device__
-inline
-double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta, int width = warpSize) {
+__device__ inline double __shfl_down(MAYBE_UNDEF double var, unsigned int lane_delta,
+                                     int width = warpSize) {
  static_assert(sizeof(double) == 2 * sizeof(int), "");
  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  tmp[1] = __shfl_down(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline long __shfl_down(MAYBE_UNDEF long var, unsigned int lane_delta,
+                                   int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(long) == 2 * sizeof(int), "");
  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  tmp[1] = __shfl_down(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(long) == sizeof(int), "");
  return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
 #endif
 }
-__device__
-inline
-unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline unsigned long __shfl_down(MAYBE_UNDEF unsigned long var, unsigned int lane_delta,
+                                            int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");

-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  tmp[1] = __shfl_down(tmp[1], lane_delta, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
 #endif
 }
-__device__
-inline
-long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline long long __shfl_down(MAYBE_UNDEF long long var, unsigned int lane_delta,
+                                        int width = warpSize) {
  static_assert(sizeof(long long) == 2 * sizeof(int), "");
  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var, unsigned int lane_delta, int width = warpSize)
-{
+__device__ inline unsigned long long __shfl_down(MAYBE_UNDEF unsigned long long var,
+                                                 unsigned int lane_delta, int width = warpSize) {
  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_down(tmp[0], lane_delta, width);
  tmp[1] = __shfl_down(tmp[1], lane_delta, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }

-__device__
-inline
-int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
+__device__ inline int __shfl_xor(MAYBE_UNDEF int var, int lane_mask, int width = warpSize) {
  int self = __lane_id();
  int index = self ^ lane_mask;
  index = index >= ((self + width) & ~(width - 1)) ? self : index;
  return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
-__device__
-inline
-unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.u = var;
+__device__ inline unsigned int __shfl_xor(MAYBE_UNDEF unsigned int var, int lane_mask,
+                                          int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.u = var;
  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  return tmp.u;
 }
-__device__
-inline
-float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
-    union { int i; unsigned u; float f; } tmp; tmp.f = var;
+__device__ inline float __shfl_xor(MAYBE_UNDEF float var, int lane_mask, int width = warpSize) {
+  union {
+    int i;
+    unsigned u;
+    float f;
+  } tmp;
+  tmp.f = var;
  tmp.i = __shfl_xor(tmp.i, lane_mask, width);
  return tmp.f;
 }
-__device__
-inline
-double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
+__device__ inline double __shfl_xor(MAYBE_UNDEF double var, int lane_mask, int width = warpSize) {
  static_assert(sizeof(double) == 2 * sizeof(int), "");
  static_assert(sizeof(double) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  double tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize)
-{
+__device__ inline long __shfl_xor(MAYBE_UNDEF long var, int lane_mask, int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(long) == 2 * sizeof(int), "");
  static_assert(sizeof(long) == sizeof(__hip_uint64_t), "");

-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(long) == sizeof(int), "");
  return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
 #endif
 }
-__device__
-inline
-unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask, int width = warpSize)
-{
+__device__ inline unsigned long __shfl_xor(MAYBE_UNDEF unsigned long var, int lane_mask,
+                                           int width = warpSize) {
 #ifndef _MSC_VER
  static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long) == sizeof(__hip_uint64_t), "");

-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 #else
  static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
  return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
 #endif
 }
-__device__
-inline
-long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask, int width = warpSize)
-{
+__device__ inline long long __shfl_xor(MAYBE_UNDEF long long var, int lane_mask,
+                                       int width = warpSize) {
  static_assert(sizeof(long long) == 2 * sizeof(int), "");
  static_assert(sizeof(long long) == sizeof(__hip_uint64_t), "");
-    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }
-__device__
-inline
-unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask, int width = warpSize)
-{
+__device__ inline unsigned long long __shfl_xor(MAYBE_UNDEF unsigned long long var, int lane_mask,
+                                                int width = warpSize) {
  static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
  static_assert(sizeof(unsigned long long) == sizeof(__hip_uint64_t), "");
-    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+  unsigned int tmp[2];
+  __builtin_memcpy(tmp, &var, sizeof(tmp));
  tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
  tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
-    __hip_uint64_t tmp0 = (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
-    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+  __hip_uint64_t tmp0 =
+      (static_cast<__hip_uint64_t>(tmp[1]) << 32ull) | static_cast<__hip_uint32_t>(tmp[0]);
+  unsigned long long tmp1;
+  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
  return tmp1;
 }

@@ -50,37 +50,41 @@ extern "C" __device__ __attribute__((const)) unsigned int __ockl_wfred_xor_u32(u
 #ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
 // this macro enable types that are not in CUDA
 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_add_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_add_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_add_u64(
+    unsigned long long);
 extern "C" __device__ __attribute__((const)) float __ockl_wfred_add_f32(float);
 extern "C" __device__ __attribute__((const)) double __ockl_wfred_add_f64(double);

 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_min_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_min_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_min_u64(
+    unsigned long long);
 extern "C" __device__ __attribute__((const)) float __ockl_wfred_min_f32(float);
 extern "C" __device__ __attribute__((const)) double __ockl_wfred_min_f64(double);

 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_max_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_max_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_max_u64(
+    unsigned long long);
 extern "C" __device__ __attribute__((const)) float __ockl_wfred_max_f32(float);
 extern "C" __device__ __attribute__((const)) double __ockl_wfred_max_f64(double);

 extern "C" __device__ __attribute__((const)) int __ockl_wfred_and_i32(int);
 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_and_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_and_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_and_u64(
+    unsigned long long);

 extern "C" __device__ __attribute__((const)) int __ockl_wfred_or_i32(int);
 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_or_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_or_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_or_u64(
+    unsigned long long);

 extern "C" __device__ __attribute__((const)) int __ockl_wfred_xor_i32(int);
 extern "C" __device__ __attribute__((const)) long long __ockl_wfred_xor_i64(long long);
-extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_xor_u64(unsigned long long);
+extern "C" __device__ __attribute__((const)) unsigned long long __ockl_wfred_xor_u64(
+    unsigned long long);

 #endif

-template <typename T>
-__device__ inline
-T __hip_readfirstlane(T val) {
+template <typename T> __device__ inline T __hip_readfirstlane(T val) {
  // In theory, behaviour is undefined when reading from a union member other
  // than the member that was last assigned to, but it works in practice because
  // we rely on the compiler to do the reasonable thing.
@@ -92,8 +96,7 @@ T __hip_readfirstlane(T val) {
  // NOTE: The builtin returns int, so we first cast it to unsigned int and only
  // then extend it to 64 bits.
  unsigned long long lower = (unsigned)__builtin_amdgcn_readfirstlane(u.l);
-  unsigned long long upper =
-      (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
+  unsigned long long upper = (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32);
  u.l = (upper << 32) | lower;
  return u.d;
 }
@@ -181,10 +184,8 @@ template <typename MaskT> __device__ inline void __syncwarp(MaskT mask) {
 // __all_sync, __any_sync, __ballot_sync

 template <typename MaskT>
-__device__ inline
-unsigned long long __ballot_sync(MaskT mask, int predicate) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline unsigned long long __ballot_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -192,22 +193,16 @@ unsigned long long __ballot_sync(MaskT mask, int predicate) {
  return __ballot(predicate) & mask;
 }

-template <typename MaskT>
-__device__ inline
-int __all_sync(MaskT mask, int predicate) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+template <typename MaskT> __device__ inline int __all_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
  return __ballot_sync(mask, predicate) == mask;
 }

-template <typename MaskT>
-__device__ inline
-int __any_sync(MaskT mask, int predicate) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+template <typename MaskT> __device__ inline int __any_sync(MaskT mask, int predicate) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -216,9 +211,7 @@ int __any_sync(MaskT mask, int predicate) {

 // __match_any, __match_all and sync variants

-template <typename T>
-__device__ inline
-unsigned long long __match_any(T value) {
+template <typename T> __device__ inline unsigned long long __match_any(T value) {
  static_assert(
      (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
          (sizeof(T) == 4 || sizeof(T) == 8),
@@ -241,10 +234,8 @@ unsigned long long __match_any(T value) {
 }

 template <typename MaskT, typename T>
-__device__ inline
-unsigned long long __match_any_sync(MaskT mask, T value) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline unsigned long long __match_any_sync(MaskT mask, T value) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -252,9 +243,7 @@ unsigned long long __match_any_sync(MaskT mask, T value) {
  return __match_any(value) & mask;
 }

-template <typename T>
-__device__ inline
-unsigned long long __match_all(T value, int* pred) {
+template <typename T> __device__ inline unsigned long long __match_all(T value, int* pred) {
  static_assert(
      (__hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value) &&
          (sizeof(T) == 4 || sizeof(T) == 8),
@@ -271,10 +260,8 @@ unsigned long long __match_all(T value, int* pred) {
 }

 template <typename MaskT, typename T>
-__device__ inline
-unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  MaskT retval = 0;
@@ -286,11 +273,8 @@ unsigned long long __match_all_sync(MaskT mask, T value, int* pred) {
 // various variants of shfl

 template <typename MaskT, typename T>
-__device__ inline
-T __shfl_sync(MaskT mask, T var, int srcLane,
-              int width = warpSize) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline T __shfl_sync(MaskT mask, T var, int srcLane, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -299,11 +283,8 @@ T __shfl_sync(MaskT mask, T var, int srcLane,
 }

 template <typename MaskT, typename T>
-__device__ inline
-T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
-                                   int width = warpSize) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline T __shfl_up_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -312,11 +293,8 @@ T __shfl_up_sync(MaskT mask, T var, unsigned int delta,
 }

 template <typename MaskT, typename T>
-__device__ inline
-T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
-                                     int width = warpSize) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline T __shfl_down_sync(MaskT mask, T var, unsigned int delta, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -325,11 +303,8 @@ T __shfl_down_sync(MaskT mask, T var, unsigned int delta,
 }

 template <typename MaskT, typename T>
-__device__ inline
-T __shfl_xor_sync(MaskT mask, T var, int laneMask,
-                                    int width = warpSize) {
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+__device__ inline T __shfl_xor_sync(MaskT mask, T var, int laneMask, int width = warpSize) {
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -338,13 +313,11 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask,
 }

 template <typename MaskT, typename T, typename BinaryOp, typename WfReduce>
-__device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wfReduce)
-{
+__device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wfReduce) {
  using permuteType =
      typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, T, unsigned int>::type;
  static constexpr auto kMaskNumBits = sizeof(MaskT) * 8;
-  static_assert(
-      __hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
+  static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                "The mask must be a 64-bit integer. "
                "Implicitly promoting a smaller integer is almost always an error.");
  __hip_adjust_mask_for_wave32(mask);
@@ -361,9 +334,12 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
  int maskNumBits;
  int numIterations;
  // unsigned int[2] is used when T is 64-bit wide
-  typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, permuteType, permuteType[2]>::type result, permuteResult;
+  typename __hip_internal::conditional<sizeof(T) == 4 || sizeof(T) == 2, permuteType,
+                                       permuteType[2]>::type result,
+      permuteResult;
  auto backwardPermute = [](int index, permuteType val) {
-    if constexpr (__hip_internal::is_integral<T>::value || __hip_internal::is_same<T, double>::value)
+    if constexpr (__hip_internal::is_integral<T>::value ||
+                  __hip_internal::is_same<T, double>::value)
      return __hip_ds_bpermute(index, val);
    else
      return __hip_ds_bpermutef(index, val);
@@ -372,7 +348,8 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
  __hip_check_mask(mask);
  maskNumBits = __popcll(mask);

-#ifdef __OPTIMIZE__ // at the time of this writing the ockl wfred functions do not compile when using -O0
+#ifdef __OPTIMIZE__  // at the time of this writing the ockl wfred functions do not compile when
+                     // using -O0
  if (maskNumBits == lastLane + 1)
    // this means the mask "does not have holes", and starts from 0; we can use a specific intrinsic
    // to calculate the aggregated result
@@ -419,7 +396,10 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
    }

    if constexpr (sizeof(T) == 2) {
-      union { int i; T f; } tmp;
+      union {
+        int i;
+        T f;
+      } tmp;

      tmp.f = result;
      tmp.i = __hip_ds_bpermute(nextBit << 2, tmp.i);
@@ -438,7 +418,8 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
        result = op(result, permuteResult);
      else {
        T tmp;
-        unsigned long long rhs = (static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];
+        unsigned long long rhs =
+            (static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];

        __builtin_memcpy(&tmp, &result, sizeof(T));
        tmp = op(tmp, *reinterpret_cast<T*>(&rhs));
@@ -451,7 +432,10 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
  }

  if constexpr (sizeof(T) == 2) {
-    union { int i; T f; } tmp;
+    union {
+      int i;
+      T f;
+    } tmp;
    tmp.f = result;
    tmp.i = __hip_ds_bpermute(firstLane << 2, tmp.i);
    return tmp.f;
@@ -464,9 +448,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
  }
 }

-template <typename MaskT>
-__device__ inline int __reduce_add_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_add_sync(MaskT mask, int val) {
  // although C++ has std::plus and other functors, we do not use them because
  // they are in the header <functional> and they were causing problem with hipRTC
  // at this time
@@ -477,17 +459,14 @@ __device__ inline int __reduce_add_sync(MaskT mask, int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_add_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_add_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline int __reduce_min_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_min_sync(MaskT mask, int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i32(v); };

@@ -495,17 +474,14 @@ __device__ inline int __reduce_min_sync(MaskT mask, int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_min_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_min_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline int __reduce_max_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_max_sync(MaskT mask, int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i32(v); };

@@ -513,8 +489,7 @@ __device__ inline int __reduce_max_sync(MaskT mask, int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u32(v); };

@@ -522,8 +497,7 @@ __device__ inline unsigned int __reduce_max_sync(MaskT mask, unsigned int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u32(v); };

@@ -531,8 +505,7 @@ __device__ inline unsigned int __reduce_or_sync(MaskT mask, unsigned int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u32(v); };

@@ -540,8 +513,7 @@ __device__ inline unsigned int __reduce_and_sync(MaskT mask, unsigned int val)
 }

 template <typename MaskT>
-__device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val)
-{
+__device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u32(v); };

@@ -549,9 +521,7 @@ __device__ inline unsigned int __reduce_xor_sync(MaskT mask, unsigned int val)
 }

 #ifdef HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
-template <typename MaskT>
-__device__ inline long long __reduce_add_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_add_sync(MaskT mask, long long val) {
  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_i64(v); };

@@ -559,35 +529,28 @@ __device__ inline long long __reduce_add_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_add_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_add_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_u64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline float __reduce_add_sync(MaskT mask, float val)
-{
+template <typename MaskT> __device__ inline float __reduce_add_sync(MaskT mask, float val) {
  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline double __reduce_add_sync(MaskT mask, double val)
-{
+template <typename MaskT> __device__ inline double __reduce_add_sync(MaskT mask, double val) {
  auto op = [](decltype(val)& a, decltype(val)& b) { return a + b; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_add_f64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline long long __reduce_min_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_min_sync(MaskT mask, long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_i64(v); };

@@ -595,35 +558,28 @@ __device__ inline long long __reduce_min_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_min_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_min_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_u64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline float __reduce_min_sync(MaskT mask, float val)
-{
+template <typename MaskT> __device__ inline float __reduce_min_sync(MaskT mask, float val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline double __reduce_min_sync(MaskT mask, double val)
-{
+template <typename MaskT> __device__ inline double __reduce_min_sync(MaskT mask, double val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return rhs < lhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_min_f64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline long long __reduce_max_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_max_sync(MaskT mask, long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_i64(v); };

@@ -631,44 +587,35 @@ __device__ inline long long __reduce_max_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_max_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_max_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_u64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline float __reduce_max_sync(MaskT mask, float val)
-{
+template <typename MaskT> __device__ inline float __reduce_max_sync(MaskT mask, float val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline double __reduce_max_sync(MaskT mask, double val)
-{
+template <typename MaskT> __device__ inline double __reduce_max_sync(MaskT mask, double val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs < rhs ? rhs : lhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_max_f64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline int __reduce_and_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_and_sync(MaskT mask, int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline long long __reduce_and_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_and_sync(MaskT mask, long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_i64(v); };

@@ -676,26 +623,21 @@ __device__ inline long long __reduce_and_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_and_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_and_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs && rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_and_u64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline int __reduce_or_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_or_sync(MaskT mask, int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline long long __reduce_or_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_or_sync(MaskT mask, long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_i64(v); };

@@ -703,26 +645,21 @@ __device__ inline long long __reduce_or_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_or_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_or_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return lhs || rhs; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_or_u64(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline int __reduce_xor_sync(MaskT mask, int val)
-{
+template <typename MaskT> __device__ inline int __reduce_xor_sync(MaskT mask, int val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i32(v); };

  return __reduce_op_sync(mask, val, op, wfReduce);
 }

-template <typename MaskT>
-__device__ inline long long __reduce_xor_sync(MaskT mask, long long val)
-{
+template <typename MaskT> __device__ inline long long __reduce_xor_sync(MaskT mask, long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_i64(v); };

@@ -730,8 +667,7 @@ __device__ inline long long __reduce_xor_sync(MaskT mask, long long val)
 }

 template <typename MaskT>
-__device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long long val)
-{
+__device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long long val) {
  auto op = [](decltype(val) lhs, decltype(val) rhs) { return (!lhs) != (!rhs) == 1; };
  auto wfReduce = [](decltype(val) v) { return __ockl_wfred_xor_u64(v); };

@@ -111,13 +111,14 @@ extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a
 extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);

 extern "C" __device__ __hip_uint64_t __ockl_fprintf_stderr_begin();
-extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(__hip_uint64_t msg_desc, __hip_uint32_t num_args,
-                                                          __hip_uint64_t value0, __hip_uint64_t value1,
-                                                          __hip_uint64_t value2, __hip_uint64_t value3,
-                                                          __hip_uint64_t value4, __hip_uint64_t value5,
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_args(
+    __hip_uint64_t msg_desc, __hip_uint32_t num_args, __hip_uint64_t value0, __hip_uint64_t value1,
+    __hip_uint64_t value2, __hip_uint64_t value3, __hip_uint64_t value4, __hip_uint64_t value5,
    __hip_uint64_t value6, __hip_uint32_t is_last);
-extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc, const char* data,
-                                                              __hip_uint64_t length, __hip_uint32_t is_last);
+extern "C" __device__ __hip_uint64_t __ockl_fprintf_append_string_n(__hip_uint64_t msg_desc,
+                                                                    const char* data,
+                                                                    __hip_uint64_t length,
+                                                                    __hip_uint32_t is_last);

 // Introduce local address space
 #define __local __attribute__((address_space(3)))
@@ -37,15 +37,12 @@ THE SOFTWARE.
 hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
                                               unsigned int flags, hip_impl::program_state& ps);

-hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
-                                    dim3 blockDim, void** args,
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim, void** args,
                                      size_t sharedMem, hipStream_t stream,
                                      hip_impl::program_state& ps);

-hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                 int  numDevices,
-                                                 unsigned int flags,
-                                                 hip_impl::program_state& ps);
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+                                                 unsigned int flags, hip_impl::program_state& ps);

 #pragma GCC visibility push(hidden)

@@ -56,51 +53,39 @@ inline T round_up_to_next_multiple_nonnegative(T x, T y) {
  return tmp - tmp % y;
 }

-template <
-    std::size_t n,
-    typename... Ts,
+template <std::size_t n, typename... Ts,
          typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
-inline hip_impl::kernarg make_kernarg(
-    const std::tuple<Ts...>&,
-    const kernargs_size_align&,
+inline hip_impl::kernarg make_kernarg(const std::tuple<Ts...>&, const kernargs_size_align&,
                                      hip_impl::kernarg kernarg) {
  return kernarg;
 }

-template <
-    std::size_t n,
-    typename... Ts,
+template <std::size_t n, typename... Ts,
          typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
-inline hip_impl::kernarg make_kernarg(
-    const std::tuple<Ts...>& formals,
+inline hip_impl::kernarg make_kernarg(const std::tuple<Ts...>& formals,
                                      const kernargs_size_align& size_align,
                                      hip_impl::kernarg kernarg) {
  using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;

-    static_assert(
-        !std::is_reference<T>{},
+  static_assert(!std::is_reference<T>{},
                "A __global__ function cannot have a reference as one of its "
                "arguments.");
 #if defined(HIP_STRICT)
-        static_assert(
-            std::is_trivially_copyable<T>{},
+  static_assert(std::is_trivially_copyable<T>{},
                "Only TriviallyCopyable types can be arguments to a __global__ "
                "function");
 #endif

-    kernarg.resize(round_up_to_next_multiple_nonnegative(
-        kernarg.size(), size_align.alignment(n)) + size_align.size(n));
+  kernarg.resize(round_up_to_next_multiple_nonnegative(kernarg.size(), size_align.alignment(n)) +
+                 size_align.size(n));

-    std::memcpy(
-        kernarg.data() + kernarg.size() - size_align.size(n),
-        &std::get<n>(formals),
+  std::memcpy(kernarg.data() + kernarg.size() - size_align.size(n), &std::get<n>(formals),
              size_align.size(n));
  return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
 }

 template <typename... Formals, typename... Actuals>
-inline hip_impl::kernarg make_kernarg(
-    void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
+inline hip_impl::kernarg make_kernarg(void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
  static_assert(sizeof...(Formals) == sizeof...(Actuals),
                "The count of formal arguments must match the count of actuals.");

@@ -112,54 +97,45 @@ inline hip_impl::kernarg make_kernarg(

  auto& ps = hip_impl::get_program_state();
  return make_kernarg<0>(to_formals,
-                           ps.get_kernargs_size_align(
-                               reinterpret_cast<std::uintptr_t>(kernel)),
+                         ps.get_kernargs_size_align(reinterpret_cast<std::uintptr_t>(kernel)),
                         std::move(kernarg));
 }


 HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);

-inline
-__attribute__((visibility("hidden")))
-void hipLaunchKernelGGLImpl(
-    std::uintptr_t function_address,
-    const dim3& numBlocks,
-    const dim3& dimBlocks,
-    std::uint32_t sharedMemBytes,
-    hipStream_t stream,
-    void** kernarg) {
+inline __attribute__((visibility("hidden"))) void hipLaunchKernelGGLImpl(
+    std::uintptr_t function_address, const dim3& numBlocks, const dim3& dimBlocks,
+    std::uint32_t sharedMemBytes, hipStream_t stream, void** kernarg) {
+  const auto& kd =
+      hip_impl::get_program_state().kernel_descriptor(function_address, target_agent(stream));

-    const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
-                                                               target_agent(stream));
-
-    hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
-                          dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
-                          stream, nullptr, kernarg);
+  hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z, dimBlocks.x, dimBlocks.y,
+                        dimBlocks.z, sharedMemBytes, stream, nullptr, kernarg);
 }
 }  // Namespace hip_impl.


 template <class T>
-inline
-hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
-
+inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, T kernel,
+                                                    size_t dynSharedMemPerBlk = 0,
+                                                    int blockSizeLimit = 0) {
  using namespace hip_impl;

  hip_impl::hip_init();
  auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
                                                 target_agent(0));

-    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
-                                      dynSharedMemPerBlk, blockSizeLimit);
+  return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, dynSharedMemPerBlk,
+                                                 blockSizeLimit);
 }

 template <class T>
-inline
-hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
-    T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int  flags = 0 ) {
-
+inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                                             T kernel,
+                                                             size_t dynSharedMemPerBlk = 0,
+                                                             int blockSizeLimit = 0,
+                                                             unsigned int flags = 0) {
  using namespace hip_impl;

  hip_impl::hip_init();
@@ -167,49 +143,35 @@ hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockS
  auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
                                                 target_agent(0));

-    return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
-                                      dynSharedMemPerBlk, blockSizeLimit);
+  return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, dynSharedMemPerBlk,
+                                                 blockSizeLimit);
 }

 template <typename... Args, typename F = void (*)(Args...)>
-inline
-void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
-                        std::uint32_t sharedMemBytes, hipStream_t stream,
-                        Args... args) {
+inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+                               std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
  hip_impl::hip_init();
  auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
  std::size_t kernarg_size = kernarg.size();

-    void* config[]{
-        HIP_LAUNCH_PARAM_BUFFER_POINTER,
-        kernarg.data(),
-        HIP_LAUNCH_PARAM_BUFFER_SIZE,
-        &kernarg_size,
-        HIP_LAUNCH_PARAM_END};
+  void* config[]{HIP_LAUNCH_PARAM_BUFFER_POINTER, kernarg.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                 &kernarg_size, HIP_LAUNCH_PARAM_END};

-    hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
-                                     numBlocks, dimBlocks, sharedMemBytes,
-                                     stream, &config[0]);
+  hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel), numBlocks, dimBlocks,
+                                   sharedMemBytes, stream, &config[0]);
 }

 template <typename F>
-inline
-__attribute__((visibility("hidden")))
-hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
-                                      void** args, size_t sharedMem,
-                                      hipStream_t stream) {
+inline __attribute__((visibility("hidden"))) hipError_t hipLaunchCooperativeKernel(
+    F f, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, hipStream_t stream) {
  hip_impl::hip_init();
  auto& ps = hip_impl::get_program_state();
-    return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
-                                      blockDim, args, sharedMem, stream, ps);
+  return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim, blockDim, args, sharedMem,
+                                    stream, ps);
 }

-inline
-__attribute__((visibility("hidden")))
-hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                 int  numDevices,
-                                                 unsigned int  flags) {
-
+inline __attribute__((visibility("hidden"))) hipError_t hipLaunchCooperativeKernelMultiDevice(
+    hipLaunchParams* launchParamsList, int numDevices, unsigned int flags) {
  hip_impl::hip_init();
  auto& ps = hip_impl::get_program_state();
  return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
@@ -10,12 +10,11 @@
 namespace hc {
 class completion_future;
 class accelerator_view;
-}
+}  // namespace hc


 // 3 dim structure for groups and grids.
-typedef struct gl_dim3
-{
+typedef struct gl_dim3 {
  int x, y, z;
  gl_dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z) {};
 } gl_dim3;
@@ -28,8 +27,7 @@ typedef enum gl_barrier_bit {


 // grid_launch_parm contains information used to launch the kernel.
-typedef struct grid_launch_parm
-{
+typedef struct grid_launch_parm {
  //! Grid dimensions
  gl_dim3 grid_dim;

@@ -3,14 +3,12 @@
 #include "grid_launch.h"
 #include "hc.hpp"

-class grid_launch_parm_cxx : public grid_launch_parm
-{
+class grid_launch_parm_cxx : public grid_launch_parm {
 public:
  grid_launch_parm_cxx() = default;

  // customized serialization: don't need av and cf in kernel
-  __attribute__((annotate("serialize")))
-  void __cxxamp_serialize(Kalmar::Serialize& s) const {
+  __attribute__((annotate("serialize"))) void __cxxamp_serialize(Kalmar::Serialize& s) const {
    s.Append(sizeof(int), &grid_dim.x);
    s.Append(sizeof(int), &grid_dim.y);
    s.Append(sizeof(int), &grid_dim.z);
@@ -19,9 +17,11 @@ public:
    s.Append(sizeof(int), &group_dim.z);
  }

-  __attribute__((annotate("user_deserialize")))
-  grid_launch_parm_cxx(int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
-                   int group_dim_x, int group_dim_y, int group_dim_z) {
+  __attribute__((annotate("user_deserialize"))) grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y,
+                                                                     int grid_dim_z,
+                                                                     int group_dim_x,
+                                                                     int group_dim_y,
+                                                                     int group_dim_z) {
    grid_dim.x = grid_dim_x;
    grid_dim.y = grid_dim_y;
    grid_dim.z = grid_dim_z;
@@ -47,4 +47,3 @@ extern inline void grid_launch_init(grid_launch_parm *lp) {
  lp->av = &av;
  lp->cf = NULL;
 }
-
@@ -50,40 +50,34 @@ THE SOFTWARE.
 namespace std {  // TODO: these should be removed as soon as possible.
 #if (__cplusplus < 201406L)
 #if (__cplusplus < 201402L)
-template <bool cond, typename T = void>
-using enable_if_t = typename enable_if<cond, T>::type;
+template <bool cond, typename T = void> using enable_if_t = typename enable_if<cond, T>::type;
 template <bool cond, typename T, typename U>
 using conditional_t = typename conditional<cond, T, U>::type;
-template <typename T>
-using decay_t = typename decay<T>::type;
+template <typename T> using decay_t = typename decay<T>::type;
 template <FunctionalProcedure F, typename... Ts>
 using result_of_t = typename result_of<F(Ts...)>::type;
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
+template <typename T> using remove_reference_t = typename remove_reference<T>::type;
 #endif
 #endif
 }  // namespace std

 namespace hip_impl {
-template <typename...>
-using void_t_ = void;
+template <typename...> using void_t_ = void;

 #if HIP_HAS_INVOCABLE
-template <typename, typename = void>
-struct is_callable_impl;
+template <typename, typename = void> struct is_callable_impl;

 template <FunctionalProcedure F, typename... Ts>
 struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
 #elif HIP_HAS_RESULT_OF_SFINAE
-template <typename, typename = void>
-struct is_callable_impl : std::false_type {};
+template <typename, typename = void> struct is_callable_impl : std::false_type {};

 template <FunctionalProcedure F, typename... Ts>
-struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
+struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type> >
+    : std::true_type {};
 #else
 template <class Base, class T, class Derived>
-auto simple_invoke(T Base::*pmd, Derived&& ref)
-> decltype(static_cast<Derived&&>(ref).*pmd);
+auto simple_invoke(T Base::* pmd, Derived&& ref) -> decltype(static_cast<Derived&&>(ref).*pmd);

 template <class PMD, class Pointer>
 auto simple_invoke(PMD&& pmd, Pointer&& ptr)
@@ -99,36 +93,34 @@ auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)

 template <class PMF, class Pointer, class... Args>
 auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
+    -> decltype(((*static_cast<Pointer&&>(ptr)).*
+                 static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));

 template <class Base, class T, class Derived, class... Args>
 auto simple_invoke(T Base::* pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
    -> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));

 template <class F, class... Ts>
-auto simple_invoke(F&& f, Ts&&... xs)
-> decltype(f(static_cast<Ts&&>(xs)...));
+auto simple_invoke(F&& f, Ts&&... xs) -> decltype(f(static_cast<Ts&&>(xs)...));

-template <typename, typename = void>
-struct is_callable_impl : std::false_type {};
+template <typename, typename = void> struct is_callable_impl : std::false_type {};

 template <FunctionalProcedure F, typename... Ts>
-struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
+struct is_callable_impl<F(Ts...),
+                        void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
    : std::true_type {};

 #endif

-template <typename Call>
-struct is_callable : is_callable_impl<Call> {};
+template <typename Call> struct is_callable : is_callable_impl<Call> {};

 #define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,     \
                                   _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25,     \
                                   _26, _27, _28, _29, _30, _31, _n, ...)                          \
  _n
 #define count_macro_args_hip_(...)                                                                 \
-    count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20,    \
-                               19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,  \
-                               0)
+  count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,  \
+                             18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)

 #define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
 #define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
@@ -78,12 +78,12 @@ typedef void (*t___hipRegisterFunction)(void** modules, const void* hostFunction
                                        dim3* blockDim, dim3* gridDim, int* wSize);
 typedef void (*t___hipRegisterManagedVar)(void* hipModule, void** pointer, void* init_value,
                                          const char* name, size_t size, unsigned align);
-typedef void (*t___hipRegisterSurface)(void** modules, void* var, char* hostVar,
-                                       char* deviceVar, int type, int ext);
-typedef void (*t___hipRegisterTexture)(void** modules, void* var, char* hostVar,
-                                       char* deviceVar, int type, int norm, int ext);
-typedef void (*t___hipRegisterVar)(void** modules, void* var, char* hostVar,
-                                   char* deviceVar, int ext, size_t size, int constant, int global);
+typedef void (*t___hipRegisterSurface)(void** modules, void* var, char* hostVar, char* deviceVar,
+                                       int type, int ext);
+typedef void (*t___hipRegisterTexture)(void** modules, void* var, char* hostVar, char* deviceVar,
+                                       int type, int norm, int ext);
+typedef void (*t___hipRegisterVar)(void** modules, void* var, char* hostVar, char* deviceVar,
+                                   int ext, size_t size, int constant, int global);
 typedef void (*t___hipUnregisterFatBinary)(void** modules);

 // HIP tools dispatch functions
@@ -666,7 +666,8 @@ typedef hipError_t (*t_hipLinkAddData)(hipLinkState_t state, hipJitInputType typ
                                       size_t size, const char* name, unsigned int numOptions,
                                       hipJitOption* options, void** optionValues);
 typedef hipError_t (*t_hipLinkAddFile)(hipLinkState_t state, hipJitInputType type, const char* path,
-                          unsigned int numOptions, hipJitOption* options, void** optionValues);
+                                       unsigned int numOptions, hipJitOption* options,
+                                       void** optionValues);
 typedef hipError_t (*t_hipLinkComplete)(hipLinkState_t state, void** hipBinOut, size_t* sizeOut);
 typedef hipError_t (*t_hipLinkCreate)(unsigned int numOptions, hipJitOption* options,
                                      void** optionValues, hipLinkState_t* stateOut);
@@ -934,29 +935,28 @@ typedef hipError_t (*t_hipHccModuleLaunchKernel)(hipFunction_t f, uint32_t globa
                                                 hipEvent_t stopEvent);
 typedef int (*t_hipGetStreamDeviceId)(hipStream_t stream);
 typedef hipError_t (*t_hipDrvGraphAddMemsetNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
-                                 const hipGraphNode_t* dependencies, size_t numDependencies,
+                                                 const hipGraphNode_t* dependencies,
+                                                 size_t numDependencies,
                                                 const hipMemsetParams* memsetParams, hipCtx_t ctx);
-typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(hipGraphNode_t* pGraphNode,
-                               hipGraph_t graph, const hipGraphNode_t* pDependencies,
-                               size_t numDependencies,
-                               const hipExternalSemaphoreWaitNodeParams* nodeParams);
-typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(hipGraphNode_t* pGraphNode,
-                               hipGraph_t graph, const hipGraphNode_t* pDependencies,
-                               size_t numDependencies,
+typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams);
+typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams);
+typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeSetParams)(
+    hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams);
+typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeSetParams)(
+    hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams);
+typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeGetParams)(
+    hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* params_out);
+typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeGetParams)(
+    hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* params_out);
+typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
    const hipExternalSemaphoreSignalNodeParams* nodeParams);
-typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeSetParams)(hipGraphNode_t hNode,
-                                            const hipExternalSemaphoreSignalNodeParams* nodeParams);
-typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeSetParams)(hipGraphNode_t hNode,
-                                            const hipExternalSemaphoreWaitNodeParams* nodeParams);
-typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeGetParams)(hipGraphNode_t hNode,
-                                            hipExternalSemaphoreSignalNodeParams* params_out);
-typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeGetParams)(hipGraphNode_t hNode,
-                                            hipExternalSemaphoreWaitNodeParams* params_out);
-typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(hipGraphExec_t hGraphExec,
-                                            hipGraphNode_t hNode,
-                                            const hipExternalSemaphoreSignalNodeParams* nodeParams);
-typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraphExec_t hGraphExec,
-                                            hipGraphNode_t hNode,
+typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
    const hipExternalSemaphoreWaitNodeParams* nodeParams);
 typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                        const hipGraphNode_t* pDependencies, size_t numDependencies,
@@ -971,7 +971,8 @@ typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureRefer
 typedef hipError_t (*t_hipTexRefGetBorderColor)(float* pBorderColor,
                                                const textureReference* texRef);
 typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureReference* texRef);
-typedef hipError_t (*t_hipGetProcAddress)(const char* symbol, void** pfn, int  hipVersion, uint64_t flags,
+typedef hipError_t (*t_hipGetProcAddress)(const char* symbol, void** pfn, int hipVersion,
+                                          uint64_t flags,
                                          hipDriverProcAddressQueryResult* symbolStatus);
 typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGraph_t graph,
                                                     const hipGraphNode_t* dependencies,
@@ -980,15 +981,17 @@ typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGrap
                                                     hipStreamCaptureMode mode);
 typedef hipError_t (*t_hipGetFuncBySymbol)(hipFunction_t* functionPtr, const void* symbolPtr);
 typedef hipError_t (*t_hipDrvGraphAddMemFreeNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
-                                  const hipGraphNode_t* dependencies, size_t numDependencies,
-                                  hipDeviceptr_t dptr);
+                                                  const hipGraphNode_t* dependencies,
+                                                  size_t numDependencies, hipDeviceptr_t dptr);

 typedef hipError_t (*t_hipDrvGraphExecMemcpyNodeSetParams)(hipGraphExec_t hGraphExec,
-                                   hipGraphNode_t hNode, const HIP_MEMCPY3D* copyParams,
+                                                           hipGraphNode_t hNode,
+                                                           const HIP_MEMCPY3D* copyParams,
                                                           hipCtx_t ctx);

 typedef hipError_t (*t_hipDrvGraphExecMemsetNodeSetParams)(hipGraphExec_t hGraphExec,
-                                   hipGraphNode_t hNode, const hipMemsetParams* memsetParams,
+                                                           hipGraphNode_t hNode,
+                                                           const hipMemsetParams* memsetParams,
                                                           hipCtx_t ctx);
 typedef hipError_t (*t_hipSetValidDevices)(int* device_arr, int len);
 typedef hipError_t (*t_hipMemcpyAtoD)(hipDeviceptr_t dstDevice, hipArray_t srcArray,
@@ -1014,7 +1017,6 @@ typedef hipError_t (*t_hipGraphExecNodeSetParams)(hipGraphExec_t graphExec, hipG
                                                  hipGraphNodeParams* nodeParams);


-
 typedef hipError_t (*t_hipExternalMemoryGetMappedMipmappedArray)(
    hipMipmappedArray_t* mipmap, hipExternalMemory_t extMem,
    const hipExternalMemoryMipmappedArrayDesc* mipmapDesc);
@@ -1024,8 +1026,7 @@ typedef hipError_t (*t_hipDrvGraphMemcpyNodeGetParams)(hipGraphNode_t hNode,
 typedef hipError_t (*t_hipDrvGraphMemcpyNodeSetParams)(hipGraphNode_t hNode,
                                                       const HIP_MEMCPY3D* nodeParams);

-typedef hipError_t (*t_hipExtHostAlloc)(void **ptr, size_t size,
-                                         unsigned int flags);
+typedef hipError_t (*t_hipExtHostAlloc)(void** ptr, size_t size, unsigned int flags);

 typedef hipError_t (*t_hipDeviceGetTexture1DLinearMaxWidth)(size_t* maxWidthInElements,
                                                            const hipChannelFormatDesc* fmtDesc,
@@ -1041,7 +1042,8 @@ typedef hipError_t (*t_hipGraphBatchMemOpNodeSetParams)(hipGraphNode_t hNode,
                                                        hipBatchMemOpNodeParams* nodeParams);
 typedef hipError_t (*t_hipGraphExecBatchMemOpNodeSetParams)(
    hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipBatchMemOpNodeParams* nodeParams);
-typedef hipError_t (*t_hipEventRecordWithFlags)(hipEvent_t event, hipStream_t stream, unsigned int flags);
+typedef hipError_t (*t_hipEventRecordWithFlags)(hipEvent_t event, hipStream_t stream,
+                                                unsigned int flags);
 typedef hipError_t (*t_hipLaunchKernelExC)(const hipLaunchConfig_t* config, const void* fPtr,
                                           void** args);
 typedef hipError_t (*t_hipDrvLaunchKernelEx)(const HIP_LAUNCH_CONFIG* config, hipFunction_t f,
@@ -1559,8 +1561,10 @@ struct HipDispatchTable {
  t_hipGraphExternalSemaphoresWaitNodeSetParams hipGraphExternalSemaphoresWaitNodeSetParams_fn;
  t_hipGraphExternalSemaphoresSignalNodeGetParams hipGraphExternalSemaphoresSignalNodeGetParams_fn;
  t_hipGraphExternalSemaphoresWaitNodeGetParams hipGraphExternalSemaphoresWaitNodeGetParams_fn;
-  t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn;
-  t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn;
+  t_hipGraphExecExternalSemaphoresSignalNodeSetParams
+      hipGraphExecExternalSemaphoresSignalNodeSetParams_fn;
+  t_hipGraphExecExternalSemaphoresWaitNodeSetParams
+      hipGraphExecExternalSemaphoresWaitNodeSetParams_fn;
  t_hipGraphAddNode hipGraphAddNode_fn;
  t_hipGraphInstantiateWithParams hipGraphInstantiateWithParams_fn;
  t_hipExtGetLastError hipExtGetLastError_fn;
@@ -25,10 +25,7 @@ THE SOFTWARE.
 #if defined(__clang__) and defined(__HIP__)

 // abort
-extern "C" __device__ inline __attribute__((weak))
-void abort() {
-  __builtin_trap();
-}
+extern "C" __device__ inline __attribute__((weak)) void abort() { __builtin_trap(); }

 // The noinline attribute helps encapsulate the printf expansion,
 // which otherwise has a performance impact just by increasing the
@@ -36,18 +33,14 @@ void abort() {
 // allows the function to exist as a global although its definition is
 // included in every compilation unit.
 #if defined(_WIN32) || defined(_WIN64)
-extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
-void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void _wassert(
+    const wchar_t* _msg, const wchar_t* _file, unsigned _line) {
  // FIXME: Need `wchar_t` support to generate assertion message.
  __builtin_trap();
 }
 #else /* defined(_WIN32) || defined(_WIN64) */
-extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
-void __assert_fail(const char *assertion,
-                   const char *file,
-                   unsigned int line,
-                   const char *function)
-{
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
+    const char* assertion, const char* file, unsigned int line, const char* function) {
  const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";

  // strlen is not available as a built-in yet, so we create our own
@@ -84,9 +77,7 @@ void __assert_fail(const char *assertion,
  __builtin_trap();
 }

-extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
-void __assertfail()
-{
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak)) void __assertfail() {
  // ignore all the args for now.
  __builtin_trap();
 }
@@ -97,8 +88,7 @@ void __assertfail()
 #else
 #define __hip_assert(COND)                                                                         \
  do {                                                                                             \
-    if (!(COND))                                    \
-      __builtin_trap();                             \
+    if (!(COND)) __builtin_trap();                                                                 \
  } while (0)
 #endif

@@ -63,12 +63,12 @@ template <unsigned int size>
 using is_valid_wavefront = __hip_internal::integral_constant<bool, size <= 64>;

 template <unsigned int size>
-using is_valid_tile_size =
-    __hip_internal::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
+using is_valid_tile_size = __hip_internal::integral_constant<
+    bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;

 template <typename T>
-using is_valid_type =
-    __hip_internal::integral_constant<bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;
+using is_valid_type = __hip_internal::integral_constant<
+    bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;

 namespace internal {

@@ -110,8 +110,8 @@ namespace helper {
 *           | | | |  | | | |
 * output:    1   1    0   0
 */
-__CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
-    unsigned long long base_mask, unsigned long long input_mask) {
+__CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(unsigned long long base_mask,
+                                                       unsigned long long input_mask) {
  unsigned long long out = 0;
  for (unsigned int i = 0, index = 0; i < warpSize; i++) {
    auto lane_active = base_mask & (1ull << i);
@@ -133,15 +133,20 @@ __CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
 namespace multi_grid {

 __CG_STATIC_QUALIFIER__ __hip_uint32_t num_grids() {
-  return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids()); }
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_num_grids());
+}

 __CG_STATIC_QUALIFIER__ __hip_uint32_t grid_rank() {
-  return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank()); }
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_grid_rank());
+}

-__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() { return static_cast<__hip_uint32_t>(__ockl_multi_grid_size()); }
+__CG_STATIC_QUALIFIER__ __hip_uint32_t num_threads() {
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_size());
+}

 __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
-  return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank()); }
+  return static_cast<__hip_uint32_t>(__ockl_multi_grid_thread_rank());
+}

 __CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }

@@ -171,8 +176,8 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
      static_cast<__hip_uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));

  // Compute thread local rank within current workgroup
-  __hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
-                                          (threadIdx.y * blockDim.x) + (threadIdx.x));
+  __hip_uint32_t local_thread_rank = static_cast<__hip_uint32_t>(
+      (threadIdx.z * blockDim.y * blockDim.x) + (threadIdx.y * blockDim.x) + (threadIdx.x));

  return (num_threads_till_current_workgroup + local_thread_rank);
 }
@@ -209,9 +214,7 @@ __CG_STATIC_QUALIFIER__ __hip_uint32_t thread_rank() {
                                      (threadIdx.y * blockDim.x) + (threadIdx.x)));
 }

-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return true;
-}
+__CG_STATIC_QUALIFIER__ bool is_valid() { return true; }

 __CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }

@@ -23,6 +23,7 @@ struct __half2_raw {
 struct __half {
 protected:
  unsigned short __x;
+
 public:
  // CREATORS
  __half() = default;
@@ -38,17 +39,16 @@ struct __half2_raw {
  // MANIPULATORS
  __half& operator=(const __half&) = default;
  __half& operator=(__half&&) = default;
-        __half& operator=(const __half_raw& x) { __x = x.x; return *this; }
+  __half& operator=(const __half_raw& x) {
+    __x = x.x;
+    return *this;
+  }
 #if !defined(__HIP_NO_HALF_CONVERSIONS__)
-            __half& operator=(float x)
-            {
+  __half& operator=(float x) {
    __x = __float2half(x).__x;
    return *this;
  }
-            __half& operator=(double x)
-            {
-                return *this = static_cast<float>(x);
-            }
+  __half& operator=(double x) { return *this = static_cast<float>(x); }
 #endif

  // ACCESSORS
@@ -66,10 +66,7 @@ struct __half2_raw {
  // CREATORS
  __half2() = default;
  __half2(const __half2_raw& ix)
-            :
-            x{reinterpret_cast<const __half&>(ix.x)},
-            y{reinterpret_cast<const __half&>(ix.y)}
-        {}
+      : x{reinterpret_cast<const __half&>(ix.x)}, y{reinterpret_cast<const __half&>(ix.y)} {}
  __half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
  __half2(const __half2&) = default;
  __half2(__half2&&) = default;
@@ -78,27 +75,21 @@ struct __half2_raw {
  // MANIPULATORS
  __half2& operator=(const __half2&) = default;
  __half2& operator=(__half2&&) = default;
-        __half2& operator=(const __half2_raw& ix)
-        {
+  __half2& operator=(const __half2_raw& ix) {
    x = reinterpret_cast<const __half_raw&>(ix.x);
    y = reinterpret_cast<const __half_raw&>(ix.y);
    return *this;
  }

  // ACCESSORS
-        operator __half2_raw() const
-        {
-            return __half2_raw{
-                reinterpret_cast<const unsigned short&>(x),
+  operator __half2_raw() const {
+    return __half2_raw{reinterpret_cast<const unsigned short&>(x),
                       reinterpret_cast<const unsigned short&>(y)};
  }
 };
 // END STRUCT __HALF2

-    inline
-    unsigned short __internal_float2half(
-        float flt, unsigned int& sgn, unsigned int& rem)
-    {
+inline unsigned short __internal_float2half(float flt, unsigned int& sgn, unsigned int& rem) {
  unsigned int x{};
  std::memcpy(&x, &flt, sizeof(flt));

@@ -108,8 +99,7 @@ struct __half2_raw {
  // NaN/+Inf/-Inf
  if (u >= 0x7f800000U) {
    rem = 0;
-            return static_cast<unsigned short>(
-                (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+    return static_cast<unsigned short>((u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
  }
  // Overflows
  if (u > 0x477fefffU) {
@@ -136,9 +126,7 @@ struct __half2_raw {
  return static_cast<unsigned short>(sgn | (mantissa >> shift));
 }

-    inline
-    __half __float2half(float x)
-    {
+inline __half __float2half(float x) {
  __half_raw r;
  unsigned int sgn{};
  unsigned int rem{};
@@ -148,12 +136,9 @@ struct __half2_raw {
  return r;
 }

-    inline
-    __half __float2half_rn(float x) { return __float2half(x); }
+inline __half __float2half_rn(float x) { return __float2half(x); }

-    inline
-    __half __float2half_rz(float x)
-    {
+inline __half __float2half_rz(float x) {
  __half_raw r;
  unsigned int sgn{};
  unsigned int rem{};
@@ -162,9 +147,7 @@ struct __half2_raw {
  return r;
 }

-    inline
-    __half __float2half_rd(float x)
-    {
+inline __half __float2half_rd(float x) {
  __half_raw r;
  unsigned int sgn{};
  unsigned int rem{};
@@ -174,9 +157,7 @@ struct __half2_raw {
  return r;
 }

-    inline
-    __half __float2half_ru(float x)
-    {
+inline __half __float2half_ru(float x) {
  __half_raw r;
  unsigned int sgn{};
  unsigned int rem{};
@@ -186,21 +167,13 @@ struct __half2_raw {
  return r;
 }

-    inline
-    __half2 __float2half2_rn(float x)
-    {
-        return __half2{__float2half_rn(x), __float2half_rn(x)};
-    }
+inline __half2 __float2half2_rn(float x) { return __half2{__float2half_rn(x), __float2half_rn(x)}; }

-    inline
-    __half2 __floats2half2_rn(float x, float y)
-    {
+inline __half2 __floats2half2_rn(float x, float y) {
  return __half2{__float2half_rn(x), __float2half_rn(y)};
 }

-    inline
-    float __internal_half2float(unsigned short x)
-    {
+inline float __internal_half2float(unsigned short x) {
  unsigned int sign = ((x >> 15) & 1);
  unsigned int exponent = ((x >> 10) & 0x1f);
  unsigned int mantissa = ((x & 0x3ff) << 13);
@@ -229,27 +202,15 @@ struct __half2_raw {
  return f;
 }

-    inline
-    float __half2float(__half x)
-    {
-        return __internal_half2float(static_cast<__half_raw>(x).x);
-    }
-    inline
-    float2 __half22float2(__half2 x)
-    {
+inline float __half2float(__half x) { return __internal_half2float(static_cast<__half_raw>(x).x); }
+inline float2 __half22float2(__half2 x) {
  return float2{__internal_half2float(static_cast<__half2_raw>(x).x),
                __internal_half2float(static_cast<__half2_raw>(x).x)};
 }

-    inline
-    float __low2float(__half2 x)
-    {
-        return __internal_half2float(static_cast<__half2_raw>(x).x);
-    }
+inline float __low2float(__half2 x) { return __internal_half2float(static_cast<__half2_raw>(x).x); }

-    inline
-    float __high2float(__half2 x)
-    {
+inline float __high2float(__half2 x) {
  return __internal_half2float(static_cast<__half2_raw>(x).y);
 }

@@ -29,16 +29,14 @@ THE SOFTWARE.
 #include "host_defines.h"
 #endif
 #ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
-extern "C"
-{
+extern "C" {
 __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
 __device__ _Float16 __ocml_cos_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
-    __device__ __attribute__((const))
-    _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
 __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
@@ -83,7 +81,6 @@ extern "C"
 __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
 __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
 __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
-
 }
 #endif  // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
 // TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
@@ -75,4 +75,3 @@ bool hipEnableActivityCallback(uint32_t op, bool enable);
 const char* hipGetCmdName(uint32_t id);

 #endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_RUNTIME_PROF_H
-
@@ -66,7 +66,9 @@ typedef bool_constant<true> true_type;
 typedef bool_constant<false> false_type;

 template <bool __B, class __T = void> struct enable_if {};
-template <class __T> struct enable_if<true, __T> { typedef __T type; };
+template <class __T> struct enable_if<true, __T> {
+  typedef __T type;
+};

 template <bool _B> struct true_or_false_type : public false_type {};
 template <> struct true_or_false_type<true> : public true_type {};
@@ -111,58 +113,56 @@ template<> struct is_floating_point<long double> : public true_type {};
 template <typename __T, typename __U> struct is_same : public false_type {};
 template <typename __T> struct is_same<__T, __T> : public true_type {};

-template<typename _Tp, bool = is_arithmetic<_Tp>::value>
-  struct is_signed : public false_type {};
+template <typename _Tp, bool = is_arithmetic<_Tp>::value> struct is_signed : public false_type {};
 template <typename _Tp>
 struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};

 template <class T>
-    auto test_returnable(int) -> decltype(
-        void(static_cast<T(*)()>(nullptr)), true_type{});
-template<class>
-    auto test_returnable(...) -> false_type;
+auto test_returnable(int) -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
+template <class> auto test_returnable(...) -> false_type;

-template<class T>
-    struct type_identity { using type = T; };
+template <class T> struct type_identity {
+  using type = T;
+};

 template <class T>  // Note that `cv void&` is a substitution failure
 auto try_add_lvalue_reference(int) -> type_identity<T&>;
 template <class T>  // Handle T = cv void case
 auto try_add_lvalue_reference(...) -> type_identity<T>;

-template<class T>
-    auto try_add_rvalue_reference(int) -> type_identity<T&&>;
-template<class T>
-    auto try_add_rvalue_reference(...) -> type_identity<T>;
+template <class T> auto try_add_rvalue_reference(int) -> type_identity<T&&>;
+template <class T> auto try_add_rvalue_reference(...) -> type_identity<T>;

-template<class T>
-struct add_lvalue_reference
-    : decltype(try_add_lvalue_reference<T>(0)) {};
+template <class T> struct add_lvalue_reference : decltype(try_add_lvalue_reference<T>(0)) {};

-template<class T>
-struct add_rvalue_reference
-    : decltype(try_add_rvalue_reference<T>(0)) {};
+template <class T> struct add_rvalue_reference : decltype(try_add_rvalue_reference<T>(0)) {};

-template<typename T>
-typename add_rvalue_reference<T>::type declval() noexcept;
+template <typename T> typename add_rvalue_reference<T>::type declval() noexcept;

 template <class From, class To>
-    auto test_implicitly_convertible(int) -> decltype(
-        void(declval<void(&)(To)>()(declval<From>())), true_type{});
+auto test_implicitly_convertible(int)
+    -> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{});

-template<class, class>
-    auto test_implicitly_convertible(...) -> false_type;
+template <class, class> auto test_implicitly_convertible(...) -> false_type;

-template<class T> struct remove_cv { typedef T type; };
-template<class T> struct remove_cv<const T> { typedef T type; };
-template<class T> struct remove_cv<volatile T> { typedef T type; };
-template<class T> struct remove_cv<const volatile T> { typedef T type; };
+template <class T> struct remove_cv {
+  typedef T type;
+};
+template <class T> struct remove_cv<const T> {
+  typedef T type;
+};
+template <class T> struct remove_cv<volatile T> {
+  typedef T type;
+};
+template <class T> struct remove_cv<const volatile T> {
+  typedef T type;
+};

-template<class T>
-struct is_void : public is_same<void, typename remove_cv<T>::type> {};
+template <class T> struct is_void : public is_same<void, typename remove_cv<T>::type> {};

 template <class From, class To>
-struct is_convertible : public integral_constant<bool,
+struct is_convertible
+    : public integral_constant<bool,
                               (decltype(test_returnable<To>(0))::value &&
                                decltype(test_implicitly_convertible<From, To>(0))::value) ||
                                   (is_void<From>::value && is_void<To>::value)> {};
@@ -174,36 +174,31 @@ typedef basic_istream<char> istream;
 typedef basic_ostream<char> ostream;

 template <typename _Tp>
-    struct is_standard_layout
-    : public integral_constant<bool, __is_standard_layout(_Tp)>
-    { };
+struct is_standard_layout : public integral_constant<bool, __is_standard_layout(_Tp)> {};

-template<typename _Tp>
-    struct is_trivial
-    : public integral_constant<bool, __is_trivial(_Tp)>
-    { };
+template <typename _Tp> struct is_trivial : public integral_constant<bool, __is_trivial(_Tp)> {};


-template <bool B, class T, class F> struct conditional { using type = T; };
-template <class T, class F> struct conditional<false, T, F> { using type = F; };
+template <bool B, class T, class F> struct conditional {
+  using type = T;
+};
+template <class T, class F> struct conditional<false, T, F> {
+  using type = F;
+};

-template<class T>
-struct alignment_of : integral_constant<size_t, alignof(T)> {};
+template <class T> struct alignment_of : integral_constant<size_t, alignof(T)> {};

-template<typename T, T... Ints>
-struct integer_sequence {
+template <typename T, T... Ints> struct integer_sequence {
  using value_type = T;
  static constexpr size_t size() noexcept { return sizeof...(Ints); }
 };

-template<size_t... Ints>
-using index_sequence = integer_sequence<size_t, Ints...>;
+template <size_t... Ints> using index_sequence = integer_sequence<size_t, Ints...>;

 template <size_t _hip_N, size_t... Ints>
 struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};

-template<size_t... Ints>
-struct make_index_sequence_impl<0, Ints...> {
+template <size_t... Ints> struct make_index_sequence_impl<0, Ints...> {
  using type = index_sequence<Ints...>;
 };

@@ -214,7 +209,7 @@ template <size_t... Ints>
 constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
  return {};
 }
-}
+}  // namespace __hip_internal
 typedef __hip_internal::uint8_t __hip_uint8_t;
 typedef __hip_internal::uint16_t __hip_uint16_t;
 typedef __hip_internal::uint32_t __hip_uint32_t;
@@ -241,7 +236,8 @@ typedef __hip_internal::int64_t __hip_int64_t;
 #define __forceinline__ inline __attribute__((always_inline))

 #if __HIP_NO_IMAGE_SUPPORT
-#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
+#define __hip_img_chk__                                                                            \
+  __attribute__((unavailable("The image/texture API not supported on the device")))
 #else
 #define __hip_img_chk__
 #endif
@@ -51,7 +51,8 @@ inline std::uint32_t group_size(hsa_executable_symbol_t x) {

 inline hsa_isa_t isa(hsa_agent_t x) {
  hsa_isa_t r = {};
-    hsa_agent_iterate_isas(x,
+  hsa_agent_iterate_isas(
+      x,
      [](hsa_isa_t i, void* o) {
        *static_cast<hsa_isa_t*>(o) = i;  // Pick the first.

@@ -40,8 +40,7 @@ namespace {
 struct New_grid_launch_tag {};
 struct Old_grid_launch_tag {};

-template <typename C, typename D>
-class RAII_guard {
+template <typename C, typename D> class RAII_guard {
  D dtor_;

 public:
@@ -58,8 +57,7 @@ class RAII_guard {
  ~RAII_guard() { dtor_(); }
 };

-template <typename C, typename D>
-RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
+template <typename C, typename D> RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
  return RAII_guard<C, D>{ctor, std::move(dtor)};
 }

@@ -72,12 +70,10 @@ using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{},
 //         in, and not always assumed to be 3;

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> ==
-         {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks,
-                                                    dim3 dim_blocks, int group_mem_bytes,
-                                                    const hc::accelerator_view& acc_v, K k) {
-    const auto d =
-        hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
+  requires(Domain<K> == {Ts...})
+inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks,
+                                  int group_mem_bytes, const hc::accelerator_view& acc_v, K k) {
+  const auto d = hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
                               num_blocks.x * dim_blocks.x}
                     .tile_with_dynamic(dim_blocks.z, dim_blocks.y, dim_blocks.x, group_mem_bytes);

@@ -96,11 +92,10 @@ void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
 void unlock_stream_hip_(hipStream_t, void*, const char*, hc::accelerator_view*);

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag,
-                                                                 dim3 num_blocks, dim3 dim_blocks,
-                                                                 int group_mem_bytes,
-                                                                 hipStream_t stream,
-                                                                 const char* kernel_name, K k) {
+  requires(Domain<K> == {Ts...})
+inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks,
+                                  int group_mem_bytes, hipStream_t stream, const char* kernel_name,
+                                  K k) {
  void* lck_stream = nullptr;
  auto acc_v = lock_stream_hip_(stream, lck_stream);
  auto stream_guard =
@@ -118,27 +113,26 @@ requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch
 }

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> ==
-         {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(Old_grid_launch_tag,
-                                                                   dim3 num_blocks, dim3 dim_blocks,
-                                                                   int group_mem_bytes,
-                                                                   hipStream_t stream, K k) {
+  requires(Domain<K> == {hipLaunchParm, Ts...})
+inline void grid_launch_hip_impl_(Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks,
+                                  int group_mem_bytes, hipStream_t stream, K k) {
  grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
                        group_mem_bytes, std::move(stream), std::move(k));
 }

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> == {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(
-    Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
-    const char* kernel_name, K k) {
+  requires(Domain<K> == {hipLaunchParm, Ts...})
+inline void grid_launch_hip_impl_(Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks,
+                                  int group_mem_bytes, hipStream_t stream, const char* kernel_name,
+                                  K k) {
  grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
                        group_mem_bytes, std::move(stream), kernel_name, std::move(k));
 }

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> == {Ts...}) inline std::enable_if_t<
-    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
-                                                  int group_mem_bytes, hipStream_t stream,
+  requires(Domain<K> == {Ts...})
+inline std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(
+    dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
    const char* kernel_name, K k) {
  grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
                        std::move(dim_blocks), group_mem_bytes, std::move(stream), kernel_name,
@@ -146,9 +140,11 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
 }

 template <FunctionalProcedure K, typename... Ts>
-requires(Domain<K> == {Ts...}) inline std::enable_if_t<
-    !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
-                                                  int group_mem_bytes, hipStream_t stream, K k) {
+  requires(Domain<K> == {Ts...})
+inline std::enable_if_t<!std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks,
+                                                                      dim3 dim_blocks,
+                                                                      int group_mem_bytes,
+                                                                      hipStream_t stream, K k) {
  grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
                        std::move(dim_blocks), group_mem_bytes, std::move(stream), std::move(k));
 }
@@ -190,9 +186,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p26)> _p26_;                                                             \
    std::decay_t<decltype(p27)> _p27_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_, _p23_, _p24_, _p25_, _p26_, _p27_);                                 \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_, _p23_,     \
+                  _p24_, _p25_, _p26_, _p27_);                                                     \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_29(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -227,9 +223,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p25)> _p25_;                                                             \
    std::decay_t<decltype(p26)> _p26_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_, _p23_, _p24_, _p25_, _p26_);                                        \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_, _p23_,     \
+                  _p24_, _p25_, _p26_);                                                            \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_28(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -263,9 +259,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p24)> _p24_;                                                             \
    std::decay_t<decltype(p25)> _p25_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_, _p23_, _p24_, _p25_);                                               \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_, _p23_,     \
+                  _p24_, _p25_);                                                                   \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_27(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -298,9 +294,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p23)> _p23_;                                                             \
    std::decay_t<decltype(p24)> _p24_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_, _p23_, _p24_);                                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_, _p23_,     \
+                  _p24_);                                                                          \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_26(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -332,9 +328,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p22)> _p22_;                                                             \
    std::decay_t<decltype(p23)> _p23_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_, _p23_);                                                             \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);    \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_25(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -365,9 +360,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p21)> _p21_;                                                             \
    std::decay_t<decltype(p22)> _p22_;                                                             \
    __attribute__((used, flatten)) void operator()(const hc::tiled_index<3>&) const [[hc]] {       \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_,      \
-                        _p22_);                                                                    \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, _p22_);           \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_24(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -396,8 +390,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p20)> _p20_;                                                             \
    std::decay_t<decltype(p21)> _p21_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_);     \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_);                  \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_23(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -425,8 +419,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p19)> _p19_;                                                             \
    std::decay_t<decltype(p20)> _p20_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_);            \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_);                         \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_22(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -453,8 +447,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p18)> _p18_;                                                             \
    std::decay_t<decltype(p19)> _p19_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_);                   \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_);                                \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_21(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -480,8 +474,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p17)> _p17_;                                                             \
    std::decay_t<decltype(p18)> _p18_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_);                          \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_, _p18_);                                       \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_20(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -506,8 +500,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p16)> _p16_;                                                             \
    std::decay_t<decltype(p17)> _p17_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);                                 \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_, _p17_);                                              \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_19(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -531,8 +525,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p15)> _p15_;                                                             \
    std::decay_t<decltype(p16)> _p16_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_, _p16_);                                        \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_, _p16_);                                                     \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_18(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -555,8 +549,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p14)> _p14_;                                                             \
    std::decay_t<decltype(p15)> _p15_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_, _p15_);                                               \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_, _p15_);                                                            \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_17(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -578,8 +572,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p13)> _p13_;                                                             \
    std::decay_t<decltype(p14)> _p14_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_, _p14_);                                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_, _p14_);                                                                   \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_16(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -600,8 +594,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p12)> _p12_;                                                             \
    std::decay_t<decltype(p13)> _p13_;                                                             \
    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
-            kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_,  \
-                        _p12_, _p13_);                                                             \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, _p12_, \
+                  _p13_);                                                                          \
    }                                                                                              \
  }
 #define make_kernel_functor_hip_15(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
@@ -675,8 +669,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p7)> _p7_;                                                               \
    std::decay_t<decltype(p8)> _p8_;                                                               \
    std::decay_t<decltype(p9)> _p9_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_); }    \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_);                     \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_11(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8) \
  struct make_kernel_name_hip(function_name, 9) {                                                  \
@@ -689,8 +684,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p6)> _p6_;                                                               \
    std::decay_t<decltype(p7)> _p7_;                                                               \
    std::decay_t<decltype(p8)> _p8_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_); }          \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);                           \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_10(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)     \
  struct make_kernel_name_hip(function_name, 8) {                                                  \
@@ -702,8 +698,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p5)> _p5_;                                                               \
    std::decay_t<decltype(p6)> _p6_;                                                               \
    std::decay_t<decltype(p7)> _p7_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_); }                \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);                                 \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_9(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)          \
  struct make_kernel_name_hip(function_name, 7) {                                                  \
@@ -714,8 +711,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p4)> _p4_;                                                               \
    std::decay_t<decltype(p5)> _p5_;                                                               \
    std::decay_t<decltype(p6)> _p6_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_); }                      \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);                                       \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_8(function_name, kernel_name, p0, p1, p2, p3, p4, p5)              \
  struct make_kernel_name_hip(function_name, 6) {                                                  \
@@ -725,8 +723,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p3)> _p3_;                                                               \
    std::decay_t<decltype(p4)> _p4_;                                                               \
    std::decay_t<decltype(p5)> _p5_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_); }                            \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);                                             \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_7(function_name, kernel_name, p0, p1, p2, p3, p4)                  \
  struct make_kernel_name_hip(function_name, 5) {                                                  \
@@ -735,8 +734,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p2)> _p2_;                                                               \
    std::decay_t<decltype(p3)> _p3_;                                                               \
    std::decay_t<decltype(p4)> _p4_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_); }                                  \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);                                                   \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)                      \
  struct make_kernel_name_hip(function_name, 4) {                                                  \
@@ -744,8 +744,9 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<
    std::decay_t<decltype(p1)> _p1_;                                                               \
    std::decay_t<decltype(p2)> _p2_;                                                               \
    std::decay_t<decltype(p3)> _p3_;                                                               \
-        void operator()(const hc::tiled_index<3>&) const                                           \
-            [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_); }                                        \
+    void operator()(const hc::tiled_index<3>&) const [[hc]] {                                      \
+      kernel_name(_p0_, _p1_, _p2_, _p3_);                                                         \
+    }                                                                                              \
  }
 #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)                          \
  struct make_kernel_name_hip(function_name, 3) {                                                  \
@@ -786,8 +787,8 @@ requires(Domain<K> == {Ts...}) inline std::enable_if_t<

 #define hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)      \
  do {                                                                                             \
-        hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes,     \
-                                stream, ##__VA_ARGS__);                                            \
+    hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, \
+                            ##__VA_ARGS__);                                                        \
  } while (0)

 #define hipLaunchKernel(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)         \
@@ -33,661 +33,252 @@ THE SOFTWARE.

 // DOT FUNCTIONS
 #if defined(__clang__) && defined(__HIP__)
-__device__
-__attribute__((const))
-int __ockl_sdot2(
-    HIP_vector_base<short, 2>::Native_vec_,
-    HIP_vector_base<short, 2>::Native_vec_,
-    int, bool);
+__device__ __attribute__((const)) int __ockl_sdot2(HIP_vector_base<short, 2>::Native_vec_,
+                                                   HIP_vector_base<short, 2>::Native_vec_, int,
+                                                   bool);

-__device__
-__attribute__((const))
-unsigned int __ockl_udot2(
-    HIP_vector_base<unsigned short, 2>::Native_vec_,
+__device__ __attribute__((const)) unsigned int __ockl_udot2(
    HIP_vector_base<unsigned short, 2>::Native_vec_,
+    HIP_vector_base<unsigned short, 2>::Native_vec_, unsigned int, bool);
+
+__device__ __attribute__((const)) int __ockl_sdot4(HIP_vector_base<char, 4>::Native_vec_,
+                                                   HIP_vector_base<char, 4>::Native_vec_, int,
+                                                   bool);
+
+__device__ __attribute__((const)) unsigned int __ockl_udot4(
+    HIP_vector_base<unsigned char, 4>::Native_vec_, HIP_vector_base<unsigned char, 4>::Native_vec_,
    unsigned int, bool);

-__device__
-__attribute__((const))
-int __ockl_sdot4(
-    HIP_vector_base<char, 4>::Native_vec_,
-    HIP_vector_base<char, 4>::Native_vec_,
-    int, bool);
+__device__ __attribute__((const)) int __ockl_sdot8(int, int, int, bool);

-__device__
-__attribute__((const))
-unsigned int __ockl_udot4(
-    HIP_vector_base<unsigned char, 4>::Native_vec_,
-    HIP_vector_base<unsigned char, 4>::Native_vec_,
+__device__ __attribute__((const)) unsigned int __ockl_udot8(unsigned int, unsigned int,
                                                            unsigned int, bool);
-
-__device__
-__attribute__((const))
-int __ockl_sdot8(int, int, int, bool);
-
-__device__
-__attribute__((const))
-unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
 #endif

 #if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
 // BEGIN FLOAT
-__device__
-__attribute__((const))
-float __ocml_acos_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_acosh_f32(float);
-__device__
-__attribute__((const))
-float __ocml_asin_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_asinh_f32(float);
-__device__
-__attribute__((const))
-float __ocml_atan2_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_atan_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_atanh_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_cbrt_f32(float);
-__device__
-__attribute__((const))
-float __ocml_ceil_f32(float);
-__device__
-__attribute__((const))
-__device__
-float __ocml_copysign_f32(float, float);
-__device__
-float __ocml_cos_f32(float);
-__device__
-float __ocml_native_cos_f32(float);
-__device__
-__attribute__((pure))
-__device__
-float __ocml_cosh_f32(float);
-__device__
-float __ocml_cospi_f32(float);
-__device__
-float __ocml_i0_f32(float);
-__device__
-float __ocml_i1_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_erfc_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_erfcinv_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_erfcx_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_erf_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_erfinv_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_exp10_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_native_exp10_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_exp2_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_exp_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_native_exp_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_expm1_f32(float);
-__device__
-__attribute__((const))
-float __ocml_fabs_f32(float);
-__device__
-__attribute__((const))
-float __ocml_fdim_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_floor_f32(float);
-__device__
-__attribute__((const))
-float __ocml_fma_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_fmax_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_fmin_f32(float, float);
-__device__
-__attribute__((const))
-__device__
-float __ocml_fmod_f32(float, float);
-__device__
-float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
-__device__
-__attribute__((const))
-float __ocml_hypot_f32(float, float);
-__device__
-__attribute__((const))
-int __ocml_ilogb_f32(float);
-__device__
-__attribute__((const))
-int __ocml_isfinite_f32(float);
-__device__
-__attribute__((const))
-int __ocml_isinf_f32(float);
-__device__
-__attribute__((const))
-int __ocml_isnan_f32(float);
-__device__
-float __ocml_j0_f32(float);
-__device__
-float __ocml_j1_f32(float);
-__device__
-__attribute__((const))
-float __ocml_ldexp_f32(float, int);
-__device__
-float __ocml_lgamma_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_log10_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_native_log10_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_log1p_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_log2_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_native_log2_f32(float);
-__device__
-__attribute__((const))
-float __ocml_logb_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_log_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_native_log_f32(float);
-__device__
-float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
-__device__
-__attribute__((const))
-float __ocml_nearbyint_f32(float);
-__device__
-__attribute__((const))
-float __ocml_nextafter_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_len3_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_len4_f32(float, float, float, float);
-__device__
-__attribute__((pure))
-float __ocml_ncdf_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_ncdfinv_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_pow_f32(float, float);
-__device__
-__attribute__((pure))
-float __ocml_pown_f32(float, int);
-__device__
-__attribute__((pure))
-float __ocml_rcbrt_f32(float);
-__device__
-__attribute__((const))
-float __ocml_remainder_f32(float, float);
-__device__
-float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
-__device__
-__attribute__((const))
-float __ocml_rhypot_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_rint_f32(float);
-__device__
-__attribute__((const))
-float __ocml_rlen3_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_rlen4_f32(float, float, float, float);
-__device__
-__attribute__((const))
-float __ocml_round_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_rsqrt_f32(float);
-__device__
-__attribute__((const))
-float __ocml_scalb_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_scalbn_f32(float, int);
-__device__
-__attribute__((const))
-int __ocml_signbit_f32(float);
-__device__
-float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
-__device__
-float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
-__device__
-float __ocml_sin_f32(float);
-__device__
-float __ocml_native_sin_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_sinh_f32(float);
-__device__
-float __ocml_sinpi_f32(float);
-__device__
-__attribute__((const))
-float __ocml_sqrt_f32(float);
-__device__
-__attribute__((const))
-float __ocml_native_sqrt_f32(float);
-__device__
-float __ocml_tan_f32(float);
-__device__
-__attribute__((pure))
-float __ocml_tanh_f32(float);
-__device__
-float __ocml_tgamma_f32(float);
-__device__
-__attribute__((const))
-float __ocml_trunc_f32(float);
-__device__
-float __ocml_y0_f32(float);
-__device__
-float __ocml_y1_f32(float);
+__device__ __attribute__((const)) float __ocml_acos_f32(float);
+__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
+__device__ __attribute__((const)) float __ocml_asin_f32(float);
+__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
+__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
+__device__ __attribute__((const)) float __ocml_atan_f32(float);
+__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
+__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_ceil_f32(float);
+__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float, float);
+__device__ float __ocml_cos_f32(float);
+__device__ float __ocml_native_cos_f32(float);
+__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
+__device__ float __ocml_cospi_f32(float);
+__device__ float __ocml_i0_f32(float);
+__device__ float __ocml_i1_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
+__device__ __attribute__((pure)) float __ocml_erf_f32(float);
+__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
+__device__ __attribute__((pure)) float __ocml_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
+__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
+__device__ __attribute__((const)) float __ocml_fabs_f32(float);
+__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
+__device__ __attribute__((const)) float __ocml_floor_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float, float);
+__device__ float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
+__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
+__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
+__device__ __attribute__((const)) int __ocml_isinf_f32(float);
+__device__ __attribute__((const)) int __ocml_isnan_f32(float);
+__device__ float __ocml_j0_f32(float);
+__device__ float __ocml_j1_f32(float);
+__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
+__device__ float __ocml_lgamma_f32(float);
+__device__ __attribute__((pure)) float __ocml_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
+__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
+__device__ __attribute__((pure)) float __ocml_log2_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
+__device__ __attribute__((const)) float __ocml_logb_f32(float);
+__device__ __attribute__((pure)) float __ocml_log_f32(float);
+__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
+__device__ float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
+__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
+__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float, float);
+__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
+__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
+__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
+__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
+__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
+__device__ float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
+__device__ __attribute__((const)) float __ocml_rint_f32(float);
+__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float, float);
+__device__ __attribute__((const)) float __ocml_round_f32(float);
+__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
+__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
+__device__ __attribute__((const)) int __ocml_signbit_f32(float);
+__device__ float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+__device__ float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+__device__ float __ocml_sin_f32(float);
+__device__ float __ocml_native_sin_f32(float);
+__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
+__device__ float __ocml_sinpi_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
+__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
+__device__ float __ocml_tan_f32(float);
+__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
+__device__ float __ocml_tgamma_f32(float);
+__device__ __attribute__((const)) float __ocml_trunc_f32(float);
+__device__ float __ocml_y0_f32(float);
+__device__ float __ocml_y1_f32(float);

 // BEGIN INTRINSICS
-__device__
-__attribute__((const))
-float __ocml_add_rte_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_add_rtn_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_add_rtp_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_add_rtz_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_sub_rte_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_sub_rtn_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_sub_rtp_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_sub_rtz_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_mul_rte_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_mul_rtn_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_mul_rtp_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_mul_rtz_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_div_rte_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_div_rtn_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_div_rtp_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_div_rtz_f32(float, float);
-__device__
-__attribute__((const))
-float __ocml_sqrt_rte_f32(float);
-__device__
-__attribute__((const))
-float __ocml_sqrt_rtn_f32(float);
-__device__
-__attribute__((const))
-float __ocml_sqrt_rtp_f32(float);
-__device__
-__attribute__((const))
-float __ocml_sqrt_rtz_f32(float);
-__device__
-__attribute__((const))
-float __ocml_fma_rte_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_fma_rtn_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_fma_rtp_f32(float, float, float);
-__device__
-__attribute__((const))
-float __ocml_fma_rtz_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
+__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
+__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
+__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
 // END INTRINSICS
 // END FLOAT

 // BEGIN DOUBLE
-__device__
-__attribute__((const))
-double __ocml_acos_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_acosh_f64(double);
-__device__
-__attribute__((const))
-double __ocml_asin_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_asinh_f64(double);
-__device__
-__attribute__((const))
-double __ocml_atan2_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_atan_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_atanh_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_cbrt_f64(double);
-__device__
-__attribute__((const))
-double __ocml_ceil_f64(double);
-__device__
-__attribute__((const))
-double __ocml_copysign_f64(double, double);
-__device__
-double __ocml_cos_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_cosh_f64(double);
-__device__
-double __ocml_cospi_f64(double);
-__device__
-double __ocml_i0_f64(double);
-__device__
-double __ocml_i1_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_erfc_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_erfcinv_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_erfcx_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_erf_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_erfinv_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_exp10_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_exp2_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_exp_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_expm1_f64(double);
-__device__
-__attribute__((const))
-double __ocml_fabs_f64(double);
-__device__
-__attribute__((const))
-double __ocml_fdim_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_floor_f64(double);
-__device__
-__attribute__((const))
-double __ocml_fma_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_fmax_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_fmin_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_fmod_f64(double, double);
-__device__
-double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
-__device__
-__attribute__((const))
-double __ocml_hypot_f64(double, double);
-__device__
-__attribute__((const))
-int __ocml_ilogb_f64(double);
-__device__
-__attribute__((const))
-int __ocml_isfinite_f64(double);
-__device__
-__attribute__((const))
-int __ocml_isinf_f64(double);
-__device__
-__attribute__((const))
-int __ocml_isnan_f64(double);
-__device__
-double __ocml_j0_f64(double);
-__device__
-double __ocml_j1_f64(double);
-__device__
-__attribute__((const))
-double __ocml_ldexp_f64(double, int);
-__device__
-double __ocml_lgamma_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_log10_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_log1p_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_log2_f64(double);
-__device__
-__attribute__((const))
-double __ocml_logb_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_log_f64(double);
-__device__
-double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
-__device__
-__attribute__((const))
-double __ocml_nearbyint_f64(double);
-__device__
-__attribute__((const))
-double __ocml_nextafter_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_len3_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_len4_f64(double, double, double, double);
-__device__
-__attribute__((pure))
-double __ocml_ncdf_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_ncdfinv_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_pow_f64(double, double);
-__device__
-__attribute__((pure))
-double __ocml_pown_f64(double, int);
-__device__
-__attribute__((pure))
-double __ocml_rcbrt_f64(double);
-__device__
-__attribute__((const))
-double __ocml_remainder_f64(double, double);
-__device__
-double __ocml_remquo_f64(
-    double, double, __attribute__((address_space(5))) int*);
-__device__
-__attribute__((const))
-double __ocml_rhypot_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_rint_f64(double);
-__device__
-__attribute__((const))
-double __ocml_rlen3_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_rlen4_f64(double, double, double, double);
-__device__
-__attribute__((const))
-double __ocml_round_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_rsqrt_f64(double);
-__device__
-__attribute__((const))
-double __ocml_scalb_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_scalbn_f64(double, int);
-__device__
-__attribute__((const))
-int __ocml_signbit_f64(double);
-__device__
-double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
-__device__
-double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
-__device__
-double __ocml_sin_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_sinh_f64(double);
-__device__
-double __ocml_sinpi_f64(double);
-__device__
-__attribute__((const))
-double __ocml_sqrt_f64(double);
-__device__
-double __ocml_tan_f64(double);
-__device__
-__attribute__((pure))
-double __ocml_tanh_f64(double);
-__device__
-double __ocml_tgamma_f64(double);
-__device__
-__attribute__((const))
-double __ocml_trunc_f64(double);
-__device__
-double __ocml_y0_f64(double);
-__device__
-double __ocml_y1_f64(double);
+__device__ __attribute__((const)) double __ocml_acos_f64(double);
+__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
+__device__ __attribute__((const)) double __ocml_asin_f64(double);
+__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
+__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
+__device__ __attribute__((const)) double __ocml_atan_f64(double);
+__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
+__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_ceil_f64(double);
+__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
+__device__ double __ocml_cos_f64(double);
+__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
+__device__ double __ocml_cospi_f64(double);
+__device__ double __ocml_i0_f64(double);
+__device__ double __ocml_i1_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
+__device__ __attribute__((pure)) double __ocml_erf_f64(double);
+__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
+__device__ __attribute__((pure)) double __ocml_exp_f64(double);
+__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
+__device__ __attribute__((const)) double __ocml_fabs_f64(double);
+__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
+__device__ __attribute__((const)) double __ocml_floor_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
+__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
+__device__ double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
+__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
+__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
+__device__ __attribute__((const)) int __ocml_isinf_f64(double);
+__device__ __attribute__((const)) int __ocml_isnan_f64(double);
+__device__ double __ocml_j0_f64(double);
+__device__ double __ocml_j1_f64(double);
+__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
+__device__ double __ocml_lgamma_f64(double);
+__device__ __attribute__((pure)) double __ocml_log10_f64(double);
+__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
+__device__ __attribute__((pure)) double __ocml_log2_f64(double);
+__device__ __attribute__((const)) double __ocml_logb_f64(double);
+__device__ __attribute__((pure)) double __ocml_log_f64(double);
+__device__ double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
+__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
+__device__ __attribute__((const)) double __ocml_len3_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double, double);
+__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
+__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
+__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
+__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
+__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
+__device__ double __ocml_remquo_f64(double, double, __attribute__((address_space(5))) int*);
+__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
+__device__ __attribute__((const)) double __ocml_rint_f64(double);
+__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double, double, double);
+__device__ __attribute__((const)) double __ocml_round_f64(double);
+__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
+__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
+__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
+__device__ __attribute__((const)) int __ocml_signbit_f64(double);
+__device__ double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+__device__ double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+__device__ double __ocml_sin_f64(double);
+__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
+__device__ double __ocml_sinpi_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
+__device__ double __ocml_tan_f64(double);
+__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
+__device__ double __ocml_tgamma_f64(double);
+__device__ __attribute__((const)) double __ocml_trunc_f64(double);
+__device__ double __ocml_y0_f64(double);
+__device__ double __ocml_y1_f64(double);

 // BEGIN INTRINSICS
-__device__
-__attribute__((const))
-double __ocml_add_rte_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_add_rtn_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_add_rtp_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_add_rtz_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_sub_rte_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_sub_rtn_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_sub_rtp_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_sub_rtz_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_mul_rte_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_mul_rtn_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_mul_rtp_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_mul_rtz_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_div_rte_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_div_rtn_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_div_rtp_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_div_rtz_f64(double, double);
-__device__
-__attribute__((const))
-double __ocml_sqrt_rte_f64(double);
-__device__
-__attribute__((const))
-double __ocml_sqrt_rtn_f64(double);
-__device__
-__attribute__((const))
-double __ocml_sqrt_rtp_f64(double);
-__device__
-__attribute__((const))
-double __ocml_sqrt_rtz_f64(double);
-__device__
-__attribute__((const))
-double __ocml_fma_rte_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_fma_rtn_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_fma_rtp_f64(double, double, double);
-__device__
-__attribute__((const))
-double __ocml_fma_rtz_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
+__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
+__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double, double);
+__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double, double);
 // END INTRINSICS
 // END DOUBLE

@@ -34,105 +34,186 @@ __device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_C

 __device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i, int c);

-__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
+__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                    int2::Native_vec_ c, int f);

-__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
+__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                     int4::Native_vec_ c, int f);

-__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int c, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int2::Native_vec_ c, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int2::Native_vec_ c, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int4::Native_vec_ c, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int4::Native_vec_ c, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        int2::Native_vec_ c, int f, int l);

-__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
+__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                         int4::Native_vec_ c, int f, int l);

-__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
+__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c,
+                                      float4::Native_vec_ p);

-__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                       float4::Native_vec_ p);

-__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                      float4::Native_vec_ p);

-__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                       float4::Native_vec_ p);

-__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                      float4::Native_vec_ p);

-__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
+__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT* i, int2::Native_vec_ c,
+                                      int f, float4::Native_vec_ p);

-__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
+__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i, int4::Native_vec_ c,
+                                       int f, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i, int c, int l,
+                                          float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int2::Native_vec_ c, int l, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int2::Native_vec_ c, int l, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int4::Native_vec_ c, int l, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int4::Native_vec_ c, int l, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                          int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);

-__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
+__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                           int4::Native_vec_ c, int f, int l,
+                                           float4::Native_vec_ p);

-__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
+__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float c);

-__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                      unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                      float4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                       unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                       float4::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float c, float dx, float dy);

-__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                            unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                            float2::Native_vec_ c, float dx,
+                                                            float dy);

-__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float2::Native_vec_ c,
+                                                           float2::Native_vec_ dx,
+                                                           float2::Native_vec_ dy);

-__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                            unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                            float4::Native_vec_ c,
+                                                            float2::Native_vec_ dx,
+                                                            float2::Native_vec_ dy);

-__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
+__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c,
+                                                           float4::Native_vec_ dx,
+                                                           float4::Native_vec_ dy);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float2::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float2::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float4::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                          unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                          float4::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                           unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                           float4::Native_vec_ c, float l);

-__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);

-__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT* i,
+                                                        unsigned int ADDRESS_SPACE_CONSTANT* s,
+                                                        float2::Native_vec_ c);

 __device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);

@@ -173,5 +254,4 @@ __device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT
 __device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);

 __device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
-
 }
@@ -54,6 +54,7 @@ public:
  std::size_t size();
  void reserve(std::size_t);
  void resize(std::size_t);
+
 private:
  kernarg_impl* impl;
 };
@@ -66,15 +67,11 @@ public:
  ~program_state();
  program_state(const program_state&) = delete;

-    hipFunction_t kernel_descriptor(std::uintptr_t,
-                                    hsa_agent_t);
+  hipFunction_t kernel_descriptor(std::uintptr_t, hsa_agent_t);

  kernargs_size_align get_kernargs_size_align(std::uintptr_t);
-    hsa_executable_t load_executable(const char*, const size_t,
-                                     hsa_executable_t,
-                                     hsa_agent_t);
-    hsa_executable_t load_executable_no_copy(const char*, const size_t,
-                                             hsa_executable_t,
+  hsa_executable_t load_executable(const char*, const size_t, hsa_executable_t, hsa_agent_t);
+  hsa_executable_t load_executable_no_copy(const char*, const size_t, hsa_executable_t,
                                           hsa_agent_t);

  void* global_addr_by_name(const char* name);
@@ -89,6 +86,7 @@ public:
  std::size_t size(std::size_t n) const;
  std::size_t alignment(std::size_t n) const;
  const void* getHandle() const { return handle; };
+
 private:
  const void* handle;
  friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
@@ -98,9 +96,7 @@ private:
 #pragma GCC visibility pop
 #endif

-inline
-__attribute__((visibility("hidden")))
-program_state& get_program_state() {
+inline __attribute__((visibility("hidden"))) program_state& get_program_state() {
  static program_state ps;
  return ps;
 }
@@ -36,66 +36,37 @@ THE SOFTWARE.
  unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;                    \
  (void)s;

-template<typename T>
-struct __hip_is_tex_surf_scalar_channel_type
-{
-    static constexpr bool value =
-        __hip_internal::is_same<T, char>::value ||
+template <typename T> struct __hip_is_tex_surf_scalar_channel_type {
+  static constexpr bool value = __hip_internal::is_same<T, char>::value ||
      __hip_internal::is_same<T, unsigned char>::value ||
      __hip_internal::is_same<T, short>::value ||
-        __hip_internal::is_same<T, unsigned short>::value ||
-        __hip_internal::is_same<T, int>::value ||
-        __hip_internal::is_same<T, unsigned int>::value ||
-        __hip_internal::is_same<T, float>::value;
+      __hip_internal::is_same<T, unsigned short>::value || __hip_internal::is_same<T, int>::value ||
+      __hip_internal::is_same<T, unsigned int>::value || __hip_internal::is_same<T, float>::value;
 };

-template<typename T>
-struct __hip_is_tex_surf_channel_type
-{
-    static constexpr bool value =
-        __hip_is_tex_surf_scalar_channel_type<T>::value;
+template <typename T> struct __hip_is_tex_surf_channel_type {
+  static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value;
 };

-template<
-    typename T,
-    unsigned int rank>
-struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>
-{
-    static constexpr bool value =
-        __hip_is_tex_surf_scalar_channel_type<T>::value &&
-        ((rank == 1) ||
-         (rank == 2) ||
-         (rank == 4));
+template <typename T, unsigned int rank>
+struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>> {
+  static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value &&
+      ((rank == 1) || (rank == 2) || (rank == 4));
 };

-template<typename T>
-struct __hip_is_tex_normalized_channel_type
-{
-    static constexpr bool value =
-        __hip_internal::is_same<T, char>::value ||
+template <typename T> struct __hip_is_tex_normalized_channel_type {
+  static constexpr bool value = __hip_internal::is_same<T, char>::value ||
      __hip_internal::is_same<T, unsigned char>::value ||
-        __hip_internal::is_same<T, short>::value ||
-        __hip_internal::is_same<T, unsigned short>::value;
+      __hip_internal::is_same<T, short>::value || __hip_internal::is_same<T, unsigned short>::value;
 };

-template<
-    typename T,
-    unsigned int rank>
-struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
-{
+template <typename T, unsigned int rank>
+struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>> {
  static constexpr bool value =
-        __hip_is_tex_normalized_channel_type<T>::value &&
-        ((rank == 1) ||
-         (rank == 2) ||
-         (rank == 4));
+      __hip_is_tex_normalized_channel_type<T>::value && ((rank == 1) || (rank == 2) || (rank == 4));
 };

-template <
-    typename T,
-    hipTextureReadMode readMode,
-    typename Enable = void>
-struct __hip_tex_ret
-{
+template <typename T, hipTextureReadMode readMode, typename Enable = void> struct __hip_tex_ret {
  static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
 };

@@ -104,8 +75,8 @@ struct __hip_tex_ret
 */
 template <typename T, typename U>
 __forceinline__ __device__
-typename __hip_internal::enable_if<
-  __hip_is_tex_surf_scalar_channel_type<T>::value, const T>::type
+    typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
+                                       const T>::type
    __hipMapFrom(const U& u) {
  if constexpr (sizeof(T) < sizeof(float)) {
    union {
@@ -126,8 +97,7 @@ __hipMapFrom(const U &u) {
 * Map from device function return U to vector texture type T
 */
 template <typename T, typename U>
-__forceinline__ __device__
-typename __hip_internal::enable_if<
+__forceinline__ __device__ typename __hip_internal::enable_if<
    __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
 __hipMapFrom(const U& u) {
  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
@@ -150,8 +120,8 @@ __hipMapFrom(const U &u) {
 */
 template <typename U, typename T>
 __forceinline__ __device__
-typename __hip_internal::enable_if<
-__hip_is_tex_surf_scalar_channel_type<T>::value, const U>::type
+    typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
+                                       const U>::type
    __hipMapTo(const T& t) {
  if constexpr (sizeof(T) < sizeof(float)) {
    union {
@@ -174,8 +144,7 @@ __hipMapTo(const T &t) {
 * Map from vector texture type T to device function input U
 */
 template <typename U, typename T>
-__forceinline__ __device__
-typename __hip_internal::enable_if<
+__forceinline__ __device__ typename __hip_internal::enable_if<
    __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
 __hipMapTo(const T& t) {
  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
@@ -195,71 +164,59 @@ __hipMapTo(const T &t) {
  }
 }

-template <
-    typename T,
-    hipTextureReadMode readMode>
+template <typename T, hipTextureReadMode readMode>
 using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;

 template <typename T>
 struct __hip_tex_ret<
-    T,
-    hipReadModeElementType,
-    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
-{
+    T, hipReadModeElementType,
+    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
  using type = T;
 };

-template<
-    typename T,
-    unsigned int rank>
+template <typename T, unsigned int rank>
 struct __hip_tex_ret<
-    HIP_vector_type<T, rank>,
-    hipReadModeElementType,
-    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
-{
+    HIP_vector_type<T, rank>, hipReadModeElementType,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
  using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
 };

 template <typename T>
-struct __hip_tex_ret<
-    T,
-    hipReadModeNormalizedFloat,
-    typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
-{
+struct __hip_tex_ret<T, hipReadModeNormalizedFloat,
+                     typename __hip_internal::enable_if<
+                         __hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
  using type = float;
 };

-template<
-    typename T,
-    unsigned int rank>
+template <typename T, unsigned int rank>
 struct __hip_tex_ret<
-    HIP_vector_type<T, rank>,
-    hipReadModeNormalizedFloat,
-    typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
-{
+    HIP_vector_type<T, rank>, hipReadModeNormalizedFloat,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
  using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
 };


 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(
+    texture<T, hipTextureType1D, readMode> t, int x) {
  TEXTURE_PARAMETERS_INIT;
  auto tmp = __ockl_image_load_1Db(i, x);
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(
+    texture<T, hipTextureType1D, readMode> t, float x) {
  TEXTURE_PARAMETERS_INIT;
  auto tmp = __ockl_image_sample_1D(i, s, x);
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(
+    texture<T, hipTextureType2D, readMode> t, float x, float y) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
@@ -267,8 +224,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, layer};
  auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
@@ -276,8 +233,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, layer, 0.0f};
  auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
@@ -285,8 +242,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
@@ -294,8 +251,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
@@ -303,16 +260,16 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(
+    texture<T, hipTextureType1D, readMode> t, float x, float level) {
  TEXTURE_PARAMETERS_INIT;
  auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, float level) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
@@ -320,8 +277,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, layer};
  auto tmp = __ockl_image_sample_lod_1Da(i, s, get_native_vector(coords), level);
@@ -329,8 +286,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, layer, 0.0f};
  auto tmp = __ockl_image_sample_lod_2Da(i, s, get_native_vector(coords), level);
@@ -338,8 +295,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
@@ -347,8 +304,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
@@ -356,8 +313,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(
+    texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, layer};
  auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
@@ -365,8 +322,9 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(
+    texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer,
+    float level) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, layer};
  auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
@@ -374,8 +332,9 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(
+    texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx,
+    float4 dPdy) {
  TEXTURE_PARAMETERS_INIT;
  (void)x;
  (void)y;
@@ -390,8 +349,9 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode>
+texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y,
+                      float z, int layer, float4 dPdx, float4 dPdy) {
  TEXTURE_PARAMETERS_INIT;
  (void)x;
  (void)y;
@@ -407,16 +367,16 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(
+    texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy) {
  TEXTURE_PARAMETERS_INIT;
  auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
@@ -425,8 +385,8 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(
+    texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, layer};
  auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
@@ -434,73 +394,61 @@ static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> t
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(
+    texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx,
+    float2 dPdy) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, layer, 0.0f};
-    auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords),
-                                            get_native_vector(dPdx), get_native_vector(dPdy));
+  auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                          get_native_vector(dPdy));
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(
+    texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy) {
  TEXTURE_PARAMETERS_INIT;
  float4 coords{x, y, z, 0.0f};
  float4 gradx{dPdx.x, dPdx.y, dPdx.z, 0.0f};
  float4 grady{dPdy.x, dPdy.y, dPdy.z, 0.0f};
-    auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords),
-                                           get_native_vector(gradx), get_native_vector(grady));
+  auto tmp = __ockl_image_sample_grad_3D(i, s, get_native_vector(coords), get_native_vector(gradx),
+                                         get_native_vector(grady));
  return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
 }

-template <
-    typename T,
-    hipTextureReadMode readMode,
-    typename Enable = void>
-struct __hip_tex2dgather_ret
-{
+template <typename T, hipTextureReadMode readMode, typename Enable = void>
+struct __hip_tex2dgather_ret {
  static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
 };

-template <
-    typename T,
-    hipTextureReadMode readMode>
+template <typename T, hipTextureReadMode readMode>
 using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;

 template <typename T>
 struct __hip_tex2dgather_ret<
-    T,
-    hipReadModeElementType,
-    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
-{
+    T, hipReadModeElementType,
+    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
  using type = HIP_vector_type<T, 4>;
 };

-template<
-    typename T,
-    unsigned int rank>
+template <typename T, unsigned int rank>
 struct __hip_tex2dgather_ret<
-    HIP_vector_type<T, rank>,
-    hipReadModeElementType,
-    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
-{
+    HIP_vector_type<T, rank>, hipReadModeElementType,
+    typename __hip_internal::enable_if<
+        __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
  using type = HIP_vector_type<T, 4>;
 };

 template <typename T>
-struct __hip_tex2dgather_ret<
-    T,
-    hipReadModeNormalizedFloat,
-    typename __hip_internal::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
-{
+struct __hip_tex2dgather_ret<T, hipReadModeNormalizedFloat,
+                             typename __hip_internal::enable_if<
+                                 __hip_is_tex_normalized_channel_type<T>::value, bool>::type> {
  using type = float4;
 };

 template <typename T, hipTextureReadMode readMode>
-static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
-{
+static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(
+    texture<T, hipTextureType2D, readMode> t, float x, float y, int comp = 0) {
  TEXTURE_PARAMETERS_INIT;
  float2 coords{x, y};
  switch (comp) {
@@ -40,8 +40,7 @@ THE SOFTWARE.
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
-{
+static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  auto tmp = __ockl_image_load_1Db(i, x);
  return __hipMapFrom<T>(tmp);
@@ -50,16 +49,14 @@ static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject,
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
-{
+static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) {
  *ptr = tex1Dfetch<T>(textureObject, x);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
-{
+static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  auto tmp = __ockl_image_sample_1D(i, s, x);
  return __hipMapFrom<T>(tmp);
@@ -68,16 +65,14 @@ static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, floa
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
-{
+static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) {
  *ptr = tex1D<T>(textureObject, x);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
-{
+static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_2D(i, s, get_native_vector(coords));
@@ -87,16 +82,16 @@ static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, floa
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
-{
+static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x,
+                                             float y) {
  *ptr = tex2D<T>(textureObject, x, y);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
-{
+static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y,
+                                          float z) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_3D(i, s, get_native_vector(coords));
@@ -106,16 +101,16 @@ static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, floa
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
-{
+static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x,
+                                             float y, float z) {
  *ptr = tex3D<T>(textureObject, x, y, z);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
-{
+static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x,
+                                                 int layer) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, layer};
  auto tmp = __ockl_image_sample_1Da(i, s, get_native_vector(coords));
@@ -125,16 +120,16 @@ static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObjec
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
-{
+static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject,
+                                                    float x, int layer) {
  *ptr = tex1DLayered<T>(textureObject, x, layer);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
-{
+static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
+                                                 int layer) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, layer, 0.0f};
  auto tmp = __ockl_image_sample_2Da(i, s, get_native_vector(coords));
@@ -144,16 +139,16 @@ static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObjec
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
-{
+static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject,
+                                                    float x, float y, int layer) {
  *ptr = tex1DLayered<T>(textureObject, x, y, layer);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__  T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
-{
+static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y,
+                                               float z) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_CM(i, s, get_native_vector(coords));
@@ -163,16 +158,16 @@ static __device__ __hip_img_chk__  T texCubemap(hipTextureObject_t textureObject
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
-{
+static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x,
+                                                  float y, float z) {
  *ptr = texCubemap<T>(textureObject, x, y, z);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
-{
+static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x,
+                                                      float y, float z, int layer) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, layer};
  auto tmp = __ockl_image_sample_CMa(i, s, get_native_vector(coords));
@@ -182,16 +177,16 @@ static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t texture
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
-{
+static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject,
+                                                         float x, float y, float z, int layer) {
  *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
-{
+static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y,
+                                                int comp = 0) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, y};
  switch (comp) {
@@ -222,16 +217,16 @@ static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
-{
+static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject,
+                                                   float x, float y, int comp = 0) {
  *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
-{
+static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x,
+                                             float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
  return __hipMapFrom<T>(tmp);
@@ -240,16 +235,16 @@ static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, f
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
-{
+static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float level) {
  *ptr = tex1DLod<T>(textureObject, x, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
-{
+static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
+                                             float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_lod_2D(i, s, get_native_vector(coords), level);
@@ -259,16 +254,16 @@ static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, f
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
-{
+static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float y, float level) {
  *ptr = tex2DLod<T>(textureObject, x, y, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
-{
+static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y,
+                                             float z, float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_lod_3D(i, s, get_native_vector(coords), level);
@@ -278,16 +273,16 @@ static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, f
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
-{
+static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x,
+                                                float y, float z, float level) {
  *ptr = tex3DLod<T>(textureObject, x, y, z, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
-{
+static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x,
+                                                    int layer, float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT;
  (void)level;
  float2 coords{x, layer};
@@ -298,16 +293,16 @@ static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureOb
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
-{
+static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject,
+                                                       float x, int layer, float level) {
  *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__  T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
-{
+static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x,
+                                                    float y, int layer, float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT;
  (void)level;
  float4 coords{x, y, layer, 0.0f};
@@ -318,16 +313,16 @@ static __device__ __hip_img_chk__  T tex2DLayeredLod(hipTextureObject_t textureO
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
-{
+static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject,
+                                                       float x, float y, int layer, float level) {
  *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
-{
+static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x,
+                                                  float y, float z, float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, 0.0f};
  auto tmp = __ockl_image_sample_lod_CM(i, s, get_native_vector(coords), level);
@@ -337,16 +332,16 @@ static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObje
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
-{
+static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject,
+                                                     float x, float y, float z, float level) {
  *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x,
+                                                   float y, float z, float4 dPdx, float4 dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT;
  (void)x;
  (void)y;
@@ -363,16 +358,17 @@ static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObj
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject,
+                                                      float x, float y, float z, float4 dPdx,
+                                                      float4 dPdy) {
  *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
-{
+static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x,
+                                                         float y, float z, int layer, float level) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, z, layer};
  auto tmp = __ockl_image_sample_lod_CMa(i, s, get_native_vector(coords), level);
@@ -382,16 +378,18 @@ static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t text
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
-{
+static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
+                                                            hipTextureObject_t textureObject,
+                                                            float x, float y, float z, int layer,
+                                                            float level) {
  *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
-{
+static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx,
+                                              float dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
  return __hipMapFrom<T>(tmp);
@@ -400,16 +398,16 @@ static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject,
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
-{
+static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float dPdx, float dPdy) {
  *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
-{
+static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y,
+                                              float2 dPdx, float2 dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, y};
  auto tmp = __ockl_image_sample_grad_2D(i, s, get_native_vector(coords), get_native_vector(dPdx),
@@ -420,16 +418,16 @@ static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject,
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
-{
+static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float y, float2 dPdx, float2 dPdy) {
  *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y,
+                                              float z, float4 dPdx, float4 dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT;
  (void)dPdx;
  float4 coords{x, y, z, 0.0f};
@@ -443,16 +441,16 @@ static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject,
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x,
+                                                 float y, float z, float4 dPdx, float4 dPdy) {
  *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
-{
+static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                     int layer, float dPdx, float dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float2 coords{x, layer};
  auto tmp = __ockl_image_sample_grad_1Da(i, s, get_native_vector(coords), dPdx, dPdy);
@@ -462,36 +460,39 @@ static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureO
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
-{
+static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
+                                                        float x, int layer, float dPdx,
+                                                        float dPdy) {
  *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
-{
+static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                     float y, int layer, float2 dPdx, float2 dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT
  float4 coords{x, y, layer, 0.0f};
-    auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords),
-                                            get_native_vector(dPdx), get_native_vector(dPdy));
+  auto tmp = __ockl_image_sample_grad_2Da(i, s, get_native_vector(coords), get_native_vector(dPdx),
+                                          get_native_vector(dPdy));
  return __hipMapFrom<T>(tmp);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
-{
+static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
+                                                        float x, float y, int layer, float2 dPdx,
+                                                        float2 dPdy) {
  *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
 }

 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__  T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x,
+                                                          float y, float z, int layer, float4 dPdx,
+                                                          float4 dPdy) {
  TEXTURE_OBJECT_PARAMETERS_INIT;
  (void)x;
  (void)y;
@@ -509,8 +510,10 @@ static __device__ __hip_img_chk__  T texCubemapLayeredGrad(hipTextureObject_t te
 template <
    typename T,
    typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
-static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
-{
+static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr,
+                                                             hipTextureObject_t textureObject,
+                                                             float x, float y, float z, int layer,
+                                                             float4 dPdx, float4 dPdy) {
  *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
 }

@@ -123,8 +123,7 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
 hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
 hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
 hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
-                                               const hipChannelFormatDesc* fmtDesc,
-                                               int device);
+                                               const hipChannelFormatDesc* fmtDesc, int device);
 hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device);
 hipError_t hipDeviceGraphMemTrim(int device);
 hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
@@ -551,11 +550,11 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
 hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size,
                          const char* name, unsigned int numOptions, hipJitOption* options,
                          void** optionValues);
-hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path, unsigned int numOptions,
-                            hipJitOption* options, void** optionValues);
+hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type, const char* path,
+                          unsigned int numOptions, hipJitOption* options, void** optionValues);
 hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeOut);
-hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options,
-                            void** optionValues, hipLinkState_t* stateOut);
+hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues,
+                         hipLinkState_t* stateOut);
 hipError_t hipLinkDestroy(hipLinkState_t state);
 hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, hipFunction_t f,
                                                              int blockSize,
@@ -842,8 +841,8 @@ hipError_t hipMemGetHandleForAddressRange(void* handle, hipDeviceptr_t dptr, siz
                                          unsigned long long flags);
 hipError_t hipMemsetD2D8(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
                         size_t height);
-hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
-                              size_t height, hipStream_t stream);
+hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value,
+                              size_t width, size_t height, hipStream_t stream);
 hipError_t hipMemsetD2D16(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
                          size_t height);
 hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value,
@@ -940,7 +939,8 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
  ptrDispatchTable->hipDeviceGetPCIBusId_fn = hip::hipDeviceGetPCIBusId;
  ptrDispatchTable->hipDeviceGetSharedMemConfig_fn = hip::hipDeviceGetSharedMemConfig;
  ptrDispatchTable->hipDeviceGetStreamPriorityRange_fn = hip::hipDeviceGetStreamPriorityRange;
-  ptrDispatchTable->hipDeviceGetTexture1DLinearMaxWidth_fn = hip::hipDeviceGetTexture1DLinearMaxWidth;
+  ptrDispatchTable->hipDeviceGetTexture1DLinearMaxWidth_fn =
+      hip::hipDeviceGetTexture1DLinearMaxWidth;
  ptrDispatchTable->hipDeviceGetUuid_fn = hip::hipDeviceGetUuid;
  ptrDispatchTable->hipDeviceGraphMemTrim_fn = hip::hipDeviceGraphMemTrim;
  ptrDispatchTable->hipDevicePrimaryCtxGetState_fn = hip::hipDevicePrimaryCtxGetState;
@@ -1464,8 +1464,7 @@ NO_VECTORIZE const HipDispatchTable* GetHipDispatchTable() {
  static auto* _v = &GetDispatchTableImpl<HipDispatchTable>();
  return _v;
 }
-NO_VECTORIZE const HipCompilerDispatchTable*
-GetHipCompilerDispatchTable() {
+NO_VECTORIZE const HipCompilerDispatchTable* GetHipCompilerDispatchTable() {
  static auto* _v = &GetDispatchTableImpl<HipCompilerDispatchTable>();
  return _v;
 }
@@ -1485,7 +1484,8 @@ constexpr auto ComputeTableOffset(size_t num_funcs) {
 // update the table versioning value before changing the value in HIP_ENFORCE_ABI_VERSIONING to make
 // this static assert pass.
 //
-// HIP_ENFORCE_ABI will cause a compiler error if the order of the members in the API table change. Do not reorder member variables and change existing HIP_ENFORCE_ABI values -- always
+// HIP_ENFORCE_ABI will cause a compiler error if the order of the members in the API table change.
+// Do not reorder member variables and change existing HIP_ENFORCE_ABI values -- always
 //
 // Please note: rocprofiler will do very strict compile time checks to make
 // sure these versioning values are appropriately updated -- so commenting out this check, only
@@ -147,6 +147,7 @@ private:
 class StatCO : public CodeObject {
  // Guards Static Code object
  amd::Monitor sclock_{true};
+
 public:
  StatCO();
  virtual ~StatCO();
@@ -171,6 +172,7 @@ public:
  // Managed variable is a defined symbol in code object
  // pointer to the alocated managed memory has to be copied to the address of symbol
  hipError_t initStatManagedVarDevicePtr(int deviceId);
+
 private:
  friend class hip::PlatformState;
  // Populated during __hipRegisterFatBinary
@@ -103,7 +103,8 @@ static bool getTargetIDValue(std::string& input, std::string& processor, char& s
 }

 bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
-         std::string agent_triple_target_id, unsigned& genericVersion) {
+                                      std::string agent_triple_target_id,
+                                      unsigned& genericVersion) {
  // Primitive Check
  if (co_triple_target_id == agent_triple_target_id) return true;

@@ -137,8 +138,7 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
  // Check for compatibility
  if (genericVersion >= EF_AMDGPU_GENERIC_VERSION_MIN) {
    // co_processor is generic target
-    if (!IsCompatibleWithGenericTarget(co_processor, agent_isa_processor))
-    return false;
+    if (!IsCompatibleWithGenericTarget(co_processor, agent_isa_processor)) return false;
  } else if (agent_isa_processor != co_processor) {
    return false;
  }
@@ -455,15 +455,13 @@ bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::strin
 }

 bool CheckIfBundled(std::vector<char>& llvm_bitcode) {
-  std::string magic(llvm_bitcode.begin(),
-                    llvm_bitcode.begin() + bundle_magic_string_size);
+  std::string magic(llvm_bitcode.begin(), llvm_bitcode.begin() + bundle_magic_string_size);

  if (magic.compare(CLANG_OFFLOAD_BUNDLER_MAGIC_STR) == 0) {
    return true;
  }
  // File is not bundled
  return false;
-
 }
 // Unbundle Bitcode using COMGR action
 // Supports only 1 Bundle Entry ID for now
@@ -490,7 +488,8 @@ bool UnbundleUsingComgr(std::vector<char>& source, const std::string& isa,
    return false;
  }

-  if(amd::Comgr::action_info_set_bundle_entry_ids(action, bundleEntryIDs, bundleEntryIDsCount) != AMD_COMGR_STATUS_SUCCESS) {
+  if (amd::Comgr::action_info_set_bundle_entry_ids(action, bundleEntryIDs, bundleEntryIDsCount) !=
+      AMD_COMGR_STATUS_SUCCESS) {
    amd::Comgr::destroy_action_info(action);
    return false;
  }
@@ -501,8 +500,7 @@ bool UnbundleUsingComgr(std::vector<char>& source, const std::string& isa,
    return false;
  }

-  if (auto res =
-          amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE, action, linkinput, output);
+  if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_UNBUNDLE, action, linkinput, output);
      res != AMD_COMGR_STATUS_SUCCESS) {
    amd::Comgr::destroy_action_info(action);
    amd::Comgr::destroy_data_set(output);
@@ -533,8 +531,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i
  const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP;
  amd_comgr_action_info_t action;

-  if (auto res = createAction(action, linkOptions, isa, lang);
-      res != AMD_COMGR_STATUS_SUCCESS) {
+  if (auto res = createAction(action, linkOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) {
    return false;
  }

@@ -900,7 +897,6 @@ RTCProgram::RTCProgram(std::string name) : name_(name) {
 }

 bool RTCProgram::findIsa() {
-
 #ifdef BUILD_SHARED_LIBS
  const char* libName;
 #ifdef _WIN32
@@ -1032,7 +1028,6 @@ bool LinkProgram::AddLinkerOptions(unsigned int num_options, hipJitOption* optio
 }


-
 amd_comgr_data_kind_t LinkProgram::GetCOMGRDataKind(hipJitInputType input_type) {
  amd_comgr_data_kind_t data_kind = AMD_COMGR_DATA_KIND_UNDEF;

@@ -1146,13 +1141,15 @@ bool LinkProgram::LinkComplete(void** bin_out, size_t* size_out) {
  if (data_kind_ == AMD_COMGR_DATA_KIND_SPIRV) {
    // Convert SPIRV Unbundled code object to LLVM Bitcode
    std::vector<char> llvmbc_from_spirv;
-    if (!helpers::convertSPIRVToLLVMBC(link_input_, isa_, link_options_, build_log_, llvmbc_from_spirv)) {
+    if (!helpers::convertSPIRVToLLVMBC(link_input_, isa_, link_options_, build_log_,
+                                       llvmbc_from_spirv)) {
      LogError("Error in hip Linker: unable to convert SPIRV to BC");
      return false;
    }

    std::string linkedFileName = "LLVMBitcodeFromSPIRV.bc";
-    if (!helpers::addCodeObjData(link_input, llvmbc_from_spirv, linkedFileName, AMD_COMGR_DATA_KIND_BC)) {
+    if (!helpers::addCodeObjData(link_input, llvmbc_from_spirv, linkedFileName,
+                                 AMD_COMGR_DATA_KIND_BC)) {
      LogError("Error in hip Linker: unable to add linked LLVM bitcode");
      return false;
    }
@@ -25,10 +25,8 @@ THE SOFTWARE.
 #include <hip/driver_types.h>
 #include <hip/texture_types.h>

-namespace hip
-{
-inline
-cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
+namespace hip {
+inline cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
                                        const hipTextureReadMode hipReadMode) {
  if (hipReadMode == hipReadModeElementType) {
    switch (hipFormat) {
@@ -74,9 +72,7 @@ cl_channel_type getCLChannelType(const hipArray_Format hipFormat,
  return {};
 }

-inline
-cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
-                                   const int sRGB) {
+inline cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels, const int sRGB) {
  switch (hipNumChannels) {
    case 1:
      return CL_R;
@@ -92,8 +88,7 @@ cl_channel_order getCLChannelOrder(const unsigned int hipNumChannels,
  return {};
 }

-inline
-cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
+inline cl_mem_object_type getCLMemObjectType(const unsigned int hipWidth,
                                             const unsigned int hipHeight,
                                             const unsigned int hipDepth,
                                             const unsigned int flags) {
@@ -126,8 +121,7 @@ inline bool isLayered1D(const hipArray* arr) {
  return CL_MEM_OBJECT_IMAGE1D_ARRAY == getCLMemObjectType(arr);
 }

-inline
-cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
+inline cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMode) {
  switch (hipAddressMode) {
    case hipAddressModeWrap:
      return CL_ADDRESS_REPEAT;
@@ -143,8 +137,7 @@ cl_addressing_mode getCLAddressingMode(const hipTextureAddressMode hipAddressMod
  return {};
 }

-inline
-cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
+inline cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
  switch (hipFilterMode) {
    case hipFilterModePoint:
      return CL_FILTER_NEAREST;
@@ -156,8 +149,7 @@ cl_filter_mode getCLFilterMode(const hipTextureFilterMode hipFilterMode) {
  return {};
 }

-inline
-cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
+inline cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
  switch (hipResType) {
    case hipResourceTypeLinear:
      return CL_MEM_OBJECT_IMAGE1D_BUFFER;
@@ -171,8 +163,7 @@ cl_mem_object_type getCLMemObjectType(const hipResourceType hipResType) {
  return {};
 }

-inline
-hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
+inline hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
  switch (type) {
    case CL_SNORM_INT8:
    case CL_SIGNED_INT8:
@@ -200,8 +191,7 @@ hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
      return HIP_AD_FORMAT_UNSIGNED_INT8;
  }
 }
-inline
-size_t getElementSize(const hipArray_const_t array) {
+inline size_t getElementSize(const hipArray_const_t array) {
  switch (array->Format) {
    case HIP_AD_FORMAT_UNSIGNED_INT8:
    case HIP_AD_FORMAT_SIGNED_INT8:
@@ -220,9 +210,7 @@ size_t getElementSize(const hipArray_const_t array) {
  return {};
 }

-inline
-hipChannelFormatDesc getChannelFormatDesc(int numChannels,
-                                          hipArray_Format arrayFormat) {
+inline hipChannelFormatDesc getChannelFormatDesc(int numChannels, hipArray_Format arrayFormat) {
  switch (arrayFormat) {
    case HIP_AD_FORMAT_UNSIGNED_INT8:
      switch (numChannels) {
@@ -302,13 +290,11 @@ hipChannelFormatDesc getChannelFormatDesc(int numChannels,
  return {};
 }

-inline
-unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
+inline unsigned int getNumChannels(const hipChannelFormatDesc& desc) {
  return ((desc.x != 0) + (desc.y != 0) + (desc.z != 0) + (desc.w != 0));
 }

-inline
-bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
+inline bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
  if (desc.x == 0) {
    return false;
  } else {
@@ -325,16 +311,14 @@ bool CheckArrayFormat(const hipChannelFormatDesc& desc) {
  // The bit channel description should not allow any channels after a zero channel
  if (desc.y == 0) {
    return !(desc.z > 0 || desc.w > 0);
-  }
-  else if (desc.z == 0) {
+  } else if (desc.z == 0) {
    return !(desc.w > 0);
  }

  return true;
 }

-inline
-hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
+inline hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
  switch (desc.f) {
    case hipChannelFormatKindUnsigned:
      switch (desc.x) {
@@ -369,8 +353,7 @@ hipArray_Format getArrayFormat(const hipChannelFormatDesc& desc) {
  return {};
 }

-inline
-int getNumChannels(const hipResourceViewFormat hipFormat) {
+inline int getNumChannels(const hipResourceViewFormat hipFormat) {
  switch (hipFormat) {
    case hipResViewFormatUnsignedChar1:
    case hipResViewFormatSignedChar1:
@@ -407,8 +390,7 @@ int getNumChannels(const hipResourceViewFormat hipFormat) {
  return {};
 }

-inline
-hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
+inline hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
  switch (hipFormat) {
    case hipResViewFormatUnsignedChar1:
    case hipResViewFormatUnsignedChar2:
@@ -450,8 +432,7 @@ hipArray_Format getArrayFormat(const hipResourceViewFormat hipFormat) {
  return {};
 }

-inline
-hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
+inline hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
  switch (desc.f) {
    case hipChannelFormatKindUnsigned:
      switch (getNumChannels(desc)) {
@@ -545,8 +526,7 @@ hipResourceViewFormat getResourceViewFormat(const hipChannelFormatDesc& desc) {
  return {};
 }

-inline
-hipTextureDesc getTextureDesc(const textureReference* texRef) {
+inline hipTextureDesc getTextureDesc(const textureReference* texRef) {
  hipTextureDesc texDesc = {};
  std::memcpy(texDesc.addressMode, texRef->addressMode, sizeof(texDesc.addressMode));
  texDesc.filterMode = texRef->filterMode;
@@ -562,8 +542,7 @@ hipTextureDesc getTextureDesc(const textureReference* texRef) {
  return texDesc;
 }

-inline
-hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
+inline hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
                                               const hipResourceViewFormat format) {
  hipResourceViewDesc resViewDesc = {};
  resViewDesc.format = format;
@@ -578,8 +557,7 @@ hipResourceViewDesc getResourceViewDesc(hipArray_const_t array,
  return resViewDesc;
 }

-inline
-hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
+inline hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
                                               const hipResourceViewFormat format) {
  hipResourceViewDesc resViewDesc = {};
  resViewDesc.format = format;
@@ -594,8 +572,7 @@ hipResourceViewDesc getResourceViewDesc(hipMipmappedArray_const_t array,
  return resViewDesc;
 }

-inline
-std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
+inline std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind) {
  switch (kind) {
    case hipMemcpyHostToHost:
      return {hipMemoryTypeHost, hipMemoryTypeHost};
@@ -614,8 +591,7 @@ std::pair<hipMemoryType, hipMemoryType> getMemoryType(const hipMemcpyKind kind)
  return {};
 }

-inline
-HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
+inline HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
  HIP_MEMCPY3D desc3D = {};

  desc3D.srcXInBytes = desc2D.srcXInBytes;
@@ -626,8 +602,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
  desc3D.srcHost = desc2D.srcHost;
  desc3D.srcDevice = desc2D.srcDevice;
  desc3D.srcArray = desc2D.srcArray;
-  desc3D.srcPitch = desc2D.srcPitch ? desc2D.srcPitch
-                                    : (desc2D.srcXInBytes + desc2D.WidthInBytes);
+  desc3D.srcPitch = desc2D.srcPitch ? desc2D.srcPitch : (desc2D.srcXInBytes + desc2D.WidthInBytes);
  desc3D.srcHeight = 0;

  desc3D.dstXInBytes = desc2D.dstXInBytes;
@@ -638,8 +613,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
  desc3D.dstHost = desc2D.dstHost;
  desc3D.dstDevice = desc2D.dstDevice;
  desc3D.dstArray = desc2D.dstArray;
-  desc3D.dstPitch = desc2D.dstPitch ? desc2D.dstPitch
-                                    : (desc2D.dstXInBytes + desc2D.WidthInBytes);
+  desc3D.dstPitch = desc2D.dstPitch ? desc2D.dstPitch : (desc2D.dstXInBytes + desc2D.WidthInBytes);
  desc3D.dstHeight = 0;

  desc3D.WidthInBytes = desc2D.WidthInBytes;
@@ -649,8 +623,7 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hip_Memcpy2D& desc2D) {
  return desc3D;
 }

-inline
-HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
+inline HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
  HIP_MEMCPY3D descDrv = {};

  descDrv.WidthInBytes = desc.extent.width;
@@ -702,7 +675,8 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
    descDrv.dstHeight = desc.dstPtr.ysize;
  }

-  // If a HIP array is participating in the copy, the extent is defined in terms of that array's elements.
+  // If a HIP array is participating in the copy, the extent is defined in terms of that array's
+  // elements.
  if ((desc.srcArray != nullptr) && (desc.dstArray == nullptr)) {
    descDrv.WidthInBytes *= getElementSize(desc.srcArray);
  } else if ((desc.srcArray == nullptr) && (desc.dstArray != nullptr)) {
@@ -733,20 +707,17 @@ HIP_MEMCPY3D getDrvMemcpy3DDesc(const hipMemcpy3DParms& desc) {
  return descDrv;
 }

-inline
-hipResourceType getResourceType(const HIPresourcetype resType) {
+inline hipResourceType getResourceType(const HIPresourcetype resType) {
  // These two enums should be isomorphic.
  return static_cast<hipResourceType>(resType);
 }

-inline
-HIPresourcetype getResourceType(const hipResourceType resType) {
+inline HIPresourcetype getResourceType(const hipResourceType resType) {
  // These two enums should be isomorphic.
  return static_cast<HIPresourcetype>(resType);
 }

-inline
-hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
+inline hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
  hipResourceDesc desc;

  desc.resType = getResourceType(resDesc.resType);
@@ -759,12 +730,14 @@ hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
      break;
    case hipResourceTypeLinear:
      desc.res.linear.devPtr = resDesc.res.linear.devPtr;
-    desc.res.linear.desc = getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
+      desc.res.linear.desc =
+          getChannelFormatDesc(resDesc.res.linear.numChannels, resDesc.res.linear.format);
      desc.res.linear.sizeInBytes = resDesc.res.linear.sizeInBytes;
      break;
    case hipResourceTypePitch2D:
      desc.res.pitch2D.devPtr = resDesc.res.pitch2D.devPtr;
-    desc.res.pitch2D.desc = getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
+      desc.res.pitch2D.desc =
+          getChannelFormatDesc(resDesc.res.pitch2D.numChannels, resDesc.res.pitch2D.format);
      desc.res.pitch2D.width = resDesc.res.pitch2D.width;
      desc.res.pitch2D.height = resDesc.res.pitch2D.height;
      desc.res.pitch2D.pitchInBytes = resDesc.res.pitch2D.pitchInBytes;
@@ -776,8 +749,7 @@ hipResourceDesc getResourceDesc(const HIP_RESOURCE_DESC& resDesc) {
  return desc;
 }

-inline
-HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
+inline HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
  HIP_RESOURCE_DESC desc;

  desc.resType = getResourceType(resDesc.resType);
@@ -809,32 +781,27 @@ HIP_RESOURCE_DESC getResourceDesc(const hipResourceDesc& resDesc) {
  return desc;
 }

-inline
-hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
+inline hipTextureAddressMode getAddressMode(const HIPaddress_mode mode) {
  // These two enums should be isomorphic.
  return static_cast<hipTextureAddressMode>(mode);
 }

-inline
-HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
+inline HIPaddress_mode getAddressMode(const hipTextureAddressMode mode) {
  // These two enums should be isomorphic.
  return static_cast<HIPaddress_mode>(mode);
 }

-inline
-hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
+inline hipTextureFilterMode getFilterMode(const HIPfilter_mode mode) {
  // These two enums should be isomorphic.
  return static_cast<hipTextureFilterMode>(mode);
 }

-inline
-HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
+inline HIPfilter_mode getFilterMode(const hipTextureFilterMode mode) {
  // These two enums should be isomorphic.
  return static_cast<HIPfilter_mode>(mode);
 }

-inline
-hipTextureReadMode getReadMode(const unsigned int flags) {
+inline hipTextureReadMode getReadMode(const unsigned int flags) {
  if (flags & HIP_TRSF_READ_AS_INTEGER) {
    return hipReadModeElementType;
  } else {
@@ -842,8 +809,7 @@ hipTextureReadMode getReadMode(const unsigned int flags) {
  }
 }

-inline
-unsigned int getReadMode(const hipTextureReadMode mode) {
+inline unsigned int getReadMode(const hipTextureReadMode mode) {
  if (mode == hipReadModeElementType) {
    return HIP_TRSF_READ_AS_INTEGER;
  } else {
@@ -851,8 +817,7 @@ unsigned int getReadMode(const hipTextureReadMode mode) {
  }
 }

-inline
-int getsRGB(const unsigned int flags) {
+inline int getsRGB(const unsigned int flags) {
  if (flags & HIP_TRSF_SRGB) {
    return 1;
  } else {
@@ -860,8 +825,7 @@ int getsRGB(const unsigned int flags) {
  }
 }

-inline
-unsigned int getsRGB(const int sRGB) {
+inline unsigned int getsRGB(const int sRGB) {
  if (sRGB == 1) {
    return HIP_TRSF_SRGB;
  } else {
@@ -869,8 +833,7 @@ unsigned int getsRGB(const int sRGB) {
  }
 }

-inline
-int getNormalizedCoords(const unsigned int flags) {
+inline int getNormalizedCoords(const unsigned int flags) {
  if (flags & HIP_TRSF_NORMALIZED_COORDINATES) {
    return 1;
  } else {
@@ -878,8 +841,7 @@ int getNormalizedCoords(const unsigned int flags) {
  }
 }

-inline
-unsigned int getNormalizedCoords(const int normalizedCoords) {
+inline unsigned int getNormalizedCoords(const int normalizedCoords) {
  if (normalizedCoords == 1) {
    return HIP_TRSF_NORMALIZED_COORDINATES;
  } else {
@@ -887,8 +849,7 @@ unsigned int getNormalizedCoords(const int normalizedCoords) {
  }
 }

-inline
-hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
+inline hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
  hipTextureDesc desc;

  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
@@ -908,8 +869,7 @@ hipTextureDesc getTextureDesc(const HIP_TEXTURE_DESC& texDesc) {
  return desc;
 }

-inline
-HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
+inline HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
  HIP_TEXTURE_DESC desc;

  desc.addressMode[0] = getAddressMode(texDesc.addressMode[0]);
@@ -930,20 +890,17 @@ HIP_TEXTURE_DESC getTextureDesc(const hipTextureDesc& texDesc) {
  return desc;
 }

-inline
-hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
+inline hipResourceViewFormat getResourceViewFormat(const HIPresourceViewFormat format) {
  // These two enums should be isomorphic.
  return static_cast<hipResourceViewFormat>(format);
 }

-inline
-HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
+inline HIPresourceViewFormat getResourceViewFormat(const hipResourceViewFormat format) {
  // These two enums should be isomorphic.
  return static_cast<HIPresourceViewFormat>(format);
 }

-inline
-hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
+inline hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDesc) {
  hipResourceViewDesc desc;

  desc.format = getResourceViewFormat(resViewDesc.format);
@@ -958,8 +915,7 @@ hipResourceViewDesc getResourceViewDesc(const HIP_RESOURCE_VIEW_DESC& resViewDes
  return desc;
 }

-inline
-HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
+inline HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDesc) {
  HIP_RESOURCE_VIEW_DESC desc;

  desc.format = getResourceViewFormat(resViewDesc.format);
@@ -974,13 +930,11 @@ HIP_RESOURCE_VIEW_DESC getResourceViewDesc(const hipResourceViewDesc& resViewDes
  return desc;
 }

-inline
-size_t getElementSize(const hipChannelFormatDesc &desc) {
+inline size_t getElementSize(const hipChannelFormatDesc& desc) {
  return (desc.x / 8) * getNumChannels(desc);
 }

-inline
-hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
+inline hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
  hipMemcpy3DParms params;
  params.extent = desc.extent;
  params.kind = hipMemcpyDefault;
@@ -1000,21 +954,13 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
    size_t spitch = (row ? row : desc.extent.width) * elementSize;
    size_t swidth = (row ? row : desc.extent.width);
    size_t sheight = (height ? height : desc.extent.height);
-    params.srcPtr = make_hipPitchedPtr(
-        desc.src.op.ptr.ptr,
-        spitch,
-        swidth,
-        sheight
-    );
+    params.srcPtr = make_hipPitchedPtr(desc.src.op.ptr.ptr, spitch, swidth, sheight);
    params.srcPos = make_hipPos(0, 0, 0);
    params.srcArray = nullptr;
  } else if (desc.src.type == hipMemcpyOperandTypeArray) {
    params.srcArray = desc.src.op.array.array;
-    params.srcPos = make_hipPos(
-        desc.src.op.array.offset.x,
-        desc.src.op.array.offset.y,
-        desc.src.op.array.offset.z
-    );
+    params.srcPos = make_hipPos(desc.src.op.array.offset.x, desc.src.op.array.offset.y,
+                                desc.src.op.array.offset.z);
    params.srcPtr.ptr = nullptr;
  }
  // dest
@@ -1024,28 +970,19 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DBatchOp& desc) {
    size_t spitch = (row ? row : desc.extent.width) * elementSize;
    size_t swidth = (row ? row : desc.extent.width);
    size_t sheight = (height ? height : desc.extent.height);
-    params.dstPtr = make_hipPitchedPtr(
-        desc.dst.op.ptr.ptr,
-        spitch,
-        swidth,
-        sheight
-    );
+    params.dstPtr = make_hipPitchedPtr(desc.dst.op.ptr.ptr, spitch, swidth, sheight);
    params.dstPos = make_hipPos(0, 0, 0);
    params.dstArray = nullptr;
  } else if (desc.dst.type == hipMemcpyOperandTypeArray) {
    params.dstArray = desc.dst.op.array.array;
-    params.dstPos = make_hipPos(
-        desc.dst.op.array.offset.x,
-        desc.dst.op.array.offset.y,
-        desc.dst.op.array.offset.z
-    );
+    params.dstPos = make_hipPos(desc.dst.op.array.offset.x, desc.dst.op.array.offset.y,
+                                desc.dst.op.array.offset.z);
    params.dstPtr.ptr = nullptr;
  }
  return params;
 }

-inline
-hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
+inline hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
  hipMemcpy3DParms params;
  params.srcArray = desc.srcArray;
  params.srcPos = desc.srcPos;
@@ -1057,4 +994,4 @@ hipMemcpy3DParms getMemcpy3DParms(const hipMemcpy3DPeerParms& desc) {
  params.kind = hipMemcpyDeviceToDevice;
  return params;
 }
-};
+};  // namespace hip
@@ -166,8 +166,7 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
  amd::Command::EventWaitList eventWaitList(0);
  bool submitMarker = 0;

-  auto waitForStream = [&submitMarker,
-                         &eventWaitList](hip::Stream* stream) {
+  auto waitForStream = [&submitMarker, &eventWaitList](hip::Stream* stream) {
    if (amd::Command* command = stream->getLastQueuedCommand(true)) {
      amd::Event& event = command->event();
      // Check HW status of the ROCcrl event.
@@ -300,7 +299,8 @@ void Device::SyncAllStreams(bool cpu_wait, bool wait_blocking_streams_only) {
 bool Device::StreamCaptureBlocking() {
  std::shared_lock lock(streamSetLock);
  for (auto& it : streamSet) {
-    if (it->GetCaptureStatus() == hipStreamCaptureStatusActive && it->Flags() != hipStreamNonBlocking) {
+    if (it->GetCaptureStatus() == hipStreamCaptureStatusActive &&
+        it->Flags() != hipStreamNonBlocking) {
      return true;
    }
  }
@@ -536,8 +536,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, int device) {
  deviceProps.cooperativeMultiDeviceUnmatchedBlockDim = info.cooperativeMultiDeviceGroups_;
  deviceProps.cooperativeMultiDeviceUnmatchedSharedMem = info.cooperativeMultiDeviceGroups_;

-  deviceProps.maxTexture1DLinear =
-      std::min(pixel_size_max * info.imageMaxBufferSize_, int32_max);
+  deviceProps.maxTexture1DLinear = std::min(pixel_size_max * info.imageMaxBufferSize_, int32_max);
  deviceProps.maxTexture1DMipmap = std::min(16 * info.imageMaxBufferSize_, int32_max);
  deviceProps.maxTexture1D = deviceProps.maxSurface1D = std::min(info.image1DMaxWidth_, int32_max);
  deviceProps.maxTexture2D[0] = deviceProps.maxSurface2D[0] =
@@ -49,8 +49,7 @@ hipError_t ihipChooseDevice(int* device, const DeviceProp* properties) {

    if constexpr (std::is_same_v<DeviceProp, hipDeviceProp_tR0600>) {
      err = ihipGetDeviceProperties(&currentProp, i);
-    }
-    else {
+    } else {
      err = hip::hipGetDevicePropertiesR0000(&currentProp, i);
    }

@@ -533,7 +532,8 @@ hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
      *pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMin;
      break;
    case hipExtLimitScratchMax:
-      *pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMax;;
+      *pValue = hip::getCurrentDevice()->devices()[0]->info().scratchLimitMax;
+      ;
      break;
    case hipExtLimitScratchCurrent:
      *pValue = hip::getCurrentDevice()->devices()[0]->ScratchLimitCurrent();
@@ -563,10 +563,7 @@ hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device) {
  hipDeviceProp_tR0600 prop;
  HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
  auto* deviceHandle = g_devices[device]->devices()[0];
-  snprintf (pciBusId, len, "%04x:%02x:%02x.%01x",
-                    prop.pciDomainID,
-                    prop.pciBusID,
-                    prop.pciDeviceID,
+  snprintf(pciBusId, len, "%04x:%02x:%02x.%01x", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID,
           deviceHandle->info().deviceTopology_.pcie.function);

  HIP_RETURN(len <= 12 ? hipErrorInvalidValue : hipSuccess);
@@ -661,8 +658,8 @@ hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
  hipDeviceProp_tR0600 prop = {0};
  HIP_RETURN_ONFAIL(ihipGetDeviceProperties(&prop, device));
  // Calculate element size according to fmtDesc
-  size_t elementSize = (fmtDesc->x + fmtDesc->y
-  + fmtDesc->z + fmtDesc->w) / 8; // Convert from bits to bytes
+  size_t elementSize =
+      (fmtDesc->x + fmtDesc->y + fmtDesc->z + fmtDesc->w) / 8;  // Convert from bits to bytes
  if (elementSize == 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -717,15 +714,16 @@ hipError_t hipGetDeviceFlags(unsigned int* flags) {
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipGetDriverEntryPoint_common(const char* symbol, void** funcPtr, unsigned long long flags,
+hipError_t hipGetDriverEntryPoint_common(const char* symbol, void** funcPtr,
+                                         unsigned long long flags,
                                         hipDriverEntryPointQueryResult* status) {
  std::string symbolString = symbol;
  if (symbol == nullptr || symbolString == "" || funcPtr == nullptr) {
    return hipErrorInvalidValue;
  }

-  if (flags != hipEnableDefault && flags != hipEnableLegacyStream
-      && flags != hipEnablePerThreadDefaultStream) {
+  if (flags != hipEnableDefault && flags != hipEnableLegacyStream &&
+      flags != hipEnablePerThreadDefaultStream) {
    return hipErrorInvalidValue;
  }

@@ -23,31 +23,27 @@
 #include "hip_internal.hpp"

 namespace hip {
-hipError_t hipExtGetLastError()
-{
+hipError_t hipExtGetLastError() {
  HIP_INIT_API(hipExtGetLastError);
  hipError_t err = hip::tls.last_command_error_;
  hip::tls.last_command_error_ = hipSuccess;
  return err;
 }

-hipError_t hipGetLastError()
-{
+hipError_t hipGetLastError() {
  HIP_INIT_API(hipGetLastError);
  hipError_t err = hip::tls.last_error_;
  hip::tls.last_error_ = hipSuccess;
  return err;
 }

-hipError_t hipPeekAtLastError()
-{
+hipError_t hipPeekAtLastError() {
  HIP_INIT_API(hipPeekAtLastError);
  hipError_t err = hip::tls.last_error_;
  HIP_RETURN(err);
 }

-const char *ihipGetErrorName(hipError_t hip_error)
-{
+const char* ihipGetErrorName(hipError_t hip_error) {
  switch (hip_error) {
    case hipSuccess:
      return "hipSuccess";
@@ -343,7 +339,8 @@ const char *ihipGetErrorString(hipError_t hip_error) {
    case hipErrorStreamCaptureWrongThread:
      return "attempt to terminate a thread-local capture sequence from another thread";
    case hipErrorGraphExecUpdateFailure:
-            return "the graph update was not performed because it included changes which violated constraints specific to instantiated graph update";
+      return "the graph update was not performed because it included changes which violated "
+             "constraints specific to instantiated graph update";
    case hipErrorRuntimeMemory:
      return "runtime memory call returned error";
    case hipErrorRuntimeOther:
@@ -354,18 +351,11 @@ const char *ihipGetErrorString(hipError_t hip_error) {
  }
 }

-const char* hipGetErrorName(hipError_t hip_error)
-{
-  return ihipGetErrorName(hip_error);
-}
+const char* hipGetErrorName(hipError_t hip_error) { return ihipGetErrorName(hip_error); }

-const char *hipGetErrorString(hipError_t hip_error)
-{
-  return ihipGetErrorString(hip_error);
-}
+const char* hipGetErrorString(hipError_t hip_error) { return ihipGetErrorString(hip_error); }

-hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr)
-{
+hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr) {
  if (errStr == nullptr) {
    return hipErrorInvalidValue;
  }
@@ -377,8 +367,7 @@ hipError_t hipDrvGetErrorName(hipError_t hip_error, const char** errStr)
  }
 }

-hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr)
-{
+hipError_t hipDrvGetErrorString(hipError_t hip_error, const char** errStr) {
  if (errStr == nullptr) {
    return hipErrorInvalidValue;
  }
@@ -77,8 +77,8 @@ hipError_t Event::synchronize() {
  auto hip_device = g_devices[deviceId()];
  // Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
  static constexpr bool kWaitCompletion = true;
-  amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
-                                                              amd::SyncPolicy::Auto;
+  amd::SyncPolicy policy =
+      (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
  if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, policy)) {
    event_->awaitCompletion();
  }
@@ -86,13 +86,11 @@ hipError_t Event::synchronize() {
 }

 // ================================================================================================
-bool Event::awaitEventCompletion() {
-  return event_->awaitCompletion();
-}
+bool Event::awaitEventCompletion() { return event_->awaitCompletion(); }

 bool EventDD::awaitEventCompletion() {
-  amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
-                                                              amd::SyncPolicy::Auto;
+  amd::SyncPolicy policy =
+      (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
  return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, true, policy);
 }

@@ -135,7 +133,8 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
    amd::Command* command = new amd::Marker(*event_->command().queue(), kMarkerDisableFlush);
    command->enqueue();
    command->awaitCompletion();
-    ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) - time(false)) /
+    ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) -
+                            time(false)) /
        1000000.f;
    command->release();
  } else {
@@ -208,12 +207,11 @@ hipError_t Event::streamWait(hip::Stream* stream, uint flags) {
 }

 // ================================================================================================
-hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                uint32_t ext_flags, bool batch_flush) {
+hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags,
+                                bool batch_flush) {
  if (command == nullptr) {
    int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
-                            (hipEventReleaseToDevice | hipEventReleaseToSystem |
-                             hipEventDisableSystemFence);
+        (hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
    if (releaseFlags & hipEventDisableSystemFence) {
      releaseFlags = amd::Device::kCacheStateIgnore;
    } else {
@@ -242,8 +240,7 @@ hipError_t Event::enqueueRecordCommand(hip::Stream* stream, amd::Command* comman
 }

 // ================================================================================================
-hipError_t Event::addMarker(hip::Stream* hip_stream, amd::Command* command,
-                            bool batch_flush) {
+hipError_t Event::addMarker(hip::Stream* hip_stream, amd::Command* command, bool batch_flush) {
  // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
  amd::ScopedLock lock(lock_);
  hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
@@ -272,21 +269,21 @@ bool isValid(hipEvent_t event) {
 // ================================================================================================
 hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
  unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
-                            hipEventReleaseToDevice | hipEventReleaseToSystem |
-                            hipEventInterprocess | hipEventDisableSystemFence;
+      hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess |
+      hipEventDisableSystemFence;

-  const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem |
-                                 hipEventDisableSystemFence);
+  const unsigned releaseFlags =
+      (hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
  // can't set any unsupported flags.
  // can set only one of the release flags.
  // if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set
-  const bool illegalFlags = (flags & ~supportedFlags) ||
-                            ([](unsigned int num){
+  const bool illegalFlags = (flags & ~supportedFlags) || ([](unsigned int num) {
                              unsigned int bitcount;
                              for (bitcount = 0; num; bitcount++) {
                                num &= num - 1;
                              }
-                              return bitcount; } (flags & releaseFlags) > 1) ||
+                              return bitcount;
+                            }(flags & releaseFlags) > 1) ||
      ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
  if (!illegalFlags) {
    hip::Event* e = nullptr;
@@ -33,9 +33,9 @@ namespace hip {
 class StreamCallback {
 protected:
  void* userData_;
+
 public:
-  StreamCallback(void* userData)
-      : userData_(userData) {}
+  StreamCallback(void* userData) : userData_(userData) {}

  virtual void CL_CALLBACK callback() = 0;

@@ -45,6 +45,7 @@ protected:
 class StreamAddCallback : public StreamCallback {
  hipStreamCallback_t callBack_;
  hipStream_t stream_;
+
 public:
  StreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData)
      : StreamCallback(userData) {
@@ -60,9 +61,9 @@ public:

 class LaunchHostFuncCallback : public StreamCallback {
  hipHostFn_t callBack_;
+
 public:
-  LaunchHostFuncCallback(hipHostFn_t callback, void* userData)
-      : StreamCallback(userData) {
+  LaunchHostFuncCallback(hipHostFn_t callback, void* userData) : StreamCallback(userData) {
    callBack_ = callback;
  }

@@ -100,18 +101,19 @@ class Event {
  hipStream_t captureStream_ = nullptr;
  /// Previous captured nodes before event record
  std::vector<hip::GraphNode*> nodesPrevToRecorded_;
+
 protected:
  bool CheckHwEvent() {
-    amd::SyncPolicy policy = (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking :
-                                                                amd::SyncPolicy::Auto;
+    amd::SyncPolicy policy =
+        (flags_ == hipEventBlockingSync) ? amd::SyncPolicy::Blocking : amd::SyncPolicy::Auto;
    return g_devices[deviceId()]->devices()[0]->IsHwEventReady(*event_, false, policy);
  }

 public:
  constexpr static bool kBatchFlush = true;  //!< Flushes CPU command batch in direct dispatch mode

-  Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
-                              event_(nullptr), stream_(nullptr) {
+  Event(uint32_t flags)
+      : flags_(flags), lock_(true) /* hipEvent_t lock*/, event_(nullptr), stream_(nullptr) {
    device_id_ = hip::getCurrentDevice()->deviceId();  // Created in current device ctx
  }

@@ -132,8 +134,7 @@ class Event {
  virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
                                   uint32_t flags = 0, bool batch_flush = true);
  virtual hipError_t enqueueRecordCommand(hip::Stream* stream, amd::Command* command);
-  hipError_t addMarker(hip::Stream* stream, amd::Command* command,
-                       bool batch_flush = true);
+  hipError_t addMarker(hip::Stream* stream, amd::Command* command, bool batch_flush = true);

  void BindCommand(amd::Command& command) {
    amd::ScopedLock lock(lock_);
@@ -226,8 +227,8 @@ class IPCEvent : public Event {

  hipError_t streamWait(hip::Stream* stream, uint flags);

-  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
-                           uint32_t flags = 0, bool batch_flush = true) override;
+  hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0,
+                           bool batch_flush = true) override;
  hipError_t enqueueRecordCommand(hip::Stream* stream, amd::Command* command);
 };

@@ -63,9 +63,8 @@ bool IPCEvent::createIpcEventShmemIfNeeded() {
  }

  // device sets 0 to this ptr when the ipc event is completed
-  hipError_t status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
-                                       sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
-                                       0);
+  hipError_t status =
+      ihipHostRegister(&ipc_evt_.ipc_shmem_->signal, sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT, 0);
  if (status != hipSuccess) {
    return false;
  }
@@ -110,15 +109,14 @@ hipError_t IPCEvent::streamWait(hip::Stream* stream, uint flags) {
 }

 // ================================================================================================
-hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
-                                   uint32_t flags, bool batch_flush) {
+hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags,
+                                   bool batch_flush) {
  command = new amd::Marker(*stream, kMarkerDisableFlush);
  return hipSuccess;
 }

 // ================================================================================================
 hipError_t IPCEvent::enqueueRecordCommand(hip::Stream* stream, amd::Command* command) {
-
  amd::Event& tEvent = command->event();
  createIpcEventShmemIfNeeded();
  int write_index = ipc_evt_.ipc_shmem_->write_index++;
@@ -185,9 +183,8 @@ hipError_t IPCEvent::OpenHandle(ihipIpcEventHandle_t* handle) {
  ipc_evt_.ipc_shmem_->owners += 1;
  // device sets 0 to this ptr when the ipc event is completed
  hipError_t status = hipSuccess;
-  status = ihipHostRegister(&ipc_evt_.ipc_shmem_->signal,
-                            sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT,
-                            0);
+  status =
+      ihipHostRegister(&ipc_evt_.ipc_shmem_->signal, sizeof(uint32_t) * IPC_SIGNALS_PER_EVENT, 0);
  return status;
 }

@@ -273,8 +273,7 @@ static bool UncompressAndPopulateCodeObject(
    bundle_ids.push_back(bundle_id_str.c_str());
  }

-  const auto obheader =
-      reinterpret_cast<const symbols::ClangOffloadBundleCompressedHeader*>(image);
+  const auto obheader = reinterpret_cast<const symbols::ClangOffloadBundleCompressedHeader*>(image);
  const size_t size = obheader->totalSize;

  bool passed = false;
@@ -720,7 +719,8 @@ hipError_t FatBinaryInfo::AddDevProgram(hip::Device* device, const void* binary_
  }
  if (CL_SUCCESS !=
      program->addDeviceProgram(*ctx->devices()[0], binary_image, binary_size, false, nullptr,
-                                nullptr, (ufd_ != nullptr ? ufd_->fdesc_ : amd::Os::FDescInit()), binary_offset, uri_)) {
+                                nullptr, (ufd_ != nullptr ? ufd_->fdesc_ : amd::Os::FDescInit()),
+                                binary_offset, uri_)) {
    return hipErrorInvalidKernelFile;
  }
  return hipSuccess;
@@ -600,41 +600,11 @@ inline std::ostream& operator<<(std::ostream& os, const hip_api_id_t* s) {
 }

 inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc& s) {
-  os << '{'
-  << '{'
-  << s.addressMode[0]
-  << ','
-  << s.addressMode[1]
-  << ','
-  << s.addressMode[2]
-  << '}'
-  << ','
-  << s.filterMode
-  << ','
-  << s.readMode
-  << ','
-  << s.sRGB
-  << ','
-  << '{'
-  << s.borderColor[0]
-  << ','
-  << s.borderColor[1]
-  << ','
-  << s.borderColor[2]
-  << ','
-  << s.borderColor[3]
-  << '}'
-  << ','
-  << s.normalizedCoords
-  << ','
-  << s.mipmapFilterMode
-  << ','
-  << s.mipmapLevelBias
-  << ','
-  << s.minMipmapLevelClamp
-  << ','
-  << s.maxMipmapLevelClamp
-  << '}';
+  os << '{' << '{' << s.addressMode[0] << ',' << s.addressMode[1] << ',' << s.addressMode[2] << '}'
+     << ',' << s.filterMode << ',' << s.readMode << ',' << s.sRGB << ',' << '{' << s.borderColor[0]
+     << ',' << s.borderColor[1] << ',' << s.borderColor[2] << ',' << s.borderColor[3] << '}' << ','
+     << s.normalizedCoords << ',' << s.mipmapFilterMode << ',' << s.mipmapLevelBias << ','
+     << s.minMipmapLevelClamp << ',' << s.maxMipmapLevelClamp << '}';
  return os;
 }

@@ -649,13 +619,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipTextureDesc* s) {


 inline std::ostream& operator<<(std::ostream& os, const dim3& s) {
-  os << '{'
-  << s.x
-  << ','
-  << s.y
-  << ','
-  << s.z
-  << '}';
+  os << '{' << s.x << ',' << s.y << ',' << s.z << '}';
  return os;
 }

@@ -669,17 +633,7 @@ inline std::ostream& operator<<(std::ostream& os, const dim3* s) {
 }

 inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc& s) {
-  os << '{'
-  << s.x
-  << ','
-  << s.y
-  << ','
-  << s.z
-  << ','
-  << s.w
-  << ','
-  << s.f
-  << '}';
+  os << '{' << s.x << ',' << s.y << ',' << s.z << ',' << s.w << ',' << s.f << '}';
  return os;
 }

@@ -693,16 +647,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipChannelFormatDesc* s)
 }

 inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray& s) {
-  os << '{'
-  << s.data
-  << ','
-  << s.desc
-  << ','
-  << s.width
-  << ','
-  << s.height
-  << ','
-  << s.depth
+  os << '{' << s.data << ',' << s.desc << ',' << s.width << ',' << s.height << ',' << s.depth
     << '}';
  return os;
 }
@@ -718,29 +663,15 @@ inline std::ostream& operator<<(std::ostream& os, const hipMipmappedArray* s) {


 inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc& s) {
-  os << '{'
-  << s.resType
-  << ','
-  << '{';
+  os << '{' << s.resType << ',' << '{';

  switch (s.resType) {
    case hipResourceTypeLinear:
-    os << s.res.linear.devPtr
-    << ','
-    << s.res.linear.desc
-    << ','
-    << s.res.linear.sizeInBytes;
+      os << s.res.linear.devPtr << ',' << s.res.linear.desc << ',' << s.res.linear.sizeInBytes;
      break;
    case hipResourceTypePitch2D:
-    os << s.res.pitch2D.devPtr
-    << ','
-    << s.res.pitch2D.desc
-    << ','
-    << s.res.pitch2D.width
-    << ','
-    << s.res.pitch2D.height
-    << ','
-    << s.res.pitch2D.pitchInBytes;
+      os << s.res.pitch2D.devPtr << ',' << s.res.pitch2D.desc << ',' << s.res.pitch2D.width << ','
+         << s.res.pitch2D.height << ',' << s.res.pitch2D.pitchInBytes;
      break;
    case hipResourceTypeArray:
      os << s.res.array.array;
@@ -767,37 +698,11 @@ inline std::ostream& operator<<(std::ostream& os, const hipResourceDesc* s) {
 }

 inline std::ostream& operator<<(std::ostream& os, const textureReference& s) {
-  os << '{'
-  << s.normalized
-  << ','
-  << s.readMode
-  << ','
-  << s.filterMode
-  << ','
-  << '{'
-  << s.addressMode[0]
-  << ','
-  << s.addressMode[1]
-  << ','
-  << s.addressMode[2]
-  << '}'
-  << ','
-  << s.channelDesc
-  << ','
-  << s.sRGB
-  << ','
-  << s.maxAnisotropy
-  << ','
-  << s.mipmapFilterMode
-  << ','
-  << s.mipmapLevelBias
-  << ','
-  << s.minMipmapLevelClamp
-  << ','
-  << s.maxMipmapLevelClamp
-  << ','
-  << s.textureObject
-  << '}';
+  os << '{' << s.normalized << ',' << s.readMode << ',' << s.filterMode << ',' << '{'
+     << s.addressMode[0] << ',' << s.addressMode[1] << ',' << s.addressMode[2] << '}' << ','
+     << s.channelDesc << ',' << s.sRGB << ',' << s.maxAnisotropy << ',' << s.mipmapFilterMode << ','
+     << s.mipmapLevelBias << ',' << s.minMipmapLevelClamp << ',' << s.maxMipmapLevelClamp << ','
+     << s.textureObject << '}';
  return os;
 }

@@ -826,22 +731,8 @@ inline std::ostream& operator<<(std::ostream& os, const hipError_t* s) {
 }

 inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc& s) {
-  os << '{'
-  << s.format
-  << ','
-  << s.width
-  << ','
-  << s.height
-  << ','
-  << s.depth
-  << ','
-  << s.firstMipmapLevel
-  << ','
-  << s.lastMipmapLevel
-  << ','
-  << s.firstLayer
-  << ','
-  << s.lastLayer
+  os << '{' << s.format << ',' << s.width << ',' << s.height << ',' << s.depth << ','
+     << s.firstMipmapLevel << ',' << s.lastMipmapLevel << ',' << s.firstLayer << ',' << s.lastLayer
     << '}';
  return os;
 }
@@ -856,15 +747,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipResourceViewDesc* s)
 }

 inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR& s) {
-  os << '{'
-  << s.Width
-  << ','
-  << s.Height
-  << ','
-  << s.Format
-  << ','
-  << s.NumChannels
-  << '}';
+  os << '{' << s.Width << ',' << s.Height << ',' << s.Format << ',' << s.NumChannels << '}';
  return os;
 }

@@ -878,19 +761,8 @@ inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY_DESCRIPTOR* s)
 }

 inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR& s) {
-  os << '{'
-  << s.Width
-  << ','
-  << s.Height
-  << ','
-  << s.Depth
-  << ','
-  << s.Format
-  << ','
-  << s.NumChannels
-  << ','
-  << s.Flags
-  << '}';
+  os << '{' << s.Width << ',' << s.Height << ',' << s.Depth << ',' << s.Format << ','
+     << s.NumChannels << ',' << s.Flags << '}';
  return os;
 }

@@ -904,13 +776,7 @@ inline std::ostream& operator<<(std::ostream& os, const HIP_ARRAY3D_DESCRIPTOR*
 }

 inline std::ostream& operator<<(std::ostream& os, const hipExtent& s) {
-  os << '{'
-  << s.width
-  << ','
-  << s.height
-  << ','
-  << s.depth
-  << '}';
+  os << '{' << s.width << ',' << s.height << ',' << s.depth << '}';
  return os;
 }

@@ -66,7 +66,8 @@ void setupGLInteropOnce() {

 static inline hipError_t hipSetInteropObjects(int num_objects, void** mem_objects,
                                              std::vector<amd::Memory*>& interopObjects) {
-  if ((num_objects == 0 && mem_objects != nullptr) || (num_objects != 0 && mem_objects == nullptr)) {
+  if ((num_objects == 0 && mem_objects != nullptr) ||
+      (num_objects != 0 && mem_objects == nullptr)) {
    return hipErrorUnknown;
  }

@@ -207,7 +208,8 @@ hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsRe
  myarray->height = view->getHeight();
  myarray->depth = view->getDepth();

-  const cl_mem_object_type image_type = hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
+  const cl_mem_object_type image_type =
+      hip::getCLMemObjectType(myarray->width, myarray->height, myarray->depth, hipArrayDefault);
  myarray->type = image_type;
  amd::Image::Format f = image->getImageFormat();
  myarray->Format = hip::getCL2hipArrayFormat(f.image_channel_data_type);
@@ -448,8 +450,8 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima
    // In case target is GL_TEXTURE_BUFFER
    GLint backingBuffer;
    clearGLErrors(amdContext);
-    amdContext.glenv()->glGetTexLevelParameteriv_(
-        glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING, &backingBuffer);
+    amdContext.glenv()->glGetTexLevelParameteriv_(glTarget, 0, GL_TEXTURE_BUFFER_DATA_STORE_BINDING,
+                                                  &backingBuffer);
    if (GL_NO_ERROR != (glErr = amdContext.glenv()->glGetError_())) {
      LogWarning("Cannot get backing buffer for GL \"texture buffer\" object");
      HIP_RETURN(hipErrorInvalidValue);
@@ -529,7 +531,6 @@ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint ima

  *resource = reinterpret_cast<hipGraphicsResource*>(pImageGL);
  HIP_RETURN(hipSuccess);
-
 }

 hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer,
@@ -28,17 +28,11 @@ THE SOFTWARE.
 #include "platform/program.hpp"
 #include <hip/hip_version.h>

-const char* amd_dbgapi_get_build_name(void) {
-  return HIP_VERSION_BUILD_NAME;
-}
+const char* amd_dbgapi_get_build_name(void) { return HIP_VERSION_BUILD_NAME; }

-const char* amd_dbgapi_get_git_hash() {
-  return HIP_VERSION_GITHASH;
-}
+const char* amd_dbgapi_get_git_hash() { return HIP_VERSION_GITHASH; }

-size_t amd_dbgapi_get_build_id() {
-  return HIP_VERSION_BUILD_ID;
-}
+size_t amd_dbgapi_get_build_id() { return HIP_VERSION_BUILD_ID; }

 #ifdef __HIP_ENABLE_PCH
 extern const char __hip_pch_wave32[];
@@ -65,15 +59,10 @@ namespace hip {
 hipError_t ihipMallocManaged(void** ptr, size_t size, size_t align = 0, bool use_host_ptr = 0);

 // Device Vars
-DeviceVar::DeviceVar(std::string name,
-                     hipModule_t hmod,
-                     int deviceId) :
-                     shadowVptr(nullptr), name_(name),
-                     amd_mem_obj_(nullptr), device_ptr_(nullptr),
-                     size_(0) {
+DeviceVar::DeviceVar(std::string name, hipModule_t hmod, int deviceId)
+    : shadowVptr(nullptr), name_(name), amd_mem_obj_(nullptr), device_ptr_(nullptr), size_(0) {
  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
-  device::Program* dev_program =
-                   program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);
+  device::Program* dev_program = program->getDeviceProgram(*g_devices.at(deviceId)->devices()[0]);

  guarantee(dev_program != nullptr, "Cannot get Device Program for module: 0x%x", hmod);

@@ -112,8 +101,8 @@ DeviceVar::~DeviceVar() {
 }

 // Device Functions
-DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod) : dflock_("function lock"),
-                       name_(name), kernel_(nullptr) {
+DeviceFunc::DeviceFunc(std::string name, hipModule_t hmod)
+    : dflock_("function lock"), name_(name), kernel_(nullptr) {
  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));

  const amd::Symbol* symbol = program->findSymbol(name.c_str());
@@ -180,7 +169,6 @@ hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
 }

 hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId) {
-
  if (modules_ == nullptr || *modules_ == nullptr) {
    return hipErrorInvalidDeviceFunction;
  }
@@ -199,8 +187,8 @@ hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId)
  auto* device_handle = devices[deviceId];
  const device::Kernel::WorkGroupInfo* wginfo =
      kernel->getDeviceKernel(*device_handle)->workGroupInfo();
-  int binaryVersion = device_handle->isa().versionMajor() * 10 +
-                      device_handle->isa().versionMinor();
+  int binaryVersion =
+      device_handle->isa().versionMajor() * 10 + device_handle->isa().versionMinor();
  func_attr->sharedSizeBytes = static_cast<int>(wginfo->localMemSize_);
  func_attr->binaryVersion = binaryVersion;
  func_attr->cacheModeCA = 0;
@@ -216,8 +204,15 @@ hipError_t Function::getStatFuncAttr(hipFuncAttributes* func_attr, int deviceId)

 // Abstract Vars
 Var::Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
-         FatBinaryInfo** modules) : name_(name), dVarKind_(dVarKind), size_(size),
-         type_(type), norm_(norm), modules_(modules), managedVarPtr_(nullptr), align_(0) {
+         FatBinaryInfo** modules)
+    : name_(name),
+      dVarKind_(dVarKind),
+      size_(size),
+      type_(type),
+      norm_(norm),
+      modules_(modules),
+      managedVarPtr_(nullptr),
+      align_(0) {
  dVar_.resize(g_devices.size());
 }

@@ -246,8 +241,7 @@ hipError_t Var::getDeviceVarPtr(DeviceVar** dvar, int deviceId) {
  guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
            "Invalid DeviceId, greater than no of code objects");
-  guarantee((dVar_.size() == g_devices.size()),
-             "Device Var not initialized to size");
+  guarantee((dVar_.size() == g_devices.size()), "Device Var not initialized to size");
  *dvar = dVar_[deviceId];
  return hipSuccess;
 }
@@ -256,8 +250,7 @@ hipError_t Var::getDeviceVar(DeviceVar** dvar, int deviceId, hipModule_t hmod) {
  guarantee((deviceId >= 0), "Invalid DeviceId, less than zero");
  guarantee((static_cast<size_t>(deviceId) < g_devices.size()),
            "Invalid DeviceId, greater than no of code objects");
-  guarantee((dVar_.size() == g_devices.size()),
-             "Device Var not initialized to size");
+  guarantee((dVar_.size() == g_devices.size()), "Device Var not initialized to size");

  if (dVar_[deviceId] == nullptr) {
    dVar_[deviceId] = new DeviceVar(name_, hmod, deviceId);
@@ -295,4 +288,4 @@ hipError_t Var::allocateManagedVarPtr() {
  }
  return hipSuccess;
 }
-}; //namespace: hip
+};  // namespace hip
@@ -101,12 +101,7 @@ private:
 class Var {
 public:
  // Types of variable
-  enum DeviceVarKind {
-    DVK_Variable = 0,
-    DVK_Surface,
-    DVK_Texture,
-    DVK_Managed
-  };
+  enum DeviceVarKind { DVK_Variable = 0, DVK_Surface, DVK_Texture, DVK_Managed };

  Var(const std::string& name, DeviceVarKind dVarKind, size_t size, int type, int norm,
      FatBinaryInfo** modules = nullptr);
@@ -158,5 +153,5 @@ public:
  bool allocFlag_;                // 0 : host alloc, 1: managed alloc
 };

-}; //namespace: hip
+};  // namespace hip
 #endif /* HIP_GLOBAL_HPP */
@@ -194,8 +194,8 @@ hipError_t ihipGraphAddMemsetNode(hip::GraphNode** pGraphNode, hip::Graph* graph
    if (pMemsetParams->pitch < (pMemsetParams->width * pMemsetParams->elementSize)) {
      return hipErrorInvalidValue;
    }
-    auto sizeBytes = pMemsetParams->width * pMemsetParams->height *
-                     depth * pMemsetParams->elementSize;
+    auto sizeBytes =
+        pMemsetParams->width * pMemsetParams->height * depth * pMemsetParams->elementSize;
    status = ihipMemset3D_validate(
        {pMemsetParams->dst, pMemsetParams->pitch, pMemsetParams->width, pMemsetParams->height},
        pMemsetParams->value, {pMemsetParams->width, pMemsetParams->height, depth}, sizeBytes);
@@ -307,10 +307,10 @@ hipError_t capturehipExtLaunchKernel(hipStream_t& stream, const void*& hostFunct
                                     hipEvent_t& startEvent, hipEvent_t& stopEvent, int& flags) {
  ClPrint(amd::LOG_INFO, amd::LOG_API,
          "[hipGraph] Current capture node ExtLaunchKernel on stream : %p", stream);
-  return ihipExtLaunchKernel(
-      stream, reinterpret_cast<hipFunction_t>(const_cast<void*>(hostFunction)),
-      gridDim.x, gridDim.y, gridDim.z, blockDim.x,
-      blockDim.y, blockDim.z, sharedMemBytes, args, nullptr, startEvent, stopEvent, flags);
+  return ihipExtLaunchKernel(stream,
+                             reinterpret_cast<hipFunction_t>(const_cast<void*>(hostFunction)),
+                             gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
+                             sharedMemBytes, args, nullptr, startEvent, stopEvent, flags);
 }

 hipError_t capturehipModuleLaunchKernel(hipStream_t& stream, hipFunction_t& f, uint32_t& gridDimX,
@@ -408,8 +408,7 @@ hipError_t capturehipLaunchByPtr(hipStream_t& stream, hipFunction_t func, dim3 b

 hipError_t capturehipLaunchCooperativeKernel(hipStream_t& stream, const void*& f, dim3& gridDim,
                                             dim3& blockDim, void**& kernelParams,
-                                             uint32_t& sharedMemBytes)
-{
+                                             uint32_t& sharedMemBytes) {
  ClPrint(amd::LOG_INFO, amd::LOG_API,
          "[hipGraph] Current capture node LaunchCooperativeKernel on stream : %p", stream);
  if (!hip::isValid(stream)) {
@@ -846,7 +845,8 @@ hipError_t capturehipMemcpyToSymbolAsync(hipStream_t& stream, const void*& symbo
    HIP_RETURN(status);
  }
  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
-  hip::GraphNode* pGraphNode = new hip::GraphMemcpyNodeToSymbol(symbol, src, sizeBytes, offset, kind);
+  hip::GraphNode* pGraphNode =
+      new hip::GraphMemcpyNodeToSymbol(symbol, src, sizeBytes, offset, kind);
  status = ihipGraphAddNode(pGraphNode, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
                            s->GetLastCapturedNodes().size());
  if (status != hipSuccess) {
@@ -966,8 +966,8 @@ hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*&
 }

 // ================================================================================================
-hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool,
-                                 size_t size, void** dev_ptr) {
+hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size,
+                                 void** dev_ptr) {
  auto s = reinterpret_cast<hip::Stream*>(stream);
  auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);

@@ -990,8 +990,9 @@ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool,
  node_params.bytesize = size;

  auto mem_alloc_node = new hip::GraphMemAllocNode(&node_params);
-  auto status = ihipGraphAddNode(mem_alloc_node, s->GetCaptureGraph(),
-      s->GetLastCapturedNodes().data(), s->GetLastCapturedNodes().size());
+  auto status =
+      ihipGraphAddNode(mem_alloc_node, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                       s->GetLastCapturedNodes().size());
  if (status != hipSuccess) {
    return status;
  }
@@ -1006,8 +1007,9 @@ hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool,
 hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr) {
  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
  auto mem_free_node = new hip::GraphMemFreeNode(dev_ptr);
-  auto status = ihipGraphAddNode(mem_free_node, s->GetCaptureGraph(),
-      s->GetLastCapturedNodes().data(), s->GetLastCapturedNodes().size());
+  auto status =
+      ihipGraphAddNode(mem_free_node, s->GetCaptureGraph(), s->GetLastCapturedNodes().data(),
+                       s->GetLastCapturedNodes().size());
  if (status != hipSuccess) {
    return status;
  }
@@ -1308,8 +1310,7 @@ hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
  hip::GraphNode* node;
  hipError_t status = ihipGraphAddMemcpyNode(
      &node, reinterpret_cast<hip::Graph*>(graph),
-      reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies, pCopyParams,
-      false);
+      reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies, pCopyParams, false);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
  HIP_RETURN(status);
 }
@@ -1324,8 +1325,8 @@ hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra
    HIP_RETURN(hipErrorInvalidValue);
  }
  hip::GraphNode* node;
-  hipError_t status = ihipDrvGraphAddMemcpyNode(
-    &node, reinterpret_cast<hip::Graph*>(hGraph),
+  hipError_t status =
+      ihipDrvGraphAddMemcpyNode(&node, reinterpret_cast<hip::Graph*>(hGraph),
                                reinterpret_cast<hip::GraphNode* const*>(dependencies),
                                numDependencies, copyParams, ctx, false);
  *phGraphNode = reinterpret_cast<hipGraphNode_t>(node);
@@ -1335,14 +1336,15 @@ hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra
 hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                   const hipGraphNode_t* pDependencies, size_t numDependencies,
                                   void* dst, const void* src, size_t count, hipMemcpyKind kind) {
-  HIP_INIT_API(hipGraphAddMemcpyNode1D, pGraphNode, graph, pDependencies, numDependencies, dst,
-               src, count, kind);
+  HIP_INIT_API(hipGraphAddMemcpyNode1D, pGraphNode, graph, pDependencies, numDependencies, dst, src,
+               count, kind);
  if (pGraphNode == nullptr || graph == nullptr ||
      (numDependencies > 0 && pDependencies == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  hip::GraphNode* node;
-  hipError_t status = ihipGraphAddMemcpyNode1D(&node, reinterpret_cast<hip::Graph*>(graph),
+  hipError_t status =
+      ihipGraphAddMemcpyNode1D(&node, reinterpret_cast<hip::Graph*>(graph),
                               reinterpret_cast<hip::GraphNode* const*>(pDependencies),
                               numDependencies, dst, src, count, kind, false);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
@@ -1379,8 +1381,8 @@ hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraph
  if (oldkind != kind) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  hipError_t status = reinterpret_cast<hip::GraphMemcpyNode1D*>(clonedNode)->SetParams(dst, src,
-                                                                              count, kind);
+  hipError_t status =
+      reinterpret_cast<hip::GraphMemcpyNode1D*>(clonedNode)->SetParams(dst, src, count, kind);
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
@@ -1426,8 +1428,7 @@ hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGra
  pmemsetParams.pitch = memsetParams->pitch;
  pmemsetParams.value = memsetParams->value;
  pmemsetParams.width = memsetParams->width;
-  hipError_t status =
-      ihipGraphAddMemsetNode(&node, reinterpret_cast<hip::Graph*>(hGraph),
+  hipError_t status = ihipGraphAddMemsetNode(&node, reinterpret_cast<hip::Graph*>(hGraph),
                                             reinterpret_cast<hip::GraphNode* const*>(dependencies),
                                             numDependencies, &pmemsetParams, false);
  *phGraphNode = reinterpret_cast<hipGraphNode_t>(node);
@@ -1538,7 +1539,6 @@ hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t g

 hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
                                         hipGraphInstantiateParams* instantiateParams) {
-
  HIP_INIT_API(hipGraphInstantiateWithParams, pGraphExec, graph, instantiateParams);
  if (pGraphExec == nullptr || graph == nullptr || instantiateParams == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -1709,8 +1709,7 @@ hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAtt
    HIP_RETURN(hipErrorInvalidValue);
  }
  if (attr != hipKernelNodeAttributeAccessPolicyWindow &&
-      attr != hipKernelNodeAttributeCooperative &&
-      attr != hipLaunchAttributePriority) {
+      attr != hipKernelNodeAttributeCooperative && attr != hipLaunchAttributePriority) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -1728,8 +1727,7 @@ hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAtt
    HIP_RETURN(hipErrorInvalidValue);
  }
  if (attr != hipKernelNodeAttributeAccessPolicyWindow &&
-      attr != hipKernelNodeAttributeCooperative &&
-      attr != hipLaunchAttributePriority) {
+      attr != hipKernelNodeAttributeCooperative && attr != hipLaunchAttributePriority) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -1753,8 +1751,7 @@ hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
                                           hipMemcpy3DParms* pNodeParams) {
  HIP_INIT_API(hipGraphExecMemcpyNodeSetParams, hGraphExec, node, pNodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(node);
-  if (hGraphExec == nullptr ||
-      !hip::GraphNode::isNodeValid(reinterpret_cast<hip::GraphNode*>(n)) ||
+  if (hGraphExec == nullptr || !hip::GraphNode::isNodeValid(reinterpret_cast<hip::GraphNode*>(n)) ||
      n->GetType() != hipGraphNodeTypeMemcpy) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -1826,8 +1823,8 @@ hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
  if (clonedNode == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  hipError_t status = reinterpret_cast<hip::GraphMemsetNode*>(clonedNode)
-                 ->SetParams(pNodeParams, true);
+  hipError_t status =
+      reinterpret_cast<hip::GraphMemsetNode*>(clonedNode)->SetParams(pNodeParams, true);
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
@@ -1879,9 +1876,8 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo
                                           const hipKernelNodeParams* pNodeParams) {
  HIP_INIT_API(hipGraphExecKernelNodeSetParams, hGraphExec, node, pNodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(node);
-  if (hGraphExec == nullptr ||
-      !hip::GraphNode::isNodeValid(n) ||
-      pNodeParams == nullptr || pNodeParams->func == nullptr || n->GetType() != hipGraphNodeTypeKernel) {
+  if (hGraphExec == nullptr || !hip::GraphNode::isNodeValid(n) || pNodeParams == nullptr ||
+      pNodeParams->func == nullptr || n->GetType() != hipGraphNodeTypeKernel) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  hip::GraphNode* clonedNode = reinterpret_cast<hip::GraphExec*>(hGraphExec)->GetClonedNode(n);
@@ -1912,15 +1908,13 @@ hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGrap
  HIP_RETURN(hipSuccess);
 }

-hipError_t validateChildGraphNodeSetParams(hip::GraphNode* n,
-                                            hip::Graph* cg, bool exec = true) {
+hipError_t validateChildGraphNodeSetParams(hip::GraphNode* n, hip::Graph* cg, bool exec = true) {
  if (cg == nullptr || n == nullptr || !hip::GraphNode::isNodeValid(n) ||
      !hip::Graph::isGraphValid(cg) || n->GetType() != hipGraphNodeTypeGraph) {
    return hipErrorInvalidValue;
  }
  // compare with parent graph fron cloned and original node
-  if (cg == n->GetParentGraph()->getOriginalGraph()
-      || cg == n->GetParentGraph()) {
+  if (cg == n->GetParentGraph()->getOriginalGraph() || cg == n->GetParentGraph()) {
    return hipErrorUnknown;
  }

@@ -2254,8 +2248,7 @@ hipError_t hipGraphDestroyNode(hipGraphNode_t node) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (n->GetType() == hipGraphNodeTypeMemAlloc ||
-      n->GetType() == hipGraphNodeTypeMemFree) {
+  if (n->GetType() == hipGraphNodeTypeMemAlloc || n->GetType() == hipGraphNodeTypeMemFree) {
    HIP_RETURN(hipErrorNotSupported);
  }
  // Remove the node from graph should takecare of updating edges of parent and deps of child nodes
@@ -2275,8 +2268,7 @@ hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  for (auto n : g->GetNodes()) {
-    if (n->GetType() == hipGraphNodeTypeMemAlloc ||
-        n->GetType() == hipGraphNodeTypeMemFree) {
+    if (n->GetType() == hipGraphNodeTypeMemAlloc || n->GetType() == hipGraphNodeTypeMemFree) {
      HIP_RETURN(hipErrorNotSupported);
    }
  }
@@ -2328,8 +2320,8 @@ hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_
    HIP_RETURN(status);
  }
  hip::GraphNode* node = new hip::GraphMemcpyNodeFromSymbol(dst, symbol, count, offset, kind);
-  status = ihipGraphAddNode(
-      node, g, reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies, false);
+  status = ihipGraphAddNode(node, g, reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                            numDependencies, false);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
  HIP_RETURN(status);
 }
@@ -2345,8 +2337,8 @@ hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst,
    HIP_RETURN(hipErrorInvalidValue);
  }

-  HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(node)->SetParams(
-      dst, symbol, count, offset, kind));
+  HIP_RETURN(reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(node)->SetParams(dst, symbol, count,
+                                                                                offset, kind));
 }

 hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node,
@@ -2368,7 +2360,8 @@ hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
    HIP_RETURN(hipErrorInvalidValue);
  }

-  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(clonedNode)->GetMemcpyKind();
+  hipMemcpyKind oldkind =
+      reinterpret_cast<hip::GraphMemcpyNodeFromSymbol*>(clonedNode)->GetMemcpyKind();
  if (oldkind != kind) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -2450,7 +2443,8 @@ hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hi
  if (clonedNode == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  hipMemcpyKind oldkind =  reinterpret_cast<hip::GraphMemcpyNodeToSymbol*>(clonedNode)->GetMemcpyKind();
+  hipMemcpyKind oldkind =
+      reinterpret_cast<hip::GraphMemcpyNodeToSymbol*>(clonedNode)->GetMemcpyKind();
  if (oldkind != kind) {
    HIP_RETURN(hipErrorInvalidValue);
  }
@@ -2739,8 +2733,8 @@ hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
  auto mem_alloc_node = new hip::GraphMemAllocNode(pNodeParams);
  hip::GraphNode* node = mem_alloc_node;
  auto hgraph = reinterpret_cast<hip::Graph*>(graph);
-  auto status = ihipGraphAddNode(node, hgraph,
-                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
+  auto status = ihipGraphAddNode(
+      node, hgraph, reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
  // The address must be provided during the node creation time
  pNodeParams->dptr =
      (HIP_MEM_POOL_USE_VM) ? mem_alloc_node->ReserveAddress() : mem_alloc_node->Execute();
@@ -2781,8 +2775,7 @@ hipError_t ihipGraphAddMemFreeNode(hip::GraphNode** graphNode, hip::Graph* graph

  auto mem_free_node = new hip::GraphMemFreeNode(dptr);
  *graphNode = mem_free_node;
-  auto status =
-      ihipGraphAddNode(*graphNode, graph, pDependencies, numDependencies);
+  auto status = ihipGraphAddNode(*graphNode, graph, pDependencies, numDependencies);
  HIP_RETURN(status);
 }
 // ================================================================================================
@@ -2816,10 +2809,9 @@ hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
  if (bGraphFound == false) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  auto status =
-      ihipGraphAddMemFreeNode(&pNode,
-                reinterpret_cast<hip::Graph*>(graph),
-                reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies, dev_ptr);
+  auto status = ihipGraphAddMemFreeNode(&pNode, reinterpret_cast<hip::Graph*>(graph),
+                                        reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                                        numDependencies, dev_ptr);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(pNode);
  HIP_RETURN(status);
 }
@@ -2828,8 +2820,8 @@ hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
 hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr) {
  HIP_INIT_API(hipGraphMemFreeNodeGetParams, node, dev_ptr);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(node);
-  if (node == nullptr || dev_ptr == nullptr || !hip::GraphNode::isNodeValid(n)
-      || n->GetType() != hipGraphNodeTypeMemFree) {
+  if (node == nullptr || dev_ptr == nullptr || !hip::GraphNode::isNodeValid(n) ||
+      n->GetType() != hipGraphNodeTypeMemFree) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  reinterpret_cast<hip::GraphMemFreeNode*>(n)->GetParams(reinterpret_cast<void**>(dev_ptr));
@@ -2848,20 +2840,20 @@ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType at
  hipError_t result = hipErrorInvalidValue;
  switch (attr) {
    case hipGraphMemAttrUsedMemCurrent:
-      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
-          hipMemPoolAttrUsedMemCurrent, value);
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(hipMemPoolAttrUsedMemCurrent,
+                                                                     value);
      break;
    case hipGraphMemAttrUsedMemHigh:
-      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
-          hipMemPoolAttrUsedMemHigh, value);
+      result =
+          g_devices[device]->GetGraphMemoryPool()->GetAttribute(hipMemPoolAttrUsedMemHigh, value);
      break;
    case hipGraphMemAttrReservedMemCurrent:
      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
          hipMemPoolAttrReservedMemCurrent, value);
      break;
    case hipGraphMemAttrReservedMemHigh:
-      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
-          hipMemPoolAttrReservedMemHigh, value);
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(hipMemPoolAttrReservedMemHigh,
+                                                                     value);
      break;
    default:
      break;
@@ -2881,12 +2873,12 @@ hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType at
  hipError_t result = hipErrorInvalidValue;
  switch (attr) {
    case hipGraphMemAttrUsedMemHigh:
-      result = g_devices[device]->GetGraphMemoryPool()->SetAttribute(
-          hipMemPoolAttrUsedMemHigh, value);
+      result =
+          g_devices[device]->GetGraphMemoryPool()->SetAttribute(hipMemPoolAttrUsedMemHigh, value);
      break;
    case hipGraphMemAttrReservedMemHigh:
-      result = g_devices[device]->GetGraphMemoryPool()->SetAttribute(
-          hipMemPoolAttrReservedMemHigh, value);
+      result = g_devices[device]->GetGraphMemoryPool()->SetAttribute(hipMemPoolAttrReservedMemHigh,
+                                                                     value);
      break;
    default:
      break;
@@ -3107,28 +3099,22 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,

  switch (nodeType) {
    case hipGraphNodeTypeKernel:
-      status = ihipGraphAddKernelNode(
-        &node, reinterpret_cast<hip::Graph*>(graph),
-        reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies,
-        &nodeParams->kernel,
-        nullptr, false);
+      status = ihipGraphAddKernelNode(&node, reinterpret_cast<hip::Graph*>(graph),
+                                      reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                                      numDependencies, &nodeParams->kernel, nullptr, false);
      break;
    case hipGraphNodeTypeMemcpy:
-      status = ihipGraphAddMemcpyNode(
-      &node, reinterpret_cast<hip::Graph*>(graph),
-      reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies,
-      &nodeParams->memcpy.copyParams,
-      false);
+      status = ihipGraphAddMemcpyNode(&node, reinterpret_cast<hip::Graph*>(graph),
+                                      reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                                      numDependencies, &nodeParams->memcpy.copyParams, false);
      break;
    case hipGraphNodeTypeMemset:
-      status =
-      ihipGraphAddMemsetNode(&node, reinterpret_cast<hip::Graph*>(graph),
+      status = ihipGraphAddMemsetNode(&node, reinterpret_cast<hip::Graph*>(graph),
                                      reinterpret_cast<hip::GraphNode* const*>(pDependencies),
                                      numDependencies, &nodeParams->memset, false);
      break;
    case hipGraphNodeTypeHost:
-      if(nodeParams->host.fn == nullptr)
-      {
+      if (nodeParams->host.fn == nullptr) {
        HIP_RETURN(hipErrorInvalidValue);
      }
      node = new hip::GraphHostNode(&nodeParams->host);
@@ -3137,8 +3123,7 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
                                numDependencies, false);
      break;
    case hipGraphNodeTypeGraph:
-      if(nodeParams->graph.graph == nullptr)
-      {
+      if (nodeParams->graph.graph == nullptr) {
        HIP_RETURN(hipErrorInvalidValue);
      }
      node = new hip::ChildGraphNode(reinterpret_cast<hip::Graph*>(nodeParams->graph.graph));
@@ -3147,8 +3132,7 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
                                numDependencies, false);
      break;
    case hipGraphNodeTypeWaitEvent:
-      if(nodeParams->eventWait.event == nullptr)
-      {
+      if (nodeParams->eventWait.event == nullptr) {
        HIP_RETURN(hipErrorInvalidValue);
      }
      node = new hip::GraphEventWaitNode(nodeParams->eventWait.event);
@@ -3157,8 +3141,7 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
                                numDependencies, false);
      break;
    case hipGraphNodeTypeEventRecord:
-      if(nodeParams->eventRecord.event == nullptr)
-      {
+      if (nodeParams->eventRecord.event == nullptr) {
        HIP_RETURN(hipErrorInvalidValue);
      }
      node = new hip::GraphEventRecordNode(nodeParams->eventRecord.event);
@@ -3176,15 +3159,13 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
      break;
    case hipGraphNodeTypeMemAlloc:
      params = nodeParams->alloc;
-      if (params.bytesize == 0 ||
-            params.poolProps.allocType != hipMemAllocationTypePinned ||
+      if (params.bytesize == 0 || params.poolProps.allocType != hipMemAllocationTypePinned ||
          params.poolProps.location.type != hipMemLocationTypeDevice) {
        params.dptr = nullptr;
        HIP_RETURN(hipErrorInvalidValue);
      }
      if (params.poolProps.location.type == hipMemLocationTypeDevice) {
-        if (params.poolProps.location.id < 0 ||
-            params.poolProps.location.id >= g_devices.size()) {
+        if (params.poolProps.location.id < 0 || params.poolProps.location.id >= g_devices.size()) {
          HIP_RETURN(hipErrorInvalidValue);
        }
      }
@@ -3192,9 +3173,9 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
      params.dptr = nullptr;
      mem_alloc_node = new hip::GraphMemAllocNode(&params);
      node = mem_alloc_node;
-      status =
-          ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
-                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
+      status = ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
+                                reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                                numDependencies);
      // The address must be provided during the node creation time
      nodeParams->alloc.dptr =
          (HIP_MEM_POOL_USE_VM) ? mem_alloc_node->ReserveAddress() : mem_alloc_node->Execute();
@@ -3216,9 +3197,9 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
        }
      }
      node = new hip::GraphMemFreeNode(nodeParams->free.dptr);
-    status =
-      ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
-                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
+      status = ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
+                                reinterpret_cast<hip::GraphNode* const*>(pDependencies),
+                                numDependencies);
      break;
    default:
      status = hipErrorInvalidValue;
@@ -3228,21 +3209,22 @@ hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph,
  HIP_RETURN(status);
 }

-hipError_t hipGraphAddExternalSemaphoresSignalNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                 const hipExternalSemaphoreSignalNodeParams* nodeParams) {
+hipError_t hipGraphAddExternalSemaphoresSignalNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphAddExternalSemaphoresSignalNode, pGraphNode, graph, pDependencies,
               numDependencies, nodeParams);
  hip::GraphNode* node = new hip::hipGraphExternalSemSignalNode(nodeParams);
-  hipError_t status = ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
+  hipError_t status =
+      ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
  HIP_RETURN(status);
 }

-hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
-                                 const hipGraphNode_t* pDependencies, size_t numDependencies,
-                                 const hipExternalSemaphoreWaitNodeParams* nodeParams) {
+hipError_t hipGraphAddExternalSemaphoresWaitNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphAddExternalSemaphoresWaitNode, pGraphNode, graph, pDependencies,
               numDependencies, nodeParams);
  if (pGraphNode == nullptr || graph == nullptr ||
@@ -3250,14 +3232,15 @@ hipError_t hipGraphAddExternalSemaphoresWaitNode(hipGraphNode_t* pGraphNode, hip
    HIP_RETURN(hipErrorInvalidValue);
  }
  hip::GraphNode* node = new hip::hipGraphExternalSemWaitNode(nodeParams);
-  hipError_t status = ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
+  hipError_t status =
+      ihipGraphAddNode(node, reinterpret_cast<hip::Graph*>(graph),
                       reinterpret_cast<hip::GraphNode* const*>(pDependencies), numDependencies);
  *pGraphNode = reinterpret_cast<hipGraphNode_t>(node);
  HIP_RETURN(status);
 }

-hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode,
-                                    const hipExternalSemaphoreSignalNodeParams* nodeParams) {
+hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphExternalSemaphoresSignalNodeSetParams, hNode, nodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
  if (!hip::GraphNode::isNodeValid(n) || nodeParams == nullptr) {
@@ -3266,8 +3249,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(hipGraphNode_t hNode,
  HIP_RETURN(reinterpret_cast<hip::hipGraphExternalSemSignalNode*>(n)->SetParams(nodeParams));
 }

-hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode,
-                                      const hipExternalSemaphoreWaitNodeParams* nodeParams) {
+hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphExternalSemaphoresWaitNodeSetParams, hNode, nodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
  if (!hip::GraphNode::isNodeValid(n) || nodeParams == nullptr) {
@@ -3276,8 +3259,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(hipGraphNode_t hNode,
  HIP_RETURN(reinterpret_cast<hip::hipGraphExternalSemWaitNode*>(n)->SetParams(nodeParams));
 }

-hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode,
-                                              hipExternalSemaphoreSignalNodeParams* params_out) {
+hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* params_out) {
  HIP_INIT_API(hipGraphExternalSemaphoresSignalNodeGetParams, hNode, params_out);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
  if (!hip::GraphNode::isNodeValid(n) || params_out == nullptr) {
@@ -3287,8 +3270,8 @@ hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(hipGraphNode_t hNode,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode,
-                                              hipExternalSemaphoreWaitNodeParams* params_out) {
+hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* params_out) {
  HIP_INIT_API(hipGraphExternalSemaphoresWaitNodeGetParams, hNode, params_out);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
  if (!hip::GraphNode::isNodeValid(n) || params_out == nullptr) {
@@ -3298,8 +3281,8 @@ hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(hipGraphNode_t hNode,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGraphExec,
-                                        hipGraphNode_t hNode,
+hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
    const hipExternalSemaphoreSignalNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphExecExternalSemaphoresSignalNodeSetParams, hGraphExec, hNode, nodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
@@ -3313,12 +3296,12 @@ hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(hipGraphExec_t hGra
  if (clonedNode == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  HIP_RETURN(reinterpret_cast<hip::hipGraphExternalSemSignalNode*>(clonedNode)->SetParams(
-      nodeParams));
+  HIP_RETURN(
+      reinterpret_cast<hip::hipGraphExternalSemSignalNode*>(clonedNode)->SetParams(nodeParams));
 }

-hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraphExec,
-                                              hipGraphNode_t hNode,
+hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
    const hipExternalSemaphoreWaitNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphExecExternalSemaphoresWaitNodeSetParams, hGraphExec, hNode, nodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
@@ -3332,8 +3315,8 @@ hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(hipGraphExec_t hGraph
  if (clonedNode == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  HIP_RETURN(reinterpret_cast<hip::hipGraphExternalSemWaitNode*>(clonedNode)->SetParams(
-      nodeParams));
+  HIP_RETURN(
+      reinterpret_cast<hip::hipGraphExternalSemWaitNode*>(clonedNode)->SetParams(nodeParams));
 }

 hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
@@ -3359,10 +3342,9 @@ hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGr
    }
  }
  hip::GraphNode* pNode;
-  auto status =
-      ihipGraphAddMemFreeNode(&pNode,
-                    reinterpret_cast<hip::Graph*>(hGraph),
-                    reinterpret_cast<hip::GraphNode* const*>(dependencies), numDependencies, dptr);
+  auto status = ihipGraphAddMemFreeNode(&pNode, reinterpret_cast<hip::Graph*>(hGraph),
+                                        reinterpret_cast<hip::GraphNode* const*>(dependencies),
+                                        numDependencies, dptr);
  *phGraphNode = reinterpret_cast<hipGraphNode_t>(pNode);
  HIP_RETURN(status);
 }
@@ -3371,18 +3353,17 @@ hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
                                              const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
  HIP_INIT_API(hipDrvGraphExecMemcpyNodeSetParams, hGraphExec, hNode, copyParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
-  if (hGraphExec == nullptr ||
-                    !hip::GraphNode::isNodeValid(reinterpret_cast<hip::GraphNode*>(n))) {
+  if (hGraphExec == nullptr || !hip::GraphNode::isNodeValid(reinterpret_cast<hip::GraphNode*>(n))) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  if (ihipDrvMemcpy3D_validate(copyParams) != hipSuccess) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  // Check if pNodeParams passed is a empty struct
-  if (((copyParams->srcArray == 0) && (copyParams->srcHost == nullptr)
-       && (copyParams->srcDevice == nullptr)) ||
-      ((copyParams->dstArray == 0) && (copyParams->dstHost == nullptr)
-       && (copyParams->dstDevice == nullptr))) {
+  if (((copyParams->srcArray == 0) && (copyParams->srcHost == nullptr) &&
+       (copyParams->srcDevice == nullptr)) ||
+      ((copyParams->dstArray == 0) && (copyParams->dstHost == nullptr) &&
+       (copyParams->dstDevice == nullptr))) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  hip::GraphNode* clonedNode = reinterpret_cast<hip::GraphExec*>(hGraphExec)->GetClonedNode(n);
@@ -3415,8 +3396,8 @@ hipError_t hipDrvGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGrap
  if (clonedNode == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }
-  hipError_t status = reinterpret_cast<hip::GraphMemsetNode*>(clonedNode)
-                 ->SetParams(memsetParams, true);
+  hipError_t status =
+      reinterpret_cast<hip::GraphMemsetNode*>(clonedNode)->SetParams(memsetParams, true);
  if (status != hipSuccess) {
    HIP_RETURN(status);
  }
@@ -3456,20 +3437,18 @@ hipError_t ihipGraphNodeSetParams(hip::GraphNode* n, hipGraphNodeParams *nodePar
          break;
        }
      }
-      status = reinterpret_cast<hip::GraphMemcpyNode*>(n)->SetParams(
-                                                &nodeParams->memcpy.copyParams);
+      status =
+          reinterpret_cast<hip::GraphMemcpyNode*>(n)->SetParams(&nodeParams->memcpy.copyParams);
      break;
    case hipGraphNodeTypeMemset:
-      status =
-      reinterpret_cast<hip::GraphMemsetNode*>(n)->SetParams(&nodeParams->memset);
+      status = reinterpret_cast<hip::GraphMemsetNode*>(n)->SetParams(&nodeParams->memset);
      break;
    case hipGraphNodeTypeHost:
      if (nodeParams->host.fn == nullptr || nodeParams->host.userData == nullptr) {
        status = hipErrorInvalidValue;
        break;
      }
-      status =
-      reinterpret_cast<hip::GraphHostNode*>(n)->SetParams(&nodeParams->host);
+      status = reinterpret_cast<hip::GraphHostNode*>(n)->SetParams(&nodeParams->host);
      break;
    case hipGraphNodeTypeGraph:
      cg = reinterpret_cast<hip::Graph*>(nodeParams->graph.graph);
@@ -3485,16 +3464,16 @@ hipError_t ihipGraphNodeSetParams(hip::GraphNode* n, hipGraphNodeParams *nodePar
        status = hipErrorInvalidValue;
        break;
      }
-      status = reinterpret_cast<hip::GraphEventWaitNode*>(n)->SetParams(
-                                                 nodeParams->eventWait.event);
+      status =
+          reinterpret_cast<hip::GraphEventWaitNode*>(n)->SetParams(nodeParams->eventWait.event);
      break;
    case hipGraphNodeTypeEventRecord:
      if (nodeParams->eventRecord.event == nullptr) {
        status = hipErrorInvalidValue;
        break;
      }
-      status = reinterpret_cast<hip::GraphEventRecordNode*>(n)->SetParams(
-                                                 nodeParams->eventRecord.event);
+      status =
+          reinterpret_cast<hip::GraphEventRecordNode*>(n)->SetParams(nodeParams->eventRecord.event);
      break;
    case hipGraphNodeTypeExtSemaphoreSignal:
      status = reinterpret_cast<hip::hipGraphExternalSemSignalNode*>(n)->SetParams(
@@ -3619,8 +3598,7 @@ hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
  HIP_RETURN(reinterpret_cast<hip::hipGraphBatchMemOpNode*>(n)->SetParams(nodeParams));
 }

-hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec,
-                                               hipGraphNode_t hNode,
+hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
                                               const hipBatchMemOpNodeParams* nodeParams) {
  HIP_INIT_API(hipGraphExecBatchMemOpNodeSetParams, hGraphExec, hNode, nodeParams);
  hip::GraphNode* n = reinterpret_cast<hip::GraphNode*>(hNode);
@@ -108,7 +108,8 @@ hipError_t capturehipMemset3DAsync(hipStream_t& stream, hipPitchedPtr& pitchedDe

 hipError_t capturehipLaunchHostFunc(hipStream_t& stream, hipHostFn_t& fn, void*& userData);

-hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size, void** dev_ptr);
+hipError_t capturehipMallocAsync(hipStream_t stream, hipMemPool_t mem_pool, size_t size,
+                                 void** dev_ptr);

 hipError_t capturehipFreeAsync(hipStream_t stream, void* dev_ptr);
-}
+}  // namespace hip
@@ -48,7 +48,7 @@ const char* GetGraphNodeTypeString(uint32_t op) {
  };
  return case_string;
 };
-}
+}  // namespace

 namespace hip {

@@ -250,8 +250,7 @@ bool Graph::TopologicalOrder(std::vector<Node>& TopoOrder) {
    }
    inDegree[entry] = entry->GetInDegree();
  }
-  while (!q.empty())
-  {
+  while (!q.empty()) {
    Node node = q.front();
    TopoOrder.push_back(node);
    q.pop();
@@ -333,8 +332,8 @@ bool GraphExec::isGraphExecValid(GraphExec* pGraphExec) {
 hipError_t GraphExec::CreateStreams(uint32_t num_streams) {
  parallel_streams_.reserve(num_streams);
  for (uint32_t i = 0; i < num_streams; ++i) {
-    auto stream = new hip::Stream(hip::getCurrentDevice(),
-                                  hip::Stream::Priority::Normal, hipStreamNonBlocking);
+    auto stream = new hip::Stream(hip::getCurrentDevice(), hip::Stream::Priority::Normal,
+                                  hipStreamNonBlocking);
    if (stream == nullptr || !stream->Create()) {
      if (stream != nullptr) {
        hip::Stream::Destroy(stream);
@@ -581,8 +580,9 @@ bool Graph::RunOneNode(Node node, bool wait) {
    // Execute the nodes in the edges list
    for (auto edge : node->GetEdges()) {
      // Don't wait in the nodes, executed on the same streams and if it has just one dependency
-      bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) ||
-                   (edge->GetDependencies().size() > 1)) ? true : false;
+      bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1))
+          ? true
+          : false;
      // Execute the edge node
      if (!RunOneNode(edge, wait)) {
        return false;
@@ -599,11 +599,8 @@ bool Graph::RunOneNode(Node node, bool wait) {
 }

 // ================================================================================================
-bool Graph::RunNodes(
-    int32_t base_stream,
-    const std::vector<hip::Stream*>* parallel_streams,
+bool Graph::RunNodes(int32_t base_stream, const std::vector<hip::Stream*>* parallel_streams,
                     const amd::Command::EventWaitList* parent_waitlist) {
-
  if (parallel_streams != nullptr) {
    streams_ = *parallel_streams;
  }
@@ -594,8 +594,7 @@ class Graph {

  //! Schedules one node on a vitual stream.
  //! It will also process the nodes in edges, using recursion
-  void ScheduleOneNode(
-    Node node,      //!< Node for scheduling on a virtual stream
+  void ScheduleOneNode(Node node,     //!< Node for scheduling on a virtual stream
                       int stream_id  //!< Current active virtual stream to use for scheduling
  );

@@ -609,8 +608,7 @@ class Graph {
  );

  //! Runs one node on the assigned stream
-  bool RunOneNode(
-    Node node,    //!< Node for the execution on GPU
+  bool RunOneNode(Node node,  //!< Node for the execution on GPU
                  bool wait   //!< Wait dependencies
  );

@@ -693,17 +691,11 @@ class Graph {
    return false;
  }

-  void FreeAllMemory(hip::Stream* stream) {
-    mem_pool_->FreeAllMemory(stream);
-  }
+  void FreeAllMemory(hip::Stream* stream) { mem_pool_->FreeAllMemory(stream); }

-  bool IsGraphInstantiated() const {
-    return graphInstantiated_;
-  }
+  bool IsGraphInstantiated() const { return graphInstantiated_; }

-  void SetGraphInstantiated(bool graphInstantiate) {
-    graphInstantiated_ = graphInstantiate;
-  }
+  void SetGraphInstantiated(bool graphInstantiate) { graphInstantiated_ = graphInstantiate; }

  //! returns count of unreleased memalloc nodes
  uint32_t GetMemAllocNodeCount() const { return memalloc_nodes_; }
@@ -798,9 +790,7 @@ class GraphExec : public amd::ReferenceCountedObject, public Graph {
  void SetKernelArgManager(GraphKernelArgManager* kernArgManager) {
    kernArgManager_ = kernArgManager;
  }
-  GraphKernelArgManager* GetKernelArgManager() {
-    return kernArgManager_;
-  }
+  GraphKernelArgManager* GetKernelArgManager() { return kernArgManager_; }
  static void DecrementRefCount(cl_event event, cl_int command_exec_status, void* user_data);
  hipError_t AllocKernelArgForGraphNode();
  void GetKernelArgSizeForGraph(size_t& kernArgSizeForGraph);
@@ -838,13 +828,9 @@ class ChildGraphNode : public GraphNode, public GraphExec {

  bool GetGraphCaptureStatus() { return graphCaptureStatus_; }

-  std::vector<Node>& GetChildGraphNodeOrder() {
-    return topoOrder_;
-  }
+  std::vector<Node>& GetChildGraphNodeOrder() { return topoOrder_; }

-  void SetStream(hip::Stream* stream) override {
-    stream_ = stream;
-  }
+  void SetStream(hip::Stream* stream) override { stream_ = stream; }

  bool TopologicalOrder(std::vector<Node>& TopoOrder) override {
    return Graph::TopologicalOrder(TopoOrder);
@@ -856,8 +842,7 @@ class ChildGraphNode : public GraphNode, public GraphExec {
    } else if (max_streams_ == 1) {
      for (int i = 0; i < topoOrder_.size(); i++) {
        topoOrder_[i]->SetStream(stream_);
-        hipError_t status =
-            topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
+        hipError_t status = topoOrder_[i]->CreateCommand(topoOrder_[i]->GetQueue());
        topoOrder_[i]->EnqueueCommands(stream_);
      }
    }
@@ -964,36 +949,30 @@ class GraphKernelNode : public GraphNode {
              "%u}\n| {priority | %d}\n}",
              label_, GetID(), function->name().c_str(), kernelParams_.gridDim.x,
              kernelParams_.gridDim.y, kernelParams_.gridDim.z, kernelParams_.blockDim.x,
-              kernelParams_.blockDim.y, kernelParams_.blockDim.z,
-              kernelParams_.sharedMemBytes, this, kernelParams_.func,
-              kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes,
-              kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
-              kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative,
-              kernelAttr_.priority);
+              kernelParams_.blockDim.y, kernelParams_.blockDim.z, kernelParams_.sharedMemBytes,
+              this, kernelParams_.func, kernelAttr_.accessPolicyWindow.base_ptr,
+              kernelAttr_.accessPolicyWindow.num_bytes, kernelAttr_.accessPolicyWindow.hitRatio,
+              kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp,
+              kernelAttr_.cooperative, kernelAttr_.priority);
      label = buffer;
-    }
-    else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) {
+    } else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) {
      sprintf(buffer,
              "{\n%s\n| {ID | %d | %s}\n"
              "| {accessPolicyWindow | {base_ptr | num_bytes | "
              "hitRatio | hitProp | missProp} |\n| {%p | %zu | %f | %d | %d}}\n| {cooperative | "
              "%u}\n| {priority | %d}\n}",
-              label_, GetID(), function->name().c_str(),
-              kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes,
-              kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
-              kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative,
-              kernelAttr_.priority);
+              label_, GetID(), function->name().c_str(), kernelAttr_.accessPolicyWindow.base_ptr,
+              kernelAttr_.accessPolicyWindow.num_bytes, kernelAttr_.accessPolicyWindow.hitRatio,
+              kernelAttr_.accessPolicyWindow.hitProp, kernelAttr_.accessPolicyWindow.missProp,
+              kernelAttr_.cooperative, kernelAttr_.priority);
      label = buffer;
-    }
-    else if (flag == hipGraphDebugDotFlagsKernelNodeParams) {
-      sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>",
-              GetID(), function->name().c_str(), kernelParams_.gridDim.x,
-              kernelParams_.gridDim.y, kernelParams_.gridDim.z,
-              kernelParams_.blockDim.x, kernelParams_.blockDim.y,
+    } else if (flag == hipGraphDebugDotFlagsKernelNodeParams) {
+      sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>", GetID(),
+              function->name().c_str(), kernelParams_.gridDim.x, kernelParams_.gridDim.y,
+              kernelParams_.gridDim.z, kernelParams_.blockDim.x, kernelParams_.blockDim.y,
              kernelParams_.blockDim.z, kernelParams_.sharedMemBytes);
      label = buffer;
-    }
-    else {
+    } else {
      label = std::to_string(GetID()) + "\n" + function->name() + "\n";
    }
    return label;
@@ -1241,8 +1220,7 @@ class GraphKernelNode : public GraphNode {
    int accessPolicyMaxWindowSize = prop.accessPolicyMaxWindowSize;
    // updates kernel attr params
    if (attr == hipKernelNodeAttributeAccessPolicyWindow) {
-      if (params->accessPolicyWindow.hitRatio > 1 ||
-          params->accessPolicyWindow.hitRatio < 0) {
+      if (params->accessPolicyWindow.hitRatio > 1 || params->accessPolicyWindow.hitRatio < 0) {
        return hipErrorInvalidValue;
      }

@@ -1327,9 +1305,8 @@ class GraphKernelNode : public GraphNode {
    return SetParams(&kernelNode->kernelParams_);
  }

-  static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams,
-                                         hipFunction_t func, int devId) {
-
+  static hipError_t validateKernelParams(const hipKernelNodeParams* pNodeParams, hipFunction_t func,
+                                         int devId) {
    amd::HIPLaunchParams launch_params(pNodeParams->gridDim.x, pNodeParams->gridDim.y,
                                       pNodeParams->gridDim.z, pNodeParams->blockDim.x,
                                       pNodeParams->blockDim.y, pNodeParams->blockDim.z,
@@ -1372,15 +1349,13 @@ class GraphMemcpyNode : public GraphNode {
  }
  ~GraphMemcpyNode() {}

-  GraphMemcpyNode(const GraphMemcpyNode& rhs) : GraphNode(rhs) {
-    copyParams_ = rhs.copyParams_;
-  }
+  GraphMemcpyNode(const GraphMemcpyNode& rhs) : GraphNode(rhs) { copyParams_ = rhs.copyParams_; }

  GraphNode* clone() const override { return new GraphMemcpyNode(*this); }

  virtual hipError_t CreateCommand(hip::Stream* stream) override {
-    if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault)
-      && IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
+    if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
+        IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
      return hipSuccess;
    }
    hipError_t status = GraphNode::CreateCommand(stream);
@@ -1397,9 +1372,9 @@ class GraphMemcpyNode : public GraphNode {
  virtual void EnqueueCommands(hip::Stream* stream) override {
    if ((copyParams_.kind == hipMemcpyHostToHost || copyParams_.kind == hipMemcpyDefault) &&
        isEnabled_ && IsHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr)) {
-      ihipHtoHMemcpy(copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr,
-                     copyParams_.extent.width * copyParams_.extent.height *
-                     copyParams_.extent.depth, *stream);
+      ihipHtoHMemcpy(
+          copyParams_.dstPtr.ptr, copyParams_.srcPtr.ptr,
+          copyParams_.extent.width * copyParams_.extent.height * copyParams_.extent.depth, *stream);
      return;
    }
    GraphNode::EnqueueCommands(stream);
@@ -1493,9 +1468,8 @@ class GraphMemcpyNode : public GraphNode {
          copyParams_.srcPtr.ptr, copyParams_.srcPtr.xsize, copyParams_.srcPtr.ysize,
          copyParams_.dstPtr.pitch, copyParams_.dstPtr.ptr, copyParams_.dstPtr.xsize,
          copyParams_.dstPtr.ysize, copyParams_.srcPos.x, copyParams_.srcPos.y,
-          copyParams_.srcPos.z, copyParams_.dstPos.x, copyParams_.dstPos.y,
-          copyParams_.dstPos.z, copyParams_.extent.width, copyParams_.extent.height,
-          copyParams_.extent.depth);
+          copyParams_.srcPos.z, copyParams_.dstPos.x, copyParams_.dstPos.y, copyParams_.dstPos.z,
+          copyParams_.extent.width, copyParams_.extent.height, copyParams_.extent.depth);
      label = buffer;
    } else {
      label = std::to_string(GetID()) + "\nMEMCPY\n(" + memcpyDirection + ")";
@@ -1641,9 +1615,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
    }
  }

-  hipMemcpyKind GetMemcpyKind() const override {
-    return kind_;
-  }
+  hipMemcpyKind GetMemcpyKind() const override { return kind_; }

  hipError_t SetParams(void* dst, const void* src, size_t count, hipMemcpyKind kind) {
    hipError_t status = ValidateParams(dst, src, count, kind);
@@ -1699,9 +1671,9 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
          "| %zu}}\n| {{srcPos | {{x | %zu} | {y | %zu} | {z | %zu}}} | {dstPos | {{x | %zu} | {y "
          "| "
          "%zu} | {z | %zu}}} | {Extent | {{Width | %zu} | {Height | %zu} | {Depth | %zu}}}}\n}",
-          label_, GetID(), this, memcpyDirection.c_str(), (size_t)0, src_, (size_t)0,
-          (size_t)0, (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0,
-          (size_t)0, (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1);
+          label_, GetID(), this, memcpyDirection.c_str(), (size_t)0, src_, (size_t)0, (size_t)0,
+          (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0,
+          (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1);
      label = buffer;
    } else {
      label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," +
@@ -1805,8 +1777,8 @@ class GraphMemcpyNodeFromSymbol : public GraphMemcpyNode1D {
    if (dstMemory == nullptr && kind != hipMemcpyDeviceToHost && kind != hipMemcpyDefault) {
      return hipErrorInvalidMemcpyDirection;
    } else if (dstMemory != nullptr && dstMemory->getMemFlags() == 0 &&
-               kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU
-               && kind != hipMemcpyDefault) {
+               kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU &&
+               kind != hipMemcpyDefault) {
      return hipErrorInvalidMemcpyDirection;
    } else if (kind == hipMemcpyHostToHost || kind == hipMemcpyHostToDevice) {
      return hipErrorInvalidMemcpyDirection;
@@ -1905,9 +1877,8 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
    }
    if (srcMemory == nullptr && kind != hipMemcpyHostToDevice && kind != hipMemcpyDefault) {
      return hipErrorInvalidValue;
-    } else if (srcMemory != nullptr && srcFlag == 0 &&
-               kind != hipMemcpyDeviceToDevice && kind != hipMemcpyDeviceToDeviceNoCU
-               && kind != hipMemcpyDefault) {
+    } else if (srcMemory != nullptr && srcFlag == 0 && kind != hipMemcpyDeviceToDevice &&
+               kind != hipMemcpyDeviceToDeviceNoCU && kind != hipMemcpyDefault) {
      return hipErrorInvalidValue;
    } else if (kind == hipMemcpyHostToHost || kind == hipMemcpyDeviceToHost) {
      return hipErrorInvalidValue;
@@ -1921,8 +1892,7 @@ class GraphMemcpyNodeToSymbol : public GraphMemcpyNode1D {
  }

  virtual hipError_t SetParams(GraphNode* node) override {
-    const GraphMemcpyNodeToSymbol* memcpyNode =
-        static_cast<GraphMemcpyNodeToSymbol const*>(node);
+    const GraphMemcpyNodeToSymbol* memcpyNode = static_cast<GraphMemcpyNodeToSymbol const*>(node);
    return SetParams(memcpyNode->src_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_,
                     memcpyNode->kind_);
  }
@@ -1932,6 +1902,7 @@ class GraphMemsetNode : public GraphNode {
  size_t depth_ = 1;
  size_t arrWidth_ = 1;
  size_t arrHeight_ = 1;
+
 public:
  GraphMemsetNode(const hipMemsetParams* pMemsetParams, size_t depth = 1, size_t arrWidth = 1,
                  size_t arrHeight = 1)
@@ -1966,9 +1937,8 @@ class GraphMemsetNode : public GraphNode {
      sprintf(buffer,
              "{\n%s\n| {{ID | node handle | dptr | pitch | value | elementSize | width | "
              "height | depth} | {%u | %p | %p | %zu | %u | %u | %zu | %zu | %zu}}}",
-              label_, GetID(), this, memsetParams_.dst, memsetParams_.pitch,
-              memsetParams_.value, memsetParams_.elementSize, memsetParams_.width,
-              memsetParams_.height, depth_);
+              label_, GetID(), this, memsetParams_.dst, memsetParams_.pitch, memsetParams_.value,
+              memsetParams_.elementSize, memsetParams_.width, memsetParams_.height, depth_);
      label = buffer;
    } else {
      size_t sizeBytes;
@@ -2054,8 +2024,9 @@ class GraphMemsetNode : public GraphNode {
      if (isExec) {
        // 2D - hipGraphExecMemsetNodeSetParams returns invalid value if new width or new height is
        // not same as what memset node is added with.
-        if (memsetParams_.width * memsetParams_.elementSize != params->width * params->elementSize
-         || memsetParams_.height != params->height || depth != depth_) {
+        if (memsetParams_.width * memsetParams_.elementSize !=
+                params->width * params->elementSize ||
+            memsetParams_.height != params->height || depth != depth_) {
          return hipErrorInvalidValue;
        }
      } else {
@@ -2064,9 +2035,9 @@ class GraphMemsetNode : public GraphNode {
        size_t discardOffset = 0;
        amd::Memory* memObj = getMemoryObject(params->dst, discardOffset);
        if (memObj != nullptr) {
-          if (params->width * params->elementSize > memObj->getUserData().width_
-           || params->height > memObj->getUserData().height_
-           || depth > memObj->getUserData().depth_) {
+          if (params->width * params->elementSize > memObj->getUserData().width_ ||
+              params->height > memObj->getUserData().height_ ||
+              depth > memObj->getUserData().depth_) {
            return hipErrorInvalidValue;
          }
        }
@@ -2103,9 +2074,7 @@ class GraphEventRecordNode : public GraphNode {
        event_(event) {}
  ~GraphEventRecordNode() {}

-  GraphEventRecordNode(const GraphEventRecordNode& rhs) : GraphNode(rhs) {
-    event_ = rhs.event_;
-  }
+  GraphEventRecordNode(const GraphEventRecordNode& rhs) : GraphNode(rhs) { event_ = rhs.event_; }

  GraphNode* clone() const override { return new GraphEventRecordNode(*this); }

@@ -2143,8 +2112,7 @@ class GraphEventRecordNode : public GraphNode {
  }

  hipError_t SetParams(GraphNode* node) override {
-    const GraphEventRecordNode* eventRecordNode =
-        static_cast<GraphEventRecordNode const*>(node);
+    const GraphEventRecordNode* eventRecordNode = static_cast<GraphEventRecordNode const*>(node);
    return SetParams(eventRecordNode->event_);
  }
 };
@@ -2154,14 +2122,11 @@ class GraphEventWaitNode : public GraphNode {

 public:
  GraphEventWaitNode(hipEvent_t event)
-      : GraphNode(hipGraphNodeTypeWaitEvent, "solid", "rectangle", "EVENT_WAIT"),
-        event_(event) {}
+      : GraphNode(hipGraphNodeTypeWaitEvent, "solid", "rectangle", "EVENT_WAIT"), event_(event) {}

  ~GraphEventWaitNode() {}

-  GraphEventWaitNode(const GraphEventWaitNode& rhs) : GraphNode(rhs) {
-    event_ = rhs.event_;
-  }
+  GraphEventWaitNode(const GraphEventWaitNode& rhs) : GraphNode(rhs) { event_ = rhs.event_; }

  GraphNode* clone() const override { return new GraphEventWaitNode(*this); }

@@ -2302,7 +2267,8 @@ class GraphMemAllocNode final : public GraphNode {
    VirtualMemAllocNode(amd::HostQueue& queue, const amd::Event::EventWaitList& eventWaitList,
                        amd::Memory* va, size_t size, amd::Memory* memory, Graph* graph)
        : VirtualMapCommand(queue, eventWaitList, va->getSvmPtr(), size, memory),
-          va_(va), graph_(graph) {}
+          va_(va),
+          graph_(graph) {}

    virtual void submit(device::VirtualDevice& device) final {
      // Remove VA reference from the global mapping. Runtime has to keep a dummy reference for
@@ -2341,8 +2307,8 @@ class GraphMemAllocNode final : public GraphNode {
                                     amd::Device::VmmAccess::kReadWrite);
      va_->retain();
      graph_->IncrementMemAllocNodeCount();  // Increment count of unreleased mem alloc nodes
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL,
-              "Graph MemAlloc execute [%p-%p], %p", vaddr_sub_obj->getSvmPtr(),
+      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute [%p-%p], %p",
+              vaddr_sub_obj->getSvmPtr(),
              reinterpret_cast<char*>(vaddr_sub_obj->getSvmPtr()) + aligned_size, memory());
    }

@@ -2357,8 +2323,7 @@ class GraphMemAllocNode final : public GraphNode {
    node_params_ = *node_params;
  }

-  GraphMemAllocNode(const GraphMemAllocNode& rhs)
-      : GraphNode(rhs) {
+  GraphMemAllocNode(const GraphMemAllocNode& rhs) : GraphNode(rhs) {
    node_params_ = rhs.node_params_;
    if (HIP_MEM_POOL_USE_VM) {
      assert(rhs.va_ != nullptr && "Graph MemAlloc runtime can't clone an invalid node!");
@@ -2392,8 +2357,8 @@ class GraphMemAllocNode final : public GraphNode {
        assert(va_ != nullptr && "Runtime can't create a command for an invalid node!");
        stream->GetDevice()->GetGraphMemoryPool()->SetGraphInUse();
        // Create command for memory mapping
-        auto cmd = new VirtualMemAllocNode(*stream, amd::Event::EventWaitList{},
-            va_, node_params_.bytesize, nullptr, graph);
+        auto cmd = new VirtualMemAllocNode(*stream, amd::Event::EventWaitList{}, va_,
+                                           node_params_.bytesize, nullptr, graph);
        commands_.push_back(cmd);
        size_t offset = 0;
        // Check if memory was already added after first reserve
@@ -2405,8 +2370,7 @@ class GraphMemAllocNode final : public GraphNode {
          // be executed again
          amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
        }
-        ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p",
-            node_params_.dptr);
+        ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc create: %p", node_params_.dptr);
      }
    }
    return error;
@@ -2421,8 +2385,7 @@ class GraphMemAllocNode final : public GraphNode {
        va_ = amd::MemObjMap::FindVirtualMemObj(node_params_.dptr);
        amd::MemObjMap::AddMemObj(node_params_.dptr, va_);
      }
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p",
-          node_params_.dptr);
+      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc reserve VA: %p", node_params_.dptr);
    }
    return node_params_.dptr;
  }
@@ -2459,8 +2422,10 @@ class GraphMemFreeNode : public GraphNode {
   public:
    VirtualMemFreeNode(Graph* graph, int device_id, amd::HostQueue& queue,
                       const amd::Event::EventWaitList& eventWaitList, void* ptr, size_t size,
-        amd::Memory* memory) : VirtualMapCommand(queue, eventWaitList, ptr, size, memory)
-        , graph_(graph), device_id_(device_id) {}
+                       amd::Memory* memory)
+        : VirtualMapCommand(queue, eventWaitList, ptr, size, memory),
+          graph_(graph),
+          device_id_(device_id) {}

    virtual void submit(device::VirtualDevice& device) final {
      // Find memory object before unmap logic
@@ -2485,8 +2450,8 @@ class GraphMemFreeNode : public GraphNode {
      }
      amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
      graph_->DecrementMemAllocNodeCount();  // Decrement count of unreleased memalloc nodes
-      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p",
-          ptr(), vaddr_sub_obj);
+      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", ptr(),
+              vaddr_sub_obj);
    }

   private:
@@ -2496,11 +2461,8 @@ class GraphMemFreeNode : public GraphNode {

 public:
  GraphMemFreeNode(void* dptr)
-    : GraphNode(hipGraphNodeTypeMemFree, "solid", "rectangle", "MEM_FREE")
-    , device_ptr_(dptr) {}
-  GraphMemFreeNode(const GraphMemFreeNode& rhs) : GraphNode(rhs) {
-    device_ptr_ = rhs.device_ptr_;
-  }
+      : GraphNode(hipGraphNodeTypeMemFree, "solid", "rectangle", "MEM_FREE"), device_ptr_(dptr) {}
+  GraphMemFreeNode(const GraphMemFreeNode& rhs) : GraphNode(rhs) { device_ptr_ = rhs.device_ptr_; }

  virtual GraphNode* clone() const final { return new GraphMemFreeNode(*this); }

@@ -2514,8 +2476,8 @@ class GraphMemFreeNode : public GraphNode {
        const auto& dev_info = stream->device().info();
        auto va = amd::MemObjMap::FindVirtualMemObj(device_ptr_);
        // Unmap virtual address from memory
-        amd::Command* cmd = new VirtualMemFreeNode(graph, stream->DeviceId(), *stream,
-            amd::Command::EventWaitList{}, device_ptr_,
+        amd::Command* cmd = new VirtualMemFreeNode(
+            graph, stream->DeviceId(), *stream, amd::Command::EventWaitList{}, device_ptr_,
            amd::alignUp(va->getSize(), dev_info.virtualMemAllocGranularity_), nullptr);
        commands_.push_back(cmd);
        ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph FreeMem create: %p", device_ptr_);
@@ -2531,9 +2493,7 @@ class GraphMemFreeNode : public GraphNode {
    }
  }

-  void GetParams(void** params) const {
-    *params = device_ptr_;
-  }
+  void GetParams(void** params) const { *params = device_ptr_; }
 };

 class GraphDrvMemcpyNode : public GraphNode {
@@ -2578,16 +2538,13 @@ class GraphDrvMemcpyNode : public GraphNode {
    }
    if (isEnabled_ && isHtoH) {
      ihipHtoHMemcpy(copyParams_.dstHost, copyParams_.srcHost,
-                     copyParams_.WidthInBytes * copyParams_.Height *
-                     copyParams_.Depth, *stream);
+                     copyParams_.WidthInBytes * copyParams_.Height * copyParams_.Depth, *stream);
      return;
    }
    GraphNode::EnqueueCommands(stream);
  }

-  void GetParams(HIP_MEMCPY3D* params) {
-    std::memcpy(params, &copyParams_, sizeof(HIP_MEMCPY3D));
-  }
+  void GetParams(HIP_MEMCPY3D* params) { std::memcpy(params, &copyParams_, sizeof(HIP_MEMCPY3D)); }
  hipError_t SetParams(const HIP_MEMCPY3D* params) {
    hipError_t status = ValidateParams(params);
    if (status != hipSuccess) {
@@ -2608,7 +2565,6 @@ class GraphDrvMemcpyNode : public GraphNode {
    }
    return hipSuccess;
  }
-
 };

 class hipGraphExternalSemSignalNode : public GraphNode {
@@ -2621,8 +2577,7 @@ class hipGraphExternalSemSignalNode : public GraphNode {
    externalSemaphorNodeParam_ = *pNodeParams;
  }

-  hipGraphExternalSemSignalNode(const hipGraphExternalSemSignalNode& rhs)
-    : GraphNode(rhs) {
+  hipGraphExternalSemSignalNode(const hipGraphExternalSemSignalNode& rhs) : GraphNode(rhs) {
    externalSemaphorNodeParam_ = rhs.externalSemaphorNodeParam_;
  }

@@ -2639,8 +2594,8 @@ class hipGraphExternalSemSignalNode : public GraphNode {
    commands_.reserve(numExtSems);
    for (unsigned int i = 0; i < numExtSems; i++) {
      if (externalSemaphorNodeParam_.extSemArray[i] != nullptr) {
-        amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(*stream,
-                                      externalSemaphorNodeParam_.extSemArray[i],
+        amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(
+            *stream, externalSemaphorNodeParam_.extSemArray[i],
            externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
            amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE);
        if (command == nullptr) {
@@ -2671,8 +2626,8 @@ class hipGraphExternalSemWaitNode : public GraphNode {

 public:
  hipGraphExternalSemWaitNode(const hipExternalSemaphoreWaitNodeParams* pNodeParams)
-    : GraphNode(hipGraphNodeTypeExtSemaphoreWait, "solid",
-                   "rectangle", "EXTERNAL_SEMAPHORE_WAIT") {
+      : GraphNode(hipGraphNodeTypeExtSemaphoreWait, "solid", "rectangle",
+                  "EXTERNAL_SEMAPHORE_WAIT") {
    externalSemaphorNodeParam_ = *pNodeParams;
  }

@@ -2687,14 +2642,13 @@ class hipGraphExternalSemWaitNode : public GraphNode {
    hipError_t status = GraphNode::CreateCommand(stream);
    if (status != hipSuccess) {
      return status;
-
    }
    unsigned int numExtSems = externalSemaphorNodeParam_.numExtSems;
    commands_.reserve(numExtSems);
    for (unsigned int i = 0; i < numExtSems; i++) {
      if (externalSemaphorNodeParam_.extSemArray[i] != nullptr) {
-        amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(*stream,
-                                    externalSemaphorNodeParam_.extSemArray[i],
+        amd::ExternalSemaphoreCmd* command = new amd::ExternalSemaphoreCmd(
+            *stream, externalSemaphorNodeParam_.extSemArray[i],
            externalSemaphorNodeParam_.paramsArray[i].params.fence.value,
            amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE);
        if (command == nullptr) {
@@ -39,31 +39,41 @@ static_assert(hipCpuDeviceId == amd::CpuDeviceId, "CPU device ID mismatch with R
 static_assert(hipInvalidDeviceId == amd::InvalidDeviceId,
              "Invalid device ID mismatch with ROCclr!");

-static_assert(static_cast<uint32_t>(hipMemAdviseSetReadMostly) ==
-              amd::MemoryAdvice::SetReadMostly, "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseSetReadMostly) == amd::MemoryAdvice::SetReadMostly,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseUnsetReadMostly) ==
-              amd::MemoryAdvice::UnsetReadMostly, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::UnsetReadMostly,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseSetPreferredLocation) ==
-              amd::MemoryAdvice::SetPreferredLocation, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::SetPreferredLocation,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseUnsetPreferredLocation) ==
-              amd::MemoryAdvice::UnsetPreferredLocation, "Enum mismatch with ROCclr!");
-static_assert(static_cast<uint32_t>(hipMemAdviseSetAccessedBy) ==
-              amd::MemoryAdvice::SetAccessedBy, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::UnsetPreferredLocation,
+              "Enum mismatch with ROCclr!");
+static_assert(static_cast<uint32_t>(hipMemAdviseSetAccessedBy) == amd::MemoryAdvice::SetAccessedBy,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseUnsetAccessedBy) ==
-              amd::MemoryAdvice::UnsetAccessedBy, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::UnsetAccessedBy,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseSetCoarseGrain) ==
-              amd::MemoryAdvice::SetCoarseGrain, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::SetCoarseGrain,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemAdviseUnsetCoarseGrain) ==
-              amd::MemoryAdvice::UnsetCoarseGrain, "Enum mismatch with ROCclr!");
+                  amd::MemoryAdvice::UnsetCoarseGrain,
+              "Enum mismatch with ROCclr!");

 static_assert(static_cast<uint32_t>(hipMemRangeAttributeReadMostly) ==
-              amd::MemRangeAttribute::ReadMostly, "Enum mismatch with ROCclr!");
+                  amd::MemRangeAttribute::ReadMostly,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemRangeAttributePreferredLocation) ==
-              amd::MemRangeAttribute::PreferredLocation, "Enum mismatch with ROCclr!");
+                  amd::MemRangeAttribute::PreferredLocation,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemRangeAttributeAccessedBy) ==
-              amd::MemRangeAttribute::AccessedBy, "Enum mismatch with ROCclr!");
+                  amd::MemRangeAttribute::AccessedBy,
+              "Enum mismatch with ROCclr!");
 static_assert(static_cast<uint32_t>(hipMemRangeAttributeLastPrefetchLocation) ==
-              amd::MemRangeAttribute::LastPrefetchLocation, "Enum mismatch with ROCclr!");
+                  amd::MemRangeAttribute::LastPrefetchLocation,
+              "Enum mismatch with ROCclr!");

 // ================================================================================================
 hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
@@ -84,8 +94,7 @@ hipError_t hipMallocManaged(void** dev_ptr, size_t size, unsigned int flags) {
 }

 // ================================================================================================
-hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
-                               hipStream_t stream) {
+hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device, hipStream_t stream) {
  HIP_INIT_API(hipMemPrefetchAsync, dev_ptr, count, device, stream);
  CHECK_STREAM_CAPTURE_SUPPORTED();
  hipMemLocation location;
@@ -147,8 +156,8 @@ hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttr
  amd::Device* dev = g_devices[0]->devices()[0];

  // Get the allocation attribute from AMD HMM
-  if (!dev->GetSvmAttributes(&data, &data_size, reinterpret_cast<int*>(&attribute), 1,
-                             dev_ptr, count)) {
+  if (!dev->GetSvmAttributes(&data, &data_size, reinterpret_cast<int*>(&attribute), 1, dev_ptr,
+                             count)) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -159,8 +168,8 @@ hipError_t hipMemRangeGetAttribute(void* data, size_t data_size, hipMemRangeAttr
 hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
                                    hipMemRangeAttribute* attributes, size_t num_attributes,
                                    const void* dev_ptr, size_t count) {
-  HIP_INIT_API(hipMemRangeGetAttributes, data, data_sizes,
-               attributes, num_attributes, dev_ptr, count);
+  HIP_INIT_API(hipMemRangeGetAttributes, data, data_sizes, attributes, num_attributes, dev_ptr,
+               count);

  if ((data == nullptr) || (data_sizes == nullptr) || (attributes == nullptr) ||
      (num_attributes == 0) || (dev_ptr == nullptr) || (count == 0)) {
@@ -188,8 +197,8 @@ hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
  // Shouldn't matter for which device the interface is called
  amd::Device* dev = g_devices[0]->devices()[0];
  // Get the allocation attributes from AMD HMM
-  if (!dev->GetSvmAttributes(data, data_sizes, reinterpret_cast<int*>(attributes),
-      num_attributes, dev_ptr, count)) {
+  if (!dev->GetSvmAttributes(data, data_sizes, reinterpret_cast<int*>(attributes), num_attributes,
+                             dev_ptr, count)) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -197,8 +206,8 @@ hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
 }

 // ================================================================================================
-hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr,
-                                   size_t length, unsigned int flags) {
+hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t length,
+                                   unsigned int flags) {
  HIP_INIT_API(hipStreamAttachMemAsync, stream, dev_ptr, length, flags);
  // stream can be null, length can be 0.
  if (dev_ptr == nullptr) {
@@ -217,8 +226,9 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr,
  // host-accessible region of system-allocated pageable memory.
  // This type of memory may only be specified if the device associated with the
  // stream reports a non-zero value for the device attribute hipDevAttrPageableMemoryAccess.
-  hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
-                             hip::getCurrentDevice()->NullStream() : hip::getStream(stream);
+  hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy)
+      ? hip::getCurrentDevice()->NullStream()
+      : hip::getStream(stream);
  size_t offset = 0;
  amd::Memory* memObj = getMemoryObject(dev_ptr, offset);
  if (memObj == nullptr) {
@@ -258,9 +268,9 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, size_t align, bool use_hos
  // allocation in the device driver
  if (use_host_ptr) {
    // If the host pointer is already allocated, map it to svm fine grain buffer
-    *ptr = amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR, size,
-                                  (align == 0) ? dev.info().memBaseAddrAlign_ : align, nullptr,
-                                  *ptr);
+    *ptr =
+        amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR, size,
+                               (align == 0) ? dev.info().memBaseAddrAlign_ : align, nullptr, *ptr);
  } else {
    *ptr = amd::SvmBuffer::malloc(ctx, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR, size,
                                  (align == 0) ? dev.info().memBaseAddrAlign_ : align);
@@ -96,7 +96,7 @@ typedef struct ihipIpcEventHandle_st {
 } ihipIpcEventHandle_t;

 const char* ihipGetErrorName(hipError_t hip_error);
-}
+}  // namespace hip

 #define HIP_INIT(noReturn)                                                                         \
  {                                                                                                \
@@ -124,13 +124,12 @@ const char* ihipGetErrorName(hipError_t hip_error);

 #define HIP_API_PRINT(...)                                                                         \
  uint64_t startTimeUs = 0;                                                                        \
-  HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs,       \
-                  "%s %s ( %s ) %s", KGRN,                          \
-                  __func__, ToString( __VA_ARGS__ ).c_str(), KNRM);
+  HIPPrintDuration(amd::LOG_INFO, amd::LOG_API, &startTimeUs, "%s %s ( %s ) %s", KGRN, __func__,   \
+                   ToString(__VA_ARGS__).c_str(), KNRM);

 #define HIP_ERROR_PRINT(err, ...)                                                                  \
-  ClPrint(amd::LOG_INFO, amd::LOG_API, "%s: Returned %s : %s",                     \
-          __func__, hip::ihipGetErrorName(err), ToString( __VA_ARGS__ ).c_str());
+  ClPrint(amd::LOG_INFO, amd::LOG_API, "%s: Returned %s : %s", __func__,                           \
+          hip::ihipGetErrorName(err), ToString(__VA_ARGS__).c_str());

 #define HIP_INIT_API_INTERNAL(noReturn, cid, ...)                                                  \
  HIP_INIT(noReturn)                                                                               \
@@ -145,10 +144,9 @@ const char* ihipGetErrorName(hipError_t hip_error);
  HIP_INIT_API_INTERNAL(0, cid, __VA_ARGS__)                                                       \
  if (hip::g_devices.size() == 0) {                                                                \
    HIP_RETURN(hipErrorNoDevice);                                                                  \
-  }                                                                                                \
+  }

-#define HIP_INIT_API_NO_RETURN(cid, ...)                                                           \
-  HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__)
+#define HIP_INIT_API_NO_RETURN(cid, ...) HIP_INIT_API_INTERNAL(1, cid, __VA_ARGS__)

 #define HIP_RETURN_DURATION(ret, ...)                                                              \
  hip::tls.last_command_error_ = ret;                                                              \
@@ -266,7 +264,7 @@ const char* ihipGetErrorName(hipError_t hip_error);
 namespace hc {
 class accelerator;
 class accelerator_view;
-};
+};  // namespace hc

 struct ihipExec_t {
  dim3 gridDim_;
@@ -280,6 +278,7 @@ namespace hip {
 class stream_per_thread {
 private:
  std::vector<hipStream_t> m_streams;
+
 public:
  stream_per_thread();
  stream_per_thread(const stream_per_thread&) = delete;
@@ -328,8 +327,9 @@ public:
  unsigned long long captureID_;

  static inline CommandQueue::Priority convertToQueuePriority(Priority p) {
-      return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ?
-                    amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal;
+    return p == Priority::High ? amd::CommandQueue::Priority::High
+        : p == Priority::Low   ? amd::CommandQueue::Priority::Low
+                               : amd::CommandQueue::Priority::Normal;
  }

 public:
@@ -383,9 +383,7 @@ public:
    lastCapturedNodes_.push_back(graphNode);
  }
  /// returns updated dependencies removed
-    const std::vector<hip::GraphNode*>& GetRemovedDependencies() {
-      return removedDependencies_;
-    }
+  const std::vector<hip::GraphNode*>& GetRemovedDependencies() { return removedDependencies_; }
  /// Append captured node via the wait event cross stream
  void AddCrossCapturedNode(std::vector<hip::GraphNode*> graphNodes, bool replace = false) {
    // replace dependencies as per flag hipStreamSetCaptureDependencies
@@ -436,7 +434,8 @@ public:
  unsigned long long GetCaptureID() { return captureID_; }
  void SetCaptureEvent(hipEvent_t e) {
    amd::ScopedLock lock(lock_);
-      captureEvents_.emplace(e); }
+    captureEvents_.emplace(e);
+  }
  bool IsEventCaptured(hipEvent_t e) {
    amd::ScopedLock lock(lock_);
    auto it = captureEvents_.find(e);
@@ -500,14 +499,16 @@ public:
  std::set<MemoryPool*> mem_pools_;

 public:
-    Device(amd::Context* ctx, int devId): context_(ctx),
+  Device(amd::Context* ctx, int devId)
+      : context_(ctx),
        deviceId_(devId),
        flags_(hipDeviceScheduleSpin),
        isActive_(false),
        default_mem_pool_(nullptr),
        current_mem_pool_(nullptr),
-        graph_mem_pool_(nullptr)
-        { assert(ctx != nullptr); }
+        graph_mem_pool_(nullptr) {
+    assert(ctx != nullptr);
+  }
  ~Device();

  bool Create();
@@ -518,7 +519,8 @@ public:
  const std::vector<amd::Device*>& devices() const { return context_->devices(); }
  hipError_t EnablePeerAccess(int peerDeviceId) {
    amd::ScopedLock lock(lock_);
-      bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
+    bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) !=
+                  userEnabledPeers.end());
    if (found) {
      return hipErrorPeerAccessAlreadyEnabled;
    }
@@ -527,7 +529,8 @@ public:
  }
  hipError_t DisablePeerAccess(int peerDeviceId) {
    amd::ScopedLock lock(lock_);
-      bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) != userEnabledPeers.end());
+    bool found = (std::find(userEnabledPeers.begin(), userEnabledPeers.end(), peerDeviceId) !=
+                  userEnabledPeers.end());
    if (found) {
      userEnabledPeers.remove(peerDeviceId);
      return hipSuccess;
@@ -542,9 +545,7 @@ public:
  hip::Stream* NullStream(bool wait = true);
  Stream* GetNullStream() const { return null_stream_; };

-    void SetActiveStatus() {
-      isActive_ = true;
-    }
+  void SetActiveStatus() { isActive_ = true; }

  bool GetActiveStatus() {
    amd::ScopedLock lock(lock_);
@@ -621,14 +622,13 @@ public:
  stream_per_thread stream_per_thread_obj_;
  bool isSetDeviceCalled;

-    TlsAggregator(): device_(nullptr),
+  TlsAggregator()
+      : device_(nullptr),
        last_error_(hipSuccess),
        last_command_error_(hipSuccess),
        stream_capture_mode_(hipStreamCaptureModeGlobal),
-      isSetDeviceCalled(false) {
-    }
-    ~TlsAggregator() {
-    }
+        isSetDeviceCalled(false) {}
+  ~TlsAggregator() {}
 };
 extern thread_local TlsAggregator tls;

@@ -93,8 +93,8 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) {
    HIP_RETURN(hipSuccess);
  }
  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
-  auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
-    hip::getCurrentDevice()->NullStream() : s;
+  auto hip_stream =
+      (stream == nullptr || stream == hipStreamLegacy) ? hip::getCurrentDevice()->NullStream() : s;
  auto device = hip_stream->GetDevice();
  auto mem_pool = device->GetCurrentMemoryPool();

@@ -150,8 +150,8 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
  getStreamPerThread(stream);

  hip::Stream* s = reinterpret_cast<hip::Stream*>(stream);
-  auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
-    hip::getCurrentDevice()->NullStream(): s;
+  auto hip_stream =
+      (stream == nullptr || stream == hipStreamLegacy) ? hip::getCurrentDevice()->NullStream() : s;

  auto device = hip_stream->GetDevice();
  // Return error if any stream other than the current stream is in capture mode
@@ -199,8 +199,7 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
      // may block the execution
      event = new hip::Event(0);
      if (event != nullptr) {
-        if (hipSuccess !=
-            event->addMarker(hip_stream, nullptr)) {
+        if (hipSuccess != event->addMarker(hip_stream, nullptr)) {
          delete event;
          event = nullptr;
        } else {
@@ -253,9 +252,7 @@ hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, vo
 }

 // ================================================================================================
-hipError_t hipMemPoolSetAccess(
-    hipMemPool_t mem_pool,
-    const hipMemAccessDesc* desc_list,
+hipError_t hipMemPoolSetAccess(hipMemPool_t mem_pool, const hipMemAccessDesc* desc_list,
                               size_t count) {
  HIP_INIT_API(hipMemPoolSetAccess, mem_pool, desc_list, count);
  if ((mem_pool == nullptr) || (desc_list == nullptr)) {
@@ -286,9 +283,7 @@ hipError_t hipMemPoolSetAccess(
 }

 // ================================================================================================
-hipError_t hipMemPoolGetAccess(
-    hipMemAccessFlags* flags,
-    hipMemPool_t mem_pool,
+hipError_t hipMemPoolGetAccess(hipMemAccessFlags* flags, hipMemPool_t mem_pool,
                               hipMemLocation* location) {
  HIP_INIT_API(hipMemPoolGetAccess, flags, mem_pool, location);
  if ((mem_pool == nullptr) || (location == nullptr) || (flags == nullptr)) {
@@ -370,10 +365,7 @@ hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool) {
 }

 // ================================================================================================
-hipError_t hipMallocFromPoolAsync(
-    void** dev_ptr,
-    size_t size,
-    hipMemPool_t mem_pool,
+hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_pool,
                                  hipStream_t stream) {
  HIP_INIT_API(hipMallocFromPoolAsync, dev_ptr, size, mem_pool, stream);
  if ((dev_ptr == nullptr) || (mem_pool == nullptr)) {
@@ -387,8 +379,9 @@ hipError_t hipMallocFromPoolAsync(
  STREAM_CAPTURE(hipMallocAsync, stream, mem_pool, size, dev_ptr);

  auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
-  auto hip_stream = (stream == nullptr || stream == hipStreamLegacy) ?
-    hip::getCurrentDevice()->NullStream() : reinterpret_cast<hip::Stream*>(stream);
+  auto hip_stream = (stream == nullptr || stream == hipStreamLegacy)
+      ? hip::getCurrentDevice()->NullStream()
+      : reinterpret_cast<hip::Stream*>(stream);
  *dev_ptr = mpool->AllocateMemory(size, hip_stream);
  if (*dev_ptr == nullptr) {
    HIP_RETURN(hipErrorOutOfMemory);
@@ -397,9 +390,7 @@ hipError_t hipMallocFromPoolAsync(
 }

 // ================================================================================================
-hipError_t hipMemPoolExportToShareableHandle(
-    void*                      shared_handle,
-    hipMemPool_t               mem_pool,
+hipError_t hipMemPoolExportToShareableHandle(void* shared_handle, hipMemPool_t mem_pool,
                                             hipMemAllocationHandleType handle_type,
                                             unsigned int flags) {
  HIP_INIT_API(hipMemPoolExportToShareableHandle, shared_handle, mem_pool, handle_type, flags);
@@ -421,9 +412,7 @@ hipError_t hipMemPoolExportToShareableHandle(
 }

 // ================================================================================================
-hipError_t hipMemPoolImportFromShareableHandle(
-    hipMemPool_t*              mem_pool,
-    void*                      shared_handle,
+hipError_t hipMemPoolImportFromShareableHandle(hipMemPool_t* mem_pool, void* shared_handle,
                                               hipMemAllocationHandleType handle_type,
                                               unsigned int flags) {
  HIP_INIT_API(hipMemPoolImportFromShareableHandle, mem_pool, shared_handle, handle_type, flags);
@@ -470,8 +459,8 @@ hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* p
    // Note: export_data must point to 64 bytes of shared memory
    auto shared = reinterpret_cast<hip::SharedMemPointer*>(export_data);

-    if (!g_devices[id]->devices()[0]->IpcCreate(ptr,
-      &shared->size_, &shared->handle_[0], &shared->offset_)) {
+    if (!g_devices[id]->devices()[0]->IpcCreate(ptr, &shared->size_, &shared->handle_[0],
+                                                &shared->offset_)) {
      HIP_RETURN(hipErrorOutOfMemory);
    }
  } else {
@@ -481,9 +470,7 @@ hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* p
 }

 // ================================================================================================
-hipError_t hipMemPoolImportPointer(
-    void**                   ptr,
-    hipMemPool_t             mem_pool,
+hipError_t hipMemPoolImportPointer(void** ptr, hipMemPool_t mem_pool,
                                   hipMemPoolPtrExportData* export_data) {
  HIP_INIT_API(hipMemPoolImportPointer, ptr, mem_pool, export_data);
  if (mem_pool == nullptr || export_data == nullptr || ptr == nullptr) {
@@ -491,8 +478,8 @@ hipError_t hipMemPoolImportPointer(
  }
  auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
  auto shared = reinterpret_cast<hip::SharedMemPointer*>(export_data);
-  if (!mpool->Device()->devices()[0]->IpcAttach(
-      &shared->handle_[0], shared->size_, shared->offset_, 0, ptr)) {
+  if (!mpool->Device()->devices()[0]->IpcAttach(&shared->handle_[0], shared->size_, shared->offset_,
+                                                0, ptr)) {
    HIP_RETURN(hipErrorOutOfMemory);
  }
  size_t offset = 0;
@@ -41,8 +41,8 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) {
 }

 // ================================================================================================
-amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic,
-    void* dptr, MemoryTimestamp* ts) {
+amd::Memory* Heap::FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr,
+                              MemoryTimestamp* ts) {
  amd::Memory* memory = nullptr;
  auto start = allocations_.lower_bound({size, nullptr});
  for (auto it = start; it != allocations_.end();) {
@@ -204,8 +204,10 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
      size_t free = 0, total = 0;
      hipError_t err = hipMemGetInfo(&free, &total);
      if (err == hipSuccess) {
-        LogPrintfError("Allocation failed : Device memory : required :\
-          %zu | free :%zu | total :%zu", size, free, total);
+        LogPrintfError(
+            "Allocation failed : Device memory : required :\
+          %zu | free :%zu | total :%zu",
+            size, free, total);
      }
      return nullptr;
    }
@@ -231,8 +233,8 @@ void* MemoryPool::AllocateMemory(size_t size, Stream* stream, void* dptr) {
  ts.AddSafeStream(stream);
  busy_heap_.AddMemory(memory, ts);

-  max_total_size_ = std::max(max_total_size_, busy_heap_.GetTotalSize() +
-                                                  free_heap_.GetTotalSize());
+  max_total_size_ =
+      std::max(max_total_size_, busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
  // Increment the reference counter on the pool
  retain();

@@ -420,13 +422,14 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
      break;
    case hipMemPoolAttrReservedMemCurrent:
      // All allocated memory by the pool in OS
-      *reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_) ? MappedSize() :
-        (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
+      *reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
+          ? MappedSize()
+          : (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
      break;
    case hipMemPoolAttrReservedMemHigh:
      // High watermark of all allocated memory in OS, since the last reset
-      *reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
-          ? MaxMappedSize() : max_total_size_;
+      *reinterpret_cast<uint64_t*>(value) =
+          (state_.use_vm_heap_) ? MaxMappedSize() : max_total_size_;
      break;
    case hipMemPoolAttrUsedMemCurrent:
      // Total currently used memory by the pool
@@ -505,8 +508,8 @@ amd::Os::FileDesc MemoryPool::Export() {
  // Note: Windows can accept an unnamed allocation
  snprintf(file_name, kFileNameSize, "%p", this);
  amd::Os::FileDesc handle{};
-  shared_ = reinterpret_cast<SharedMemPool*>(amd::Os::CreateIpcMemory(
-      file_name, sizeof(SharedMemPool), &handle));
+  shared_ = reinterpret_cast<SharedMemPool*>(
+      amd::Os::CreateIpcMemory(file_name, sizeof(SharedMemPool), &handle));
  if (shared_ != nullptr) {
    shared_->handle_ = handle;
    shared_->state_ = state_.value_;
@@ -537,4 +540,4 @@ bool MemoryPool::Import(amd::Os::FileDesc handle) {
  }
  return result;
 }
-}
+}  // namespace hip
@@ -103,11 +103,11 @@ public:
  typedef std::map<std::pair<size_t, amd::Memory*>, MemoryTimestamp> SortedMap;

  Heap(hip::Device* device, amd::VmHeapArray& vm_heap)
-    : total_size_(0)
-    , max_total_size_(0)
-    , release_threshold_(0)
-    , device_(device)
-    , vm_heap_(vm_heap) {}
+      : total_size_(0),
+        max_total_size_(0),
+        release_threshold_(0),
+        device_(device),
+        vm_heap_(vm_heap) {}
  ~Heap() {}

  /// Adds allocation into the heap on a specific stream
@@ -117,8 +117,8 @@ public:
  void AddMemory(amd::Memory* memory, const MemoryTimestamp& ts);

  /// Finds memory object with the specified size
-  amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic,
-    void* dptr, MemoryTimestamp* ts);
+  amd::Memory* FindMemory(size_t size, Stream* stream, bool opportunistic, void* dptr,
+                          MemoryTimestamp* ts);

  /// Removes allocation from the map
  bool RemoveMemory(amd::Memory* memory, MemoryTimestamp* ts = nullptr);
@@ -280,9 +280,7 @@ class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {
  void ReleaseAllMemory();

  /// Place the allocated memory into the busy heap
-  void AddBusyMemory(amd::Memory* memory) {
-    busy_heap_.AddMemory(memory, nullptr);
-  }
+  void AddBusyMemory(amd::Memory* memory) { busy_heap_.AddMemory(memory, nullptr); }

  /// Add a safe stream for quick looks-ups if event dependencies option is enabled
  void AddSafeStream(Stream* event_stream, Stream* wait_stream) {
@@ -352,7 +350,8 @@ class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {

  hipMemPoolProps properties_;  //!< Properties of the memory pool
  amd::Monitor lock_pool_ops_;  //!< Access to the pool must be lock protected
-  std::map<hip::Device*, hipMemAccessFlags> access_map_;  //!< Map of access to the pool from devices
+  std::map<hip::Device*, hipMemAccessFlags>
+      access_map_;  //!< Map of access to the pool from devices

  hip::Device* device_;      //!< Hip device the heap will reside
  SharedMemPool* shared_;    //!< Pointer to shared memory for IPC
@@ -360,4 +359,4 @@ class MemoryPool : public amd::ReferenceCountedObject, amd::VmHeapArray {
 };


-} // Mamespace hip
+}  // namespace hip
@@ -221,11 +221,11 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
    HIP_RETURN(hipErrorInvalidDeviceFunction);
  }
  device::Kernel* d_kernel =
-                 (device::Kernel*)(kernel->getDeviceKernel(
-                  *(hip::getCurrentDevice()->devices()[0])));
+      (device::Kernel*)(kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0])));

  if (attr == hipFuncAttributeMaxDynamicSharedMemorySize) {
-    if ((value < 0) || (value > (d_kernel->workGroupInfo()->availableLDSSize_ -
+    if ((value < 0) ||
+        (value > (d_kernel->workGroupInfo()->availableLDSSize_ -
                  d_kernel->workGroupInfo()->localMemSize_))) {
      HIP_RETURN(hipErrorInvalidValue);
    }
@@ -244,7 +244,9 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
 hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
  HIP_INIT_API(hipFuncSetCacheConfig, cacheConfig);

-  if (func == nullptr) { HIP_RETURN(hipErrorInvalidDeviceFunction); }
+  if (func == nullptr) {
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
  if (cacheConfig != hipFuncCachePreferNone && cacheConfig != hipFuncCachePreferShared &&
      cacheConfig != hipFuncCachePreferL1 && cacheConfig != hipFuncCachePreferEqual) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -257,7 +259,9 @@ hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
 hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
  HIP_INIT_API(hipFuncSetSharedMemConfig, func, config);

-  if (func == nullptr) { HIP_RETURN(hipErrorInvalidDeviceFunction); }
+  if (func == nullptr) {
+    HIP_RETURN(hipErrorInvalidDeviceFunction);
+  }
  if (config != hipSharedMemBankSizeDefault && config != hipSharedMemBankSizeFourByte &&
      config != hipSharedMemBankSizeEightByte) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -281,13 +285,13 @@ hipError_t ihipLaunchKernel_validate(hipFunction_t f, const amd::LaunchParams& l
    return hipErrorInvalidValue;
  }

-  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
-                                     || launch_params.global_[2] == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
+      launch_params.global_[2] == 0) {
    return hipErrorInvalidConfiguration;
  }

-  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
-                                    || launch_params.local_[2] == 0) {
+  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
+      launch_params.local_[2] == 0) {
    return hipErrorInvalidConfiguration;
  }

@@ -368,9 +372,9 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
    params |= amd::NDRangeKernelCommand::AnyOrderLaunch;
  }

-  amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(*stream, waitList,
-    *kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids, prevGridSum,
-    allGridSum, firstDevice, profileNDRange);
+  amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
+      *stream, waitList, *kernel, ndrange, launch_params.sharedMemBytes_, params, gridId, numGrids,
+      prevGridSum, allGridSum, firstDevice, profileNDRange);
  if (!kernelCommand) {
    return hipErrorOutOfMemory;
  }
@@ -403,8 +407,8 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
  }

  if (DEBUG_HIP_KERNARG_COPY_OPT) {
-    if (CL_SUCCESS != kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs,
-                                                             kernargs_size)) {
+    if (CL_SUCCESS !=
+        kernelCommand->AllocCaptureSetValidate(kernelParams, kernargs, kernargs_size)) {
      kernelCommand->release();
      return hipErrorOutOfMemory;
    }
@@ -461,8 +465,8 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_par
  amd::Kernel* kernel = function->kernel();
  amd::ScopedLock lock(DEBUG_HIP_KERNARG_COPY_OPT ? nullptr : &function->dflock_);

-  hipError_t status = ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId,
-                                                params);
+  hipError_t status =
+      ihipLaunchKernel_validate(f, launch_params, kernelParams, extra, deviceId, params);
  if (status != hipSuccess) {
    return status;
  }
@@ -562,18 +566,18 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, uint32_t gridDimX, uint32_t gr
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
-      || launch_params.global_[2] == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
+      launch_params.global_[2] == 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
-                                    || launch_params.local_[2] == 0) {
+  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
+      launch_params.local_[2] == 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr,
-                                    nullptr));
+  HIP_RETURN(
+      ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, nullptr, nullptr));
 }

 hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
@@ -594,9 +598,8 @@ hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
                 globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, sharedMemBytes,
                 kernelParams, extra, startEvent, stopEvent, flags);

-  amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
-                                  localWorkSizeX, localWorkSizeY, localWorkSizeZ,
-                                  sharedMemBytes);
+  amd::LaunchParams launch_params(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX,
+                                  localWorkSizeY, localWorkSizeZ, sharedMemBytes);

  HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, hStream, kernelParams, extra, startEvent,
                                    stopEvent, flags));
@@ -649,13 +652,13 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
-      || launch_params.global_[2] == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
+      launch_params.global_[2] == 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0
-                                    || launch_params.local_[2] == 0) {
+  if (launch_params.local_[0] == 0 || launch_params.local_[1] == 0 ||
+      launch_params.local_[2] == 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -664,8 +667,7 @@ hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDi
 }

 hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList,
-                                                       unsigned int  numDevices,
-                                                       unsigned int  flags,
+                                                        unsigned int numDevices, unsigned int flags,
                                                        uint32_t extFlags) {
  int numActiveGPUs = 0;
  hipError_t result = hipSuccess;
@@ -675,8 +677,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
    return hipErrorInvalidValue;
  }

-  if (flags > (hipCooperativeLaunchMultiDeviceNoPostSync +
-               hipCooperativeLaunchMultiDeviceNoPreSync)) {
+  if (flags >
+      (hipCooperativeLaunchMultiDeviceNoPostSync + hipCooperativeLaunchMultiDeviceNoPreSync)) {
    return hipErrorInvalidValue;
  }

@@ -713,8 +715,7 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
  // Sync the execution streams on all devices
  if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
    for (int i = 0; i < numDevices; ++i) {
-      hip::Stream* hip_stream =
-          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
+      hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
      hip_stream->finish();
    }
  }
@@ -759,9 +760,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
      return hipErrorInvalidConfiguration;
    }

-    result = ihipModuleLaunchKernel(
-        launch.function, launch_params, launch.hStream, launch.kernelParams,
-        nullptr, nullptr, nullptr, flags, extFlags,
+    result = ihipModuleLaunchKernel(launch.function, launch_params, launch.hStream,
+                                    launch.kernelParams, nullptr, nullptr, nullptr, flags, extFlags,
                                    i, numDevices, prevGridSize, allGridSize, firstDevice);
    if (result != hipSuccess) {
      break;
@@ -772,8 +772,7 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
  // Sync the execution streams on all devices
  if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
    for (int i = 0; i < numDevices; ++i) {
-      hip::Stream* hip_stream =
-          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
+      hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
      hip_stream->finish();
    }
  }
@@ -798,19 +797,16 @@ hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
  }

  HIP_RETURN(ihipModuleLaunchCooperativeKernelMultiDevice(
-      launchParamsList,
-      numDevices,
-      flags,
+      launchParamsList, numDevices, flags,
      (amd::NDRangeKernelCommand::CooperativeGroups |
       amd::NDRangeKernelCommand::CooperativeMultiDeviceGroups)));
-
 }

 hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr) {
  HIP_INIT_API(hipGetFuncBySymbol, functionPtr, symbolPtr);

-  hipError_t hip_error = PlatformState::instance().getStatFunc(functionPtr,
-                         symbolPtr, ihipGetDevice());
+  hipError_t hip_error =
+      PlatformState::instance().getStatFunc(functionPtr, symbolPtr, ihipGetDevice());

  if ((hip_error != hipSuccess) || (functionPtr == nullptr)) {
    HIP_RETURN(hipErrorInvalidDeviceFunction);
@@ -819,31 +815,31 @@ hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr)
 }

 hipError_t hipLaunchKernel_common(const void* hostFunction, dim3 gridDim, dim3 blockDim,
-                                             void** args, size_t sharedMemBytes,
-                                             hipStream_t stream) {
+                                  void** args, size_t sharedMemBytes, hipStream_t stream) {
  STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes);
  return ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, nullptr,
                          nullptr, 0);
 }

-hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
-                                      void** args, size_t sharedMemBytes, hipStream_t stream) {
+hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
+                           size_t sharedMemBytes, hipStream_t stream) {
  HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
-  HIP_RETURN_DURATION(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
+  HIP_RETURN_DURATION(
+      hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
 }

-hipError_t hipLaunchKernel_spt(const void* hostFunction, dim3 gridDim, dim3 blockDim,
-                                          void** args, size_t sharedMemBytes, hipStream_t stream) {
+hipError_t hipLaunchKernel_spt(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
+                               size_t sharedMemBytes, hipStream_t stream) {
  HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
  PER_THREAD_DEFAULT_STREAM(stream);
  HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
 }

-hipError_t hipExtLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
-                                         void** args, size_t sharedMemBytes, hipStream_t stream,
-                                         hipEvent_t startEvent, hipEvent_t stopEvent, int flags) {
-  HIP_INIT_API(hipExtLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes,
-               stream, startEvent, stopEvent, flags);
+hipError_t hipExtLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
+                              size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
+                              hipEvent_t stopEvent, int flags) {
+  HIP_INIT_API(hipExtLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream,
+               startEvent, stopEvent, flags);

  if (!hip::isValid(startEvent) || !hip::isValid(stopEvent)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -889,13 +885,13 @@ hipError_t hipLaunchCooperativeKernel_common(const void* f, dim3 gridDim, dim3 b
    return hipErrorCooperativeLaunchTooLarge;
  }

-  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0
-      || launch_params.global_[2] == 0) {
+  if (launch_params.global_[0] == 0 || launch_params.global_[1] == 0 ||
+      launch_params.global_[2] == 0) {
    return hipErrorInvalidConfiguration;
  }

-  return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr,
-                                nullptr, nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
+  return ihipModuleLaunchKernel(func, launch_params, hStream, kernelParams, nullptr, nullptr,
+                                nullptr, 0, amd::NDRangeKernelCommand::CooperativeGroups);
 }

 hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
@@ -975,10 +971,8 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
    functionLaunchParamsList[i].kernelParams = launch.args;
  }

-  return ihipModuleLaunchCooperativeKernelMultiDevice(functionLaunchParamsList.data(),
-                                                      functionLaunchParamsList.size(),
-                                                      flags,
-                                                      extFlags);
+  return ihipModuleLaunchCooperativeKernelMultiDevice(
+      functionLaunchParamsList.data(), functionLaunchParamsList.size(), flags, extFlags);
 }

 hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
@@ -1039,8 +1033,8 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const
 hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_type, void* image,
                          size_t image_size, const char* name, unsigned int num_options,
                          hipJitOption* options_ptr, void** option_values) {
-
-  HIP_INIT_API(hipLinkAddData, hip_link_state, image, image_size, name, num_options, options_ptr, option_values);
+  HIP_INIT_API(hipLinkAddData, hip_link_state, image, image_size, name, num_options, options_ptr,
+               option_values);

  if (image == nullptr || image_size <= 0) {
    HIP_RETURN(hipErrorInvalidImage);
@@ -1059,8 +1053,7 @@ hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_t
    input_name = name;
  }

-  LinkProgram* hip_link_prog_ptr =
-      reinterpret_cast<LinkProgram*>(hip_link_state);
+  LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);

  if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
    HIP_RETURN(hipErrorInvalidHandle);
@@ -1073,9 +1066,11 @@ hipError_t hipLinkAddData(hipLinkState_t hip_link_state, hipJitInputType input_t
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_type, const char* file_path,
-                          unsigned int num_options, hipJitOption* options_ptr, void** option_values) {
-  HIP_INIT_API(hipLinkAddFile, hip_link_state, input_type, file_path, num_options, options_ptr, option_values);
+hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_type,
+                          const char* file_path, unsigned int num_options,
+                          hipJitOption* options_ptr, void** option_values) {
+  HIP_INIT_API(hipLinkAddFile, hip_link_state, input_type, file_path, num_options, options_ptr,
+               option_values);

  if (hip_link_state == nullptr) {
    HIP_RETURN(hipErrorInvalidHandle);
@@ -1089,8 +1084,7 @@ hipError_t hipLinkAddFile(hipLinkState_t hip_link_state, hipJitInputType input_t
    HIP_RETURN(hipErrorInvalidValue);
  }

-  LinkProgram* hip_link_prog_ptr =
-      reinterpret_cast<LinkProgram*>(hip_link_state);
+  LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);

  if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -1175,8 +1169,7 @@ hipError_t hipLinkComplete(hipLinkState_t hip_link_state, void** bin_out, size_t
    HIP_RETURN(hipErrorInvalidValue);
  }

-  LinkProgram* hip_link_prog_ptr =
-      reinterpret_cast<LinkProgram*>(hip_link_state);
+  LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);

  if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -1192,8 +1185,7 @@ hipError_t hipLinkComplete(hipLinkState_t hip_link_state, void** bin_out, size_t
 hipError_t hipLinkDestroy(hipLinkState_t hip_link_state) {
  HIP_INIT_API(hipLinkDestroy, hip_link_state);

-  LinkProgram* hip_link_prog_ptr =
-      reinterpret_cast<LinkProgram*>(hip_link_state);
+  LinkProgram* hip_link_prog_ptr = reinterpret_cast<LinkProgram*>(hip_link_state);

  if (!LinkProgram::isLinkerValid(hip_link_prog_ptr)) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -1263,8 +1255,7 @@ hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f
  for (size_t attr_idx = 0; attr_idx < config->numAttrs; ++attr_idx) {
    hipLaunchAttribute& attr = config->attrs[attr_idx];
    switch (attr.id) {
-      case hipLaunchAttributeCooperative:
-      {
+      case hipLaunchAttributeCooperative: {
        if (attr.value.cooperative != 0) {
          HIP_RETURN(ihipModuleLaunchKernel(f, launch_params, config->hStream, kernelParams,
                                            nullptr, nullptr, nullptr, 0,
@@ -38,21 +38,20 @@ hipError_t canAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId){
    return hipSuccess;
  }
  /* Cannot exceed the max number of devices */
-  if (static_cast<size_t>(deviceId) >= g_devices.size()
-       || static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
+  if (static_cast<size_t>(deviceId) >= g_devices.size() ||
+      static_cast<size_t>(peerDeviceId) >= g_devices.size()) {
    return hipErrorInvalidDevice;
  }
  device = g_devices[deviceId]->devices()[0];
  peer_device = g_devices[peerDeviceId]->devices()[0];
-  *canAccessPeer = static_cast<int>(std::find(device->p2pDevices_.begin(),
-                                              device->p2pDevices_.end(), as_cl(peer_device))
-                                              != device->p2pDevices_.end());
+  *canAccessPeer =
+      static_cast<int>(std::find(device->p2pDevices_.begin(), device->p2pDevices_.end(),
+                                 as_cl(peer_device)) != device->p2pDevices_.end());
  return hipSuccess;
 }

 hipError_t findLinkInfo(int device1, int device2,
                        std::vector<amd::Device::LinkAttrType>* link_attrs) {
-
  amd::Device* amd_dev_obj1 = nullptr;
  amd::Device* amd_dev_obj2 = nullptr;
  const int numDevices = static_cast<int>(g_devices.size());
@@ -71,12 +70,12 @@ hipError_t findLinkInfo(int device1, int device2,
  return hipSuccess;
 }

-hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
-                                        uint32_t* linktype, uint32_t* hopcount) {
+hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype,
+                                        uint32_t* hopcount) {
  HIP_INIT_API(hipExtGetLinkTypeAndHopCount, device1, device2, linktype, hopcount);

-  if (linktype == nullptr || hopcount == nullptr ||
-      device1 == device2  || device1 < 0 || device2 < 0) {
+  if (linktype == nullptr || hopcount == nullptr || device1 == device2 || device1 < 0 ||
+      device2 < 0) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  // Fill out the list of LinkAttributes
@@ -92,16 +91,16 @@ hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
-                                    int srcDevice, int dstDevice) {
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr, int srcDevice,
+                                    int dstDevice) {
  HIP_INIT_API(hipDeviceGetP2PAttribute, value, attr, srcDevice, dstDevice);

  if (value == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  if (srcDevice == dstDevice || srcDevice >= static_cast<int>(g_devices.size())
-      || dstDevice >= static_cast<int>(g_devices.size())) {
+  if (srcDevice == dstDevice || srcDevice >= static_cast<int>(g_devices.size()) ||
+      dstDevice >= static_cast<int>(g_devices.size())) {
    HIP_RETURN(hipErrorInvalidDevice);
  }

@@ -193,13 +192,12 @@ hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevic
  HIP_INIT_API(hipMemcpyPeer, dst, dstDevice, src, srcDevice, sizeBytes);
  CHECK_STREAM_CAPTURING();
  if (srcDevice >= static_cast<int>(g_devices.size()) ||
-      dstDevice >= static_cast<int>(g_devices.size()) ||
-      srcDevice < 0 || dstDevice < 0) {
+      dstDevice >= static_cast<int>(g_devices.size()) || srcDevice < 0 || dstDevice < 0) {
    HIP_RETURN(hipErrorInvalidDevice);
  }

-  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(),
-                        true, false));
+  HIP_RETURN(
+      ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(), true, false));
 }

 hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice,
@@ -207,8 +205,7 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
  HIP_INIT_API(hipMemcpyPeerAsync, dst, dstDevice, src, srcDevice, sizeBytes, stream);

  if (srcDevice >= static_cast<int>(g_devices.size()) ||
-      dstDevice >= static_cast<int>(g_devices.size()) ||
-      srcDevice < 0 || dstDevice < 0) {
+      dstDevice >= static_cast<int>(g_devices.size()) || srcDevice < 0 || dstDevice < 0) {
    HIP_RETURN(hipErrorInvalidDevice);
  }
  getStreamPerThread(stream);
@@ -50,11 +50,13 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t h
 hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
                                  hipDeviceptr_t* dptr, size_t* bytes);

-extern hipError_t ihipModuleLaunchKernel(
-    hipFunction_t f, amd::LaunchParams& launch_params, hipStream_t hStream, void** kernelParams,
-    void** extra, hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags = 0,
-    uint32_t params = 0, uint32_t gridId = 0, uint32_t numGrids = 0, uint64_t prevGridSum = 0,
-    uint64_t allGridSum = 0, uint32_t firstDevice = 0);
+extern hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_params,
+                                         hipStream_t hStream, void** kernelParams, void** extra,
+                                         hipEvent_t startEvent, hipEvent_t stopEvent,
+                                         uint32_t flags = 0, uint32_t params = 0,
+                                         uint32_t gridId = 0, uint32_t numGrids = 0,
+                                         uint64_t prevGridSum = 0, uint64_t allGridSum = 0,
+                                         uint32_t firstDevice = 0);
 static bool isCompatibleCodeObject(const std::string& codeobj_target_id, const char* device_name) {
  // Workaround for device name mismatch.
  // Device name may contain feature strings delimited by '+', e.g.
@@ -83,9 +85,8 @@ void** __hipRegisterFatBinary(const void* data) {
 }

 void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunction,
-                                      char* deviceFunction, const char* deviceName,
-                                      unsigned int threadLimit, uint3* tid, uint3* bid,
-                                      dim3* blockDim, dim3* gridDim, int* wSize) {
+                           char* deviceFunction, const char* deviceName, unsigned int threadLimit,
+                           uint3* tid, uint3* bid, dim3* blockDim, dim3* gridDim, int* wSize) {
  static int enable_deferred_loading{[]() {
    char* var = getenv("HIP_ENABLE_DEFERRED_LOADING");
    return var ? atoi(var) : 1;
@@ -106,8 +107,7 @@ void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunctio

    for (size_t dev_idx = 0; dev_idx < g_devices.size(); ++dev_idx) {
      hip_error = PlatformState::instance().getStatFunc(&hfunc, hostFunction, dev_idx);
-      guarantee((hip_error == hipSuccess), "Cannot retrieve Static function, error: %d",
-                                            hip_error);
+      guarantee((hip_error == hipSuccess), "Cannot retrieve Static function, error: %d", hip_error);
    }
  }
 }
@@ -117,8 +117,7 @@ void __hipRegisterFunction(hip::FatBinaryInfo** modules, const void* hostFunctio
 // global variable in host code. The shadow host variable is used to keep
 // track of the value of the device side global variable between kernel
 // executions.
-void __hipRegisterVar(
-    hip::FatBinaryInfo** modules,  // The device modules containing code object
+void __hipRegisterVar(hip::FatBinaryInfo** modules,  // The device modules containing code object
                      void* var,                     // The shadow variable in host code
                      char* hostVar,                 // Variable name in host code
                      char* deviceVar,               // Variable name in device code
@@ -152,7 +151,6 @@ void __hipRegisterManagedVar(
    void* init_value,  // Initial value to be copied into \p pointer
    const char* name,  // Name of the variable in code object
    size_t size, unsigned align) {
-  
  static int enable_deferred_loading{[]() {
 #ifdef _WIN32  // Don't defer loading for windows
    return 0;
@@ -237,15 +235,14 @@ void __hipRegisterTexture(void** modules, void* var, char* hostVar, char* device
 }
 void __hipRegisterVar(void** modules, void* var, char* hostVar, char* deviceVar, int ext,
                      size_t size, int constant, int global) {
-  return __hipRegisterVar(reinterpret_cast<hip::FatBinaryInfo**>(modules), var, hostVar,
-                          deviceVar, ext, size, constant, global);
+  return __hipRegisterVar(reinterpret_cast<hip::FatBinaryInfo**>(modules), var, hostVar, deviceVar,
+                          ext, size, constant, global);
 }
 void __hipUnregisterFatBinary(void** modules) {
  return __hipUnregisterFatBinary(reinterpret_cast<hip::FatBinaryInfo**>(modules));
 }

-hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem,
-                                       hipStream_t stream) {
+hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, hipStream_t stream) {
  HIP_INIT_API(hipConfigureCall, gridDim, blockDim, sharedMem, stream);

  PlatformState::instance().configureCall(gridDim, blockDim, sharedMem, stream);
@@ -345,7 +342,6 @@ hipError_t hipGetSymbolSize(size_t* sizePtr, const void* symbol) {

 hipError_t ihipCreateGlobalVarObj(const char* name, hipModule_t hmod, amd::Memory** amd_mem_obj,
                                  hipDeviceptr_t* dptr, size_t* bytes) {
-
  /* Get Device Program pointer*/
  amd::Program* program = as_amd(reinterpret_cast<cl_program>(hmod));
  device::Program* dev_program = program->getDeviceProgram(*hip::getCurrentDevice()->devices()[0]);
@@ -674,8 +670,8 @@ hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDi
    return hipErrorInvalidConfiguration;
  }

-  return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr,
-                                startEvent, stopEvent, flags);
+  return ihipModuleLaunchKernel(func, launch_params, stream, args, nullptr, startEvent, stopEvent,
+                                flags);
 }

 // conversion routines between float and half precision
@@ -50,10 +50,9 @@ template <hip_api_id_t operation_id> class api_callbacks_spawner_t {
    static_assert(operation_id >= HIP_API_ID_FIRST && operation_id <= HIP_API_ID_LAST,
                  "invalid HIP_API operation id");

-    if (auto function =
-            amd::activity_prof::report_activity.load(std::memory_order_relaxed);
-        function && (enabled_ = function(ACTIVITY_DOMAIN_HIP_API, operation_id,
-                                         &trace_data_) == 0)) {
+    if (auto function = amd::activity_prof::report_activity.load(std::memory_order_relaxed);
+        function &&
+        (enabled_ = function(ACTIVITY_DOMAIN_HIP_API, operation_id, &trace_data_) == 0)) {
      amd::activity_prof::correlation_id = trace_data_.api_data.correlation_id;

      if (trace_data_.phase_enter != nullptr) {
@@ -67,9 +67,7 @@ hipError_t Stream::EndCapture() {
 }

 // ================================================================================================
-bool Stream::Create() {
-  return create();
-}
+bool Stream::Create() { return create(); }

 // ================================================================================================
 void Stream::Destroy(hip::Stream* stream, bool forceDestroy) {
@@ -106,9 +104,7 @@ bool isValid(hipStream_t& stream) {
 }

 // ================================================================================================
-int Stream::DeviceId() const {
-  return device_->deviceId();
-}
+int Stream::DeviceId() const { return device_->deviceId(); }

 // ================================================================================================
 int Stream::DeviceId(const hipStream_t hStream) {
@@ -181,8 +177,8 @@ void CL_CALLBACK ihipStreamCallback(cl_event event, cl_int command_exec_status,
 }

 // ================================================================================================
-static hipError_t ihipStreamCreate(hipStream_t* stream,
-                                  unsigned int flags, hip::Stream::Priority priority,
+static hipError_t ihipStreamCreate(hipStream_t* stream, unsigned int flags,
+                                   hip::Stream::Priority priority,
                                   const std::vector<uint32_t>& cuMask = {}) {
  if (flags != hipStreamDefault && flags != hipStreamNonBlocking) {
    return hipErrorInvalidValue;
@@ -191,8 +187,7 @@ static hipError_t ihipStreamCreate(hipStream_t* stream,

  if (hStream == nullptr) {
    return hipErrorOutOfMemory;
-  }
-  else if (!hStream->Create()) {
+  } else if (!hStream->Create()) {
    hip::Stream::Destroy(hStream);
    return hipErrorOutOfMemory;
  }
@@ -233,8 +228,8 @@ hipStream_t stream_per_thread::get() {
  // There is a scenario where hipResetDevice destroys stream per thread
  // hence isValid check is required to make sure only valid stream is used
  if (m_streams[currDev] == nullptr || !hip::isValid(m_streams[currDev])) {
-    hipError_t status = ihipStreamCreate(&m_streams[currDev], hipStreamDefault,
-                                         hip::Stream::Priority::Normal);
+    hipError_t status =
+        ihipStreamCreate(&m_streams[currDev], hipStreamDefault, hip::Stream::Priority::Normal);
    if (status != hipSuccess) {
      DevLogError("Stream creation failed");
    }
@@ -417,8 +412,8 @@ hipError_t hipStreamDestroy(hipStream_t stream) {
      g_allCapturingStreams.erase(g_it);
    }
  }
-  const auto& l_it = std::find(hip::tls.capture_streams_.begin(),
-                      hip::tls.capture_streams_.end(), s);
+  const auto& l_it =
+      std::find(hip::tls.capture_streams_.begin(), hip::tls.capture_streams_.end(), s);
  if (l_it != hip::tls.capture_streams_.end()) {
    hip::tls.capture_streams_.erase(l_it);
  }
@@ -680,7 +675,8 @@ hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize

  const std::vector<uint32_t> cuMaskv(cuMask, cuMask + cuMaskSize);

-  HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal, cuMaskv), *stream);
+  HIP_RETURN(ihipStreamCreate(stream, hipStreamDefault, hip::Stream::Priority::Normal, cuMaskv),
+             *stream);
 }

 // ================================================================================================
@@ -727,8 +723,7 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32

  // find the minimum cuMaskSize required to present the CU mask bit-array in a patch of 32 bits
  // and return error if the cuMaskSize argument is less than cuMaskSizeRequired
-  uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 +
-    ((info.maxComputeUnits_ % 32) ? 1 : 0);
+  uint32_t cuMaskSizeRequired = info.maxComputeUnits_ / 32 + ((info.maxComputeUnits_ % 32) ? 1 : 0);

  if (cuMaskSize < cuMaskSizeRequired) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -892,4 +887,4 @@ hipError_t hipStreamGetAttribute(hipStream_t stream, hipStreamAttrID attr,

  HIP_RETURN(hipSuccess);
 }
-} // hip namespace
+}  // namespace hip
@@ -108,8 +108,8 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void
  amd::Command::EventWaitList waitList;

  amd::StreamOperationCommand* command =
-    new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(),
-                                    value, mask, outFlags, offset, sizeBytes);
+      new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(), value,
+                                      mask, outFlags, offset, sizeBytes);

  if (command == nullptr) {
    return hipErrorOutOfMemory;
@@ -124,48 +124,30 @@ hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, uint32_t value, u
  HIP_INIT_API(hipStreamWaitValue32, stream, ptr, value, mask, flags);
  // NOTE: ptr corresponds to a HSA Signal memeory which is 64 bits.
  // 32 bit value and mask are converted to 64-bit values.
-  HIP_RETURN_DURATION(ihipStreamOperation(
-      stream,
-      ROCCLR_COMMAND_STREAM_WAIT_VALUE,
-      ptr,
-      value,
-      mask,
-      flags,
-      sizeof(uint32_t)));
+  HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WAIT_VALUE, ptr, value,
+                                          mask, flags, sizeof(uint32_t)));
 }

 hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags,
                                uint64_t mask) {
  HIP_INIT_API(hipStreamWaitValue64, stream, ptr, value, mask, flags);
-  HIP_RETURN_DURATION(ihipStreamOperation(
-      stream,
-      ROCCLR_COMMAND_STREAM_WAIT_VALUE,
-      ptr,
-      value,
-      mask,
-      flags,
-      sizeof(uint64_t)));
+  HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WAIT_VALUE, ptr, value,
+                                          mask, flags, sizeof(uint64_t)));
 }

-hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value, unsigned int flags) {
+hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, uint32_t value,
+                                 unsigned int flags) {
  HIP_INIT_API(hipStreamWriteValue32, stream, ptr, value, flags);
-  HIP_RETURN_DURATION(ihipStreamOperation(
-      stream,
-      ROCCLR_COMMAND_STREAM_WRITE_VALUE,
-      ptr,
-      value,
+  HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE, ptr, value,
                                          0,  // mask un-used set it to 0
                                          0,  // flags un-used for now set it to 0
                                          sizeof(uint32_t)));
 }

-hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value, unsigned int flags) {
+hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
+                                 unsigned int flags) {
  HIP_INIT_API(hipStreamWriteValue64, stream, ptr, value, flags);
-  HIP_RETURN_DURATION(ihipStreamOperation(
-      stream,
-      ROCCLR_COMMAND_STREAM_WRITE_VALUE,
-      ptr,
-      value,
+  HIP_RETURN_DURATION(ihipStreamOperation(stream, ROCCLR_COMMAND_STREAM_WRITE_VALUE, ptr, value,
                                          0,  // mask un-used set it to 0
                                          0,  // flags un-used for now set it to 0
                                          sizeof(uint64_t)));
@@ -174,11 +156,7 @@ hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, uint64_t value,
 hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
                               hipStreamBatchMemOpParams* paramArray, unsigned int flags) {
  HIP_INIT_API(hipStreamBatchMemOp, count, paramArray, flags);
-  HIP_RETURN_DURATION(ihipBatchMemOperation(
-                      stream,
-                      ROCCLR_COMMAND_BATCH_STREAM,
-                      count,
-                      paramArray,
-                      flags));
+  HIP_RETURN_DURATION(
+      ihipBatchMemOperation(stream, ROCCLR_COMMAND_BATCH_STREAM, count, paramArray, flags));
 }
 }  // namespace hip
@@ -78,8 +78,8 @@ hipError_t ihipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
  image = as_amd(memObj)->asImage();

  void* surfObjectBuffer = nullptr;
-  hipError_t err = ihipMalloc(&surfObjectBuffer, sizeof(__hip_surface),
-                              CL_MEM_SVM_FINE_GRAIN_BUFFER);
+  hipError_t err =
+      ihipMalloc(&surfObjectBuffer, sizeof(__hip_surface), CL_MEM_SVM_FINE_GRAIN_BUFFER);
  if (surfObjectBuffer == nullptr || err != hipSuccess) {
    return hipErrorOutOfMemory;
  }
@@ -421,13 +421,11 @@ extern "C" hipError_t hipGetDevicePropertiesR0000(hipDeviceProp_tR0000* prop, in
 }
 hipError_t hipGetDriverEntryPoint(const char* symbol, void** funcPtr, unsigned long long flags,
                                  hipDriverEntryPointQueryResult* status) {
-  return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_fn(symbol, funcPtr, flags,
-                                                            status);
+  return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_fn(symbol, funcPtr, flags, status);
 }
 hipError_t hipGetDriverEntryPoint_spt(const char* symbol, void** funcPtr, unsigned long long flags,
                                      hipDriverEntryPointQueryResult* status) {
-  return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_spt_fn(symbol, funcPtr, flags,
-                                                                   status);
+  return hip::GetHipDispatchTable()->hipGetDriverEntryPoint_spt_fn(symbol, funcPtr, flags, status);
 }
 const char* hipGetErrorName(hipError_t hip_error) {
  return hip::GetHipDispatchTable()->hipGetErrorName_fn(hip_error);
@@ -559,8 +557,8 @@ hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
 hipError_t hipGraphAddNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                           const hipGraphNode_t* pDependencies, size_t numDependencies,
                           hipGraphNodeParams* nodeParams) {
-  return hip::GetHipDispatchTable()->hipGraphAddNode_fn(pGraphNode, graph,
-                                            pDependencies, numDependencies, nodeParams);
+  return hip::GetHipDispatchTable()->hipGraphAddNode_fn(pGraphNode, graph, pDependencies,
+                                                        numDependencies, nodeParams);
 }
 hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) {
  return hip::GetHipDispatchTable()->hipGraphChildGraphNodeGetGraph_fn(node, pGraph);
@@ -599,8 +597,7 @@ hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGra
 }
 hipError_t hipGraphExecNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node,
                                     hipGraphNodeParams* nodeParams) {
-  return hip::GetHipDispatchTable()->hipGraphExecNodeSetParams_fn(hGraphExec, node,
-                                                                  nodeParams);
+  return hip::GetHipDispatchTable()->hipGraphExecNodeSetParams_fn(hGraphExec, node, nodeParams);
 }
 hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec) {
  return hip::GetHipDispatchTable()->hipGraphExecDestroy_fn(graphExec);
@@ -842,8 +839,8 @@ hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
 hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
                                    const hipGraphNode_t* dependencies, size_t numDependencies,
                                    const hipMemsetParams* memsetParams, hipCtx_t ctx) {
-  return hip::GetHipDispatchTable()->hipDrvGraphAddMemsetNode_fn(phGraphNode, hGraph,
-                                            dependencies, numDependencies, memsetParams, ctx);
+  return hip::GetHipDispatchTable()->hipDrvGraphAddMemsetNode_fn(
+      phGraphNode, hGraph, dependencies, numDependencies, memsetParams, ctx);
 }
 hipError_t hipInit(unsigned int flags) { return hip::GetHipDispatchTable()->hipInit_fn(flags); }
 hipError_t hipIpcCloseMemHandle(void* devPtr) {
@@ -1288,8 +1285,9 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned
                                                            optionValues);
 }

-hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size, const char* name,
-                          unsigned int numOptions, hipJitOption* options, void** optionValues) {
+hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data, size_t size,
+                          const char* name, unsigned int numOptions, hipJitOption* options,
+                          void** optionValues) {
  return hip::GetHipDispatchTable()->hipLinkAddData_fn(state, type, data, size, name, numOptions,
                                                       options, optionValues);
 }
@@ -1304,7 +1302,8 @@ hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeO
  return hip::GetHipDispatchTable()->hipLinkComplete_fn(state, hipBinOut, sizeOut);
 }

-hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues, hipLinkState_t* stateOut) {
+hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options, void** optionValues,
+                         hipLinkState_t* stateOut) {
  return hip::GetHipDispatchTable()->hipLinkCreate_fn(numOptions, options, optionValues, stateOut);
 }

@@ -1609,8 +1608,9 @@ DllExport hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWo
                                              uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                              uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                              uint32_t localWorkSizeZ, size_t sharedMemBytes,
-                                    hipStream_t hStream, void** kernelParams, void** extra,
-                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
+                                              hipStream_t hStream, void** kernelParams,
+                                              void** extra, hipEvent_t startEvent,
+                                              hipEvent_t stopEvent, uint32_t flags) {
  return hip::GetHipDispatchTable()->hipExtModuleLaunchKernel_fn(
      f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
      localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags);
@@ -1620,8 +1620,9 @@ DllExport hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWo
                                              uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                              uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                              uint32_t localWorkSizeZ, size_t sharedMemBytes,
-                                    hipStream_t hStream, void** kernelParams, void** extra,
-                                    hipEvent_t startEvent, hipEvent_t stopEvent) {
+                                              hipStream_t hStream, void** kernelParams,
+                                              void** extra, hipEvent_t startEvent,
+                                              hipEvent_t stopEvent) {
  return hip::GetHipDispatchTable()->hipHccModuleLaunchKernel_fn(
      f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
      localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent);
@@ -1785,9 +1786,7 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD
 extern "C" int hipGetStreamDeviceId(hipStream_t stream) {
  return hip::GetHipDispatchTable()->hipGetStreamDeviceId_fn(stream);
 }
-hipError_t hipExtGetLastError() {
-  return hip::GetHipDispatchTable()->hipExtGetLastError_fn();
-}
+hipError_t hipExtGetLastError() { return hip::GetHipDispatchTable()->hipExtGetLastError_fn(); }
 hipError_t hipTexRefGetBorderColor(float* pBorderColor, const textureReference* texRef) {
  return hip::GetHipDispatchTable()->hipTexRefGetBorderColor_fn(pBorderColor, texRef);
 }
@@ -1821,9 +1820,8 @@ hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* fl
 hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
                                     const hipGraphNode_t* dependencies, size_t numDependencies,
                                     hipDeviceptr_t dptr) {
-  return hip::GetHipDispatchTable()->hipDrvGraphAddMemFreeNode_fn(phGraphNode, hGraph,
-                                  dependencies, numDependencies,
-                                  dptr);
+  return hip::GetHipDispatchTable()->hipDrvGraphAddMemFreeNode_fn(phGraphNode, hGraph, dependencies,
+                                                                  numDependencies, dptr);
 }
 hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
                                              const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
@@ -1835,8 +1833,7 @@ hipError_t hipSetValidDevices(int* device_arr, int len) {
 }
 hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset,
                         size_t ByteCount) {
-  return hip::GetHipDispatchTable()->hipMemcpyAtoD_fn(dstDevice, srcArray, srcOffset,
-                                                      ByteCount);
+  return hip::GetHipDispatchTable()->hipMemcpyAtoD_fn(dstDevice, srcArray, srcOffset, ByteCount);
 }
 hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset, hipDeviceptr_t srcDevice,
                         size_t ByteCount) {
@@ -1886,8 +1883,7 @@ hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
                                           hipBatchMemOpNodeParams* nodeParams) {
  return hip::GetHipDispatchTable()->hipGraphBatchMemOpNodeSetParams_fn(hNode, nodeParams);
 }
-hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec,
-                                               hipGraphNode_t hNode,
+hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
                                               const hipBatchMemOpNodeParams* nodeParams) {
  return hip::GetHipDispatchTable()->hipGraphExecBatchMemOpNodeSetParams_fn(hGraphExec, hNode,
                                                                            nodeParams);
@@ -1915,8 +1911,8 @@ hipError_t hipMemsetD2D8(hipDeviceptr_t dst, size_t dstPitch, unsigned char valu
                         size_t height) {
  return hip::GetHipDispatchTable()->hipMemsetD2D8_fn(dst, dstPitch, value, width, height);
 }
-hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value, size_t width,
-                              size_t height, hipStream_t stream) {
+hipError_t hipMemsetD2D8Async(hipDeviceptr_t dst, size_t dstPitch, unsigned char value,
+                              size_t width, size_t height, hipStream_t stream) {
  return hip::GetHipDispatchTable()->hipMemsetD2D8Async_fn(dst, dstPitch, value, width, height,
                                                           stream);
 }
@@ -1924,8 +1920,8 @@ hipError_t hipMemsetD2D16(hipDeviceptr_t dst, size_t dstPitch, unsigned short va
                          size_t height) {
  return hip::GetHipDispatchTable()->hipMemsetD2D16_fn(dst, dstPitch, value, width, height);
 }
-hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value, size_t width,
-                              size_t height, hipStream_t stream) {
+hipError_t hipMemsetD2D16Async(hipDeviceptr_t dst, size_t dstPitch, unsigned short value,
+                               size_t width, size_t height, hipStream_t stream) {
  return hip::GetHipDispatchTable()->hipMemsetD2D16Async_fn(dst, dstPitch, value, width, height,
                                                            stream);
 }
@@ -1933,8 +1929,8 @@ hipError_t hipMemsetD2D32(hipDeviceptr_t dst, size_t dstPitch, unsigned int valu
                          size_t height) {
  return hip::GetHipDispatchTable()->hipMemsetD2D32_fn(dst, dstPitch, value, width, height);
 }
-hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value, size_t width,
-                              size_t height, hipStream_t stream) {
+hipError_t hipMemsetD2D32Async(hipDeviceptr_t dst, size_t dstPitch, unsigned int value,
+                               size_t width, size_t height, hipStream_t stream) {
  return hip::GetHipDispatchTable()->hipMemsetD2D32Async_fn(dst, dstPitch, value, width, height,
                                                            stream);
 }
@@ -36,8 +36,9 @@ DllExport hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWo
                                              uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                              uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                              uint32_t localWorkSizeZ, size_t sharedMemBytes,
-                                    hipStream_t hStream, void** kernelParams, void** extra,
-                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags) {
+                                              hipStream_t hStream, void** kernelParams,
+                                              void** extra, hipEvent_t startEvent,
+                                              hipEvent_t stopEvent, uint32_t flags) {
  return hip::GetHipDispatchTable()->hipExtModuleLaunchKernel_fn(
      f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
      localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent, flags);
@@ -46,8 +47,9 @@ DllExport hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWo
                                              uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
                                              uint32_t localWorkSizeX, uint32_t localWorkSizeY,
                                              uint32_t localWorkSizeZ, size_t sharedMemBytes,
-                                    hipStream_t hStream, void** kernelParams, void** extra,
-                                    hipEvent_t startEvent, hipEvent_t stopEvent) {
+                                              hipStream_t hStream, void** kernelParams,
+                                              void** extra, hipEvent_t startEvent,
+                                              hipEvent_t stopEvent) {
  return hip::GetHipDispatchTable()->hipHccModuleLaunchKernel_fn(
      f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY,
      localWorkSizeZ, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent);
@@ -34,12 +34,9 @@ struct __hip_texture {
  hipTextureDesc texDesc;
  hipResourceViewDesc resViewDesc;

-  __hip_texture(amd::Image* image_,
-                amd::Sampler* sampler_,
-                const hipResourceDesc& resDesc_,
-                const hipTextureDesc& texDesc_,
-                const hipResourceViewDesc& resViewDesc_) :
-    image(image_),
+  __hip_texture(amd::Image* image_, amd::Sampler* sampler_, const hipResourceDesc& resDesc_,
+                const hipTextureDesc& texDesc_, const hipResourceViewDesc& resViewDesc_)
+      : image(image_),
        sampler(sampler_),
        resDesc(resDesc_),
        texDesc(texDesc_),
@@ -58,22 +55,14 @@ struct __hip_texture {
 namespace hip {

 hipError_t ihipFree(void* ptr);
-amd::Image* ihipImageCreate(const cl_channel_order channelOrder,
-                            const cl_channel_type channelType,
-                            const cl_mem_object_type imageType,
-                            const size_t imageWidth,
-                            const size_t imageHeight,
-                            const size_t imageDepth,
-                            const size_t imageArraySize,
-                            const size_t imageRowPitch,
-                            const size_t imageSlicePitch,
-                            const uint32_t numMipLevels,
-                            const size_t offset,
-                            amd::Memory* buffer,
-                            hipError_t& status);
+amd::Image* ihipImageCreate(const cl_channel_order channelOrder, const cl_channel_type channelType,
+                            const cl_mem_object_type imageType, const size_t imageWidth,
+                            const size_t imageHeight, const size_t imageDepth,
+                            const size_t imageArraySize, const size_t imageRowPitch,
+                            const size_t imageSlicePitch, const uint32_t numMipLevels,
+                            const size_t offset, amd::Memory* buffer, hipError_t& status);

-hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
-                                   const hipResourceDesc* pResDesc,
+hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc,
                                   const hipTextureDesc* pTexDesc,
                                   const hipResourceViewDesc* pResViewDesc) {
  amd::Device* device = hip::getCurrentDevice()->devices()[0];
@@ -88,9 +77,11 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
    return hipErrorInvalidChannelDescriptor;
  }

-  // pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped array.
+  // pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped
+  // array.
  if ((pResViewDesc != nullptr) &&
-      ((pResDesc->resType != hipResourceTypeArray) && (pResDesc->resType != hipResourceTypeMipmappedArray))) {
+      ((pResDesc->resType != hipResourceTypeArray) &&
+       (pResDesc->resType != hipResourceTypeMipmappedArray))) {
    return hipErrorUnknown;
  }

@@ -132,21 +123,25 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
  }

  // If hipResourceDesc::resType is set to hipResourceTypeLinear,
-  // hipResourceDesc::res::linear::devPtr must be set to a valid device pointer, that is aligned to hipDeviceProp::textureAlignment.
-  // The total number of elements in the linear address range cannot exceed hipDeviceProp::maxTexture1DLinear.
+  // hipResourceDesc::res::linear::devPtr must be set to a valid device pointer, that is aligned to
+  // hipDeviceProp::textureAlignment. The total number of elements in the linear address range
+  // cannot exceed hipDeviceProp::maxTexture1DLinear.
  if ((pResDesc->resType == hipResourceTypeLinear) &&
      ((pResDesc->res.linear.devPtr == nullptr) ||
       (!amd::isMultipleOf(pResDesc->res.linear.devPtr, info.imageBaseAddressAlignment_)) ||
-       (pResDesc->res.linear.sizeInBytes >= info.imageMaxBufferSize_ * hip::getElementSize(pResDesc->res.linear.desc)))) {
+       (pResDesc->res.linear.sizeInBytes >=
+        info.imageMaxBufferSize_ * hip::getElementSize(pResDesc->res.linear.desc)))) {
    return hipErrorInvalidChannelDescriptor;
  }

  // If hipResourceDesc::resType is set to hipResourceTypePitch2D,
-  // hipResourceDesc::res::pitch2D::devPtr must be set to a valid device pointer, that is aligned to hipDeviceProp::textureAlignment.
-  // hipResourceDesc::res::pitch2D::width and hipResourceDesc::res::pitch2D::height specify the width and height of the array in elements,
-  // and cannot exceed hipDeviceProp::maxTexture2DLinear[0] and hipDeviceProp::maxTexture2DLinear[1] respectively.
-  // hipResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to hipDeviceProp::texturePitchAlignment.
-  // Pitch cannot exceed hipDeviceProp::maxTexture2DLinear[2].
+  // hipResourceDesc::res::pitch2D::devPtr must be set to a valid device pointer, that is aligned to
+  // hipDeviceProp::textureAlignment. hipResourceDesc::res::pitch2D::width and
+  // hipResourceDesc::res::pitch2D::height specify the width and height of the array in elements,
+  // and cannot exceed hipDeviceProp::maxTexture2DLinear[0] and hipDeviceProp::maxTexture2DLinear[1]
+  // respectively. hipResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows
+  // in bytes and has to be aligned to hipDeviceProp::texturePitchAlignment. Pitch cannot exceed
+  // hipDeviceProp::maxTexture2DLinear[2].
  if ((pResDesc->resType == hipResourceTypePitch2D) &&
      ((pResDesc->res.pitch2D.devPtr == nullptr) ||
       (!amd::isMultipleOf(pResDesc->res.pitch2D.devPtr, info.imageBaseAddressAlignment_)) ||
@@ -207,13 +202,9 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
    mipFilterMode = hip::getCLFilterMode(pTexDesc->mipmapFilterMode);
  }

-  amd::Sampler* sampler = new amd::Sampler(*hip::getCurrentDevice()->asContext(),
-                                           pTexDesc->normalizedCoords,
-                                           addressMode,
-                                           filterMode,
-                                           mipFilterMode,
-                                           pTexDesc->minMipmapLevelClamp,
-                                           pTexDesc->maxMipmapLevelClamp);
+  amd::Sampler* sampler = new amd::Sampler(
+      *hip::getCurrentDevice()->asContext(), pTexDesc->normalizedCoords, addressMode, filterMode,
+      mipFilterMode, pTexDesc->minMipmapLevelClamp, pTexDesc->maxMipmapLevelClamp);

  if (sampler == nullptr) {
    return hipErrorOutOfMemory;
@@ -243,14 +234,15 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,

      // We need to create an image view if the user requested to use normalized pixel values,
      // due to already having the image created with a different format.
-    if ((pResViewDesc != nullptr) ||
-        (readMode == hipReadModeNormalizedFloat) ||
+      if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) ||
          (pTexDesc->sRGB == 1)) {
        // TODO ROCclr currently right now can only change the format of the image.
-      const cl_channel_order channelOrder = (pResViewDesc != nullptr) ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB) :
-                                                                        hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
-      const cl_channel_type channelType = (pResViewDesc != nullptr) ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode) :
-                                                                      hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
+        const cl_channel_order channelOrder = (pResViewDesc != nullptr)
+            ? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
+            : hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
+        const cl_channel_type channelType = (pResViewDesc != nullptr)
+            ? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
+            : hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
        const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
        if (!imageFormat.isValid()) {
          return hipErrorInvalidValue;
@@ -307,16 +299,17 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
      break;
    }
    case hipResourceTypeLinear: {
-    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.linear.desc), pTexDesc->sRGB);
-    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.linear.desc), pTexDesc->readMode);
+      const cl_channel_order channelOrder =
+          hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.linear.desc), pTexDesc->sRGB);
+      const cl_channel_type channelType =
+          hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.linear.desc), pTexDesc->readMode);
      const amd::Image::Format imageFormat({channelOrder, channelType});
      const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
      const size_t imageSizeInBytes = pResDesc->res.linear.sizeInBytes;
-    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.linear.devPtr, imageSizeInBytes);
+      amd::Memory* buffer =
+          getMemoryObjectWithOffset(pResDesc->res.linear.devPtr, imageSizeInBytes);
      hipError_t status = hipSuccess;
-    image = ihipImageCreate(channelOrder,
-                            channelType,
-                            imageType,
+      image = ihipImageCreate(channelOrder, channelType, imageType,
                              imageSizeInBytes / imageFormat.getElementSize(), /* imageWidth */
                              0,                                               /* imageHeight */
                              0,                                               /* imageDepth */
@@ -325,8 +318,7 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
                              0,                                               /* imageSlicePitch */
                              0,                                               /* numMipLevels */
                              0,                                               /* offset */
-                            buffer,
-                            status);
+                              buffer, status);
      if (buffer != nullptr) {
        buffer->release();
      }
@@ -337,17 +329,18 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
      break;
    }
    case hipResourceTypePitch2D: {
-    const cl_channel_order channelOrder = hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB);
-    const cl_channel_type channelType = hip::getCLChannelType(hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
+      const cl_channel_order channelOrder =
+          hip::getCLChannelOrder(hip::getNumChannels(pResDesc->res.pitch2D.desc), pTexDesc->sRGB);
+      const cl_channel_type channelType = hip::getCLChannelType(
+          hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
      const amd::Image::Format imageFormat({channelOrder, channelType});
      const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
      const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
          pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1);
-    amd::Memory* buffer = getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
+      amd::Memory* buffer =
+          getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
      hipError_t status = hipSuccess;
-    image = ihipImageCreate(channelOrder,
-                            channelType,
-                            imageType,
+      image = ihipImageCreate(channelOrder, channelType, imageType,
                              pResDesc->res.pitch2D.width,        /* imageWidth */
                              pResDesc->res.pitch2D.height,       /* imageHeight */
                              0,                                  /* imageDepth */
@@ -356,8 +349,7 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
                              0,                                  /* imageSlicePitch */
                              0,                                  /* numMipLevels */
                              0,                                  /* offset */
-                            buffer,
-                            status);
+                              buffer, status);
      if (buffer != nullptr) {
        buffer->release();
      }
@@ -369,17 +361,19 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject,
  }

  void* texObjectBuffer = nullptr;
-  hipError_t err = ihipMalloc(&texObjectBuffer, sizeof(__hip_texture), CL_MEM_SVM_FINE_GRAIN_BUFFER);
+  hipError_t err =
+      ihipMalloc(&texObjectBuffer, sizeof(__hip_texture), CL_MEM_SVM_FINE_GRAIN_BUFFER);
  if (texObjectBuffer == nullptr || err != hipSuccess) {
    return hipErrorOutOfMemory;
  }
-  *pTexObject = new (texObjectBuffer) __hip_texture{image, sampler, *pResDesc, *pTexDesc, (pResViewDesc != nullptr) ? *pResViewDesc : hipResourceViewDesc{}};
+  *pTexObject = new (texObjectBuffer)
+      __hip_texture{image, sampler, *pResDesc, *pTexDesc,
+                    (pResViewDesc != nullptr) ? *pResViewDesc : hipResourceViewDesc{}};

  return hipSuccess;
 }

-hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
-                                  const hipResourceDesc* pResDesc,
+hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc,
                                  const hipTextureDesc* pTexDesc,
                                  const hipResourceViewDesc* pResViewDesc) {
  HIP_INIT_API(hipCreateTextureObject, pTexObject, pResDesc, pTexDesc, pResViewDesc);
@@ -410,7 +404,6 @@ hipError_t ihipDestroyTextureObject(hipTextureObject_t texObject) {
 }

 hipError_t ihipUnbindTexture(textureReference* texRef) {
-
  hipError_t hip_error = hipSuccess;

  do {
@@ -487,8 +480,7 @@ hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc,
-                                          hipTextureObject_t texObject) {
+hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc, hipTextureObject_t texObject) {
  HIP_INIT_API(hipGetTextureObjectTextureDesc, pTexDesc, texObject);

  if ((pTexDesc == nullptr) || (texObject == nullptr)) {
@@ -506,8 +498,7 @@ hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc,
  HIP_RETURN(hipSuccess);
 }

-inline hipError_t ihipGetTextureAlignmentOffset(size_t* offset,
-                                          const void* devPtr) {
+inline hipError_t ihipGetTextureAlignmentOffset(size_t* offset, const void* devPtr) {
  amd::Device* device = hip::getCurrentDevice()->devices()[0];
  const device::Info& info = device->info();
  if (!info.imageSupport_) {
@@ -515,7 +506,8 @@ inline hipError_t ihipGetTextureAlignmentOffset(size_t* offset,
    return hipErrorNotSupported;
  }

-  const char* alignedDevPtr = amd::alignUp(static_cast<const char*>(devPtr), info.imageBaseAddressAlignment_);
+  const char* alignedDevPtr =
+      amd::alignUp(static_cast<const char*>(devPtr), info.imageBaseAddressAlignment_);
  const size_t alignedOffset = alignedDevPtr - static_cast<const char*>(devPtr);

  // If the device memory pointer was returned from hipMalloc(),
@@ -532,11 +524,8 @@ inline hipError_t ihipGetTextureAlignmentOffset(size_t* offset,
  return hipSuccess;
 }

-hipError_t ihipBindTexture(size_t* offset,
-                           const textureReference* texref,
-                           const void* devPtr,
-                           const hipChannelFormatDesc* desc,
-                           size_t size) {
+hipError_t ihipBindTexture(size_t* offset, const textureReference* texref, const void* devPtr,
+                           const hipChannelFormatDesc* desc, size_t size) {
  if (texref == nullptr) {
    return hipErrorUnknown;
  }
@@ -549,9 +538,8 @@ hipError_t ihipBindTexture(size_t* offset,
    return hipErrorInvalidValue;
  }

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texref->textureObject);
  if (err != hipSuccess) {
    return err;
@@ -572,26 +560,19 @@ hipError_t ihipBindTexture(size_t* offset,
      static_cast<char*>(const_cast<void*>(devPtr)) - (offset != nullptr ? *offset : 0);
  hipTextureDesc texDesc = hip::getTextureDesc(texref);

-  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, nullptr);
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc,
+                                 &texDesc, nullptr);
 }

-hipError_t ihipBindTexture2D(size_t* offset,
-                             const textureReference* texref,
-                             const void* devPtr,
-                             const hipChannelFormatDesc* desc,
-                             size_t width,
-                             size_t height,
+hipError_t ihipBindTexture2D(size_t* offset, const textureReference* texref, const void* devPtr,
+                             const hipChannelFormatDesc* desc, size_t width, size_t height,
                             size_t pitch) {
-  if ((texref == nullptr) ||
-      (devPtr == nullptr) ||
-      (desc == nullptr) ||
-      (pitch == 0)) {
+  if ((texref == nullptr) || (devPtr == nullptr) || (desc == nullptr) || (pitch == 0)) {
    return hipErrorInvalidValue;
  }

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texref->textureObject);
  if (err != hipSuccess) {
    return err;
@@ -614,15 +595,12 @@ hipError_t ihipBindTexture2D(size_t* offset,
      static_cast<char*>(const_cast<void*>(devPtr)) - (offset != nullptr ? *offset : 0);
  hipTextureDesc texDesc = hip::getTextureDesc(texref);

-  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, nullptr);
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc,
+                                 &texDesc, nullptr);
 }

-hipError_t hipBindTexture2D(size_t* offset,
-                            const textureReference* texref,
-                            const void* devPtr,
-                            const hipChannelFormatDesc* desc,
-                            size_t width,
-                            size_t height,
+hipError_t hipBindTexture2D(size_t* offset, const textureReference* texref, const void* devPtr,
+                            const hipChannelFormatDesc* desc, size_t width, size_t height,
                            size_t pitch) {
  HIP_INIT_API(hipBindTexture2D, offset, texref, devPtr, desc, width, height, pitch);

@@ -636,8 +614,8 @@ hipError_t hipBindTexture2D(size_t* offset,
  hipDeviceptr_t refDevPtr = nullptr;
  size_t refDevSize = 0;

-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
-                                                               &refDevSize));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr, &refDevSize));

  assert(refDevSize == sizeof(textureReference));
  hipError_t err = ihipBindTexture2D(offset, texref, devPtr, desc, width, height, pitch);
@@ -649,12 +627,10 @@ hipError_t hipBindTexture2D(size_t* offset,
  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }

-hipError_t ihipBindTextureToArray(const textureReference* texref,
-                                  hipArray_const_t array,
+hipError_t ihipBindTextureToArray(const textureReference* texref, hipArray_const_t array,
                                  const hipChannelFormatDesc* desc) {
-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texref->textureObject);
  if (err != hipSuccess) {
    return err;
@@ -669,11 +645,11 @@ hipError_t ihipBindTextureToArray(const textureReference* texref,
  hipResourceViewFormat format = hip::getResourceViewFormat(*desc);
  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(array, format);

-  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, &resViewDesc);
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc,
+                                 &texDesc, &resViewDesc);
 }

-hipError_t hipBindTextureToArray(const textureReference* texref,
-                                 hipArray_const_t array,
+hipError_t hipBindTextureToArray(const textureReference* texref, hipArray_const_t array,
                                 const hipChannelFormatDesc* desc) {
  HIP_INIT_API(hipBindTextureToArray, texref, array, desc);

@@ -683,8 +659,8 @@ hipError_t hipBindTextureToArray(const textureReference* texref,

  hipDeviceptr_t refDevPtr = nullptr;
  size_t refDevSize = 0;
-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
-                                                               &refDevSize));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr, &refDevSize));

  assert(refDevSize == sizeof(textureReference));
  hipError_t err = ihipBindTextureToArray(texref, array, desc);
@@ -699,15 +675,12 @@ hipError_t hipBindTextureToArray(const textureReference* texref,
 hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref,
                                           hipMipmappedArray_const_t mipmappedArray,
                                           const hipChannelFormatDesc* desc) {
-  if ((texref == nullptr) ||
-      (mipmappedArray == nullptr) ||
-      (desc == nullptr)) {
+  if ((texref == nullptr) || (mipmappedArray == nullptr) || (desc == nullptr)) {
    return hipErrorInvalidValue;
  }

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texref->textureObject);
  if (err != hipSuccess) {
    return err;
@@ -722,7 +695,8 @@ hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref,
  hipResourceViewFormat format = hip::getResourceViewFormat(*desc);
  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(mipmappedArray, format);

-  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc, &texDesc, &resViewDesc);
+  return ihipCreateTextureObject(const_cast<hipTextureObject_t*>(&texref->textureObject), &resDesc,
+                                 &texDesc, &resViewDesc);
 }

 hipError_t hipBindTextureToMipmappedArray(const textureReference* texref,
@@ -733,8 +707,8 @@ hipError_t hipBindTextureToMipmappedArray(const textureReference* texref,
  hipDeviceptr_t refDevPtr = nullptr;
  size_t refDevSize = 0;

-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
-                                                               &refDevSize));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr, &refDevSize));

  assert(refDevSize == sizeof(textureReference));
  hipError_t err = ihipBindTextureToMipmappedArray(texref, mipmappedArray, desc);
@@ -752,17 +726,14 @@ hipError_t hipUnbindTexture(const textureReference* texref) {
  HIP_RETURN(ihipUnbindTexture(const_cast<textureReference*>(texref)));
 }

-hipError_t hipBindTexture(size_t* offset,
-                          const textureReference* texref,
-                          const void* devPtr,
-                          const hipChannelFormatDesc* desc,
-                          size_t size) {
+hipError_t hipBindTexture(size_t* offset, const textureReference* texref, const void* devPtr,
+                          const hipChannelFormatDesc* desc, size_t size) {
  HIP_INIT_API(hipBindTexture, offset, texref, devPtr, desc, size);

  hipDeviceptr_t refDevPtr = nullptr;
  size_t refDevSize = 0;
-  HIP_RETURN_ONFAIL(PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr,
-                                                               &refDevSize));
+  HIP_RETURN_ONFAIL(
+      PlatformState::instance().getStatGlobalVar(texref, ihipGetDevice(), &refDevPtr, &refDevSize));
  assert(refDevSize == sizeof(textureReference));
  hipError_t err = ihipBindTexture(offset, texref, devPtr, desc, size);
  if (err != hipSuccess) {
@@ -773,8 +744,7 @@ hipError_t hipBindTexture(size_t* offset,
  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }

-hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc,
-                             hipArray_const_t array) {
+hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array) {
  HIP_INIT_API(hipGetChannelDesc, desc, array);

  if (desc == nullptr) {
@@ -790,15 +760,15 @@ hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc,
    HIP_RETURN(hipErrorNotSupported);
  }

-  // It is UB to call hipGetChannelDesc() on an array created via hipArrayCreate()/hipArray3DCreate().
-  // This is due to hip not differentiating between runtime and driver types.
+  // It is UB to call hipGetChannelDesc() on an array created via
+  // hipArrayCreate()/hipArray3DCreate(). This is due to hip not differentiating between runtime and
+  // driver types.
  *desc = array->desc;

  HIP_RETURN(hipSuccess);
 }

-hipError_t hipGetTextureAlignmentOffset(size_t* offset,
-                                        const textureReference* texref) {
+hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref) {
  HIP_INIT_API(hipGetTextureAlignmentOffset, offset, texref);

  if (texref == nullptr) {
@@ -839,8 +809,7 @@ hipError_t hipGetTextureReference(const textureReference** texref, const void* s
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetFormat(textureReference* texRef,
-                              hipArray_Format fmt,
+hipError_t hipTexRefSetFormat(textureReference* texRef, hipArray_Format fmt,
                              int NumPackedComponents) {
  HIP_INIT_API(hipTexRefSetFormat, texRef, fmt, NumPackedComponents);

@@ -860,8 +829,7 @@ hipError_t hipTexRefSetFormat(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetFlags(textureReference* texRef,
-                             unsigned int Flags) {
+hipError_t hipTexRefSetFlags(textureReference* texRef, unsigned int Flags) {
  HIP_INIT_API(hipTexRefSetFlags, texRef, Flags);

  if (texRef == nullptr) {
@@ -893,8 +861,7 @@ hipError_t hipTexRefSetFlags(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetFilterMode(textureReference* texRef,
-                                  hipTextureFilterMode fm) {
+hipError_t hipTexRefSetFilterMode(textureReference* texRef, hipTextureFilterMode fm) {
  HIP_INIT_API(hipTexRefSetFilterMode, texRef, fm);

  if (texRef == nullptr) {
@@ -912,8 +879,7 @@ hipError_t hipTexRefSetFilterMode(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* pam,
-                                   const textureReference* texRef,
+hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* pam, const textureReference* texRef,
                                   int dim) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetAddressMode, pam, texRef, dim);
@@ -942,9 +908,7 @@ hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* pam,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetAddressMode(textureReference* texRef,
-                                   int dim,
-                                   hipTextureAddressMode am) {
+hipError_t hipTexRefSetAddressMode(textureReference* texRef, int dim, hipTextureAddressMode am) {
  HIP_INIT_API(hipTexRefSetAddressMode, texRef, dim, am);

  if (texRef == nullptr) {
@@ -970,8 +934,7 @@ hipError_t hipTexRefSetAddressMode(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetArray(hipArray_t* pArray,
-                             const textureReference* texRef) {
+hipError_t hipTexRefGetArray(hipArray_t* pArray, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetArray, pArray, texRef);

@@ -1000,9 +963,7 @@ hipError_t hipTexRefGetArray(hipArray_t* pArray,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetArray(textureReference* texRef,
-                             hipArray_const_t array,
-                             unsigned int flags) {
+hipError_t hipTexRefSetArray(textureReference* texRef, hipArray_const_t array, unsigned int flags) {
  HIP_INIT_API(hipTexRefSetArray, texRef, array, flags);

  if ((texRef == nullptr) || (array == nullptr)) {
@@ -1019,9 +980,8 @@ hipError_t hipTexRefSetArray(textureReference* texRef,
  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
  assert(refDevSize == sizeof(textureReference));

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texRef->textureObject);
  if (err != hipSuccess) {
    HIP_RETURN(err);
@@ -1038,7 +998,8 @@ hipError_t hipTexRefSetArray(textureReference* texRef,

  hipTextureDesc texDesc = hip::getTextureDesc(texRef);

-  hipResourceViewFormat format = hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
+  hipResourceViewFormat format =
+      hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(array, format);

  err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, &resViewDesc);
@@ -1050,8 +1011,7 @@ hipError_t hipTexRefSetArray(textureReference* texRef,
  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }

-hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr,
-                               const textureReference* texRef) {
+hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetAddress, dptr, texRef);

@@ -1087,9 +1047,7 @@ hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetAddress(size_t* ByteOffset,
-                               textureReference* texRef,
-                               hipDeviceptr_t dptr,
+hipError_t hipTexRefSetAddress(size_t* ByteOffset, textureReference* texRef, hipDeviceptr_t dptr,
                               size_t bytes) {
  HIP_INIT_API(hipTexRefSetAddress, ByteOffset, texRef, dptr, bytes);

@@ -1102,9 +1060,8 @@ hipError_t hipTexRefSetAddress(size_t* ByteOffset,
  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
  assert(refDevSize == sizeof(textureReference));

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texRef->textureObject);
  if (err != hipSuccess) {
    HIP_RETURN(err);
@@ -1133,10 +1090,8 @@ hipError_t hipTexRefSetAddress(size_t* ByteOffset,
  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }

-hipError_t hipTexRefSetAddress2D(textureReference* texRef,
-                                 const HIP_ARRAY_DESCRIPTOR* desc,
-                                 hipDeviceptr_t dptr,
-                                 size_t Pitch) {
+hipError_t hipTexRefSetAddress2D(textureReference* texRef, const HIP_ARRAY_DESCRIPTOR* desc,
+                                 hipDeviceptr_t dptr, size_t Pitch) {
  HIP_INIT_API(hipTexRefSetAddress2D, texRef, desc, dptr, Pitch);

  if ((texRef == nullptr) || (desc == nullptr)) {
@@ -1148,9 +1103,8 @@ hipError_t hipTexRefSetAddress2D(textureReference* texRef,
  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
  assert(refDevSize == sizeof(textureReference));

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texRef->textureObject);
  if (err != hipSuccess) {
    HIP_RETURN(err);
@@ -1159,7 +1113,8 @@ hipError_t hipTexRefSetAddress2D(textureReference* texRef,
  hipResourceDesc resDesc = {};
  resDesc.resType = hipResourceTypePitch2D;
  resDesc.res.linear.devPtr = dptr;
-  resDesc.res.linear.desc = hip::getChannelFormatDesc(desc->NumChannels, desc->Format); // Need to verify.
+  resDesc.res.linear.desc =
+      hip::getChannelFormatDesc(desc->NumChannels, desc->Format);  // Need to verify.
  resDesc.res.pitch2D.width = desc->Width;
  resDesc.res.pitch2D.height = desc->Height;
  resDesc.res.pitch2D.pitchInBytes = Pitch;
@@ -1179,8 +1134,7 @@ hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannel
  return {x, y, z, w, f};
 }

-hipError_t hipTexRefGetBorderColor(float* pBorderColor,
-                                   const textureReference* texRef) {
+hipError_t hipTexRefGetBorderColor(float* pBorderColor, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetBorderColor, pBorderColor, texRef);

@@ -1201,8 +1155,7 @@ hipError_t hipTexRefGetBorderColor(float* pBorderColor,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetFilterMode(hipTextureFilterMode* pfm,
-                                  const textureReference* texRef) {
+hipError_t hipTexRefGetFilterMode(hipTextureFilterMode* pfm, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetFilterMode, pfm, texRef);

@@ -1221,8 +1174,7 @@ hipError_t hipTexRefGetFilterMode(hipTextureFilterMode* pfm,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetFlags(unsigned int* pFlags,
-                             const textureReference* texRef) {
+hipError_t hipTexRefGetFlags(unsigned int* pFlags, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetFlags, pFlags, texRef);

@@ -1253,14 +1205,12 @@ hipError_t hipTexRefGetFlags(unsigned int* pFlags,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetFormat(hipArray_Format* pFormat,
-                              int* pNumChannels,
+hipError_t hipTexRefGetFormat(hipArray_Format* pFormat, int* pNumChannels,
                              const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetFormat, pFormat, pNumChannels, texRef);

-  if ((pFormat == nullptr) || (pNumChannels == nullptr) ||
-      (texRef == nullptr)) {
+  if ((pFormat == nullptr) || (pNumChannels == nullptr) || (texRef == nullptr)) {
    HIP_RETURN(hipErrorInvalidValue);
  }
  amd::Device* device = hip::getCurrentDevice()->devices()[0];
@@ -1276,8 +1226,7 @@ hipError_t hipTexRefGetFormat(hipArray_Format* pFormat,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAnsio,
-                                     const textureReference* texRef) {
+hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAnsio, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetMaxAnisotropy, pmaxAnsio, texRef);

@@ -1296,8 +1245,7 @@ hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAnsio,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefGetMipmapFilterMode(hipTextureFilterMode* pfm,
-                                        const textureReference* texRef) {
+hipError_t hipTexRefGetMipmapFilterMode(hipTextureFilterMode* pfm, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetMipmapFilterMode, pfm, texRef);

@@ -1316,8 +1264,7 @@ hipError_t hipTexRefGetMipmapFilterMode(hipTextureFilterMode* pfm,
  HIP_RETURN(hipErrorInvalidValue);
 }

-hipError_t hipTexRefGetMipmapLevelBias(float* pbias,
-                                       const textureReference* texRef) {
+hipError_t hipTexRefGetMipmapLevelBias(float* pbias, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetMipmapLevelBias, pbias, texRef);

@@ -1336,8 +1283,7 @@ hipError_t hipTexRefGetMipmapLevelBias(float* pbias,
  HIP_RETURN(hipErrorInvalidValue);
 }

-hipError_t hipTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp,
-                                        float* pmaxMipmapLevelClamp,
+hipError_t hipTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp,
                                        const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetMipmapLevelClamp, pminMipmapLevelClamp, pmaxMipmapLevelClamp, texRef);
@@ -1359,8 +1305,7 @@ hipError_t hipTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp,
  HIP_RETURN(hipErrorInvalidValue);
 }

-hipError_t hipTexRefGetMipMappedArray(hipMipmappedArray_t* pArray,
-                                      const textureReference* texRef) {
+hipError_t hipTexRefGetMipMappedArray(hipMipmappedArray_t* pArray, const textureReference* texRef) {
  // TODO overload operator<<(ostream&, textureReference&).
  HIP_INIT_API(hipTexRefGetMipMappedArray, pArray, &texRef);

@@ -1395,8 +1340,7 @@ hipError_t hipTexRefGetMipMappedArray(hipMipmappedArray_t* pArray,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetBorderColor(textureReference* texRef,
-                                   float* pBorderColor) {
+hipError_t hipTexRefSetBorderColor(textureReference* texRef, float* pBorderColor) {
  HIP_INIT_API(hipTexRefSetBorderColor, texRef, pBorderColor);

  if ((texRef == nullptr) || (pBorderColor == nullptr)) {
@@ -1416,8 +1360,7 @@ hipError_t hipTexRefSetBorderColor(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetMaxAnisotropy(textureReference* texRef,
-                                     unsigned int maxAniso) {
+hipError_t hipTexRefSetMaxAnisotropy(textureReference* texRef, unsigned int maxAniso) {
  HIP_INIT_API(hipTexRefSetMaxAnisotropy, texRef, maxAniso);

  if (texRef == nullptr) {
@@ -1435,8 +1378,7 @@ hipError_t hipTexRefSetMaxAnisotropy(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetMipmapFilterMode(textureReference* texRef,
-                                        hipTextureFilterMode fm) {
+hipError_t hipTexRefSetMipmapFilterMode(textureReference* texRef, hipTextureFilterMode fm) {
  HIP_INIT_API(hipTexRefSetMipmapFilterMode, texRef, fm);

  if (texRef == nullptr) {
@@ -1454,8 +1396,7 @@ hipError_t hipTexRefSetMipmapFilterMode(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetMipmapLevelBias(textureReference* texRef,
-                                       float bias) {
+hipError_t hipTexRefSetMipmapLevelBias(textureReference* texRef, float bias) {
  HIP_INIT_API(hipTexRefSetMipmapLevelBias, texRef, bias);

  if (texRef == nullptr) {
@@ -1473,8 +1414,7 @@ hipError_t hipTexRefSetMipmapLevelBias(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetMipmapLevelClamp(textureReference* texRef,
-                                        float minMipMapLevelClamp,
+hipError_t hipTexRefSetMipmapLevelClamp(textureReference* texRef, float minMipMapLevelClamp,
                                        float maxMipMapLevelClamp) {
  HIP_INIT_API(hipTexRefSetMipmapLevelClamp, minMipMapLevelClamp, maxMipMapLevelClamp);

@@ -1494,8 +1434,7 @@ hipError_t hipTexRefSetMipmapLevelClamp(textureReference* texRef,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
-                                      hipMipmappedArray* mipmappedArray,
+hipError_t hipTexRefSetMipmappedArray(textureReference* texRef, hipMipmappedArray* mipmappedArray,
                                      unsigned int Flags) {
  HIP_INIT_API(hipTexRefSetMipmappedArray, texRef, mipmappedArray, Flags);

@@ -1512,9 +1451,8 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
  HIP_RETURN_ONFAIL(PlatformState::instance().getDynTexGlobalVar(texRef, &refDevPtr, &refDevSize));
  assert(refDevSize == sizeof(textureReference));

-  // Any previous address or HIP array state associated with the texture reference is superseded by this function.
-  // Any memory previously bound to hTexRef is unbound.
-  // No need to check for errors.
+  // Any previous address or HIP array state associated with the texture reference is superseded by
+  // this function. Any memory previously bound to hTexRef is unbound. No need to check for errors.
  hipError_t err = ihipDestroyTextureObject(texRef->textureObject);
  if (err != hipSuccess) {
    HIP_RETURN(err);
@@ -1526,7 +1464,8 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,

  hipTextureDesc texDesc = hip::getTextureDesc(texRef);

-  hipResourceViewFormat format = hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
+  hipResourceViewFormat format =
+      hip::getResourceViewFormat(hip::getChannelFormatDesc(texRef->numChannels, texRef->format));
  hipResourceViewDesc resViewDesc = hip::getResourceViewDesc(mipmappedArray, format);

  err = ihipCreateTextureObject(&texRef->textureObject, &resDesc, &texDesc, &resViewDesc);
@@ -1538,8 +1477,7 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }

-hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
-                              const HIP_RESOURCE_DESC* pResDesc,
+hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject, const HIP_RESOURCE_DESC* pResDesc,
                              const HIP_TEXTURE_DESC* pTexDesc,
                              const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
  HIP_INIT_API(hipTexObjectCreate, pTexObject, pResDesc, pTexDesc, pResViewDesc);
@@ -1565,8 +1503,7 @@ hipError_t hipTexObjectDestroy(hipTextureObject_t texObject) {
  HIP_RETURN(ihipDestroyTextureObject(texObject));
 }

-hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc,
-                                       hipTextureObject_t texObject) {
+hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc, hipTextureObject_t texObject) {
  HIP_INIT_API(hipTexObjectGetResourceDesc, pResDesc, texObject);

  if ((pResDesc == nullptr) || (texObject == nullptr)) {
@@ -1603,8 +1540,7 @@ hipError_t hipTexObjectGetResourceViewDesc(HIP_RESOURCE_VIEW_DESC* pResViewDesc,
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc,
-                                      hipTextureObject_t texObject) {
+hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc, hipTextureObject_t texObject) {
  HIP_INIT_API(hipTexObjectGetTextureDesc, pTexDesc, texObject);

  if ((pTexDesc == nullptr) || (texObject == nullptr)) {
@@ -23,14 +23,14 @@
 #include "hip_vm.hpp"
 namespace hip {

-static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtNone)
-              == static_cast<uint32_t>(amd::Device::VmmAccess::kNone),
+static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtNone) ==
+                  static_cast<uint32_t>(amd::Device::VmmAccess::kNone),
              "Mem Access Flag None mismatch with ROCclr!");
-static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtRead)
-              == static_cast<uint32_t>(amd::Device::VmmAccess::kReadOnly),
+static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtRead) ==
+                  static_cast<uint32_t>(amd::Device::VmmAccess::kReadOnly),
              "Mem Access Flag Read mismatch with ROCclr!");
-static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtReadWrite)
-              == static_cast<uint32_t>(amd::Device::VmmAccess::kReadWrite),
+static_assert(static_cast<uint32_t>(hipMemAccessFlagsProtReadWrite) ==
+                  static_cast<uint32_t>(amd::Device::VmmAccess::kReadWrite),
              "Mem Access Flag Read Write mismatch with ROCclr!");

 hipError_t hipMemAddressFree(void* devPtr, size_t size) {
@@ -60,8 +60,8 @@ hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void*
  }

  const auto& dev_info = g_devices[0]->devices()[0]->info();
-  if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0)
-      || ((alignment & (alignment - 1)) != 0)) {
+  if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0) ||
+      ((alignment & (alignment - 1)) != 0)) {
    HIP_RETURN(hipErrorInvalidValue);
  }

@@ -98,8 +98,8 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
    HIP_RETURN(hipErrorInvalidDevice);
  }

-  if (prop->requestedHandleTypes != hipMemHandleTypeNone
-      && prop->requestedHandleTypes != hipMemHandleTypePosixFileDescriptor) {
+  if (prop->requestedHandleTypes != hipMemHandleTypeNone &&
+      prop->requestedHandleTypes != hipMemHandleTypePosixFileDescriptor) {
    HIP_RETURN(hipErrorNotSupported);
  }

@@ -128,8 +128,10 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
    size_t free = 0, total = 0;
    hipError_t hip_error = hipMemGetInfo(&free, &total);
    if (hip_error == hipSuccess) {
-      LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu"
-                                                "| total :%zu", size, free, total);
+      LogPrintfError(
+          "Allocation failed : Device memory : required :%zu | free :%zu"
+          "| total :%zu",
+          size, free, total);
    }
    HIP_RETURN(hipErrorOutOfMemory);
  }
@@ -183,8 +185,8 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle,
 hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr) {
  HIP_INIT_API(hipMemGetAccess, flags, location, ptr);

-  if (flags == nullptr || location == nullptr || ptr == nullptr
-      || location->type != hipMemLocationTypeDevice || location->id >= g_devices.size()) {
+  if (flags == nullptr || location == nullptr || ptr == nullptr ||
+      location->type != hipMemLocationTypeDevice || location->id >= g_devices.size()) {
    HIP_RETURN(hipErrorInvalidValue)
  }

@@ -219,7 +221,8 @@ hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAlloc
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, hipMemGenericAllocationHandle_t handle) {
+hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
+                                                   hipMemGenericAllocationHandle_t handle) {
  HIP_INIT_API(hipMemGetAllocationPropertiesFromHandle, prop, handle);

  if (handle == nullptr || prop == nullptr) {
@@ -287,7 +290,8 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
  HIP_RETURN(hipSuccess);
 }

-hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int  count, hipStream_t stream) {
+hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count,
+                               hipStream_t stream) {
  HIP_INIT_API(hipMemMapArrayAsync, mapInfoList, count, stream);

  if (mapInfoList == nullptr || count == 0) {
@@ -47,12 +47,10 @@ public:
  hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() {
    return reinterpret_cast<hipMemGenericAllocationHandle_t>(this);
  }
-  amd::Memory& asAmdMemory() {
-    return phys_mem_ref_;
-  }
+  amd::Memory& asAmdMemory() { return phys_mem_ref_; }

  virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; }
 };
-};
+};  // namespace hip

 #endif  // HIP_SRC_HIP_VM_H
@@ -309,8 +309,7 @@ hiprtcResult hiprtcLinkCreate(unsigned int num_options, hiprtcJIT_option* option

  std::string name("LinkerProgram");
  hip::LinkProgram* rtc_link_prog_ptr = new hip::LinkProgram(name);
-  if (!rtc_link_prog_ptr->AddLinkerOptions(num_options, options_ptr,
-                                          options_vals_pptr)) {
+  if (!rtc_link_prog_ptr->AddLinkerOptions(num_options, options_ptr, options_vals_pptr)) {
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_OPTION);
  }

@@ -335,8 +334,7 @@ hiprtcResult hiprtcLinkAddFile(hiprtcLinkState hip_link_state, hiprtcJITInputTyp
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
  }

-  hip::LinkProgram* rtc_link_prog_ptr =
-      reinterpret_cast<hip::LinkProgram*>(hip_link_state);
+  hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);

  if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
@@ -371,8 +369,7 @@ hiprtcResult hiprtcLinkAddData(hiprtcLinkState hip_link_state, hiprtcJITInputTyp
    input_name = name;
  }

-  hip::LinkProgram* rtc_link_prog_ptr =
-      reinterpret_cast<hip::LinkProgram*>(hip_link_state);
+  hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);

  if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
@@ -392,8 +389,7 @@ hiprtcResult hiprtcLinkComplete(hiprtcLinkState hip_link_state, void** bin_out,
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
  }

-  hip::LinkProgram* rtc_link_prog_ptr =
-      reinterpret_cast<hip::LinkProgram*>(hip_link_state);
+  hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);

  if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
@@ -409,8 +405,7 @@ hiprtcResult hiprtcLinkComplete(hiprtcLinkState hip_link_state, void** bin_out,
 hiprtcResult hiprtcLinkDestroy(hiprtcLinkState hip_link_state) {
  HIPRTC_INIT_API(hip_link_state);

-  hip::LinkProgram* rtc_link_prog_ptr =
-      reinterpret_cast<hip::LinkProgram*>(hip_link_state);
+  hip::LinkProgram* rtc_link_prog_ptr = reinterpret_cast<hip::LinkProgram*>(hip_link_state);

  if (!hip::LinkProgram::isLinkerValid(rtc_link_prog_ptr)) {
    HIPRTC_RETURN(HIPRTC_ERROR_INVALID_INPUT);
@@ -90,7 +90,8 @@ bool RTCCompileProgram::addSource(const std::string& source, const std::string&
 // objects
 bool RTCCompileProgram::addSource_impl() {
  std::vector<char> vsource(source_code_.begin(), source_code_.end());
-  if (!hip::helpers::addCodeObjData(compile_input_, vsource, source_name_, AMD_COMGR_DATA_KIND_SOURCE)) {
+  if (!hip::helpers::addCodeObjData(compile_input_, vsource, source_name_,
+                                    AMD_COMGR_DATA_KIND_SOURCE)) {
    return false;
  }
  return true;
@@ -201,14 +202,15 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
  }

  if (fgpu_rdc_) {
-    if (!hip::helpers::compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) {
+    if (!hip::helpers::compileToBitCode(compile_input_, isa_, compileOpts, build_log_,
+                                        LLVMBitcode_)) {
      LogError("Error in hiprtc: unable to compile source to bitcode");
      return false;
    }
  } else {
    LogInfo("Using the new path of comgr");
-    if (!hip::helpers::compileToExecutable(compile_input_, isa_, compileOpts, link_options_, build_log_,
-                             executable_)) {
+    if (!hip::helpers::compileToExecutable(compile_input_, isa_, compileOpts, link_options_,
+                                           build_log_, executable_)) {
      LogError("Failing to compile to realloc");
      return false;
    }
@@ -234,7 +236,6 @@ void RTCCompileProgram::stripNamedExpression(std::string& strippedName) {
  if (strippedName.front() == '&') {
    strippedName.erase(0, 1);
  }
-
 }

 bool RTCCompileProgram::trackMangledName(std::string& name) {
@@ -249,8 +250,10 @@ bool RTCCompileProgram::trackMangledName(std::string& name) {

  std::string gcn_expr = "__amdgcn_name_expr_";
  std::string size = std::to_string(mangled_names_.size());
-  const auto var1{"\n static __device__ const void* " + gcn_expr + size + "[]= {\"" + strippedName + "\", (void*)&" + strippedName + "};"};
-  const auto var2{"\n static auto __amdgcn_name_expr_stub_" + size + " = " + gcn_expr + size + ";\n"};
+  const auto var1{"\n static __device__ const void* " + gcn_expr + size + "[]= {\"" + strippedName +
+                  "\", (void*)&" + strippedName + "};"};
+  const auto var2{"\n static auto __amdgcn_name_expr_stub_" + size + " = " + gcn_expr + size +
+                  ";\n"};
  const auto code{var1 + var2};

  source_code_ += code;
@@ -105,8 +105,7 @@ inline const char* ihipErrorString(hipError_t hip_error) {
 };

 // Building block functions:
-template <typename T>
-inline std::string ToHexString(T v) {
+template <typename T> inline std::string ToHexString(T v) {
  std::ostringstream ss;
  ss << "0x" << std::hex << v;
  return ss.str();
@@ -115,8 +114,7 @@ inline std::string ToHexString(T v) {
 //---
 // Template overloads for ToString to handle specific types

-template <typename T>
-inline std::string ToString(T* v) {
+template <typename T> inline std::string ToString(T* v) {
  std::ostringstream ss;
  if (v == NULL) {
    ss << "char array:<null>";
@@ -126,8 +124,7 @@ inline std::string ToString(T* v) {
  return ss.str();
 };

-template <typename T>
-inline std::string ToString(T** v) {
+template <typename T> inline std::string ToString(T** v) {
  std::ostringstream ss;
  if (v == NULL) {
    ss << "char array:<null>";
@@ -138,8 +135,7 @@ inline std::string ToString(T** v) {
 };

 // This is the default which works for most types:
-template <typename T>
-inline std::string ToString(T v) {
+template <typename T> inline std::string ToString(T v) {
  std::ostringstream ss;
  ss << v;
  return ss.str();
@@ -151,8 +147,7 @@ inline std::string ToString() { return (""); }
 //---
 // C++11 variadic template - peels off first argument, converts to string, and calls itself again to
 // peel the next arg. Strings are automatically separated by comma+space.
-template <typename T, typename... Args>
-inline std::string ToString(T first, Args... args) {
+template <typename T, typename... Args> inline std::string ToString(T first, Args... args) {
  return ToString(first) + ", " + ToString(args...);
 }

@@ -141,8 +141,8 @@ RUNTIME_ENTRY_RET(cl_command_queue, clCreateCommandQueueWithProperties,
  }

  if ((queueRTCUs != amd::CommandQueue::RealTimeDisabled) &&
-      ((queueRTCUs > amdDevice.info().numRTCUs_) || (queueRTCUs == 0)
-        || (queueRTCUs < amdDevice.info().granularityRTCUs_))) {
+      ((queueRTCUs > amdDevice.info().numRTCUs_) || (queueRTCUs == 0) ||
+       (queueRTCUs < amdDevice.info().granularityRTCUs_))) {
    *not_null(errcode_ret) = CL_INVALID_VALUE;
    return (cl_command_queue)0;
  }
@@ -32,32 +32,26 @@
 #include "vdi_common.hpp"

 //! Helper function to check "properties" parameter in various functions
-int checkContextProperties(
-    const cl_context_properties *properties,
-    bool*   offlineDevices);
+int checkContextProperties(const cl_context_properties* properties, bool* offlineDevices);

 namespace amd {

 template <typename T>
-static inline cl_int
-clGetInfo(
-    T& field,
-    size_t param_value_size,
-    void* param_value,
-    size_t* param_value_size_ret)
-{
+static inline cl_int clGetInfo(T& field, size_t param_value_size, void* param_value,
+                               size_t* param_value_size_ret) {
  const void* valuePtr;
  size_t valueSize;

-    std::tie(valuePtr, valueSize)
-        = detail::ParamInfo<typename std::remove_const<T>::type>::get(field);
+  std::tie(valuePtr, valueSize) =
+      detail::ParamInfo<typename std::remove_const<T>::type>::get(field);

  *not_null(param_value_size_ret) = valueSize;

  cl_int ret = CL_SUCCESS;
  if (param_value != NULL && param_value_size < valueSize) {
-        if ((param_value_size == 0) || !std::is_pointer<T>() || !std::is_same<typename
-            std::remove_const<typename std::remove_pointer<T>::type>::type, char>()) {
+    if ((param_value_size == 0) || !std::is_pointer<T>() ||
+        !std::is_same<typename std::remove_const<typename std::remove_pointer<T>::type>::type,
+                      char>()) {
      return CL_INVALID_VALUE;
    }
    // For char* and char[] params, we will at least fill up to
@@ -70,23 +64,19 @@ clGetInfo(
  if (param_value != NULL) {
    ::memcpy(param_value, valuePtr, valueSize);
    if (param_value_size > valueSize) {
-            ::memset(static_cast<address>(param_value) + valueSize,
-                '\0', param_value_size - valueSize);
+      ::memset(static_cast<address>(param_value) + valueSize, '\0', param_value_size - valueSize);
    }
  }

  return ret;
 }

-static inline cl_int
-clSetEventWaitList(
-    Command::EventWaitList& eventWaitList,
+static inline cl_int clSetEventWaitList(Command::EventWaitList& eventWaitList,
                                        const amd::HostQueue& hostQueue,
                                        cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list)
-{
-    if ((num_events_in_wait_list == 0 && event_wait_list != NULL)
-            || (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
+                                        const cl_event* event_wait_list) {
+  if ((num_events_in_wait_list == 0 && event_wait_list != NULL) ||
+      (num_events_in_wait_list != 0 && event_wait_list == NULL)) {
    return CL_INVALID_EVENT_WAIT_LIST;
  }

@@ -108,14 +98,14 @@ clSetEventWaitList(
 }

 //! Common function declarations for CL-external graphics API interop
-cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue,
-    cl_uint num_objects, const cl_mem* mem_objects,
-    cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
-    cl_event* event, cl_command_type cmd_type);
-cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue,
-    cl_uint num_objects, const cl_mem* mem_objects,
-    cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
-    cl_event* event, cl_command_type cmd_type);
+cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
+                                     const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+                                     const cl_event* event_wait_list, cl_event* event,
+                                     cl_command_type cmd_type);
+cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue, cl_uint num_objects,
+                                     const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+                                     const cl_event* event_wait_list, cl_event* event,
+                                     cl_command_type cmd_type);
 static inline cl_int clDXTranslateErrorCode(cl_int err) {
  return err == CL_INVALID_GL_OBJECT ? CL_INVALID_MEM_OBJECT : err;
 }
@@ -125,30 +115,19 @@ static inline cl_int clDXTranslateErrorCode(cl_int err) {
 extern "C" {

 #if defined(CL_VERSION_1_1)
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetCommandQueueProperty(
-    cl_command_queue command_queue,
-    cl_command_queue_properties properties,
-    cl_bool enable,
+extern CL_API_ENTRY cl_int CL_API_CALL clSetCommandQueueProperty(
+    cl_command_queue command_queue, cl_command_queue_properties properties, cl_bool enable,
    cl_command_queue_properties* old_properties) CL_API_SUFFIX__VERSION_1_0;
 #endif  // CL_VERSION_1_1

-extern CL_API_ENTRY cl_mem CL_API_CALL
-clConvertImageAMD(
-    cl_context              context,
-    cl_mem                  image,
+extern CL_API_ENTRY cl_mem CL_API_CALL clConvertImageAMD(cl_context context, cl_mem image,
                                                         const cl_image_format* image_format,
                                                         cl_int* errcode_ret);

-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBufferFromImageAMD(
-    cl_context              context,
-    cl_mem                  image,
+extern CL_API_ENTRY cl_mem CL_API_CALL clCreateBufferFromImageAMD(cl_context context, cl_mem image,
                                                                  cl_int* errcode_ret);

-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithAssemblyAMD(
-    cl_context              context,
+extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithAssemblyAMD(cl_context context,
                                                                          cl_uint count,
                                                                          const char** strings,
                                                                          const size_t* lengths,
@@ -396,7 +396,6 @@ RUNTIME_ENTRY(cl_int, clEnqueueReleaseD3D10ObjectsKHR,
 RUNTIME_EXIT


-
 /*! @}
 *  \addtogroup CL-D3D10 interop helper functions
 *  @{
@@ -590,6 +589,4 @@ void amd::SyncD3D10Objects(std::vector<amd::Memory*>& memObjects) {
 }


-
-
 #endif  //_WIN32
@@ -28,34 +28,20 @@

 #include <utility>

-namespace amd
-{
+namespace amd {

 //! Functions for executing the D3D10 related stuff
-cl_mem clCreateBufferFromD3D10ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D10Resource* pD3DResource,
+cl_mem clCreateBufferFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                          ID3D10Resource* pD3DResource, int* errcode_ret);
+cl_mem clCreateImage1DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D10Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
-cl_mem clCreateImage1DFromD3D10ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D10Resource* pD3DResource,
-    UINT            subresource,
+cl_mem clCreateImage2DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D10Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
-cl_mem clCreateImage2DFromD3D10ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D10Resource* pD3DResource,
-    UINT            subresource,
-    int*            errcode_ret);
-cl_mem clCreateImage3DFromD3D10ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D10Resource* pD3DResource,
-    UINT            subresource,
+cl_mem clCreateImage3DFromD3D10ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D10Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
 void SyncD3D10Objects(std::vector<amd::Memory*>& memObjects);

 }  // namespace amd
-
@@ -623,6 +623,4 @@ void amd::SyncD3D11Objects(std::vector<amd::Memory*>& memObjects) {
 }


-
-
 #endif  //_WIN32
@@ -28,41 +28,24 @@

 #include <utility>

-extern CL_API_ENTRY cl_mem CL_API_CALL
-clGetPlaneFromImageAMD(
-    cl_context /* context */,
-    cl_mem     /* mem */,
-    cl_uint    /* plane */,
+extern CL_API_ENTRY cl_mem CL_API_CALL clGetPlaneFromImageAMD(cl_context /* context */,
+                                                              cl_mem /* mem */, cl_uint /* plane */,
                                                              cl_int* /* errcode_ret */);

-namespace amd
-{
+namespace amd {

 //! Functions for executing the D3D11 related stuff
-cl_mem clCreateBufferFromD3D11ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D11Resource* pD3DResource,
+cl_mem clCreateBufferFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                          ID3D11Resource* pD3DResource, int* errcode_ret);
+cl_mem clCreateImage1DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D11Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
-cl_mem clCreateImage1DFromD3D11ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D11Resource* pD3DResource,
-    UINT            subresource,
+cl_mem clCreateImage2DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D11Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
-cl_mem clCreateImage2DFromD3D11ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D11Resource* pD3DResource,
-    UINT            subresource,
-    int*            errcode_ret);
-cl_mem clCreateImage3DFromD3D11ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
-    ID3D11Resource* pD3DResource,
-    UINT            subresource,
+cl_mem clCreateImage3DFromD3D11ResourceAMD(Context& amdContext, cl_mem_flags flags,
+                                           ID3D11Resource* pD3DResource, UINT subresource,
                                           int* errcode_ret);
 void SyncD3D11Objects(std::vector<amd::Memory*>& memObjects);

 }  // namespace amd
-
@@ -29,18 +29,13 @@

 #include <utility>

-namespace amd
-{
+namespace amd {

-cl_mem clCreateImage2DFromD3D9ResourceAMD(
-    Context&        amdContext,
-    cl_mem_flags    flags,
+cl_mem clCreateImage2DFromD3D9ResourceAMD(Context& amdContext, cl_mem_flags flags,
                                          cl_dx9_media_adapter_type_khr adapter_type,
-    cl_dx9_surface_info_khr*  surface_info,
-    cl_uint         plane,
+                                          cl_dx9_surface_info_khr* surface_info, cl_uint plane,
                                          int* errcode_ret);

 void SyncD3D9Objects(std::vector<amd::Memory*>& memObjects);

 }  // namespace amd
-
@@ -127,7 +127,8 @@ RUNTIME_ENTRY(cl_int, clGetPlatformInfo,
      value = "Advanced Micro Devices, Inc.";
      break;
    case CL_PLATFORM_EXTENSIONS:
-      value = "cl_khr_icd "
+      value =
+          "cl_khr_icd "
 #ifdef _WIN32
          "cl_khr_d3d10_sharing "
          "cl_khr_d3d11_sharing "
@@ -403,7 +404,8 @@ RUNTIME_ENTRY(cl_int, clGetDeviceInfo,
        CASE(CL_DEVICE_WAVEFRONT_WIDTH_AMD, wavefrontWidth_);
      case CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD: {
        cl_uint globalMemChannels = as_amd(device)->info().vramBusBitWidth_ / 32;
-            return amd::clGetInfo(globalMemChannels, param_value_size, param_value, param_value_size_ret);
+        return amd::clGetInfo(globalMemChannels, param_value_size, param_value,
+                              param_value_size_ret);
      }
        CASE(CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, globalMemChannelBanks_);
        CASE(CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, globalMemChannelBankWidth_);
@@ -227,7 +227,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueNDRangeKernel,
        return CL_INVALID_WORK_GROUP_SIZE;
      }
      // >32bits global work size is not supported.
-      if ((global_work_size[dim] == 0) || (global_work_size[dim] > static_cast<size_t>(0xffffffff))) {
+      if ((global_work_size[dim] == 0) ||
+          (global_work_size[dim] > static_cast<size_t>(0xffffffff))) {
        return CL_INVALID_GLOBAL_WORK_SIZE;
      }
      numWorkItems *= local_work_size[dim];
@@ -252,8 +253,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueNDRangeKernel,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -461,8 +462,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueNativeKernel,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -642,8 +643,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueMarkerWithWaitList,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -798,8 +799,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueBarrierWithWaitList,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -964,9 +965,7 @@ RUNTIME_EXIT
 *
 */
 RUNTIME_ENTRY(cl_int, clGetDeviceAndHostTimer,
-              (cl_device_id device, cl_ulong * device_timestamp,
-               cl_ulong * host_timestamp)) {
-
+              (cl_device_id device, cl_ulong* device_timestamp, cl_ulong* host_timestamp)) {
  if (!is_valid(device)) {
    return CL_INVALID_DEVICE;
  }
@@ -1012,9 +1011,7 @@ RUNTIME_EXIT
 *    by the OpenCL implementation on the host.
 *
 */
-RUNTIME_ENTRY(cl_int, clGetHostTimer,
-              (cl_device_id device, cl_ulong * host_timestamp)) {
-
+RUNTIME_ENTRY(cl_int, clGetHostTimer, (cl_device_id device, cl_ulong* host_timestamp)) {
  if (!is_valid(device)) {
    return CL_INVALID_DEVICE;
  }
@@ -431,8 +431,9 @@ RUNTIME_EXIT
 *
 *  \version 1.0r29
 */
-RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLRenderbuffer, (cl_context context, cl_mem_flags flags,
-                                                       GLuint renderbuffer, cl_int* errcode_ret)) {
+RUNTIME_ENTRY_RET(cl_mem, clCreateFromGLRenderbuffer,
+                  (cl_context context, cl_mem_flags flags, GLuint renderbuffer,
+                   cl_int* errcode_ret)) {
  cl_mem clMemObj = NULL;

  if (!is_valid(context)) {
@@ -902,8 +903,7 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,

      cl_device_id* devices = (cl_device_id*)alloca(size);

-      errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, total_devices,
-                               devices, NULL);
+      errcode = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, total_devices, devices, NULL);
      if (errcode != CL_SUCCESS) {
        return errcode;
      }
@@ -1438,10 +1438,10 @@ cl_mem clCreateFromGLTextureAMD(Context& amdContext, cl_mem_flags clFlags, GLenu
    target = (glTarget == GL_TEXTURE_CUBE_MAP) ? target : 0;

    if (wholeMipmap) {
-      pImageGL = new (amdContext)
-          ImageGL(amdContext, clType, clFlags, clImageFormat, static_cast<size_t>(gliTexWidth),
-                static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget,
-                texture, miplevel, glInternalFormat, clGLType, numSamples, gliTexMaxLevel, target);
+      pImageGL = new (amdContext) ImageGL(
+          amdContext, clType, clFlags, clImageFormat, static_cast<size_t>(gliTexWidth),
+          static_cast<size_t>(gliTexHeight), static_cast<size_t>(gliTexDepth), glTarget, texture,
+          miplevel, glInternalFormat, clGLType, numSamples, gliTexMaxLevel, target);
    } else {
      pImageGL = new (amdContext)
          ImageGL(amdContext, clType, clFlags, clImageFormat, static_cast<size_t>(gliTexWidth),
@@ -1643,8 +1643,7 @@ cl_int clEnqueueAcquireExtObjectsAMD(cl_command_queue command_queue, cl_uint num
  }

  amd::Command::EventWaitList eventWaitList;
-  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                event_wait_list);
+  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -1716,8 +1715,7 @@ cl_int clEnqueueReleaseExtObjectsAMD(cl_command_queue command_queue, cl_uint num
  }

  amd::Command::EventWaitList eventWaitList;
-  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                event_wait_list);
+  err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -86,8 +86,8 @@ cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {
     WINDOWS_SWITCH(clEnqueueAcquireD3D10ObjectsKHR, NULL),
     WINDOWS_SWITCH(clEnqueueReleaseD3D10ObjectsKHR, NULL), clSetEventCallback, clCreateSubBuffer,
     clSetMemObjectDestructorCallback, clCreateUserEvent, clSetUserEventStatus,
-     clEnqueueReadBufferRect, clEnqueueWriteBufferRect, clEnqueueCopyBufferRect,
-     NULL, NULL, NULL, clCreateEventFromGLsyncKHR,
+     clEnqueueReadBufferRect, clEnqueueWriteBufferRect, clEnqueueCopyBufferRect, NULL, NULL, NULL,
+     clCreateEventFromGLsyncKHR,

     /* OpenCL 1.2*/
     clCreateSubDevices, clRetainDevice, clReleaseDevice, clCreateImage,
@@ -114,23 +114,16 @@ cl_icd_dispatch amd::ICDDispatchedObject::icdVendorDispatch_[] = {
         clEnqueueReleaseDX9MediaSurfacesKHR,
         NULL),  // KHRpfn_clEnqueueReleaseDX9MediaSurfacesKHR clEnqueueReleaseDX9MediaSurfacesKHR;

-     NULL,
-     NULL, NULL, NULL,
+     NULL, NULL, NULL, NULL,

     clCreateCommandQueueWithProperties, clCreatePipe, clGetPipeInfo, clSVMAlloc, clSVMFree,
     clEnqueueSVMFree, clEnqueueSVMMemcpy, clEnqueueSVMMemFill, clEnqueueSVMMap, clEnqueueSVMUnmap,
     clCreateSamplerWithProperties, clSetKernelArgSVMPointer, clSetKernelExecInfo,
-     clGetKernelSubGroupInfo,
-     clCloneKernel,
-     clCreateProgramWithIL,
-     clEnqueueSVMMigrateMem,
-     clGetDeviceAndHostTimer,
-     clGetHostTimer,
-     clGetKernelSubGroupInfo,
+     clGetKernelSubGroupInfo, clCloneKernel, clCreateProgramWithIL, clEnqueueSVMMigrateMem,
+     clGetDeviceAndHostTimer, clGetHostTimer, clGetKernelSubGroupInfo,
     clSetDefaultDeviceCommandQueue,

-     clSetProgramReleaseCallback,
-     clSetProgramSpecializationConstant }};
+     clSetProgramReleaseCallback, clSetProgramSpecializationConstant}};

 CL_API_ENTRY cl_int CL_API_CALL clIcdGetPlatformIDsKHR(cl_uint num_entries,
                                                       cl_platform_id* platforms,
@@ -154,8 +154,7 @@ typedef cl_program(CL_API_CALL* clCreateProgramWithSource_fn)(
    const size_t* /* lengths */, cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;

 extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithIL(cl_context /* context */,
-    const void * /* strings */, size_t /* lengths */,
+clCreateProgramWithIL(cl_context /* context */, const void* /* strings */, size_t /* lengths */,
                      cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_2_0;

 typedef cl_program(CL_API_CALL* clCreateProgramWithILKHR_fn)(
@@ -564,9 +563,9 @@ typedef cl_kernel(CL_API_CALL* clCloneKernel_fn)(

 typedef cl_int(CL_API_CALL* clEnqueueSVMMigrateMem_fn)(
    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
-    const void ** /* svm_pointers */, const size_t * /* sizes */,
-    cl_mem_migration_flags /* flags */, cl_uint /* num_events_in_wait_list */,
-    const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_1;
+    const void** /* svm_pointers */, const size_t* /* sizes */, cl_mem_migration_flags /* flags */,
+    cl_uint /* num_events_in_wait_list */, const cl_event* /* event_wait_list */,
+    cl_event* /* event */) CL_API_SUFFIX__VERSION_2_1;

 typedef cl_int(CL_API_CALL* clGetDeviceAndHostTimer_fn)(
    cl_device_id /* device */, cl_ulong* /* device_timestamp */,
@@ -72,7 +72,8 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {

  if (temp &&
      !(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp ||
-        (chkReadWrite && (CL_MEM_KERNEL_READ_AND_WRITE == temp ||
+        (chkReadWrite &&
+         (CL_MEM_KERNEL_READ_AND_WRITE == temp ||
          (CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) ||
        CL_MEM_READ_ONLY == temp)) {
    return false;
@@ -88,8 +89,9 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {
  }

  if ((flags & CL_MEM_EXTERNAL_PHYSICAL_AMD) &&
-      (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
-                CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) {
+      (flags &
+       (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE |
+        CL_MEM_READ_ONLY))) {
    return false;
  }

@@ -231,8 +233,8 @@ static bool validateImageDescriptor(const std::vector<amd::Device*>& devices,
      }
      if (imageRowPitch == 0) {
        if (desc->mem_object != nullptr) {
-          imageRowPitch = amd::alignUp(desc->image_width,
-                                       devices[0]->info().imagePitchAlignment_) * elemSize;
+          imageRowPitch =
+              amd::alignUp(desc->image_width, devices[0]->info().imagePitchAlignment_) * elemSize;
        } else {
          imageRowPitch = desc->image_width * elemSize;
        }
@@ -331,8 +333,9 @@ class ImageViewRef : public amd::EmbeddedObject {
 *
 *  \version 1.0r33
 */
-RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer, (cl_context context, cl_mem_flags flags, size_t size,
-                                           void* host_ptr, cl_int* errcode_ret)) {
+RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer,
+                  (cl_context context, cl_mem_flags flags, size_t size, void* host_ptr,
+                   cl_int* errcode_ret)) {
  if (!is_valid(context)) {
    *not_null(errcode_ret) = CL_INVALID_CONTEXT;
    return NULL;
@@ -411,7 +414,8 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer, (cl_context context, cl_mem_flags flag

  // check extensions flag consistency
  if ((flags & CL_MEM_USE_PERSISTENT_MEM_AMD) &&
-      (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
+      (flags &
+       (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
        CL_MEM_BUS_ADDRESSABLE_AMD))) {
    *not_null(errcode_ret) = CL_INVALID_VALUE;
    LogWarning("conflicting flags CL_MEM_USE_PERSISTENT_MEM_AMD and host memory specific flags");
@@ -630,16 +634,16 @@ RUNTIME_ENTRY(cl_int, clEnqueueReadBuffer,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }

  amd::CopyMetadata copyMetadata(!blocking_read, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::ReadMemoryCommand* command = new amd::ReadMemoryCommand(
-      hostQueue, CL_COMMAND_READ_BUFFER, eventWaitList, *srcBuffer, srcOffset, srcSize,
-      ptr, 0, 0, copyMetadata);
+  amd::ReadMemoryCommand* command =
+      new amd::ReadMemoryCommand(hostQueue, CL_COMMAND_READ_BUFFER, eventWaitList, *srcBuffer,
+                                 srcOffset, srcSize, ptr, 0, 0, copyMetadata);

  if (command == NULL) {
    return CL_OUT_OF_HOST_MEMORY;
@@ -772,16 +776,16 @@ RUNTIME_ENTRY(cl_int, clEnqueueWriteBuffer,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }

  amd::CopyMetadata copyMetadata(!blocking_write, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::WriteMemoryCommand* command = new amd::WriteMemoryCommand(
-      hostQueue, CL_COMMAND_WRITE_BUFFER, eventWaitList, *dstBuffer, dstOffset, dstSize,
-      ptr, 0, 0, copyMetadata);
+  amd::WriteMemoryCommand* command =
+      new amd::WriteMemoryCommand(hostQueue, CL_COMMAND_WRITE_BUFFER, eventWaitList, *dstBuffer,
+                                  dstOffset, dstSize, ptr, 0, 0, copyMetadata);

  if (command == NULL) {
    return CL_OUT_OF_HOST_MEMORY;
@@ -897,14 +901,15 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBuffer,
    return CL_INVALID_VALUE;
  }

-  if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
+  if (srcBuffer == dstBuffer &&
+      ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
       (dst_offset <= src_offset && src_offset < dst_offset + cb))) {
    return CL_MEM_COPY_OVERLAP;
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -1086,8 +1091,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueReadBufferRect,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -1271,8 +1276,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueWriteBufferRect,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -1453,8 +1458,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferRect,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -2223,8 +2228,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueReadImage,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -2410,17 +2415,16 @@ RUNTIME_ENTRY(cl_int, clEnqueueWriteImage,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }

  amd::CopyMetadata copyMetadata(!blocking_write, amd::CopyMetadata::CopyEnginePreference::SDMA);
-  amd::WriteMemoryCommand* command =
-      new amd::WriteMemoryCommand(hostQueue, CL_COMMAND_WRITE_IMAGE, eventWaitList, *dstImage,
-                                  dstOrigin, dstRegion, ptr, input_row_pitch, input_slice_pitch,
-                                  copyMetadata);
+  amd::WriteMemoryCommand* command = new amd::WriteMemoryCommand(
+      hostQueue, CL_COMMAND_WRITE_IMAGE, eventWaitList, *dstImage, dstOrigin, dstRegion, ptr,
+      input_row_pitch, input_slice_pitch, copyMetadata);

  if (command == NULL) {
    return CL_OUT_OF_HOST_MEMORY;
@@ -2591,8 +2595,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyImage,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -2765,8 +2769,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyImageToBuffer,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -2919,8 +2923,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferToImage,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -3095,8 +3099,8 @@ RUNTIME_ENTRY_RET(void*, clEnqueueMapBuffer,

  // Wait for possible pending operations
  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    *not_null(errcode_ret) = err;
    return (void*)0;
@@ -3367,8 +3371,8 @@ RUNTIME_ENTRY_RET(void*, clEnqueueMapImage,

  // Wait for possible pending operations
  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    *not_null(errcode_ret) = err;
    return (void*)0;
@@ -3382,8 +3386,8 @@ RUNTIME_ENTRY_RET(void*, clEnqueueMapImage,
    return NULL;
  }
  // Attempt to allocate the map target now (whether blocking or non-blocking)
-  void* mapPtr = mem->allocMapTarget(srcOrigin, srcRegion, map_flags,
-                                     image_row_pitch, image_slice_pitch);
+  void* mapPtr =
+      mem->allocMapTarget(srcOrigin, srcRegion, map_flags, image_row_pitch, image_slice_pitch);
  if (NULL == mapPtr) {
    *not_null(errcode_ret) = CL_MAP_FAILURE;
    return NULL;
@@ -3513,8 +3517,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueUnmapMemObject,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -4035,8 +4039,7 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateImage,
    size_t maxDim = std::max(image_desc->image_width, image_desc->image_height);
    maxDim = std::max(maxDim, image_desc->image_depth);
    uint mipLevels;
-    for (mipLevels = 0; maxDim > 0; maxDim >>= 1, mipLevels++)
-      ;
+    for (mipLevels = 0; maxDim > 0; maxDim >>= 1, mipLevels++);
    if (mipLevels < image_desc->num_mip_levels) {
      *not_null(errcode_ret) = CL_INVALID_MIP_LEVEL;
      LogWarning("Invalid mip level");
@@ -4116,10 +4119,10 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateImage,
        return (cl_mem)0;
      }

-      image = new (amdContext) amd::Image(
-          buffer, CL_MEM_OBJECT_IMAGE1D_BUFFER, (flags != 0) ? flags : buffer.getMemFlags(),
-          imageFormat, image_desc->image_width, 1, 1, imageRowPitch, imageSlicePitch,
-          image_desc->num_mip_levels);
+      image = new (amdContext) amd::Image(buffer, CL_MEM_OBJECT_IMAGE1D_BUFFER,
+                                          (flags != 0) ? flags : buffer.getMemFlags(), imageFormat,
+                                          image_desc->image_width, 1, 1, imageRowPitch,
+                                          imageSlicePitch, image_desc->num_mip_levels);
    } break;
    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
      image =
@@ -4273,8 +4276,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueFillBuffer,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -4450,8 +4453,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueFillImage,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -4601,8 +4604,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueMigrateMemObjects,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -28,9 +28,9 @@ extern "C" {
 #endif /*__cplusplus*/

 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferP2PAMD(
-    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
-    size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
+    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset,
+    size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+    cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;

 #ifdef __cplusplus
 } /*extern "C"*/
@@ -189,8 +189,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueBeginPerfCounterAMD,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -275,8 +275,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueEndPerfCounterAMD,
  }

  amd::Command::EventWaitList eventWaitList;
-  cl_int err = amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list,
-                                       event_wait_list);
+  cl_int err =
+      amd::clSetEventWaitList(eventWaitList, *hostQueue, num_events_in_wait_list, event_wait_list);
  if (err != CL_SUCCESS) {
    return err;
  }
@@ -46,25 +46,29 @@ enum PerfcounterInfo {
 * Set device clock mode data
 *********************************/
 enum cl_DeviceClockMode_AMD {
-  CL_DEVICE_CLOCK_MODE_DEFAULT_AMD = 0x0, /*Device clocks and other power settings are restored to default*/
-  CL_DEVICE_CLOCK_MODE_QUERY_AMD = 0x1, /*Queries the current device clock ratios. Leaves the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_DEFAULT_AMD =
+      0x0, /*Device clocks and other power settings are restored to default*/
+  CL_DEVICE_CLOCK_MODE_QUERY_AMD = 0x1, /*Queries the current device clock ratios. Leaves the clock
+                                           mode of the device unchanged*/
  CL_DEVICE_CLOCK_MODE_PROFILING_AMD = 0x2, /*Scale down from peak ratio*/
-  CL_DEVICE_CLOCK_MODE_MINIMUMMEMORY_AMD = 0x3, /* Memory clock is set to the lowest available level*/
-  CL_DEVICE_CLOCK_MODE_MINIMUMENGINE_AMD = 0x4, /*Engine clock is set to the lowest available level*/
+  CL_DEVICE_CLOCK_MODE_MINIMUMMEMORY_AMD =
+      0x3, /* Memory clock is set to the lowest available level*/
+  CL_DEVICE_CLOCK_MODE_MINIMUMENGINE_AMD =
+      0x4,                             /*Engine clock is set to the lowest available level*/
  CL_DEVICE_CLOCK_MODE_PEAK_AMD = 0x5, /*Clocks set to maximum when possible. Fan set to maximum.*/
-  CL_DEVICE_CLOCK_MODE_QUERYPROFILING_AMD = 0x6, /*Queries the profiling device clock ratios. Leaves the clock mode of the device unchanged*/
-  CL_DEVICE_CLOCK_MODE_QUERYPEAK_AMD = 0x7, /*Queries the peak device clock ratios.Leaves the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_QUERYPROFILING_AMD = 0x6, /*Queries the profiling device clock ratios. Leaves
+                                                    the clock mode of the device unchanged*/
+  CL_DEVICE_CLOCK_MODE_QUERYPEAK_AMD =
+      0x7, /*Queries the peak device clock ratios.Leaves the clock mode of the device unchanged*/
  CL_DEVICE_CLOCK_MODE_COUNT_AMD = 0x8, /*Maxmium count of device clock mode*/
 };

-typedef struct _cl_set_device_clock_mode_input_amd
-{
+typedef struct _cl_set_device_clock_mode_input_amd {
  /* specify the clock mode for AMD GPU device*/
  cl_DeviceClockMode_AMD clock_mode;
 } cl_set_device_clock_mode_input_amd;

-typedef struct _cl_set_device_clock_mode_output_amd
-{
+typedef struct _cl_set_device_clock_mode_output_amd {
  /*Ratio of current mem clock to peak clock as obtained from DeviceProperties::maxGpuClock*/
  cl_float memory_clock_ratio_to_peak;
  /*Ratio of current gpu core clock to peak clock as obtained from DeviceProperties::maxGpuClock*/
@@ -1238,9 +1238,9 @@ RUNTIME_EXIT
 * \version 2.2-3
 */
 RUNTIME_ENTRY(cl_int, clSetProgramReleaseCallback,
-              (cl_program program, void (CL_CALLBACK *pfn_notify)(
-                  cl_program program, void *user_data
-                  ), void *user_data)) {
+              (cl_program program,
+               void(CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
+               void* user_data)) {
  if (!is_valid(program)) {
    return CL_INVALID_PROGRAM;
  }
@@ -1375,8 +1375,9 @@ RUNTIME_EXIT
 *
 *  \version 1.0r33
 */
-RUNTIME_ENTRY(cl_int, clCreateKernelsInProgram, (cl_program program, cl_uint num_kernels,
-                                                 cl_kernel* kernels, cl_uint* num_kernels_ret)) {
+RUNTIME_ENTRY(cl_int, clCreateKernelsInProgram,
+              (cl_program program, cl_uint num_kernels, cl_kernel* kernels,
+               cl_uint* num_kernels_ret)) {
  if (!is_valid(program)) {
    return CL_INVALID_PROGRAM;
  }
@@ -1475,8 +1476,7 @@ RUNTIME_EXIT
 *
 *  \version 2.1r01
 */
-RUNTIME_ENTRY_RET(cl_kernel, clCloneKernel,
-                  (cl_kernel source_kernel, cl_int* errcode_ret)) {
+RUNTIME_ENTRY_RET(cl_kernel, clCloneKernel, (cl_kernel source_kernel, cl_int* errcode_ret)) {
  if (!is_valid(source_kernel)) {
    *not_null(errcode_ret) = CL_INVALID_KERNEL;
    return (cl_kernel)0;
@@ -230,9 +230,10 @@ RUNTIME_EXIT
 *
 *  \version 1.0r33
 */
-RUNTIME_ENTRY_RET(cl_sampler, clCreateSampler, (cl_context context, cl_bool normalized_coords,
-                                                cl_addressing_mode addressing_mode,
-                                                cl_filter_mode filter_mode, cl_int* errcode_ret)) {
+RUNTIME_ENTRY_RET(cl_sampler, clCreateSampler,
+                  (cl_context context, cl_bool normalized_coords,
+                   cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
+                   cl_int* errcode_ret)) {
  const cl_sampler_properties sprops[] = {CL_SAMPLER_NORMALIZED_COORDS,
                                          static_cast<cl_sampler_properties>(normalized_coords),
                                          CL_SAMPLER_ADDRESSING_MODE,
--- a/Show More
+++ b/Show More