P4 to Git Change 1780358 by gandryey@gera-win10 on 2019/05/08 18:46:22

SWDEV-79445 - OCL generic changes and code clean-up - Run google autoformat over the PAL backend. It will allow to enable autoformat in VS for the future changes. - No functional changes Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#29 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#10 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#52 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#133 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#37 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.cpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palgpuopen.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#78 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#28 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#93 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#73 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#79 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#22 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#4 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#132 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#60 edit [ROCm/clr commit: 699a12bfa2]
2019-05-08 19:22:02 -04:00
@@ -11,8 +11,9 @@ namespace pal {

 AppProfile::AppProfile()
    : amd::AppProfile(), enableHighPerformanceState_(true), reportAsOCL12Device_(false) {
-  propertyDataMap_.insert({"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});
+  propertyDataMap_.insert(
+      {"HighPerfState", PropertyData(DataType_Boolean, &enableHighPerformanceState_)});

  propertyDataMap_.insert({"OCL12Device", PropertyData(DataType_Boolean, &reportAsOCL12Device_)});
 }
-}
+}  // namespace pal
@@ -20,4 +20,4 @@ class AppProfile : public amd::AppProfile {
  bool enableHighPerformanceState_;
  bool reportAsOCL12Device_;
 };
-}
+}  // namespace pal
@@ -280,8 +280,8 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
    amd::Coord3D copySize(tmpSize, 0, 0);

    // Copy data into the temporary buffer, using CPU
-    if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset,
-        src, copySize, flags)) {
+    if (!xferBuf.hostWrite(&gpu(), reinterpret_cast<const char*>(srcHost) + offset, src, copySize,
+                           flags)) {
      return false;
    }

@@ -296,7 +296,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
    srcOffset += tmpSize;
    if ((srcOffset + tmpSize) > gpu().xferWrite().MaxSize()) {
      srcOffset = 0;
-      flags =  0;
+      flags = 0;
    } else {
      flags = Resource::NoWait;
    }
@@ -310,7 +310,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
  // Use host copy if memory has direct access or it's persistent
  if (setup_.disableWriteBuffer_ ||
      (gpuMem(dstMemory).isHostMemDirectAccess() &&
-      (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+       (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
      gpuMem(dstMemory).isPersistentDirectMap()) {
    return HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
  } else {
@@ -335,7 +335,7 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
      // Copy memory, using pinning
      while (dstSize > 0) {
        size_t tmpSize;
-          // If it's the first iterarion, then readjust the copy size
+        // If it's the first iterarion, then readjust the copy size
        // to include alignment
        if (first) {
          pinAllocSize = amd::alignUp(pinSize + partial, PinnedMemoryAlignment);
@@ -398,7 +398,7 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
  // Use host copy if memory has direct access or it's persistent
  if (setup_.disableWriteBufferRect_ ||
      (dstMemory.isHostMemDirectAccess() &&
-      (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+       (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
      gpuMem(dstMemory).isPersistentDirectMap()) {
    return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
  } else {
@@ -586,8 +586,8 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
                                                entire, rowPitch, slicePitch);
  } else {
    // Use PAL path for a transfer
-    result = gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin,
-        size, gpuMem(dstMemory));
+    result =
+        gpuMem(srcMemory).partialMemCopyTo(gpu(), srcOrigin, dstOrigin, size, gpuMem(dstMemory));

    // Check if a HostBlit transfer is required
    if (completeOperation_ && !result) {
@@ -947,8 +947,8 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo

  void* param = kernel->parameters().values() + desc.offset_;
  assert((desc.type_ == T_POINTER || value != NULL ||
-    (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
-    "not a valid local mem arg");
+          (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL)) &&
+         "not a valid local mem arg");

  uint32_t uint32_value = 0;
  uint64_t uint64_value = 0;
@@ -957,14 +957,15 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
  if (desc.type_ == T_POINTER && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
    if ((value == NULL) || (static_cast<const cl_mem*>(value) == NULL)) {
      reinterpret_cast<Memory**>(kernel->parameters().values() +
-        kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
+                                 kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
+          nullptr;
    } else {
      // convert cl_mem to amd::Memory*, return false if invalid.
-      LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>((
-        *static_cast<Memory* const*>(value))->virtualAddress());
+      LP64_SWITCH(uint32_value, uint64_value) =
+          static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
      reinterpret_cast<Memory**>(kernel->parameters().values() +
-        kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
-        *static_cast<Memory* const*>(value);
+                                 kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
+          *static_cast<Memory* const*>(value);
      // Note: Special case for image SRD, which is 64 bit always
      if (LP64_SWITCH(true, false) &&
          (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject)) {
@@ -1018,8 +1019,8 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
  bool releaseView = false;
  bool result = false;
  amd::Image::Format newFormat(gpuMem(dstMemory).desc().format_);
-  bool swapLayer = (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
-       dev().settings().gfx10Plus_;
+  bool swapLayer =
+      (dstView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;

  // Find unsupported formats
  for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1078,10 +1079,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
    // Swap the Y and Z components, apparently gfx10 HW expects
    // layer in Z
    if (swapLayer) {
-        globalWorkSize[2] = globalWorkSize[1];
-        globalWorkSize[1] = 1;
-        localWorkSize[2] = localWorkSize[1];
-        localWorkSize[1] = 1;
+      globalWorkSize[2] = globalWorkSize[1];
+      globalWorkSize[1] = 1;
+      localWorkSize[2] = localWorkSize[1];
+      localWorkSize[1] = 1;
    }
  } else {
    globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1114,10 +1115,10 @@ bool KernelBlitManager::copyBufferToImageKernel(device::Memory& srcMemory,
  cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};

  if (swapLayer) {
-      dstOrg[2] = dstOrg[1];
-      dstOrg[1] = 0;
-      copySize[2] = copySize[1];
-      copySize[1] = 1;
+    dstOrg[2] = dstOrg[1];
+    dstOrg[1] = 0;
+    copySize[2] = copySize[1];
+    copySize[1] = 1;
  }

  setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1338,8 +1339,8 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
  bool releaseView = false;
  bool result = false;
  amd::Image::Format newFormat(gpuMem(srcMemory).desc().format_);
-  bool swapLayer = (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
-       dev().settings().gfx10Plus_;
+  bool swapLayer =
+      (srcView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;

  // Find unsupported formats
  for (uint i = 0; i < RejectedFormatDataTotal; ++i) {
@@ -1398,10 +1399,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
    // Swap the Y and Z components, apparently gfx10 HW expects
    // layer in Z
    if (swapLayer) {
-        globalWorkSize[2] = globalWorkSize[1];
-        globalWorkSize[1] = 1;
-        localWorkSize[2] = localWorkSize[1];
-        localWorkSize[1] = 1;
+      globalWorkSize[2] = globalWorkSize[1];
+      globalWorkSize[1] = 1;
+      localWorkSize[2] = localWorkSize[1];
+      localWorkSize[1] = 1;
    }
  } else {
    globalWorkSize[0] = amd::alignUp(size[0], 8);
@@ -1426,10 +1427,10 @@ bool KernelBlitManager::copyImageToBufferKernel(device::Memory& srcMemory,
  cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
  cl_int copySize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
  if (swapLayer) {
-      srcOrg[2] = srcOrg[1];
-      srcOrg[1] = 0;
-      copySize[2] = copySize[1];
-      copySize[1] = 1;
+    srcOrg[2] = srcOrg[1];
+    srcOrg[1] = 0;
+    copySize[2] = copySize[1];
+    copySize[1] = 1;
  }
  setArgument(kernels_[blitType], 4, sizeof(srcOrg), srcOrg);
  uint32_t memFmtSize = gpuMem(srcMemory).elementSize();
@@ -1570,7 +1571,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
  // Program source origin
  cl_int srcOrg[4] = {(cl_int)srcOrigin[0], (cl_int)srcOrigin[1], (cl_int)srcOrigin[2], 0};
  if ((gpuMem(srcMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
-    dev().settings().gfx10Plus_) {
+      dev().settings().gfx10Plus_) {
    srcOrg[3] = 1;
  }
  setArgument(kernels_[blitType], 2, sizeof(srcOrg), srcOrg);
@@ -1578,7 +1579,7 @@ bool KernelBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dst
  // Program destinaiton origin
  cl_int dstOrg[4] = {(cl_int)dstOrigin[0], (cl_int)dstOrigin[1], (cl_int)dstOrigin[2], 0};
  if ((gpuMem(dstMemory).desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
-    dev().settings().gfx10Plus_) {
+      dev().settings().gfx10Plus_) {
    dstOrg[3] = 1;
  }
  setArgument(kernels_[blitType], 3, sizeof(dstOrg), dstOrg);
@@ -1700,16 +1701,15 @@ bool KernelBlitManager::writeImage(const void* srcHost, device::Memory& dstMemor
      amdMemory = pinHostMemory(srcHost, pinSize, partial);
      if (amdMemory == nullptr) {
        // Force SW copy
-        result = HostBlitManager::writeImage(srcHost, dstMemory,
-                    origin, size, rowPitch, slicePitch, entire);
+        result = HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
+                                             entire);
        synchronize();
        return result;
      }
      // Get device memory for this virtual device
      srcMemory = dev().getGpuMemory(amdMemory);
      pinned = true;
-    }
-    else {
+    } else {
      srcMemory = &gpu().xferWrite().Acquire(pinSize);
      srcMemory->hostWrite(&gpu(), srcHost, 0, pinSize, Resource::NoWait);
      pinned = false;
@@ -1951,7 +1951,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
  // Use host copy if memory has direct access or it's persistent
  if (setup_.disableWriteBuffer_ ||
      (gpuMem(dstMemory).isHostMemDirectAccess() &&
-      (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+       (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
      (gpuMem(dstMemory).memoryType() == Resource::Persistent)) {
    result = HostBlitManager::writeBuffer(srcHost, dstMemory, origin, size, entire);
    synchronize();
@@ -2002,7 +2002,7 @@ bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dst
  // Use host copy if memory has direct access or it's persistent
  if (setup_.disableWriteBufferRect_ ||
      (gpuMem(dstMemory).isHostMemDirectAccess() &&
-      (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
+       (gpuMem(dstMemory).memoryType() != Resource::ExternalPhysical)) ||
      gpuMem(dstMemory).isPersistentDirectMap()) {
    result = HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire);
    synchronize();
@@ -2206,8 +2206,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
  size_t localWorkSize[3];
  Memory* memView = &gpuMem(memory);
  amd::Image::Format newFormat(gpuMem(memory).owner()->asImage()->getImageFormat());
-  bool swapLayer = (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
-       dev().settings().gfx10Plus_;
+  bool swapLayer =
+      (memView->desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) && dev().settings().gfx10Plus_;

  // Program the kernels workload depending on the fill dimensions
  fillType = FillImage;
@@ -2274,10 +2274,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
    // Swap the Y and Z components, apparently gfx10 HW expects
    // layer in Z
    if (swapLayer) {
-        globalWorkSize[2] = globalWorkSize[1];
-        globalWorkSize[1] = 1;
-        localWorkSize[2] = localWorkSize[1];
-        localWorkSize[1] = 1;
+      globalWorkSize[2] = globalWorkSize[1];
+      globalWorkSize[1] = 1;
+      localWorkSize[2] = localWorkSize[1];
+      localWorkSize[1] = 1;
    }
  } else {
    globalWorkSize[0] = amd::alignUp(globalWorkSize[0], 8);
@@ -2297,10 +2297,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
  cl_int fillOrigin[4] = {(cl_int)origin[0], (cl_int)origin[1], (cl_int)origin[2], 0};
  cl_int fillSize[4] = {(cl_int)size[0], (cl_int)size[1], (cl_int)size[2], 0};
  if (swapLayer) {
-      fillOrigin[2] = fillOrigin[1];
-      fillOrigin[1] = 0;
-      fillSize[2] = fillSize[1];
-      fillSize[1] = 1;
+    fillOrigin[2] = fillOrigin[1];
+    fillOrigin[1] = 0;
+    fillSize[2] = fillSize[1];
+    fillSize[1] = 1;
  }
  setArgument(kernels_[fillType], 4, sizeof(fillOrigin), fillOrigin);
  setArgument(kernels_[fillType], 5, sizeof(fillSize), fillSize);
@@ -27,7 +27,7 @@ class DmaBlitManager : public device::HostBlitManager {
  //! Constructor
  DmaBlitManager(VirtualGPU& gpu,       //!< Virtual GPU to be used for blits
                 Setup setup = Setup()  //!< Specifies HW accelerated blits
-                 );
+  );

  //! Destructor
  virtual ~DmaBlitManager() {}
@@ -211,7 +211,7 @@ class KernelBlitManager : public DmaBlitManager {
  //! Constructor
  KernelBlitManager(VirtualGPU& gpu,       //!< Virtual GPU to be used for blits
                    Setup setup = Setup()  //!< Specifies HW accelerated blits
-                    );
+  );

  //! Destructor
  virtual ~KernelBlitManager();
@@ -382,7 +382,7 @@ class KernelBlitManager : public DmaBlitManager {

  //! Creates a program for all blit operations
  bool createProgram(Device& device  //!< Device object
-                     );
+  );

  //! Creates a view memory object
  Memory* createView(const Memory& parent,         //!< Parent memory object
@@ -409,4 +409,5 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
    "fillImage",         "scheduler",
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -11,12 +11,12 @@ namespace pal {

 // ================================================================================================
 ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
-  : gpu_(gpu)
-  , pool_(MaxNumberOfBuffers)
-  , activeBuffer_(0)
-  , size_(size)
-  , wrtOffset_(0)
-  , wrtAddress_(nullptr) {}
+    : gpu_(gpu),
+      pool_(MaxNumberOfBuffers),
+      activeBuffer_(0),
+      size_(size),
+      wrtOffset_(0),
+      wrtAddress_(nullptr) {}

 // ================================================================================================
 void ManagedBuffer::release() {
@@ -40,8 +40,8 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
    pool_[i].buf->memRef()->gpu_ = &gpu_;
    void* wrtAddress = pool_[i].buf->map(&gpu_);
    if (wrtAddress == nullptr) {
-        LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
-        return false;
+      LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
+      return false;
    }
    // Make sure OCL touches every buffer in the queue to avoid delays on the first submit
    uint dummy = 0;
@@ -94,15 +94,10 @@ void ManagedBuffer::pinGpuEvent() {

 // ================================================================================================
 ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
-  : mbuf_(mbuf)
-  , sys_mem_copy_(nullptr)
-  , size_(size)
-{}
+    : mbuf_(mbuf), sys_mem_copy_(nullptr), size_(size) {}

 // ================================================================================================
-ConstantBuffer::~ConstantBuffer() {
-  amd::AlignedMemory::deallocate(sys_mem_copy_);
-}
+ConstantBuffer::~ConstantBuffer() { amd::AlignedMemory::deallocate(sys_mem_copy_); }

 // ================================================================================================
 bool ConstantBuffer::Create() {
@@ -118,8 +113,8 @@ bool ConstantBuffer::Create() {

 // ================================================================================================
 uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
-  uint64_t  vm_address;
-  address   cpu_address = mbuf_.reserve(size, &vm_address);
+  uint64_t vm_address;
+  address cpu_address = mbuf_.reserve(size, &vm_address);
  // Update memory with new CB data
  memcpy(cpu_address, sys_mem_copy_, size);
  return vm_address;
@@ -127,8 +122,8 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {

 // ================================================================================================
 uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
-  uint64_t  vm_address;
-  address   cpu_address = mbuf_.reserve(size, &vm_address);
+  uint64_t vm_address;
+  address cpu_address = mbuf_.reserve(size, &vm_address);
  // Update memory with new CB data
  memcpy(cpu_address, sysmem, size);
  return vm_address;
@@ -136,9 +131,7 @@ uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const

 // ================================================================================================
 XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)
-  : buffer_view_(device, size)
-  , mbuf_(mbuf)
-  , size_(size) {
+    : buffer_view_(device, size), mbuf_(mbuf), size_(size) {
  // Create a view for access
  Resource::ViewParams params = {};
  params.gpu_ = &mbuf_.gpu();
@@ -151,9 +144,9 @@ XferBuffer::XferBuffer(const Device& device, ManagedBuffer& mbuf, uint32_t size)

 // ================================================================================================
 Memory& XferBuffer::Acquire(uint32_t size) {
-  uint64_t  vm_address;
+  uint64_t vm_address;
  // Reserve space in the managed buffer
-  address   cpu_address = mbuf_.reserve(size, &vm_address);
+  address cpu_address = mbuf_.reserve(size, &vm_address);
  // Update a view for access
  buffer_view_.updateView(mbuf_.activeMemory(), vm_address - mbuf_.vmAddress(), size);
  return buffer_view_;
@@ -12,9 +12,9 @@ namespace pal {
 class ManagedBuffer : public amd::EmbeddedObject {
 public:
  //! Constructor for the ConstBuffer class
-  ManagedBuffer(VirtualGPU& gpu,    //!< Virtual GPU device object
-                uint32_t    size    //!< size of the managed buffers in bytes
-                );
+  ManagedBuffer(VirtualGPU& gpu,  //!< Virtual GPU device object
+                uint32_t size     //!< size of the managed buffers in bytes
+  );
  ~ManagedBuffer() {}

  //! Creates the managed buffers
@@ -50,8 +50,8 @@ class ManagedBuffer : public amd::EmbeddedObject {

 private:
  struct TimeStampedBuffer {
-    Memory*   buf;
-    GpuEvent  events[AllEngines];
+    Memory* buf;
+    GpuEvent events[AllEngines];
  };

  //! The maximum number of the managed buffers
@@ -63,21 +63,21 @@ class ManagedBuffer : public amd::EmbeddedObject {
  //! Disable operator=
  ManagedBuffer& operator=(const ManagedBuffer&) = delete;

-  VirtualGPU& gpu_;                 //!< Virtual GPU object
-  std::vector<TimeStampedBuffer>  pool_;   //!< Buffers for management
-  uint32_t  activeBuffer_;          //!< Current active buffer
-  uint32_t  size_;                  //!< Constant buffer size
-  uint32_t  wrtOffset_;             //!< Current write offset
-  address   wrtAddress_;            //!< Write address in CB
+  VirtualGPU& gpu_;                      //!< Virtual GPU object
+  std::vector<TimeStampedBuffer> pool_;  //!< Buffers for management
+  uint32_t activeBuffer_;                //!< Current active buffer
+  uint32_t size_;                        //!< Constant buffer size
+  uint32_t wrtOffset_;                   //!< Current write offset
+  address wrtAddress_;                   //!< Write address in CB
 };

 //! Constant buffer
 class ConstantBuffer : public amd::HeapObject {
-public:
+ public:
  //! Constructor for the ConstBuffer class
  ConstantBuffer(ManagedBuffer& mbuf,  //!< Managed buffer
-                 uint32_t       size   //!< Max size of the constant buffer
-                 );
+                 uint32_t size         //!< Max size of the constant buffer
+  );

  //! Destructor for the ConstBuffer class
  ~ConstantBuffer();
@@ -86,18 +86,18 @@ public:
  bool Create();

  /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
-  *
-  *  \return GPU address for the uploaded data
-  */
+   *
+   *  \return GPU address for the uploaded data
+   */
  uint64_t UploadDataToHw(uint32_t size  //!< real data size for upload
                          ) const;

  /*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
-  *
-  *  \return GPU address for the uploaded data
-  */
+   *
+   *  \return GPU address for the uploaded data
+   */
  uint64_t UploadDataToHw(const void* sysmem,  //!< Pointer to the data for upload
-                          uint32_t    size     //!< Real data size for upload
+                          uint32_t size        //!< Real data size for upload
                          ) const;

  //! Returns a pointer to the system memory copy for CB
@@ -106,52 +106,55 @@ public:
  //! Returns active GPU buffer
  Memory* ActiveMemory() const { return mbuf_.activeMemory(); }

-private:
+ private:
  //! Disable copy constructor
  ConstantBuffer(const ConstantBuffer&) = delete;

  //! Disable operator=
  ConstantBuffer& operator=(const ConstantBuffer&) = delete;

-  ManagedBuffer&  mbuf_;    //!< Managed buffer on GPU
-  address   sys_mem_copy_;  //!< System memory copy
-  uint32_t  size_;          //!< Constant buffer size
+  ManagedBuffer& mbuf_;   //!< Managed buffer on GPU
+  address sys_mem_copy_;  //!< System memory copy
+  uint32_t size_;         //!< Constant buffer size
 };

 //! Staging buffer
 class XferBuffer : public amd::EmbeddedObject {
-public:
+ public:
  //! Constructor for the ConstBuffer class
-  XferBuffer(const Device& device,  //!< Active GPU device 
+  XferBuffer(const Device& device,  //!< Active GPU device
             ManagedBuffer& mbuf,   //!< Managed buffer
-             uint32_t       size    //!< Maximum size of the transfer buffer
+             uint32_t size          //!< Maximum size of the transfer buffer
  );

  //! Destructor for the ConstBuffer class
  ~XferBuffer() {}

  /*! \brief Acquires free memory from the managed buffer
-  *
-  *  \return GPU memory object associated with free memory
-  */
-  Memory& Acquire(uint32_t size     //!< data size for transfers
-                  );
+   *
+   *  \return GPU memory object associated with free memory
+   */
+  Memory& Acquire(uint32_t size  //!< data size for transfers
+  );

  //! Releases memory object used in the staging transfer
  void Release(Memory& mem  //!< Memory object for release
-               ) { buffer_view_.updateView(nullptr, 0, 0); }
+  ) {
+    buffer_view_.updateView(nullptr, 0, 0);
+  }

  size_t MaxSize() const { return static_cast<size_t>(size_); }

-private:
+ private:
  //! Disable copy constructor
  XferBuffer(const XferBuffer&) = delete;

  //! Disable operator=
  XferBuffer& operator=(const XferBuffer&) = delete;

-  Memory  buffer_view_;     //!< Buffer view returned in the acquire
-  ManagedBuffer&  mbuf_;    //!< Managed buffer on GPU
-  uint32_t  size_;          //!< Mx staging buffer size
+  Memory buffer_view_;   //!< Buffer view returned in the acquire
+  ManagedBuffer& mbuf_;  //!< Managed buffer on GPU
+  uint32_t size_;        //!< Mx staging buffer size
 };
-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -676,12 +676,12 @@ void PerfCounter::convertInfo() {
      break;
    case Pal::GfxIpLevel::GfxIp10:
    case Pal::GfxIpLevel::GfxIp10_1:
-        if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
-            auto p = gfx10BlockIdPal[info_.blockIndex_];
-            info_.blockIndex_ = std::get<0>(p);
-            info_.counterIndex_ = std::get<1>(p);
-        }
-        break;
+      if (info_.blockIndex_ < gfx10BlockIdPal.size()) {
+        auto p = gfx10BlockIdPal[info_.blockIndex_];
+        info_.blockIndex_ = std::get<0>(p);
+        info_.counterIndex_ = std::get<1>(p);
+      }
+      break;
    default:
      Unimplemented();
      break;
@@ -84,8 +84,7 @@ class PerfCounter : public device::PerfCounter {
              cl_uint blockIndex,           //!< HW block index
              cl_uint counterIndex,         //!< Counter index within the block
              cl_uint eventIndex)           //!< Event index for profiling
-      : gpuDevice_(device),
-        palRef_(palRef) {
+      : gpuDevice_(device), palRef_(palRef) {
    info_.blockIndex_ = blockIndex;
    info_.counterIndex_ = counterIndex;
    info_.eventIndex_ = eventIndex;
@@ -98,10 +98,10 @@ struct HwDebugWaveAddr {
 };

 /*! \brief Kernel code information
-*
-*   This structure contains the pointer of mapped kernel code for host access
-*   and its size (in bytes)
-*/
+ *
+ *   This structure contains the pointer of mapped kernel code for host access
+ *   and its size (in bytes)
+ */
 struct AqlCodeInfo {
  amd_kernel_code_t* aqlCode_;  //! pointer of AQL code to allow host access
  uint32_t aqlCodeSize_;        //! size of AQL code
@@ -143,7 +143,7 @@ void GpuDebugManager::unregisterDebugger() {

 void GpuDebugManager::flushCache(uint32_t mask) {
  HwDbgGpuCacheMask cacheMask(mask);
-  //device()->xferQueue()->flushCuCaches(cacheMask);
+  // device()->xferQueue()->flushCuCaches(cacheMask);
 }


@@ -47,9 +47,9 @@ struct GpuEvent {
  static constexpr uint32_t InvalidID = ((1 << 30) - 1);

  struct {
-    uint32_t id_ : 30;        ///< Actual event id
-    uint32_t modified_ : 1;   ///< Resource associated with the event was modified
-    uint32_t engineId_ : 1;   ///< Type of the id
+    uint32_t id_ : 30;       ///< Actual event id
+    uint32_t modified_ : 1;  ///< Resource associated with the event was modified
+    uint32_t engineId_ : 1;  ///< Type of the id
  };
  //! GPU event default constructor
  GpuEvent() : id_(InvalidID), modified_(false), engineId_(MainEngine) {}
@@ -63,8 +63,11 @@ struct GpuEvent {
  void invalidate() { id_ = InvalidID; }

  // Overwrite default assign operator to preserve modified_ field
-  GpuEvent& operator=(const GpuEvent& evt)
-    { id_ = evt.id_; engineId_ = evt.engineId_; return *this; }
+  GpuEvent& operator=(const GpuEvent& evt) {
+    id_ = evt.id_;
+    engineId_ = evt.engineId_;
+    return *this;
+  }
 };

 /*! \addtogroup PAL
@@ -113,87 +116,110 @@ const static uint HsaSamplerObjectAlignment = 16;
 const static uint DeviceQueueMaskSize = 32;

 struct AMDDeviceInfo {
-  const char* targetName_;     //!< Target name
-  const char* machineTarget_;  //!< Machine target
-  const char* machineTargetLC_;//!< Machine target for LC
-  uint simdPerCU_;             //!< Number of SIMDs per CU
-  uint simdWidth_;             //!< Number of workitems processed per SIMD
-  uint simdInstructionWidth_;  //!< Number of instructions processed per SIMD
-  uint memChannelBankWidth_;   //!< Memory channel bank width
-  uint localMemSizePerCU_;     //!< Local memory size per CU
-  uint localMemBanks_;         //!< Number of banks of local memory
-  uint gfxipVersionLC_;        //!< The core engine GFXIP version for LC
-  uint gfxipVersion_;          //!< The core engine GFXIP version
-  bool xnackEnabled_;          //!< Enable XNACK feature
+  const char* targetName_;       //!< Target name
+  const char* machineTarget_;    //!< Machine target
+  const char* machineTargetLC_;  //!< Machine target for LC
+  uint simdPerCU_;               //!< Number of SIMDs per CU
+  uint simdWidth_;               //!< Number of workitems processed per SIMD
+  uint simdInstructionWidth_;    //!< Number of instructions processed per SIMD
+  uint memChannelBankWidth_;     //!< Memory channel bank width
+  uint localMemSizePerCU_;       //!< Local memory size per CU
+  uint localMemBanks_;           //!< Number of banks of local memory
+  uint gfxipVersionLC_;          //!< The core engine GFXIP version for LC
+  uint gfxipVersion_;            //!< The core engine GFXIP version
+  bool xnackEnabled_;            //!< Enable XNACK feature
 };

 static const AMDDeviceInfo DeviceInfo[] = {
-  /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
-  /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
-  /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
-  /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
-  /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
-  /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+    /* Unknown */ {"", "unknown", "", 4, 16, 1, 256, 64 * Ki, 32, 0, 0, false},
+    /* Tahiti */ {"", "tahiti", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+    /* Pitcairn */ {"", "pitcairn", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+    /* Capeverde */ {"", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
+    /* Oland */ {"", "oland", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},
+    /* Hainan */ {"", "hainan", "", 4, 16, 1, 256, 64 * Ki, 32, 600, 600, false},

-  /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
-  /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
-  /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
-  /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+    /* Bonaire */ {"Bonaire", "bonaire", "", 4, 16, 1, 256, 64 * Ki, 32, 700, 700, false},
+    /* Hawaii */ {"Hawaii", "hawaii", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+    /* Hawaii */ {"", "grenada", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+    /* Hawaii */ {"", "maui", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},

-  /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
-  /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
-  /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
-  /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+    /* Kalindi */ {"Kalindi", "kalindi", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
+    /* Godavari */ {"Mullins", "mullins", "", 4, 16, 1, 256, 64 * Ki, 32, 702, 702, false},
+    /* Spectre */ {"Spectre", "spectre", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},
+    /* Spooky */ {"Spooky", "spooky", "", 4, 16, 1, 256, 64 * Ki, 32, 701, 701, false},

-  /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801,false},
-  /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
-  /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},
+    /* Carrizo */ {"Carrizo", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
+    /* Bristol */ {"Bristol Ridge", "carrizo", "", 4, 16, 1, 256, 64 * Ki, 32, 801, 801, false},
+    /* Stoney */ {"Stoney", "stoney", "", 4, 16, 1, 256, 64 * Ki, 32, 810, 810, false},

-  /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
-  /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
-  /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
-  /* Ellesmere */ {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
-  /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
-  /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+    /* Iceland */ {"Iceland", "iceland", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
+    /* Tonga */ {"Tonga", "tonga", "gfx802", 4, 16, 1, 256, 64 * Ki, 32, 802, 800, false},
+    /* Fiji */ {"Fiji", "fiji", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+    /* Ellesmere */
+    {"Ellesmere", "ellesmere", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+    /* Baffin */ {"Baffin", "baffin", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
+    /* Lexa */ {"gfx804", "gfx804", "gfx803", 4, 16, 1, 256, 64 * Ki, 32, 803, 804, false},
 };

 // Ordering as per AsicRevision# in //depot/stg/pal/inc/core/palDevice.h and
 // http://confluence.amd.com/pages/viewpage.action?spaceKey=ASLC&title=AMDGPU+Target+Names
 static const AMDDeviceInfo Gfx9PlusSubDeviceInfo[] = {
-    /* Vega10       */{"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
-    /* Vega10 XNACK */{ LIGHTNING_SWITCH("gfx900","gfx901"), "gfx901", "gfx900",
-                        4, 16, 1, 256, 64 * Ki, 32,  900, 901, true},
-    /* Vega12       */{"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
-    /* Vega12 XNACK */{ LIGHTNING_SWITCH("gfx904","gfx905"), "gfx905", "gfx904",
-                        4, 16, 1, 256, 64 * Ki, 32, 904, 905, true},
-    /* Vega20       */{"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
-    /* Vega20 XNACK */{ LIGHTNING_SWITCH("gfx906","gfx907"), "gfx907", "gfx906",
-                        4, 16, 1, 256, 64 * Ki, 32, 906, 907, true},
-    /* Raven        */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
-    /* Raven XNACK  */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
-                        4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
-    /* Raven2       */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
-    /* Raven2 XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
-                        4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
-    /* Renoir       */{"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
-    /* Renoir XNACK */{ LIGHTNING_SWITCH("gfx902","gfx903"), "gfx903", "gfx902",
-                        4, 16, 1, 256, 64 * Ki, 32, 902, 903, true},
-    /* Navi10_A0       */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false },
-    /* Navi10_A0 XNACK */{ "gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true },
-    /* Navi10       */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
-    /* Navi10 XNACK */{"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
-    /* Navi10Lite       */{"gfx1000", "gfx1000","gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
-    /* Navi10Lite XNACK */{"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
-    /* Navi12       */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
-    /* Navi12 XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
-    /* Navi12Lite   */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false },
-    /* Navi12Lite XNACK */{ "gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true },
-    /* Navi14       */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false },
-    /* Navi14 XNACK */{ "gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true },
-    /* UnknownDevice3       */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
-    /* UnknownDevice3 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
-    /* UnknownDevice2   */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false },
-    /* UnknownDevice2 XNACK */{ "gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true },
+    /* Vega10       */ {"gfx900", "gfx900", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 900, false},
+    /* Vega10 XNACK */
+    {LIGHTNING_SWITCH("gfx900", "gfx901"), "gfx901", "gfx900", 4, 16, 1, 256, 64 * Ki, 32, 900, 901,
+     true},
+    /* Vega12       */ {"gfx904", "gfx904", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 904, false},
+    /* Vega12 XNACK */
+    {LIGHTNING_SWITCH("gfx904", "gfx905"), "gfx905", "gfx904", 4, 16, 1, 256, 64 * Ki, 32, 904, 905,
+     true},
+    /* Vega20       */ {"gfx906", "gfx906", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 906, false},
+    /* Vega20 XNACK */
+    {LIGHTNING_SWITCH("gfx906", "gfx907"), "gfx907", "gfx906", 4, 16, 1, 256, 64 * Ki, 32, 906, 907,
+     true},
+    /* Raven        */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+    /* Raven XNACK  */
+    {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+     true},
+    /* Raven2       */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+    /* Raven2 XNACK */
+    {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+     true},
+    /* Renoir       */ {"gfx902", "gfx902", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 902, false},
+    /* Renoir XNACK */
+    {LIGHTNING_SWITCH("gfx902", "gfx903"), "gfx903", "gfx902", 4, 16, 1, 256, 64 * Ki, 32, 902, 903,
+     true},
+    /* Navi10_A0       */
+    {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
+    /* Navi10_A0 XNACK */
+    {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
+    /* Navi10       */
+    {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, false},
+    /* Navi10 XNACK */
+    {"gfx1010", "gfx1010", "gfx1010", 2, 32, 1, 256, 64 * Ki, 32, 1010, 1010, true},
+    /* Navi10Lite       */
+    {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, false},
+    /* Navi10Lite XNACK */
+    {"gfx1000", "gfx1000", "gfx1000", 2, 32, 1, 256, 64 * Ki, 32, 1000, 1000, true},
+    /* Navi12       */
+    {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
+    /* Navi12 XNACK */
+    {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
+    /* Navi12Lite   */
+    {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, false},
+    /* Navi12Lite XNACK */
+    {"gfx1011", "gfx1011", "gfx1011", 2, 32, 1, 256, 64 * Ki, 32, 1011, 1011, true},
+    /* Navi14       */
+    {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, false},
+    /* Navi14 XNACK */
+    {"gfx1012", "gfx1012", "gfx1012", 2, 32, 1, 256, 64 * Ki, 32, 1012, 1012, true},
+    /* UnknownDevice3       */
+    {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
+    /* UnknownDevice3 XNACK */
+    {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},
+    /* UnknownDevice2   */
+    {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, false},
+    /* UnknownDevice2 XNACK */
+    {"gfx1030", "gfx1030", "gfx1030", 2, 32, 1, 256, 64 * Ki, 32, 1030, 1030, true},

 };

@@ -53,15 +53,14 @@ void PalDeviceUnload() { pal::Device::tearDown(); }

 namespace pal {

-Util::GenericAllocator  NullDevice::allocator_;
+Util::GenericAllocator NullDevice::allocator_;
 char* Device::platformObj_;
-Pal::IPlatform*  Device::platform_;
+Pal::IPlatform* Device::platform_;

 NullDevice::Compiler* NullDevice::compiler_;
 AppProfile Device::appProfile_;

-NullDevice::NullDevice()
-    : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}
+NullDevice::NullDevice() : amd::Device(), ipLevel_(Pal::GfxIpLevel::None), hwInfo_(nullptr) {}

 bool NullDevice::init() {
  std::vector<Device*> devices;
@@ -89,8 +88,8 @@ bool NullDevice::init() {
      driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
      if (driverVersion.find("PAL") != std::string::npos) {
        if (static_cast<NullDevice*>(devices[i])->asicRevision() == revision) {
-            foundActive = true;
-            break;
+          foundActive = true;
+          break;
        }
      }
    }
@@ -109,132 +108,130 @@ bool NullDevice::init() {
      }
    }
  }
-#endif // defined(WITH_COMPILER_LIB)
+#endif  // defined(WITH_COMPILER_LIB)

  // Loop through all supported devices and create each of them
-  for (uint id = 0;
-        id < sizeof(Gfx9PlusSubDeviceInfo)/sizeof(AMDDeviceInfo); ++id) {
-      bool foundActive = false;
-      bool foundDuplicate = false;
-      uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
-        pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
+  for (uint id = 0; id < sizeof(Gfx9PlusSubDeviceInfo) / sizeof(AMDDeviceInfo); ++id) {
+    bool foundActive = false;
+    bool foundDuplicate = false;
+    uint gfxipVersion = IS_LIGHTNING ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
+                                     : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;

-      if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
-          continue;
-      }
+    if (pal::Gfx9PlusSubDeviceInfo[id].targetName_[0] == '\0') {
+      continue;
+    }

-      // Loop through all active PAL devices and see if we match one
-      for (uint i = 0; i < devices.size(); ++i) {
-        driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
-        if (driverVersion.find("PAL") != std::string::npos) {
-          gfxipVersion = devices[i]->settings().useLightning_ ?
-            pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_ :
-            pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
-          uint gfxIpCurrent = devices[i]->settings().useLightning_ ?
-            static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_ :
-            static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
-          if (gfxIpCurrent == gfxipVersion) {
-              foundActive = true;
-              break;
-          }
+    // Loop through all active PAL devices and see if we match one
+    for (uint i = 0; i < devices.size(); ++i) {
+      driverVersion = static_cast<amd::Device*>(devices[i])->info().driverVersion_;
+      if (driverVersion.find("PAL") != std::string::npos) {
+        gfxipVersion = devices[i]->settings().useLightning_
+            ? pal::Gfx9PlusSubDeviceInfo[id].gfxipVersionLC_
+            : pal::Gfx9PlusSubDeviceInfo[id].gfxipVersion_;
+        uint gfxIpCurrent = devices[i]->settings().useLightning_
+            ? static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersionLC_
+            : static_cast<NullDevice*>(devices[i])->hwInfo()->gfxipVersion_;
+        if (gfxIpCurrent == gfxipVersion) {
+          foundActive = true;
+          break;
        }
      }
+    }

-      // Don't report an offline device if it's active
-      if (foundActive) {
-          continue;
+    // Don't report an offline device if it's active
+    if (foundActive) {
+      continue;
+    }
+
+    // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
+    // and compare them with the current entry to see if the current entry
+    // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
+    // means the current entry already has been added in the offline device list
+    for (uint j = 0; j < id; ++j) {
+      if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
+        continue;
      }
-
-      // Loop through all previous devices in the Gfx9PlusSubDeviceInfo list
-      // and compare them with the current entry to see if the current entry
-      // was listed previously in the Gfx9PlusSubDeviceInfo, if so, then it
-      // means the current entry already has been added in the offline device list
-      for (uint j = 0; j < id; ++j) {
-          if (pal::Gfx9PlusSubDeviceInfo[j].targetName_[0] == '\0') {
-              continue;
-          }
-          if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
-                     pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
-              foundDuplicate = true;
-              break;
-          }
+      if (strcmp(pal::Gfx9PlusSubDeviceInfo[j].targetName_,
+                 pal::Gfx9PlusSubDeviceInfo[id].targetName_) == 0) {
+        foundDuplicate = true;
+        break;
      }
+    }

-      // Don't report an offline device twice
-      if (foundDuplicate) {
-          continue;
-      }
+    // Don't report an offline device twice
+    if (foundDuplicate) {
+      continue;
+    }

-      Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
-      uint ipLevelMajor = round(gfxipVersion / 100);
-      uint ipLevelMinor = round(gfxipVersion / 10 % 10);
-      switch (ipLevelMajor) {
+    Pal::GfxIpLevel ipLevel = Pal::GfxIpLevel::_None;
+    uint ipLevelMajor = round(gfxipVersion / 100);
+    uint ipLevelMinor = round(gfxipVersion / 10 % 10);
+    switch (ipLevelMajor) {
      case 9:
-          ipLevel = Pal::GfxIpLevel::GfxIp9;
-          break;
+        ipLevel = Pal::GfxIpLevel::GfxIp9;
+        break;
      case 10:
        switch (ipLevelMinor) {
-        case 0:
-          ipLevel = Pal::GfxIpLevel::GfxIp10;
-          break;
-        case 1:
-          ipLevel = Pal::GfxIpLevel::GfxIp10_1;
-          break;
-        case 2:
-          ipLevel = Pal::GfxIpLevel::GfxIp10_2;
-          break;
-        case 3:
-          ipLevel = Pal::GfxIpLevel::GfxIp10_3;
-          break;
+          case 0:
+            ipLevel = Pal::GfxIpLevel::GfxIp10;
+            break;
+          case 1:
+            ipLevel = Pal::GfxIpLevel::GfxIp10_1;
+            break;
+          case 2:
+            ipLevel = Pal::GfxIpLevel::GfxIp10_2;
+            break;
+          case 3:
+            ipLevel = Pal::GfxIpLevel::GfxIp10_3;
+            break;
        }
-      }
+    }

-      Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
-      uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;
+    Pal::AsicRevision revision = Pal::AsicRevision::Unknown;
+    uint xNACKSupported = pal::Gfx9PlusSubDeviceInfo[id].xnackEnabled_ ? 1 : 0;

-      switch (gfxipVersion) {
+    switch (gfxipVersion) {
      case 901:
      case 900:
-          revision = Pal::AsicRevision::Vega10;
-          break;
+        revision = Pal::AsicRevision::Vega10;
+        break;
      case 903:
      case 902:
-          revision = Pal::AsicRevision::Raven;
-          break;
+        revision = Pal::AsicRevision::Raven;
+        break;
      case 905:
      case 904:
-          revision = Pal::AsicRevision::Vega12;
-          break;
+        revision = Pal::AsicRevision::Vega12;
+        break;
      case 907:
      case 906:
-          revision = Pal::AsicRevision::Vega20;
-          break;
+        revision = Pal::AsicRevision::Vega20;
+        break;
      case 1000:
-          revision = Pal::AsicRevision::Navi10Lite;
-          break;
+        revision = Pal::AsicRevision::Navi10Lite;
+        break;
      case 1010:
-          revision = Pal::AsicRevision::Navi10;
-          break;
+        revision = Pal::AsicRevision::Navi10;
+        break;
      case 1011:
-          revision = Pal::AsicRevision::Navi12;
-          break;
+        revision = Pal::AsicRevision::Navi12;
+        break;
      case 1012:
-          revision = Pal::AsicRevision::Navi14;
-          break;
+        revision = Pal::AsicRevision::Navi14;
+        break;
      case 1030:
-          ShouldNotReachHere();
-          break;
-      }
+        ShouldNotReachHere();
+        break;
+    }

-      NullDevice* dev = new NullDevice();
-      if (nullptr != dev) {
-          if (!dev->create(revision, ipLevel, xNACKSupported)) {
-              delete dev;
-          }
-          else {
-              dev->registerDevice();
-          }
+    NullDevice* dev = new NullDevice();
+    if (nullptr != dev) {
+      if (!dev->create(revision, ipLevel, xNACKSupported)) {
+        delete dev;
+      } else {
+        dev->registerDevice();
      }
+    }
  }

  return true;
@@ -257,10 +254,10 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
  if ((GPU_ENABLE_PAL == 1) && (ipLevel == Pal::GfxIpLevel::_None)) {
    hwInfo_ = &DeviceInfo[static_cast<uint>(asicRevision)];
  } else if (ipLevel >= Pal::GfxIpLevel::GfxIp9) {
-      subtarget = (static_cast<uint>(asicRevision_) %
-                   static_cast<uint>(Pal::AsicRevision::Vega10))
-                   << 1 | xNACKSupported;
-      hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
+    subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
+            << 1 |
+        xNACKSupported;
+    hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];

  } else {
    return false;
@@ -271,8 +268,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,

  // Report 512MB for all offline devices
  Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount];
-  heaps[Pal::GpuHeapLocal].heapSize =
-  heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;
+  heaps[Pal::GpuHeapLocal].heapSize = heaps[Pal::GpuHeapLocal].physicalHeapSize = 512 * Mi;

  Pal::WorkStationCaps wscaps = {};

@@ -295,7 +291,7 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
  info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32 : 64;

  if (settings().useLightning_) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
    //  create compilation object with cache support
    int gfxipMajor = hwInfo_->gfxipVersionLC_ / 100;
    int gfxipMinor = hwInfo_->gfxipVersionLC_ / 10 % 10;
@@ -323,16 +319,16 @@ bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
    cacheCompilation_.reset(compObj);
 #endif
  } else {
-#if  defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
    const char* library = getenv("HSA_COMPILER_LIBRARY");
-    aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
-      library,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      AMD_OCL_SC_LIB };
+    aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
+                               library,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               AMD_OCL_SC_LIB};
    // Initialize the compiler handle
    acl_error error;
    compiler_ = aclCompilerInit(&opts, &error);
@@ -370,9 +366,9 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,

  info_.maxWorkItemDimensions_ = 3;

-  info_.maxComputeUnits_ = settings().enableWgpMode_ ?
-    palProp.gfxipProperties.shaderCore.numAvailableCus / 2 :
-    palProp.gfxipProperties.shaderCore.numAvailableCus;
+  info_.maxComputeUnits_ = settings().enableWgpMode_
+      ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
+      : palProp.gfxipProperties.shaderCore.numAvailableCus;

  info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;

@@ -427,7 +423,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
  if (GPU_ADD_HBCC_SIZE) {
    localRAM = heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize;
  } else {
-    localRAM = heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
+    localRAM =
+        heaps[Pal::GpuHeapLocal].physicalHeapSize + heaps[Pal::GpuHeapInvisible].physicalHeapSize;
  }

  info_.globalMemSize_ = (static_cast<cl_ulong>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
@@ -445,10 +442,10 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
  // Find the largest heap form FB memory
  if (GPU_ADD_HBCC_SIZE) {
    info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].heapSize),
-      cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
+                                      cl_ulong(heaps[Pal::GpuHeapInvisible].heapSize));
  } else {
    info_.maxMemAllocSize_ = std::max(cl_ulong(heaps[Pal::GpuHeapLocal].physicalHeapSize),
-      cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
+                                      cl_ulong(heaps[Pal::GpuHeapInvisible].physicalHeapSize));
  }

 #if defined(ATI_OS_WIN)
@@ -561,7 +558,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,

  ::strcpy(info_.vendor_, "Advanced Micro Devices, Inc.");
  ::snprintf(info_.driverVersion_, sizeof(info_.driverVersion_) - 1, AMD_BUILD_STRING " (PAL%s)",
-        settings().useLightning_ ? ",LC" : ",HSAIL");
+             settings().useLightning_ ? ",LC" : ",HSAIL");

  info_.profile_ = "FULL_PROFILE";
  if (settings().oclVersion_ >= OpenCL20) {
@@ -640,15 +637,16 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
    info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
    info_.simdWidth_ = hwInfo()->simdWidth_;
    info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
-    info_.wavefrontWidth_ = settings().enableWave32Mode_ ? 32:
-                            palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
+    info_.wavefrontWidth_ =
+        settings().enableWave32Mode_ ? 32 : palProp.gfxipProperties.shaderCore.nativeWavefrontSize;
    info_.availableSGPRs_ = palProp.gfxipProperties.shaderCore.numAvailableSgprs;

    info_.globalMemChannelBanks_ = 4;
    info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
    info_.localMemSizePerCU_ = hwInfo()->localMemSizePerCU_;
    info_.localMemBanks_ = hwInfo()->localMemBanks_;
-    info_.gfxipVersion_ = settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;
+    info_.gfxipVersion_ =
+        settings().useLightning_ ? hwInfo()->gfxipVersionLC_ : hwInfo()->gfxipVersion_;

    info_.timeStampFrequency_ = 1000000;
    info_.numAsyncQueues_ = numComputeRings;
@@ -661,7 +659,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
    info_.pcieDeviceId_ = palProp.deviceId;
    info_.pcieRevisionId_ = palProp.revisionId;
    info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * hwInfo()->simdPerCU_ *
-                             palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
+        palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
  }
 }

@@ -789,8 +787,7 @@ Device::Device()
      globalScratchBuf_(nullptr),
      srdManager_(nullptr),
      resourceList_(nullptr),
-      rgpCaptureMgr_(nullptr)
-      {}
+      rgpCaptureMgr_(nullptr) {}

 Device::~Device() {
  // remove the HW debug manager
@@ -803,8 +800,8 @@ Device::~Device() {
  }

  if (glb_ctx_ != nullptr) {
-      glb_ctx_->release();
-      glb_ctx_ = nullptr;
+    glb_ctx_->release();
+    glb_ctx_ = nullptr;
  }

  delete srdManager_;
@@ -878,19 +875,21 @@ bool Device::create(Pal::IDevice* device) {
  ipLevel_ = properties().gfxLevel;
  asicRevision_ = properties().revision;

-   // XNACK flag should be set for  PageMigration | IOMMUv2 Support
-  uint isXNACKSupported = static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled
-      || properties_.gpuMemoryProperties.flags.iommuv2Support);
+  // XNACK flag should be set for  PageMigration | IOMMUv2 Support
+  uint isXNACKSupported =
+      static_cast<uint>(properties_.gpuMemoryProperties.flags.pageMigrationEnabled ||
+                        properties_.gpuMemoryProperties.flags.iommuv2Support);
  uint subtarget = isXNACKSupported;

  // Update HW info for the device
  if ((GPU_ENABLE_PAL == 1) && (properties().revision <= Pal::AsicRevision::Polaris12)) {
    hwInfo_ = &DeviceInfo[static_cast<uint>(properties().revision)];
  } else if (ipLevel_ >= Pal::GfxIpLevel::GfxIp9) {
-      // For compiler sub targets
-      subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10)) << 1 |
-          subtarget;
-      hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
+    // For compiler sub targets
+    subtarget = (static_cast<uint>(asicRevision_) % static_cast<uint>(Pal::AsicRevision::Vega10))
+            << 1 |
+        subtarget;
+    hwInfo_ = &Gfx9PlusSubDeviceInfo[subtarget];
  } else {
    return false;
  }
@@ -995,7 +994,7 @@ bool Device::create(Pal::IDevice* device) {
  }

  if (settings().useLightning_) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
    //  create compilation object with cache support
    int gfxipMajor = hwInfo()->gfxipVersionLC_ / 100;
    int gfxipMinor = hwInfo()->gfxipVersionLC_ / 10 % 10;
@@ -1013,7 +1012,7 @@ bool Device::create(Pal::IDevice* device) {
    }

    amd::CacheCompilation* compObj = new amd::CacheCompilation(
-      cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
+        cacheTarget.str(), "_pal", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
    if (!compObj) {
      LogError("Unable to create cache compilation object!");
      return false;
@@ -1021,18 +1020,17 @@ bool Device::create(Pal::IDevice* device) {

    cacheCompilation_.reset(compObj);
 #endif
-  }
-  else {
-#if  defined(WITH_COMPILER_LIB)
+  } else {
+#if defined(WITH_COMPILER_LIB)
    const char* library = getenv("HSA_COMPILER_LIBRARY");
-    aclCompilerOptions opts = { sizeof(aclCompilerOptions_0_8),
-      library,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      AMD_OCL_SC_LIB };
+    aclCompilerOptions opts = {sizeof(aclCompilerOptions_0_8),
+                               library,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               nullptr,
+                               AMD_OCL_SC_LIB};
    // Initialize the compiler handle
    acl_error error;
    compiler_ = aclCompilerInit(&opts, &error);
@@ -1056,7 +1054,7 @@ bool Device::create(Pal::IDevice* device) {

  if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) {
    std::vector<amd::Device*> devices;
-    uint32_t numDevices =  amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
+    uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true);
    // Add all PAL devices
    for (uint32_t i = gStartDevice; i < numDevices; ++i) {
      devices.push_back(amd::Device::devices()[i]);
@@ -1070,8 +1068,8 @@ bool Device::create(Pal::IDevice* device) {
      if (glb_ctx_ == nullptr) {
        return false;
      }
-      amd::Buffer* buf = 
-        new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
+      amd::Buffer* buf =
+          new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize);
      if ((buf != nullptr) && buf->create()) {
        p2p_stage_ = buf;
      } else {
@@ -1086,11 +1084,8 @@ bool Device::create(Pal::IDevice* device) {

 // =====================================================================================================================
 // Master function that handles developer callbacks from PAL.
-void PAL_STDCALL Device::PalDeveloperCallback(
-  void*                        pPrivateData,
-  const Pal::uint32            deviceIndex,
-  Pal::Developer::CallbackType type,
-  void*                        pCbData) {
+void PAL_STDCALL Device::PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
+                                              Pal::Developer::CallbackType type, void* pCbData) {
  Device* device = static_cast<Device*>(pPrivateData);
  const auto& barrier = *static_cast<const Pal::Developer::BarrierData*>(pCbData);

@@ -1099,7 +1094,7 @@ void PAL_STDCALL Device::PalDeveloperCallback(
  VirtualGPU* gpu = nullptr;
  if (pBarrierData->pCmdBuffer != nullptr) {
    // Find which queue the current command buffer belongs
-    for (const auto& it: device->vgpus()) {
+    for (const auto& it : device->vgpus()) {
      if (it->isActiveCmd(pBarrierData->pCmdBuffer)) {
        gpu = it;
        break;
@@ -1112,18 +1107,18 @@ void PAL_STDCALL Device::PalDeveloperCallback(
  }

  switch (type) {
-  case Pal::Developer::CallbackType::BarrierBegin:
-    device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
-  break;
-  case Pal::Developer::CallbackType::BarrierEnd:
-    device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
-  break;
-  case Pal::Developer::CallbackType::ImageBarrier:
-    assert(false);
-    break;
-  case Pal::Developer::CallbackType::DrawDispatch:
+    case Pal::Developer::CallbackType::BarrierBegin:
+      device->rgpCaptureMgr()->WriteBarrierStartMarker(gpu, barrier);
      break;
-  default:
+    case Pal::Developer::CallbackType::BarrierEnd:
+      device->rgpCaptureMgr()->WriteBarrierEndMarker(gpu, barrier);
+      break;
+    case Pal::Developer::CallbackType::ImageBarrier:
+      assert(false);
+      break;
+    case Pal::Developer::CallbackType::DrawDispatch:
+      break;
+    default:
      break;
  }
 }
@@ -1136,15 +1131,16 @@ bool Device::initializeHeapResources() {
    // Request all compute engines
    finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines =
        ((1 << numComputeEngines_) - 1);
-    for (const auto& it: exclusiveComputeEnginesId_) {
+    for (const auto& it : exclusiveComputeEnginesId_) {
      // Request real time compute engines
-      finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |= (1 << it.second);
+      finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines |=
+          (1 << it.second);
    }
    // Request all SDMA engines
    finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = (1 << numDmaEngines_) - 1;

    if (iDev()->Finalize(finalizeInfo) != Pal::Result::Success) {
-        return false;
+      return false;
    }

    heapInitComplete_ = true;
@@ -1201,7 +1197,8 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
  if (queue != nullptr) {
    profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
    if (queue->asHostQueue() != nullptr) {
-      bool interopQueue = (0 != (queue->context().info().flags_ &
+      bool interopQueue = (0 !=
+                           (queue->context().info().flags_ &
                            (amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
                             amd::Context::D3D11DeviceKhr)));
      rtCUs = queue->rtCUs();
@@ -1233,8 +1230,7 @@ device::Program* Device::createProgram(amd::option::Options* options) {
  device::Program* program;
  if (settings().useLightning_) {
    program = new LightningProgram(*this);
-  }
-  else {
+  } else {
    program = new HSAILProgram(*this);
  }
  if (program == nullptr) {
@@ -1249,9 +1245,7 @@ typedef std::unordered_map<int, bool> requestedDevices_t;

 //! Parses the requested list of devices to be exposed to the user.
 static void parseRequestedDeviceList(const char* requestedDeviceList,
-                                     requestedDevices_t& requestedDevices,
-                                     uint32_t numDevices) {
-
+                                     requestedDevices_t& requestedDevices, uint32_t numDevices) {
  char* pch = strtok(const_cast<char*>(requestedDeviceList), ",");
  while (pch != nullptr) {
    bool deviceIdValid = true;
@@ -1263,8 +1257,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList,
        break;
      }
    }
-    if (currentDeviceIndex < 0 ||
-      static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
+    if (currentDeviceIndex < 0 || static_cast<uint32_t>(currentDeviceIndex) >= numDevices) {
      deviceIdValid = false;
    }
    // Get next token.
@@ -1310,9 +1303,9 @@ bool Device::init() {
  // Count up all the devices in the system.
  platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);

-  const char* requestedDeviceList = amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ?
-                                    HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
-                                    : GPU_DEVICE_ORDINAL;
+  const char* requestedDeviceList = amd::IS_HIP
+      ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
+      : GPU_DEVICE_ORDINAL;

  if (requestedDeviceList[0] != '\0') {
    useDeviceList = true;
@@ -1465,8 +1458,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
    if (result) {
      // Disallow permanent map for Win7 only, since OS will move buffer to sysmem
      if (IS_LINUX ||
-        // Or Win10
-        (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
+          // Or Win10
+          (properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs == false)) {
        void* address = gpuMemory->map(nullptr);
        CondLog(address == nullptr, "PAL failed lock of persistent memory!");
      }
@@ -1697,9 +1690,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
      (memory->memoryType() != Resource::ExternalPhysical) &&
      ((owner.getHostMem() != nullptr) ||
       ((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
-    bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
-                                          ? owner.getHostMemRef()->size()
-                                          : owner.getSize());
+    bool ok = memory->pinSystemMemory(
+        owner.getHostMem(),
+        (owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
    //! \note: Ignore the pinning result for now
  }

@@ -1720,9 +1713,9 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
 device::Memory* Device::createView(amd::Memory& owner, const device::Memory& parent) const {
  assert((owner.asImage() != nullptr) && "View supports images only");
  const amd::Image& image = *owner.asImage();
-  pal::Memory* gpuImage = new pal::Image(
-    *this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
-    image.getImageFormat(), image.getType(), image.getMipLevels());
+  pal::Memory* gpuImage =
+      new pal::Image(*this, owner, image.getWidth(), image.getHeight(), image.getDepth(),
+                     image.getImageFormat(), image.getType(), image.getMipLevels());

  // Create resource
  if (nullptr != gpuImage) {
@@ -1827,19 +1820,18 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
  Pal::gpusize invisible = allocedMem[Pal::GpuHeapInvisible] - resourceCache().lclCacheSize();

  // Fill free memory info
-  freeMemory[TotalFreeMemory] = static_cast<size_t>((info().globalMemSize_ -
-    (local + invisible)) / Ki);
+  freeMemory[TotalFreeMemory] =
+      static_cast<size_t>((info().globalMemSize_ - (local + invisible)) / Ki);
  if (invisible >= heaps_[Pal::GpuHeapInvisible].heapSize) {
    invisible = 0;
-  }
-  else {
+  } else {
    invisible = heaps_[Pal::GpuHeapInvisible].heapSize - invisible;
  }
  freeMemory[LargestFreeBlock] = static_cast<size_t>(invisible) / Ki;

  if (settings().apuSystem_) {
    Pal::gpusize sysMem = allocedMem[Pal::GpuHeapGartCacheable] + allocedMem[Pal::GpuHeapGartUswc] -
-      resourceCache().cacheSize() + resourceCache().lclCacheSize();
+        resourceCache().cacheSize() + resourceCache().lclCacheSize();
    sysMem /= Ki;
    if (sysMem >= freeMemory[TotalFreeMemory]) {
      freeMemory[TotalFreeMemory] = 0;
@@ -1945,8 +1937,7 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
    amd::ScopedLock lk(scratchAlloc_);
    uint sb = vgpu->hwRing();
    static const uint WaveSizeLimit = ((1 << 21) - 256);
-    const uint threadSizeLimit =
-        WaveSizeLimit / info().wavefrontWidth_;
+    const uint threadSizeLimit = WaveSizeLimit / info().wavefrontWidth_;
    if (regNum > threadSizeLimit) {
      LogError("Requested private memory is bigger than HW supports!");
      regNum = threadSizeLimit;
@@ -1968,9 +1959,8 @@ bool Device::allocScratch(uint regNum, const VirtualGPU* vgpu) {
          // Calculate the size of the scratch buffer for a queue
          uint32_t numTotalCUs = info().maxComputeUnits_;
          uint32_t numMaxWaves = settings().numScratchWavesPerCu_ * numTotalCUs;
-          scratchBuf->size_ =
-              static_cast<uint64_t>(info().wavefrontWidth_) *
-              scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
+          scratchBuf->size_ = static_cast<uint64_t>(info().wavefrontWidth_) * scratchBuf->regNum_ *
+              numMaxWaves * sizeof(uint32_t);
          scratchBuf->size_ = std::min(scratchBuf->size_, info().maxMemAllocSize_);
          scratchBuf->size_ = std::min(scratchBuf->size_, uint64_t(3 * Gi));
          // Note: Generic address space setup in HW requires 64KB alignment for scratch
@@ -2280,7 +2270,7 @@ void Device::SrdManager::freeSrdSlot(uint64_t addr) {
 void Device::updateAllocedMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free) const {
  if (free) {
    allocedMem[heap] -= size;
-  }  else {
+  } else {
    allocedMem[heap] += size;
  }
 }
@@ -2337,12 +2327,18 @@ cl_int Device::hwDebugManagerInit(amd::Context* context, uintptr_t messageStorag
  return status;
 }

-bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+                          cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
  bool result = false;
  Pal::SetClockModeInput setClockMode = {};
-  Pal::DeviceClockMode palClockMode = static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
+  Pal::DeviceClockMode palClockMode =
+      static_cast<Pal::DeviceClockMode>(setClockModeInput.clock_mode);
  setClockMode.clockMode = palClockMode;
-  result = (Pal::Result::Success == (iDev()->SetClockMode(setClockMode, reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))? true : false;
+  result = (Pal::Result::Success ==
+            (iDev()->SetClockMode(setClockMode,
+                                  reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))
+      ? true
+      : false;
  return result;
 }

@@ -49,7 +49,7 @@ class NullDevice : public amd::Device {
  bool create(Pal::AsicRevision asicRevision,  //!< GPU ASIC revision
              Pal::GfxIpLevel ipLevel,         //!< GPU ip level
              uint xNACKSupported = 0          //!< GPU xNACKSupported
-             );
+  );

  //! Instantiate a new virtual device
  virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) {
@@ -111,11 +111,14 @@ class NullDevice : public amd::Device {
  virtual void svmFree(void* ptr) const { return; }

  void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
-  void  Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
-  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
+  void Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
+  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+                            cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+    return true;
+  }

 protected:
-  static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
+  static Util::GenericAllocator allocator_;  //!< Generic memory allocator in PAL

  Pal::AsicRevision asicRevision_;  //!< ASIC revision
  Pal::GfxIpLevel ipLevel_;         //!< Device IP level
@@ -127,7 +130,7 @@ class NullDevice : public amd::Device {
                      size_t maxTextureSize,         //!< Maximum texture size supported in HW
                      uint numComputeRings,          //!< Number of compute rings
                      uint numExclusiveComputeRings  //!< Number of exclusive compute rings
-                      );
+  );
 };

 //! Forward declarations
@@ -148,26 +151,22 @@ class ThreadTrace;
 #ifndef CL_FILTER_NONE
 #define CL_FILTER_NONE 0x1142
 #endif
-enum class ExclusiveQueueType : uint32_t {
-  RealTime0 = 0,
-  RealTime1,
-  Medium
-};
+enum class ExclusiveQueueType : uint32_t { RealTime0 = 0, RealTime1, Medium };
 class Sampler : public device::Sampler {
 public:
  //! Constructor
-    Sampler(const Device& dev) : dev_(dev) {}
+  Sampler(const Device& dev) : dev_(dev) {}

  //! Default destructor for the device memory object
  virtual ~Sampler();

  //! Creates a device sampler from the OCL sampler state
  bool create(uint32_t oclSamplerState  //!< OCL sampler state
-              );
+  );

  //! Creates a device sampler from the OCL sampler state
  bool create(const amd::Sampler& owner  //!< AMD sampler object
-              );
+  );

 private:
  //! Disable default copy constructor
@@ -216,7 +215,7 @@ class Device : public NullDevice {
    //! Releases transfer buffer
    void release(VirtualGPU& gpu,  //!< Virual GPU object used with the buffer
                 Memory& buffer    //!< Transfer buffer for release
-                 );
+    );

    //! Returns the buffer's size for transfer
    size_t bufSize() const { return bufSize_; }
@@ -308,7 +307,7 @@ class Device : public NullDevice {
  //! Initialise a device (i.e. all parts of the constructor that could
  //! potentially fail)
  bool create(Pal::IDevice* device  //!< PAL device interface object
-              );
+  );

  //! Destructor for the physical GPU device
  virtual ~Device();
@@ -346,7 +345,8 @@ class Device : public NullDevice {
  virtual bool validateKernel(const amd::Kernel& kernel,  //!< AMD kernel object
                              const device::VirtualDevice* vdev);

-  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
+  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+                            cl_set_device_clock_mode_output_amd* pSetClockModeOutput);

  //! Retrieves information about free memory on a GPU device
  virtual bool globalFreeMemory(size_t* freeMemory) const;
@@ -398,9 +398,10 @@ class Device : public NullDevice {
  //! Returns the number of available compute rings
  uint numExclusiveComputeEngines() const { return exclusiveComputeEnginesId_.size(); }

-  //! Returns the map of available exclusive compute rings with the engine index 
-  const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const
-    { return exclusiveComputeEnginesId_; }
+  //! Returns the map of available exclusive compute rings with the engine index
+  const std::map<ExclusiveQueueType, uint32_t>& exclusiveComputeEnginesId() const {
+    return exclusiveComputeEnginesId_;
+  }

  //! Returns the number of available DMA engines
  uint numDMAEngines() const { return numDmaEngines_; }
@@ -526,11 +527,8 @@ class Device : public NullDevice {
  }

 private:
-  static void PAL_STDCALL PalDeveloperCallback(
-    void*                        pPrivateData,
-    const Pal::uint32            deviceIndex,
-    Pal::Developer::CallbackType type,
-    void*                        pCbData);
+  static void PAL_STDCALL PalDeveloperCallback(void* pPrivateData, const Pal::uint32 deviceIndex,
+                                               Pal::Developer::CallbackType type, void* pCbData);

  //! Disable copy constructor
  Device(const Device&);
@@ -554,36 +552,37 @@ class Device : public NullDevice {
  //! Allocates/reallocates the scratch buffer, according to the usage
  bool allocScratch(uint regNum,            //!< Number of the scratch registers
                    const VirtualGPU* vgpu  //!< Virtual GPU for the allocation
-                    );
+  );

  //! Interop for D3D devices
  bool associateD3D11Device(void* d3d11Device  //!< void* is of type ID3D11Device*
-                            );
+  );
  bool associateD3D10Device(void* d3d10Device  //!< void* is of type ID3D10Device*
-                            );
+  );
  bool associateD3D9Device(void* d3d9Device  //!< void* is of type IDirect3DDevice9*
-                           );
+  );
  //! Interop for GL device
  bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
  bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;

-  static char* platformObj_;          //!< Memory allocated for PAL platform object
-  static Pal::IPlatform*  platform_;  //!< Pointer to the PAL platform object
+  static char* platformObj_;         //!< Memory allocated for PAL platform object
+  static Pal::IPlatform* platform_;  //!< Pointer to the PAL platform object

-  amd::Context* context_;       //!< A dummy context for internal allocations
-  mutable amd::Monitor lockAsyncOps_;    //!< Lock to serialise all async ops on this device
+  amd::Context* context_;              //!< A dummy context for internal allocations
+  mutable amd::Monitor lockAsyncOps_;  //!< Lock to serialise all async ops on this device
  //! Lock to serialise all async ops on initialization heap operation
-  mutable amd::Monitor lockForInitHeap_;        
-  mutable amd::Monitor lockPAL_;         //!< Lock to serialise PAL access
-  mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
-  mutable amd::Monitor scratchAlloc_;    //!< Lock to serialise scratch allocation
-  mutable amd::Monitor mapCacheOps_;     //!< Lock to serialise cache for the map resources
-  mutable amd::Monitor lockResourceOps_; //!< Lock to serialise resource access
-  XferBuffers* xferRead_;                //!< Transfer buffers read
-  std::vector<amd::Memory*>* mapCache_;  //!< Map cache info structure
-  ResourceCache* resourceCache_;         //!< Resource cache
-  uint numComputeEngines_;               //!< The number of available compute engines
-  std::map<ExclusiveQueueType, uint32_t> exclusiveComputeEnginesId_;//!< The number of available compute engines
+  mutable amd::Monitor lockForInitHeap_;
+  mutable amd::Monitor lockPAL_;          //!< Lock to serialise PAL access
+  mutable amd::Monitor vgpusAccess_;      //!< Lock to serialise virtual gpu list access
+  mutable amd::Monitor scratchAlloc_;     //!< Lock to serialise scratch allocation
+  mutable amd::Monitor mapCacheOps_;      //!< Lock to serialise cache for the map resources
+  mutable amd::Monitor lockResourceOps_;  //!< Lock to serialise resource access
+  XferBuffers* xferRead_;                 //!< Transfer buffers read
+  std::vector<amd::Memory*>* mapCache_;   //!< Map cache info structure
+  ResourceCache* resourceCache_;          //!< Resource cache
+  uint numComputeEngines_;                //!< The number of available compute engines
+  std::map<ExclusiveQueueType, uint32_t>
+      exclusiveComputeEnginesId_;        //!< The number of available compute engines
  uint numDmaEngines_;                   //!< The number of available compute engines
  bool heapInitComplete_;                //!< Keep track of initialization status of heap resources
  VirtualGPU* xferQueue_;                //!< Transfer queue
@@ -594,10 +593,13 @@ class Device : public NullDevice {
  mutable bool freeCPUMem_;              //!< flag to mark GPU free SVM CPU mem
  Pal::DeviceProperties properties_;     //!< PAL device properties
  Pal::IDevice* device_;                 //!< PAL device object
-  mutable std::atomic<Pal::gpusize> allocedMem[Pal::GpuHeap::GpuHeapCount];  //!< Free memory counter
-  std::unordered_set<Resource*>* resourceList_;   //!< Active resource list
-  RgpCaptureMgr*   rgpCaptureMgr_;       //!< RGP capture manager
-  Pal::GpuMemoryHeapProperties heaps_[Pal::GpuHeapCount]; //!< Information about heaps, returned from PAL
+  mutable std::atomic<Pal::gpusize>
+      allocedMem[Pal::GpuHeap::GpuHeapCount];    //!< Free memory counter
+  std::unordered_set<Resource*>* resourceList_;  //!< Active resource list
+  RgpCaptureMgr* rgpCaptureMgr_;                 //!< RGP capture manager
+  Pal::GpuMemoryHeapProperties
+      heaps_[Pal::GpuHeapCount];  //!< Information about heaps, returned from PAL
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -3,19 +3,19 @@
 #if defined(ATI_OS_LINUX)
 namespace pal {
 bool Device::associateD3D10Device(void* d3d10Device) { return false; }
-}  // pal
+}  // namespace pal
 #else  // !ATI_OS_WIN

 #include <D3D10_1.h>

 /**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
 #include "DxxOpenCLInteropExt.h"

 namespace pal {
@@ -127,6 +127,6 @@ bool Device::associateD3D10Device(void* d3d10Device) {
  return canInteroperate;
 }

-}  // pal
+}  // namespace pal

 #endif  // !ATI_OS_WIN
@@ -3,19 +3,19 @@
 #if defined(ATI_OS_LINUX)
 namespace pal {
 bool Device::associateD3D11Device(void* d3d11Device) { return false; }
-}
+}  // namespace pal
 #else  // !ATI_OS_LINUX

 #include <D3D11.h>

 /**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
 #include "DxxOpenCLInteropExt.h"

 namespace pal {
@@ -128,6 +128,6 @@ bool Device::associateD3D11Device(void* d3d11Device) {
  return canInteroperate;
 }

-}  // pal
+}  // namespace pal

 #endif  // !ATI_OS_LINUX
@@ -3,20 +3,20 @@
 #if defined(ATI_OS_LINUX)
 namespace pal {
 bool Device::associateD3D9Device(void* d3dDevice) { return false; }
-}
+}  // namespace pal
 #else  // !ATI_OS_LINUX

 #include <d3d9.h>
 #include <dxgi.h>

 /**************************************************************************************************************
-* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
-* This means OCL client spec will need to change to include headers directly from the DXX perforce
-*tree.
-* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
-* without notification. So it is safe to use a local copy of the relevant DXX extension interface
-*classes.
-**************************************************************************************************************/
+ * Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
+ * This means OCL client spec will need to change to include headers directly from the DXX perforce
+ *tree.
+ * However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
+ * without notification. So it is safe to use a local copy of the relevant DXX extension interface
+ *classes.
+ **************************************************************************************************************/
 #include "DxxOpenCLInteropExt.h"

 namespace pal {
@@ -44,5 +44,5 @@ bool Device::associateD3D9Device(void* d3d9Device) {
  return canInteroperate;
 }

-}  // pal
+}  // namespace pal
 #endif  // !ATI_OS_WIN
@@ -32,34 +32,27 @@
 #include "protocols/rgpServer.h"
 #include "protocols/driverControlServer.h"

-namespace pal
-{
+namespace pal {
 // ================================================================================================
 RgpCaptureMgr::RgpCaptureMgr(Pal::IPlatform* platform, const Device& device)
-  :
-  device_(device),
-  dev_driver_server_(platform->GetDevDriverServer()),
-  user_event_(nullptr),
-  num_prep_disp_(0),
-  max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
-  trace_gpu_mem_limit_(0),
-  global_disp_count_(1),      // Must start from 1 according to RGP spec
-  trace_enabled_(false),
-  inst_tracing_enabled_(false)
-{
+    : device_(device),
+      dev_driver_server_(platform->GetDevDriverServer()),
+      user_event_(nullptr),
+      num_prep_disp_(0),
+      max_sqtt_disp_(device_.settings().rgpSqttDispCount_),
+      trace_gpu_mem_limit_(0),
+      global_disp_count_(1),  // Must start from 1 according to RGP spec
+      trace_enabled_(false),
+      inst_tracing_enabled_(false) {
  memset(&trace_, 0, sizeof(trace_));
 }

 // ================================================================================================
-RgpCaptureMgr::~RgpCaptureMgr()
-{
-  DestroyRGPTracing();
-}
+RgpCaptureMgr::~RgpCaptureMgr() { DestroyRGPTracing(); }

 // ================================================================================================
 // Creates the GPU Open Developer Mode manager class.
-RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device)
-{
+RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& device) {
  RgpCaptureMgr* mgr = new RgpCaptureMgr(platform, device);

  if (mgr != nullptr && !mgr->Init(platform)) {
@@ -71,8 +64,7 @@ RgpCaptureMgr* RgpCaptureMgr::Create(Pal::IPlatform* platform, const Device& dev
 }

 // ================================================================================================
-bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
-{
+bool RgpCaptureMgr::Init(Pal::IPlatform* platform) {
  if (dev_driver_server_ == nullptr) {
    return false;
  }
@@ -105,13 +97,11 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)

    const uint32_t api_version = settings.oclVersion_;

-    trace_.gpa_session_ = new GpuUtil::GpaSession(
-        platform,
-        device_.iDev(),
-        api_version >> 4,   // OCL API version major
-        api_version & 0xf,  // OCL API version minor
-        RgpSqttInstrumentationSpecVersion,
-        RgpSqttInstrumentationApiVersion);
+    trace_.gpa_session_ = new GpuUtil::GpaSession(platform, device_.iDev(),
+                                                  api_version >> 4,   // OCL API version major
+                                                  api_version & 0xf,  // OCL API version minor
+                                                  RgpSqttInstrumentationSpecVersion,
+                                                  RgpSqttInstrumentationApiVersion);

    if (trace_.gpa_session_ == nullptr) {
      result = false;
@@ -119,7 +109,7 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
  }

  // Initialize the GPA session
-  if (result &&  (trace_.gpa_session_->Init() != Pal::Result::Success)) {
+  if (result && (trace_.gpa_session_->Init() != Pal::Result::Success)) {
    result = false;
  }

@@ -133,9 +123,9 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
  if (!result) {
    // If we've failed to initialize tracing, permanently disable traces
    if (rgp_server_ != nullptr) {
-        rgp_server_->DisableTraces();
+      rgp_server_->DisableTraces();

-        trace_enabled_ = false;
+      trace_enabled_ = false;
    }

    // Clean up if we failed
@@ -150,9 +140,8 @@ bool RgpCaptureMgr::Init(Pal::IPlatform* platform)
 // ================================================================================================
 // This function finds out all the queues in the device that we have to synchronize for RGP-traced
 // frames and initializes resources for them.
-bool RgpCaptureMgr::RegisterTimedQueue(
-  uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const
-{
+bool RgpCaptureMgr::RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue,
+                                       bool* debug_vmid) const {
  bool result = true;

  // Get the OS context handle for this queue (this is a thing that RGP needs on DX clients;
@@ -166,8 +155,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
  *debug_vmid = kernelContextInfo.flags.hasDebugVmid;

  // Register the queue with the GPA session class for timed queue operation support.
-  if (trace_.gpa_session_->RegisterTimedQueue(iQueue, queue_id,
-      kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
+  if (trace_.gpa_session_->RegisterTimedQueue(
+          iQueue, queue_id, kernelContextInfo.contextIdentifier) != Pal::Result::Success) {
    result = false;
  }

@@ -175,11 +164,8 @@ bool RgpCaptureMgr::RegisterTimedQueue(
 }

 // ================================================================================================
-Pal::Result RgpCaptureMgr::TimedQueueSubmit(
-  Pal::IQueue*  queue,
-  uint64_t      cmdId,
-  const Pal::SubmitInfo& submitInfo) const
-{
+Pal::Result RgpCaptureMgr::TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
+                                            const Pal::SubmitInfo& submitInfo) const {
  // Fill in extra meta-data information to associate the API command buffer data with
  // the generated timing information.
  GpuUtil::TimedSubmitInfo timedSubmitInfo = {};
@@ -205,8 +191,7 @@ Pal::Result RgpCaptureMgr::TimedQueueSubmit(
 // Called during initial device enumeration prior to calling Pal::IDevice::CommitSettingsAndInit().
 //
 // This finalizes the developer driver manager.
-void RgpCaptureMgr::Finalize()
-{
+void RgpCaptureMgr::Finalize() {
  // Figure out if the gfxip supports tracing.  We decide tracing if there is at least one
  // enumerated GPU that can support tracing.  Since we don't yet know if that GPU will be
  // picked as the target of an eventual VkDevice, this check is imperfect.
@@ -215,8 +200,8 @@ void RgpCaptureMgr::Finalize()
  bool hw_support_tracing = false;

  if ((rgp_server_->EnableTraces() == DevDriver::Result::Success)) {
-   if (GpuSupportsTracing(device_.properties(), device_.settings())) {
-     hw_support_tracing = true;
+    if (GpuSupportsTracing(device_.properties(), device_.settings())) {
+      hw_support_tracing = true;
    }
  }

@@ -234,20 +219,18 @@ void RgpCaptureMgr::Finalize()

 // ================================================================================================
 // Waits for the driver to be resumed if it's currently paused.
-void RgpCaptureMgr::WaitForDriverResume()
-{
-    auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
+void RgpCaptureMgr::WaitForDriverResume() {
+  auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();

-    assert(pDriverControlServer != nullptr);
+  assert(pDriverControlServer != nullptr);

-    pDriverControlServer->WaitForDriverResume();
+  pDriverControlServer->WaitForDriverResume();
 }

 // ================================================================================================
 // Called before a swap chain presents.  This signals a frame-end boundary and
 // is used to coordinate RGP trace start/stop.
-void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
-{
+void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu) {
  if (rgp_server_->TracesEnabled()) {
    // If there's currently a trace running, submit the trace-end command buffer
    if (trace_.status_ == TraceStatus::Running) {
@@ -257,8 +240,7 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)
        Pal::Result res = EndRGPHardwareTrace(gpu);
        if (Pal::Result::ErrorIncompatibleQueue == res) {
          // continue until we find the right queue...
-        }
-        else if (Pal::Result::Success == res) {
+        } else if (Pal::Result::Success == res) {
          trace_.sqtt_disp_count_ = 0;
        } else {
          FinishRGPTrace(gpu, true);
@@ -272,43 +254,42 @@ void RgpCaptureMgr::PostDispatch(VirtualGPU* gpu)

      // Currently nothing in the PresentInfo struct is used for inserting a timed present marker.
      GpuUtil::TimedQueuePresentInfo timedPresentInfo = {};
-      //Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
-      //assert(result == Pal::Result::Success);
+      // Pal::Result result = trace_.gpa_session_->TimedQueuePresent(pPalQueue, timedPresentInfo);
+      // assert(result == Pal::Result::Success);
    }
  }
 }

 // ================================================================================================
-Pal::Result RgpCaptureMgr::CheckForTraceResults()
-{
+Pal::Result RgpCaptureMgr::CheckForTraceResults() {
  assert(trace_.status_ == TraceStatus::WaitingForResults);

  Pal::Result result = Pal::Result::NotReady;

  // Check if trace results are ready
-  if (trace_.gpa_session_->IsReady() && // GPA session is ready
-      (trace_.begin_queue_->isDone(&trace_.end_event_)))   // "Trace end" cmdbuf has retired
+  if (trace_.gpa_session_->IsReady() &&                   // GPA session is ready
+      (trace_.begin_queue_->isDone(&trace_.end_event_)))  // "Trace end" cmdbuf has retired
  {
    bool success = false;

    // Fetch required trace data size from GPA session
    size_t traceDataSize = 0;
-    void* pTraceData     = nullptr;
+    void* pTraceData = nullptr;

    trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, nullptr);

    // Allocate memory for trace data
    if (traceDataSize > 0) {
-        pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
+      pTraceData = amd::AlignedMemory::allocate(traceDataSize, 256);
    }

    if (pTraceData != nullptr) {
      // Get trace data from GPA session
      if (trace_.gpa_session_->GetResults(trace_.gpa_sample_id_, &traceDataSize, pTraceData) ==
-        Pal::Result::Success) {
+          Pal::Result::Success) {
        // Transmit trace data to anyone who's listening
-        auto devResult = rgp_server_->WriteTraceData(
-            static_cast<Pal::uint8*>(pTraceData), traceDataSize);
+        auto devResult =
+            rgp_server_->WriteTraceData(static_cast<Pal::uint8*>(pTraceData), traceDataSize);

        success = (devResult == DevDriver::Result::Success);
      }
@@ -317,7 +298,7 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
    }

    if (success) {
-        result = Pal::Result::Success;
+      result = Pal::Result::Success;
    }
  }

@@ -327,9 +308,8 @@ Pal::Result RgpCaptureMgr::CheckForTraceResults()
 // ================================================================================================
 // Called after a swap chain presents.  This signals a (next) frame-begin boundary and is
 // used to coordinate RGP trace start/stop.
-void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
-  size_t x, size_t y, size_t z)
-{
+void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel, size_t x, size_t y,
+                                size_t z) {
  // Wait for the driver to be resumed in case it's been paused.
  WaitForDriverResume();

@@ -347,8 +327,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
          }
        }
      }
-    }
-    else if (trace_.status_ == TraceStatus::Preparing) {
+    } else if (trace_.status_ == TraceStatus::Preparing) {
      // Wait some number of "preparation frames" before starting the trace in order to get enough
      // timer samples to sync CPU/GPU clock domains.
      trace_.prepared_disp_count_++;
@@ -370,7 +349,7 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
    // Check if we're ending a trace waiting for SQTT to turn off.
    // If SQTT has turned off, end the trace
    else if (trace_.status_ == TraceStatus::WaitingForSqtt) {
-      Pal::Result result      = Pal::Result::Success;
+      Pal::Result result = Pal::Result::Success;

      if (trace_.begin_queue_->isDone(&trace_.end_sqtt_event_)) {
        result = EndRGPTrace(gpu);
@@ -401,14 +380,17 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
      RgpSqttMarkerEventType apiEvent = RgpSqttMarkerEventType::CmdNDRangeKernel;
      if (kernel.prog().isInternal()) {
        constexpr RgpSqttMarkerEventType ApiEvents[KernelBlitManager::BlitTotal] = {
-          RgpSqttMarkerEventType::CmdCopyImage, RgpSqttMarkerEventType::CmdCopyImage,
-          RgpSqttMarkerEventType::CmdCopyImageToBuffer,
-          RgpSqttMarkerEventType::CmdCopyBufferToImage,
-          RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
-          RgpSqttMarkerEventType::CmdCopyBuffer, RgpSqttMarkerEventType::CmdCopyBuffer,
-          RgpSqttMarkerEventType::CmdFillBuffer, RgpSqttMarkerEventType::CmdFillImage,
-          RgpSqttMarkerEventType::CmdScheduler
-        };
+            RgpSqttMarkerEventType::CmdCopyImage,
+            RgpSqttMarkerEventType::CmdCopyImage,
+            RgpSqttMarkerEventType::CmdCopyImageToBuffer,
+            RgpSqttMarkerEventType::CmdCopyBufferToImage,
+            RgpSqttMarkerEventType::CmdCopyBuffer,
+            RgpSqttMarkerEventType::CmdCopyBuffer,
+            RgpSqttMarkerEventType::CmdCopyBuffer,
+            RgpSqttMarkerEventType::CmdCopyBuffer,
+            RgpSqttMarkerEventType::CmdFillBuffer,
+            RgpSqttMarkerEventType::CmdFillImage,
+            RgpSqttMarkerEventType::CmdScheduler};
        for (uint i = 0; i < KernelBlitManager::BlitTotal; ++i) {
          if (kernel.name().compare(BlitName[i]) == 0) {
            apiEvent = ApiEvents[i];
@@ -418,8 +400,8 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,
      }
      WriteUserEventMarker(gpu, RgpSqttMarkerUserEventObjectName, kernel.name());
      // Write disaptch marker
-      WriteEventWithDimsMarker(gpu, apiEvent,
-        static_cast<uint32_t>(x), static_cast<uint32_t>(y), static_cast<uint32_t>(z));
+      WriteEventWithDimsMarker(gpu, apiEvent, static_cast<uint32_t>(x), static_cast<uint32_t>(y),
+                               static_cast<uint32_t>(z));
    }
  }

@@ -428,11 +410,11 @@ void RgpCaptureMgr::PreDispatch(VirtualGPU* gpu, const HSAILKernel& kernel,

 // ================================================================================================
 // This function starts preparing for an RGP trace.  Preparation involves some N frames of
-// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock domains.
+// lead-up time during which timing samples are accumulated to synchronize CPU and GPU clock
+// domains.
 //
 // This function transitions from the Idle state to the Preparing state.
-Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu) {
  assert(trace_.status_ == TraceStatus::Idle);

  // We can only trace using a single device at a time currently, so recreate RGP trace
@@ -441,32 +423,32 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)

  const auto traceParameters = rgp_server_->QueryTraceParameters();

-  num_prep_disp_   = traceParameters.captureStartIndex;
+  num_prep_disp_ = traceParameters.captureStartIndex;
  uint32_t capture_disp = traceParameters.captureStopIndex - traceParameters.captureStartIndex;
  // Validate if the captured dispatches are in the range
  if ((capture_disp > 0) && (capture_disp < max_sqtt_disp_)) {
    max_sqtt_disp_ = capture_disp;
  }

-  trace_gpu_mem_limit_  = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
+  trace_gpu_mem_limit_ = traceParameters.gpuMemoryLimitInMb * 1024 * 1024;
  inst_tracing_enabled_ = traceParameters.flags.enableInstructionTokens;

  // Notify the RGP server that we are starting a trace
  if (rgp_server_->BeginTrace() != DevDriver::Result::Success) {
-      result = Pal::Result::ErrorUnknown;
+    result = Pal::Result::ErrorUnknown;
  }

  // Tell the GPA session class we're starting a trace
  if (result == Pal::Result::Success) {
    GpuUtil::GpaSessionBeginInfo info = {};

-    info.flags.enableQueueTiming   = true;// trace_.queueTimingEnabled;
+    info.flags.enableQueueTiming = true;  // trace_.queueTimingEnabled;

    result = trace_.gpa_session_->Begin(info);
  }

  trace_.prepared_disp_count_ = 0;
-  trace_.sqtt_disp_count_     = 0;
+  trace_.sqtt_disp_count_ = 0;

  // Sample the timing clocks prior to starting a trace.
  if (result == Pal::Result::Success) {
@@ -476,7 +458,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
  if (result == Pal::Result::Success) {
    // Remember which queue started the trace
    trace_.prepare_queue_ = gpu;
-    trace_.begin_queue_   = nullptr;
+    trace_.begin_queue_ = nullptr;

    trace_.status_ = TraceStatus::Preparing;
  } else {
@@ -497,8 +479,7 @@ Pal::Result RgpCaptureMgr::PrepareRGPTrace(VirtualGPU* gpu)
 // the "begin trace" information command buffer.
 //
 // This function transitions from the Preparing state to the Running state.
-Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
  assert(trace_.status_ == TraceStatus::Preparing);
  assert(trace_enabled_);

@@ -526,8 +507,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)

    // Fill GPU commands
    gpu->eventBegin(MainEngine);
-    trace_.gpa_sample_id_ = trace_.gpa_session_->BeginSample(
-        gpu->queue(MainEngine).iCmd(), sampleConfig);
+    trace_.gpa_sample_id_ =
+        trace_.gpa_session_->BeginSample(gpu->queue(MainEngine).iCmd(), sampleConfig);
    gpu->eventEnd(MainEngine, trace_.begin_sqtt_event_);
  }

@@ -540,7 +521,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)

  // Make the trace active and remember which queue started it
  if (result == Pal::Result::Success) {
-    trace_.status_      = TraceStatus::Running;
+    trace_.status_ = TraceStatus::Running;
    trace_.begin_queue_ = gpu;
  }

@@ -551,8 +532,7 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu)
 // This function submits the command buffer to stop SQTT tracing.  Full tracing still continues.
 //
 // This function transitions from the Running state to the WaitingForSqtt state.
-Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu) {
  assert(trace_.status_ == TraceStatus::Running);

  Pal::Result result = Pal::Result::Success;
@@ -593,8 +573,7 @@ Pal::Result RgpCaptureMgr::EndRGPHardwareTrace(VirtualGPU* gpu)
 // This function ends a running RGP trace.
 //
 // This function transitions from the WaitingForSqtt state to WaitingForResults state.
-Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
-{
+Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu) {
  assert(trace_.status_ == TraceStatus::WaitingForSqtt);

  Pal::Result result = Pal::Result::Success;
@@ -629,8 +608,7 @@ Pal::Result RgpCaptureMgr::EndRGPTrace(VirtualGPU* gpu)
 // ================================================================================================
 // This function resets and possibly cancels a currently active (between begin/end) RGP trace.
 // It frees any dependent resources.
-void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)
-{
+void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted) {
  if (trace_.prepare_queue_ == nullptr) {
    return;
  }
@@ -654,26 +632,25 @@ void RgpCaptureMgr::FinishRGPTrace(VirtualGPU* gpu, bool aborted)

  // Reset tracing state to idle
  trace_.prepared_disp_count_ = 0;
-  trace_.sqtt_disp_count_     = 0;
-  trace_.gpa_sample_id_       = 0;
-  trace_.status_              = TraceStatus::Idle;
-  trace_.prepare_queue_       = nullptr;
-  trace_.begin_queue_         = nullptr;
+  trace_.sqtt_disp_count_ = 0;
+  trace_.gpa_sample_id_ = 0;
+  trace_.status_ = TraceStatus::Idle;
+  trace_.prepare_queue_ = nullptr;
+  trace_.begin_queue_ = nullptr;
 }

 // ================================================================================================
 // Destroys device-persistent RGP resources
-void RgpCaptureMgr::DestroyRGPTracing()
-{
+void RgpCaptureMgr::DestroyRGPTracing() {
  if (trace_.status_ != TraceStatus::Idle) {
-   FinishRGPTrace(nullptr, true);
+    FinishRGPTrace(nullptr, true);
  }

  delete user_event_;

  // Destroy the GPA session
  if (trace_.gpa_session_ != nullptr) {
-    //Util::Destructor(trace_.gpa_session_);
+    // Util::Destructor(trace_.gpa_session_);
    delete trace_.gpa_session_;
    trace_.gpa_session_ = nullptr;
  }
@@ -683,18 +660,15 @@ void RgpCaptureMgr::DestroyRGPTracing()

 // ================================================================================================
 // Returns true if the given device properties/settings support tracing.
-bool RgpCaptureMgr::GpuSupportsTracing(
-    const Pal::DeviceProperties& props,
-    const Settings&       settings)
-{
+bool RgpCaptureMgr::GpuSupportsTracing(const Pal::DeviceProperties& props,
+                                       const Settings& settings) {
  return props.gfxipProperties.flags.supportRgpTraces && !settings.rgpSqttForceDisable_;
 }

 // ================================================================================================
 // Called when a new device is created.  This will preallocate reusable RGP trace resources
 // for that device.
-void RgpCaptureMgr::PostDeviceCreate()
-{
+void RgpCaptureMgr::PostDeviceCreate() {
  amd::ScopedLock traceLock(&trace_mutex_);

  auto* pDriverControlServer = dev_driver_server_->GetDriverControlServer();
@@ -714,8 +688,7 @@ void RgpCaptureMgr::PostDeviceCreate()
 // ================================================================================================
 // Called prior to a device's being destroyed.  This will free persistent RGP trace resources for
 // that device.
-void RgpCaptureMgr::PreDeviceDestroy()
-{
+void RgpCaptureMgr::PreDeviceDestroy() {
  amd::ScopedLock traceLock(&trace_mutex_);
  // If we are idle, we can re-initialize trace resources based on the new device.
  if (trace_.status_ == TraceStatus::Idle) {
@@ -725,9 +698,8 @@ void RgpCaptureMgr::PreDeviceDestroy()

 // ================================================================================================
 // Sets up an Event marker's basic data.
-RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
-  const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const
-{
+RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(const VirtualGPU* gpu,
+                                                   RgpSqttMarkerEventType api_type) const {
  RgpSqttMarkerEvent marker = {};

  marker.identifier = RgpSqttMarkerIdentifierEvent;
@@ -739,24 +711,19 @@ RgpSqttMarkerEvent RgpCaptureMgr::BuildEventMarker(
 }

 // ================================================================================================
-void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const
-{
+void RgpCaptureMgr::WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const {
  assert((data_size % sizeof(uint32_t)) == 0);
  assert((data_size / sizeof(uint32_t)) > 0);

-  gpu->queue(MainEngine).iCmd()->CmdInsertRgpTraceMarker(
-    static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
+  gpu->queue(MainEngine)
+      .iCmd()
+      ->CmdInsertRgpTraceMarker(static_cast<uint32_t>(data_size / sizeof(uint32_t)), data);
 }

 // ================================================================================================
 // Inserts an RGP pre-dispatch marker
-void RgpCaptureMgr::WriteEventWithDimsMarker(
-  const VirtualGPU*      gpu,
-  RgpSqttMarkerEventType apiType,
-  uint32_t               x,
-  uint32_t               y,
-  uint32_t               z) const
-{
+void RgpCaptureMgr::WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
+                                             uint32_t x, uint32_t y, uint32_t z) const {
  assert(apiType != RgpSqttMarkerEventType::Invalid);

  RgpSqttMarkerEventWithDims eventWithDims = {};
@@ -771,26 +738,24 @@ void RgpCaptureMgr::WriteEventWithDimsMarker(
 }

 // ================================================================================================
-void RgpCaptureMgr::WriteBarrierStartMarker(
-  const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
-{
+void RgpCaptureMgr::WriteBarrierStartMarker(const VirtualGPU* gpu,
+                                            const Pal::Developer::BarrierData& data) const {
  if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
    amd::ScopedLock traceLock(&trace_mutex_);
    RgpSqttMarkerBarrierStart marker = {};

    marker.identifier = RgpSqttMarkerIdentifierBarrierStart;
-    marker.cbId       = trace_.begin_queue_->queue(MainEngine).cmdBufId();
-    marker.dword02    = data.reason;
-    marker.internal   = true;
+    marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();
+    marker.dword02 = data.reason;
+    marker.internal = true;

    WriteMarker(gpu, &marker, sizeof(marker));
  }
 }

 // ================================================================================================
-void RgpCaptureMgr::WriteBarrierEndMarker(
-  const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const
-{
+void RgpCaptureMgr::WriteBarrierEndMarker(const VirtualGPU* gpu,
+                                          const Pal::Developer::BarrierData& data) const {
  if (rgp_server_->TracesEnabled() && (trace_.status_ == TraceStatus::Running)) {
    amd::ScopedLock traceLock(&trace_mutex_);
    // Copy the operations part and include the same data from previous markers
@@ -799,28 +764,28 @@ void RgpCaptureMgr::WriteBarrierEndMarker(
    auto operations = data.operations;

    operations.pipelineStalls.u16All |= 0;
-    operations.caches.u16All         |= 0;
+    operations.caches.u16All |= 0;

    RgpSqttMarkerBarrierEnd marker = {};

-    marker.identifier           = RgpSqttMarkerIdentifierBarrierEnd;
-    marker.cbId                 = trace_.begin_queue_->queue(MainEngine).cmdBufId();
+    marker.identifier = RgpSqttMarkerIdentifierBarrierEnd;
+    marker.cbId = trace_.begin_queue_->queue(MainEngine).cmdBufId();

-    marker.waitOnEopTs          = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
-    marker.vsPartialFlush       = operations.pipelineStalls.vsPartialFlush;
-    marker.psPartialFlush       = operations.pipelineStalls.psPartialFlush;
-    marker.csPartialFlush       = operations.pipelineStalls.csPartialFlush;
-    marker.pfpSyncMe            = operations.pipelineStalls.pfpSyncMe;
-    marker.syncCpDma            = operations.pipelineStalls.syncCpDma;
-    marker.invalTcp             = operations.caches.invalTcp;
-    marker.invalSqI             = operations.caches.invalSqI$;
-    marker.invalSqK             = operations.caches.invalSqK$;
-    marker.flushTcc             = operations.caches.flushTcc;
-    marker.invalTcc             = operations.caches.invalTcc;
-    marker.flushCb              = operations.caches.flushCb;
-    marker.invalCb              = operations.caches.invalCb;
-    marker.flushDb              = operations.caches.flushDb;
-    marker.invalDb              = operations.caches.invalDb;
+    marker.waitOnEopTs = operations.pipelineStalls.waitOnEopTsBottomOfPipe;
+    marker.vsPartialFlush = operations.pipelineStalls.vsPartialFlush;
+    marker.psPartialFlush = operations.pipelineStalls.psPartialFlush;
+    marker.csPartialFlush = operations.pipelineStalls.csPartialFlush;
+    marker.pfpSyncMe = operations.pipelineStalls.pfpSyncMe;
+    marker.syncCpDma = operations.pipelineStalls.syncCpDma;
+    marker.invalTcp = operations.caches.invalTcp;
+    marker.invalSqI = operations.caches.invalSqI$;
+    marker.invalSqK = operations.caches.invalSqK$;
+    marker.flushTcc = operations.caches.flushTcc;
+    marker.invalTcc = operations.caches.invalTcc;
+    marker.flushCb = operations.caches.flushCb;
+    marker.invalCb = operations.caches.invalCb;
+    marker.flushDb = operations.caches.flushDb;
+    marker.invalDb = operations.caches.invalDb;

    marker.numLayoutTransitions = 0;

@@ -830,9 +795,9 @@ void RgpCaptureMgr::WriteBarrierEndMarker(

 // ================================================================================================
 // Inserts a user event string marker
-void RgpCaptureMgr::WriteUserEventMarker(
-  const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType, const std::string& name) const
-{
+void RgpCaptureMgr::WriteUserEventMarker(const VirtualGPU* gpu,
+                                         RgpSqttMarkerUserEventType eventType,
+                                         const std::string& name) const {
  memset(user_event_, 0, sizeof(RgpSqttMarkerUserEventWithString));

  user_event_->header.identifier = RgpSqttMarkerIdentifierUserEvent;
@@ -841,7 +806,8 @@ void RgpCaptureMgr::WriteUserEventMarker(
  size_t markerSize = sizeof(user_event_->header);

  if ((eventType != RgpSqttMarkerUserEventPop)) {
-    size_t strLength = std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
+    size_t strLength =
+        std::min(name.size(), RgpSqttMaxUserEventStringLengthInDwords * sizeof(uint32_t));
    for (uint32_t charIdx = 0; charIdx < strLength; ++charIdx) {
      uint32_t c = static_cast<uint32_t>(name[charIdx]);
      user_event_->stringData[charIdx / 4] |= (c << (8 * (charIdx % 4)));
@@ -859,4 +825,4 @@ void RgpCaptureMgr::WriteUserEventMarker(
 }


-}; // namespace vk
+};  // namespace pal
@@ -34,42 +34,36 @@
 #include "gpuopen.h"

 // PAL forward declarations
-namespace Pal
-{
-class  ICmdBuffer;
-class  IFence;
-class  IQueueSemaphore;
+namespace Pal {
+class ICmdBuffer;
+class IFence;
+class IQueueSemaphore;
 struct PalPublicSettings;
-}
+}  // namespace Pal

 // GpuUtil forward declarations
-namespace GpuUtil
-{                     
+namespace GpuUtil {
 class GpaSession;
 };

 // GPUOpen forward declarations
-namespace DevDriver
-{
+namespace DevDriver {
 class DevDriverServer;
 class IMsgChannel;
 struct MessageBuffer;

-namespace DriverControlProtocol
-{
+namespace DriverControlProtocol {
 enum struct DeviceClockMode : uint32_t;
 class HandlerServer;
-}
+}  // namespace DriverControlProtocol

-namespace SettingsProtocol
-{
+namespace SettingsProtocol {
 class HandlerServer;
 }

-}
+}  // namespace DevDriver

-namespace pal
-{
+namespace pal {
 class Settings;
 class Device;
 class VirtualGPU;
@@ -77,8 +71,7 @@ class HSAILKernel;

 // ================================================================================================
 // RgpSqttMarkerIdentifier - Identifiers for RGP SQ thread-tracing markers (Table 1)
-enum RgpSqttMarkerIdentifier : uint32_t
-{
+enum RgpSqttMarkerIdentifier : uint32_t {
  RgpSqttMarkerIdentifierEvent = 0x0,
  RgpSqttMarkerIdentifierCbStart = 0x1,
  RgpSqttMarkerIdentifierCbEnd = 0x2,
@@ -98,8 +91,7 @@ enum RgpSqttMarkerIdentifier : uint32_t
 };

 // ================================================================================================
-enum class RgpSqttMarkerEventType : uint32_t
-{
+enum class RgpSqttMarkerEventType : uint32_t {
  CmdNDRangeKernel = 0,
  CmdScheduler = 1,
  CmdCopyBuffer = 2,
@@ -114,8 +106,7 @@ enum class RgpSqttMarkerEventType : uint32_t
 };

 // ================================================================================================
-enum class RgpSqqtBarrierReason : uint32_t
-{
+enum class RgpSqqtBarrierReason : uint32_t {
  Invalid = 0,
  MemDependency = 0xC0000000,
  ProfilingControl = 0xC0000001,
@@ -125,129 +116,116 @@ enum class RgpSqqtBarrierReason : uint32_t
 };

 // ================================================================================================
-// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.  
+// RgpSqttMarkerEvent - "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker.
 // These are generated ahead of draws or dispatches for commands that trigger generation of waves
 //  i.e. draws/dispatches (Table 4).
-struct RgpSqttMarkerEvent
-{
-  union
-  {
-    struct
-    {
-      uint32_t identifier : 4;    // Identifier for this marker
-      uint32_t extDwords : 3;     // Number of extra dwords following this marker
-      uint32_t apiType : 24;      // The API type for this command
-      uint32_t hasThreadDims : 1; // Whether thread dimensions are included
+struct RgpSqttMarkerEvent {
+  union {
+    struct {
+      uint32_t identifier : 4;     // Identifier for this marker
+      uint32_t extDwords : 3;      // Number of extra dwords following this marker
+      uint32_t apiType : 24;       // The API type for this command
+      uint32_t hasThreadDims : 1;  // Whether thread dimensions are included
    };

-    uint32_t     dword01;            // The first dword
+    uint32_t dword01;  // The first dword
  };

-  union
-  {
-    // Some information about the vertex/instance/draw register indices.  These values are not 
+  union {
+    // Some information about the vertex/instance/draw register indices.  These values are not
    // always valid because they are not available for one reason or another:
    //
    // - If vertex offset index or instance offset index are not (together) valid, they are both
    //  equal to 0
    // - If draw index is not valid, it is equal to the vertex offset index
-    struct
-    {
-      uint32_t cbID : 20; // Command buffer ID for this marker
+    struct {
+      uint32_t cbID : 20;               // Command buffer ID for this marker
      uint32_t vertexOffsetRegIdx : 4;  // SPI userdata register index for the first vertex offset
-      uint32_t instanceOffsetRegIdx : 4;  // SPI userdata register index for the first instance offset
-      uint32_t drawIndexRegIdx : 4;  // SPI userdata register index for the draw index (multi draw indirect)
+      uint32_t
+          instanceOffsetRegIdx : 4;  // SPI userdata register index for the first instance offset
+      uint32_t drawIndexRegIdx : 4;  // SPI userdata register index for the draw index (multi draw
+                                     // indirect)
    };
-    uint32_t     dword02; // The second dword
+    uint32_t dword02;  // The second dword
  };

-  union
-  {
-    uint32_t cmdID;      // Command index within the command buffer
-    uint32_t dword03;    // The third dword
+  union {
+    uint32_t cmdID;    // Command index within the command buffer
+    uint32_t dword03;  // The third dword
  };
 };

 // ================================================================================================
 // RgpSqttMarkerEventWithDims - Per-dispatch specific marker where workgroup dims are included
-struct RgpSqttMarkerEventWithDims
-{
-  RgpSqttMarkerEvent event;   // Per-draw/dispatch marker.  API type should be Dispatch, threadDim = 1
-  uint32_t           threadX; // Work group count in X
-  uint32_t           threadY; // Work group count in Y
-  uint32_t           threadZ; // Work group count in Z
+struct RgpSqttMarkerEventWithDims {
+  RgpSqttMarkerEvent
+      event;         // Per-draw/dispatch marker.  API type should be Dispatch, threadDim = 1
+  uint32_t threadX;  // Work group count in X
+  uint32_t threadY;  // Work group count in Y
+  uint32_t threadZ;  // Work group count in Z
 };

 // ================================================================================================
 // RgpSqttMarkerBarrierStart - "Barrier Start" RGP SQTT instrumentation marker (Table 5)
-struct RgpSqttMarkerBarrierStart
-{
-  union
-  {
-    struct
-    {
+struct RgpSqttMarkerBarrierStart {
+  union {
+    struct {
      uint32_t identifier : 4;  // Identifier for this marker
      uint32_t extDwords : 3;   // Number of extra dwords following this marker
      uint32_t cbId : 20;       // Command buffer ID within queue
      uint32_t reserved : 5;    // Reserved
    };

-    uint32_t     dword01;            // The first dword
+    uint32_t dword01;  // The first dword
  };

-  union
-  {
-    struct
-    {
+  union {
+    struct {
      uint32_t driverReason : 31;
-      uint32_t internal: 1;
+      uint32_t internal : 1;
    };

-    uint32_t     dword02;            // The second dword
+    uint32_t dword02;  // The second dword
  };
 };

 // ================================================================================================
 // RgpSqttMarkerBarrierEnd - "Barrier End" RGP SQTT instrumentation marker (Table 6)
-struct RgpSqttMarkerBarrierEnd
-{
-  union
-  {
-    struct
-    {
-      uint32_t identifier : 4;  // Identifier for this marker
-      uint32_t extDwords : 3;   // Number of extra dwords following this marker
-      uint32_t cbId : 20;       // Command buffer ID within queue
-      uint32_t waitOnEopTs : 1; // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that timestamp
-                                // to be written.  Quintessential full pipeline stall.
+struct RgpSqttMarkerBarrierEnd {
+  union {
+    struct {
+      uint32_t identifier : 4;   // Identifier for this marker
+      uint32_t extDwords : 3;    // Number of extra dwords following this marker
+      uint32_t cbId : 20;        // Command buffer ID within queue
+      uint32_t waitOnEopTs : 1;  // Issued EOP_TS VGT event followed by a WAIT_REG_MEM for that
+                                 // timestamp to be written.  Quintessential full pipeline stall.
      uint32_t vsPartialFlush : 1;  // Stall at ME waiting for all prior VS waves to complete.
      uint32_t psPartialFlush : 1;  // Stall at ME waiting for all prior PS waves to complete.
      uint32_t csPartialFlush : 1;  // Stall at ME waiting for all prior CS waves to complete.
-      uint32_t pfpSyncMe : 1;   // Stall PFP until ME is at same point in command stream.
+      uint32_t pfpSyncMe : 1;       // Stall PFP until ME is at same point in command stream.
    };

-    uint32_t     dword01;             // The first dword
+    uint32_t dword01;  // The first dword
  };

-  union
-  {
-    struct
-    {
-      uint32_t syncCpDma : 1;  // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
+  union {
+    struct {
+      uint32_t
+          syncCpDma : 1;  // Issue dummy CP-DMA command to confirm all prior CP-DMAs have completed.
      uint32_t invalTcp : 1;  // Invalidate the L1 vector caches.
      uint32_t invalSqI : 1;  // Invalidate the SQ instruction caches
      uint32_t invalSqK : 1;  // Invalidate the SQ constant caches (i.e. L1 scalar caches)
      uint32_t flushTcc : 1;  // Flush L2
      uint32_t invalTcc : 1;  // Invalidate L2
-      uint32_t flushCb : 1;  // Flush CB caches (including DCC, cmask, fmask)
-      uint32_t invalCb : 1;  // Invalidate CB caches (including DCC, cmask, fmask)
-      uint32_t flushDb : 1;  // Flush DB caches (including htile)
-      uint32_t invalDb : 1;  // Invalidate DB caches (including htile)
-      uint32_t numLayoutTransitions : 16; // Number of layout transitions following this packet
-      uint32_t reserved : 6;  // Reserved for future expansion.  Always 0
+      uint32_t flushCb : 1;   // Flush CB caches (including DCC, cmask, fmask)
+      uint32_t invalCb : 1;   // Invalidate CB caches (including DCC, cmask, fmask)
+      uint32_t flushDb : 1;   // Flush DB caches (including htile)
+      uint32_t invalDb : 1;   // Invalidate DB caches (including htile)
+      uint32_t numLayoutTransitions : 16;  // Number of layout transitions following this packet
+      uint32_t reserved : 6;               // Reserved for future expansion.  Always 0
    };

-    uint32_t  dword02;                // The second dword
+    uint32_t dword02;  // The second dword
  };
 };

@@ -255,33 +233,31 @@ struct RgpSqttMarkerBarrierEnd
 constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1;

 // RGP SQTT Instrumentation Specification version for Vulkan-specific tables
-constexpr uint32_t RgpSqttInstrumentationApiVersion  = 0;
+constexpr uint32_t RgpSqttInstrumentationApiVersion = 0;

-// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user event
-enum RgpSqttMarkerUserEventType : uint32_t
-{
-    RgpSqttMarkerUserEventTrigger = 0x0,
-    RgpSqttMarkerUserEventPop = 0x1,
-    RgpSqttMarkerUserEventPush = 0x2,
-    RgpSqttMarkerUserEventObjectName = 0x3,
-    RgpSqttMarkerUserEventReserved1 = 0x4,
-    RgpSqttMarkerUserEventReserved2 = 0x5,
-    RgpSqttMarkerUserEventReserved3 = 0x6,
-    RgpSqttMarkerUserEventReserved4 = 0x7,
+// RgpSqttMarkeUserEventDataType - Data types used in RGP SQ thread-tracing markers for an user
+// event
+enum RgpSqttMarkerUserEventType : uint32_t {
+  RgpSqttMarkerUserEventTrigger = 0x0,
+  RgpSqttMarkerUserEventPop = 0x1,
+  RgpSqttMarkerUserEventPush = 0x2,
+  RgpSqttMarkerUserEventObjectName = 0x3,
+  RgpSqttMarkerUserEventReserved1 = 0x4,
+  RgpSqttMarkerUserEventReserved2 = 0x5,
+  RgpSqttMarkerUserEventReserved3 = 0x6,
+  RgpSqttMarkerUserEventReserved4 = 0x7,
 };

 // RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event.
-union RgpSqttMarkerUserEvent
-{
-    struct
-    {
-        uint32_t identifier : 4;  // Identifier for this marker
-        uint32_t extDwords : 8;  // Number of extra dwords following this marker
-        uint32_t dataType : 8;  // The type for this marker
-        uint32_t reserved : 12; // reserved
-    };
+union RgpSqttMarkerUserEvent {
+  struct {
+    uint32_t identifier : 4;  // Identifier for this marker
+    uint32_t extDwords : 8;   // Number of extra dwords following this marker
+    uint32_t dataType : 8;    // The type for this marker
+    uint32_t reserved : 12;   // reserved
+  };

-    uint32_t dword01;                               // The first dword
+  uint32_t dword01;  // The first dword
 };

 constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
@@ -289,21 +265,20 @@ constexpr uint32_t RgpSqttMarkerUserEventWordCount = 1;
 // The max lengths of frame marker strings
 static constexpr size_t RgpSqttMaxUserEventStringLengthInDwords = 1024;

-// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and trigger data types)
-struct RgpSqttMarkerUserEventWithString
-{
-    RgpSqttMarkerUserEvent header;
+// RgpSqttMarkerUserEvent - RGP SQ thread-tracing marker for an user event with a string (push and
+// trigger data types)
+struct RgpSqttMarkerUserEventWithString {
+  RgpSqttMarkerUserEvent header;

-    uint32_t stringLength;                                        // Length of the string (in characters)
-    uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords]; // String data in UTF-8 format
+  uint32_t stringLength;  // Length of the string (in characters)
+  uint32_t stringData[RgpSqttMaxUserEventStringLengthInDwords];  // String data in UTF-8 format
 };

 // ================================================================================================
 // This class provides functionality to interact with the GPU Open Developer Mode message passing
 // service and the rest of the driver.
-class RgpCaptureMgr
-{
-public:
+class RgpCaptureMgr {
+ public:
  ~RgpCaptureMgr();

  static RgpCaptureMgr* Create(Pal::IPlatform* platform, const Device& device);
@@ -321,45 +296,42 @@ public:

  bool IsQueueTimingActive() const;

-  void WriteBarrierStartMarker(
-    const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
-  void WriteBarrierEndMarker(
-    const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
+  void WriteBarrierStartMarker(const VirtualGPU* gpu,
+                               const Pal::Developer::BarrierData& data) const;
+  void WriteBarrierEndMarker(const VirtualGPU* gpu, const Pal::Developer::BarrierData& data) const;
  bool RegisterTimedQueue(uint32_t queue_id, Pal::IQueue* iQueue, bool* debug_vmid) const;
-  Pal::Result TimedQueueSubmit(
-    Pal::IQueue* queue, uint64_t cmdId, const Pal::SubmitInfo& submitInfo) const;
+  Pal::Result TimedQueueSubmit(Pal::IQueue* queue, uint64_t cmdId,
+                               const Pal::SubmitInfo& submitInfo) const;

-private:
+ private:
  // Steps that an RGP trace goes through
-  enum class TraceStatus
-  {
-      Idle = 0,           // No active trace and none requested
-      Preparing,          // A trace has been requested but is not active yet because we are
-                          // currently sampling timing information over some number of lead frames.
-      Running,            // SQTT and queue timing is currently active for all command buffer submits.
-      WaitingForSqtt,
-      WaitingForResults   // Tracing is no longer active, but all results are not yet ready.
+  enum class TraceStatus {
+    Idle = 0,   // No active trace and none requested
+    Preparing,  // A trace has been requested but is not active yet because we are
+                // currently sampling timing information over some number of lead frames.
+    Running,    // SQTT and queue timing is currently active for all command buffer submits.
+    WaitingForSqtt,
+    WaitingForResults  // Tracing is no longer active, but all results are not yet ready.
  };

  // All per-device state to support RGP tracing
-  struct TraceState
-  {
-    TraceStatus   status_;              // Current trace status (idle, running, etc.)
+  struct TraceState {
+    TraceStatus status_;  // Current trace status (idle, running, etc.)

-    GpuEvent      begin_sqtt_event_;    // Event that is signaled when a trace-end cmdbuf retires
-    GpuEvent      end_sqtt_event_;      // Event that is signaled when a trace-end cmdbuf retires
-    GpuEvent      end_event_;           // Event that is signaled when a trace-end cmdbuf retires
+    GpuEvent begin_sqtt_event_;  // Event that is signaled when a trace-end cmdbuf retires
+    GpuEvent end_sqtt_event_;    // Event that is signaled when a trace-end cmdbuf retires
+    GpuEvent end_event_;         // Event that is signaled when a trace-end cmdbuf retires

-    VirtualGPU*   prepare_queue_;       // The queue that triggered the full start of a trace
-    VirtualGPU*   begin_queue_;         // The queue that triggered starting SQTT
+    VirtualGPU* prepare_queue_;  // The queue that triggered the full start of a trace
+    VirtualGPU* begin_queue_;    // The queue that triggered starting SQTT

-    GpuUtil::GpaSession*  gpa_session_; // GPA session helper object for building RGP data
-    uint32_t      gpa_sample_id_;       // Sample ID associated with the current trace
-    bool          queue_timing_;        // Queue timing is enabled
+    GpuUtil::GpaSession* gpa_session_;  // GPA session helper object for building RGP data
+    uint32_t gpa_sample_id_;            // Sample ID associated with the current trace
+    bool queue_timing_;                 // Queue timing is enabled

-    uint32_t      prepared_disp_count_; // Number of dispatches counted while preparing for a trace
-    uint32_t      sqtt_disp_count_;     // Number of dispatches counted while SQTT tracing is active
-    mutable uint32_t current_event_id_; // Current event ID
+    uint32_t prepared_disp_count_;  // Number of dispatches counted while preparing for a trace
+    uint32_t sqtt_disp_count_;      // Number of dispatches counted while SQTT tracing is active
+    mutable uint32_t current_event_id_;  // Current event ID
  };

  RgpCaptureMgr(Pal::IPlatform* platform, const Device& device);
@@ -374,25 +346,25 @@ private:
  static bool GpuSupportsTracing(const Pal::DeviceProperties& props, const Settings& settings);
  RgpSqttMarkerEvent BuildEventMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType api_type) const;
  void WriteMarker(const VirtualGPU* gpu, const void* data, size_t data_size) const;
-  void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType,
-    uint32_t x, uint32_t y, uint32_t z) const;
+  void WriteEventWithDimsMarker(const VirtualGPU* gpu, RgpSqttMarkerEventType apiType, uint32_t x,
+                                uint32_t y, uint32_t z) const;
  void WriteUserEventMarker(const VirtualGPU* gpu, RgpSqttMarkerUserEventType eventType,
-    const std::string& name) const;
+                            const std::string& name) const;

-  const Device&               device_;
+  const Device& device_;
  DevDriver::DevDriverServer* dev_driver_server_;
  DevDriver::RGPProtocol::RGPServer* rgp_server_;
-  mutable amd::Monitor        trace_mutex_;
-  TraceState                  trace_;
+  mutable amd::Monitor trace_mutex_;
+  TraceState trace_;
  RgpSqttMarkerUserEventWithString* user_event_;

-  uint32_t  num_prep_disp_;
-  uint32_t  max_sqtt_disp_;       // Maximum number of the dispatches allowed in the trace
-  uint32_t  trace_gpu_mem_limit_;
-  uint32_t  global_disp_count_;
+  uint32_t num_prep_disp_;
+  uint32_t max_sqtt_disp_;  // Maximum number of the dispatches allowed in the trace
+  uint32_t trace_gpu_mem_limit_;
+  uint32_t global_disp_count_;

-  bool  trace_enabled_;         // True if tracing is currently enabled (master flag)
-  bool  inst_tracing_enabled_;  // Enable instruction-level SQTT tokens
+  bool trace_enabled_;         // True if tracing is currently enabled (master flag)
+  bool inst_tracing_enabled_;  // Enable instruction-level SQTT tokens

  PAL_DISALLOW_DEFAULT_CTOR(RgpCaptureMgr);
  PAL_DISALLOW_COPY_AND_ASSIGN(RgpCaptureMgr);
@@ -400,11 +372,9 @@ private:

 // ================================================================================================
 // Returns true if queue operations are currently being timed by RGP traces.
-inline bool RgpCaptureMgr::IsQueueTimingActive() const
-{
+inline bool RgpCaptureMgr::IsQueueTimingActive() const {
  return (trace_.queue_timing_ &&
-          (trace_.status_ == TraceStatus::Running ||
-           trace_.status_ == TraceStatus::Preparing ||
+          (trace_.status_ == TraceStatus::Running || trace_.status_ == TraceStatus::Preparing ||
           trace_.status_ == TraceStatus::WaitingForSqtt));
 }
-};
+};  // namespace pal
@@ -27,11 +27,9 @@ typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
 namespace pal {

 void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize,
-                                   const uint32_t groupSegmentSize,
-                                   const uint16_t numSGPRs,
+                                   const uint32_t groupSegmentSize, const uint16_t numSGPRs,
                                   const uint16_t numVGPRs) {
-  workGroupInfo_.scratchRegs_ =
-      amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
+  workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint);
  workGroupInfo_.privateMemSize_ = privateSegmentSize;
  workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize;
  workGroupInfo_.usedSGPRs_ = numSGPRs;
@@ -63,13 +61,13 @@ bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t
  }

  // Copy code object of this kernel from the program CPU segment
-  memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t));
+  memcpy(akc, reinterpret_cast<void*>(prog().findHostKernelAddress(code_)),
+         sizeof(amd_kernel_code_t));

  return true;
 }

 bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
-
  amd_kernel_code_t* akc = &akc_;

  if (!setKernelCode(sym, akc)) {
@@ -77,18 +75,16 @@ bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) {
  }

  if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE,
-    reinterpret_cast<void*>(&codeSize_))) {
+                    reinterpret_cast<void*>(&codeSize_))) {
    return false;
  }

-    // Setup the the workgroup info
-  setWorkGroupInfo(akc->workitem_private_segment_byte_size,
-                   akc->workgroup_group_segment_byte_size,
-                   akc->wavefront_sgpr_count,
-                   akc->workitem_vgpr_count);
+  // Setup the the workgroup info
+  setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size,
+                   akc->wavefront_sgpr_count, akc->workitem_vgpr_count);

  workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_;
-  kernargSegmentByteSize_ =  akc->kernarg_segment_byte_size;
+  kernargSegmentByteSize_ = akc->kernarg_segment_byte_size;
  spillSegmentByteSize_ = amd::alignUp(workGroupInfo_.privateMemSize_, sizeof(uint32_t));

  return true;
@@ -102,16 +98,14 @@ HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compi
      codeSize_(0),
      workgroupGroupSegmentByteSize_(0),
      kernargSegmentByteSize_(0),
-      spillSegmentByteSize_(0)
- {
+      spillSegmentByteSize_(0) {
  flags_.hsa_ = true;
 }

-HSAILKernel::~HSAILKernel() {
-}
+HSAILKernel::~HSAILKernel() {}

 bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
-#if  defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
  acl_error error = ACL_SUCCESS;
  std::string openClKernelName = openclMangledName(name());
  flags_.internalKernel_ =
@@ -274,12 +268,14 @@ const HSAILProgram& HSAILKernel::prog() const {
  return reinterpret_cast<const HSAILProgram&>(prog_);
 }

-hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
-    VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
-    const_address parameters, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const {
+hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel,
+                                                         const amd::NDRangeContainer& sizes,
+                                                         const_address parameters,
+                                                         size_t ldsAddress, uint64_t vmDefQueue,
+                                                         uint64_t* vmParentWrap) const {
  uint64_t argList;
  address aqlArgBuf = gpu.managedBuffer().reserve(
-    argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
+      argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList);
  gpu.addVmMemory(gpu.managedBuffer().activeMemory());

  if (dynamicParallelism()) {
@@ -307,8 +303,8 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
        if (sizes.dimensions() >= 2) {
-            offset = sizes.offset()[1];
-            WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
+          offset = sizes.offset()[1];
+          WriteAqlArgAt(const_cast<address>(parameters), &offset, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
@@ -322,8 +318,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
            // and printf buffer was allocated
            (gpu.printfDbgHSA().dbgBuffer() != nullptr)) {
          // and set the fourth argument as the printf_buffer pointer
-          size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().
-            dbgBuffer()->vmAddress());
+          size_t bufferPtr = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
          gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer());
          WriteAqlArgAt(const_cast<address>(parameters), &bufferPtr, it.size_, it.offset_);
        }
@@ -346,11 +341,11 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
  // Note: In a case of structs the size won't match,
  // since HSAIL compiler expects a reference...
  assert(argsBufferSize() <= signature.paramsSize() &&
-    "A mismatch of sizes of arguments between compiler and runtime!");
+         "A mismatch of sizes of arguments between compiler and runtime!");

-  //hsa_kernel_dispatch_packet_t disp;
-  hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
-    gpu.cb(0)->SysMemCopy());
+  // hsa_kernel_dispatch_packet_t disp;
+  hsa_kernel_dispatch_packet_t* hsaDisp =
+      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(gpu.cb(0)->SysMemCopy());

  amd::NDRange local(sizes.local());
  const amd::NDRange& global = sizes.global();
@@ -359,10 +354,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
  FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);

  constexpr uint16_t kDispatchPacketHeader =
-    (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-    (1 << HSA_PACKET_HEADER_BARRIER) |
-    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+      (1 << HSA_PACKET_HEADER_BARRIER) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

  hsaDisp->header = kDispatchPacketHeader;
  hsaDisp->setup = sizes.dimensions();
@@ -387,7 +382,7 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
  memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t));

  if (AMD_HSA_BITS_GET(akc_.kernel_code_properties,
-      AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
+                       AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
    gpu.addVmMemory(gpu.hsaQueueMem());
  }

@@ -407,7 +402,7 @@ static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const s
  }
  return nullptr;
 }
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

 #if defined(USE_COMGR_LIBRARY)
 bool LightningKernel::init() {
@@ -419,7 +414,7 @@ bool LightningKernel::init() {
    return false;
  }

-  KernelMD  kernelMD;
+  KernelMD kernelMD;
  if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) {
    return false;
  }
@@ -427,8 +422,8 @@ bool LightningKernel::init() {
  symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName;

  workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize;
-  spillSegmentByteSize_ = amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
-                                       sizeof(uint32_t));
+  spillSegmentByteSize_ =
+      amd::alignUp(kernelMD.mCodeProps.mPrivateSegmentFixedSize, sizeof(uint32_t));
  kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize;

  // Copy codeobject of this kernel from the program CPU segment
@@ -451,7 +446,7 @@ bool LightningKernel::init() {

    // Get the runtime handle symbol GPU address
    rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD.mAttrs.mRuntimeHandle.c_str()),
-                                const_cast<hsa_agent_t*>(&agent));
+                                  const_cast<hsa_agent_t*>(&agent));
    uint64_t symbol_address;
    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

@@ -461,19 +456,14 @@ bool LightningKernel::init() {
    uint64_t kernel_object = gpuAqlCode();
    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();

-    const struct RuntimeHandle runtime_handle = {
-        gpuAqlCode(),
-        spillSegSize(),
-        ldsSize()
-    };
+    const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};

    codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
  }

  // Setup the the workgroup info
  setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize,
-                   kernelMD.mCodeProps.mGroupSegmentFixedSize,
-                   kernelMD.mCodeProps.mNumSGPRs,
+                   kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs,
                   kernelMD.mCodeProps.mNumVGPRs);

  // Copy wavefront size
@@ -499,10 +489,10 @@ bool LightningKernel::init() {

  return true;
 }
-#endif // defined(USE_COMGR_LIBRARY)
+#endif  // defined(USE_COMGR_LIBRARY)

 bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
-#if defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY)
  flags_.internalKernel_ =
      (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;

@@ -545,7 +535,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {

    // Get the runtime handle symbol GPU address
    rth_symbol = prog().GetSymbol(const_cast<char*>(kernelMD->mAttrs.mRuntimeHandle.c_str()),
-                                const_cast<hsa_agent_t*>(&agent));
+                                  const_cast<hsa_agent_t*>(&agent));
    uint64_t symbol_address;
    rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address);

@@ -554,11 +544,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {
    uint64_t offset = symbol_address - codeSegGpu.vmAddress();
    VirtualGPU* gpu = codeSegGpu.dev().xferQueue();

-    const struct RuntimeHandle runtime_handle = {
-        gpuAqlCode(),
-        spillSegSize(),
-        ldsSize()
-    };
+    const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()};

    codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true);
  }
@@ -584,7 +570,7 @@ bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) {

  waveLimiter_.enable();
  */
-#endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
+#endif  // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY)
  return true;
 }

@@ -20,14 +20,14 @@ namespace amd {
 namespace hsa {
 namespace loader {
 class Symbol;
-}  // loader
+}  // namespace loader
 namespace code {
 namespace Kernel {
 class Metadata;
-}  // Kernel
-}  // code
-}  // hsa
-}  // amd
+}  // namespace Kernel
+}  // namespace code
+}  // namespace hsa
+}  // namespace amd

 //! \namespace pal PAL Device Implementation
 namespace pal {
@@ -43,7 +43,6 @@ class LightningProgram;
 */
 class HSAILKernel : public device::Kernel {
 public:
-
  HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions);

  virtual ~HSAILKernel();
@@ -106,21 +105,19 @@ class HSAILKernel : public device::Kernel {
  bool setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc);

  //! Set up the workgroup info based on the kernel metadata
-  void setWorkGroupInfo(const uint32_t privateSegmentSize,
-                        const uint32_t groupSegmentSize,
-                        const uint16_t numSGPRs,
-                        const uint16_t numVGPRs);
+  void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize,
+                        const uint16_t numSGPRs, const uint16_t numVGPRs);

-  std::string compileOptions_;    //!< compile used for finalizing this kernel
-  amd_kernel_code_t akc_;         //!< AQL kernel code on CPU
-  uint index_;                    //!< Kernel index in the program
+  std::string compileOptions_;  //!< compile used for finalizing this kernel
+  amd_kernel_code_t akc_;       //!< AQL kernel code on CPU
+  uint index_;                  //!< Kernel index in the program

-  uint64_t code_;     //!< GPU memory pointer to the kernel
-  size_t codeSize_;   //!< Size of ISA code
+  uint64_t code_;    //!< GPU memory pointer to the kernel
+  size_t codeSize_;  //!< Size of ISA code

-  uint32_t workgroupGroupSegmentByteSize_;    //!< LDS size used in the kernel
-  uint32_t kernargSegmentByteSize_;           //!< Size of kernel argument buffer
-  uint32_t spillSegmentByteSize_;             //!< Spill reg size per workitem
+  uint32_t workgroupGroupSegmentByteSize_;  //!< LDS size used in the kernel
+  uint32_t kernargSegmentByteSize_;         //!< Size of kernel argument buffer
+  uint32_t spillSegmentByteSize_;           //!< Spill reg size per workitem
 };

 class LightningKernel : public HSAILKernel {
@@ -140,4 +137,5 @@ class LightningKernel : public HSAILKernel {
 #endif
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -23,27 +23,21 @@
 namespace pal {

 Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t size)
-  : device::Memory(owner), Resource(gpuDev, size)
-  , pinnedMemory_(nullptr)
-  , parent_(nullptr) {
-
+    : device::Memory(owner), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {
  if (owner.parent() != nullptr) {
    flags_ |= SubMemoryObject;
  }
 }

 Memory::Memory(const Device& gpuDev, size_t size)
-  : device::Memory(size), Resource(gpuDev, size)
-  , pinnedMemory_(nullptr)
-  , parent_(nullptr) {
-}
+    : device::Memory(size), Resource(gpuDev, size), pinnedMemory_(nullptr), parent_(nullptr) {}

 Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t height, size_t depth,
               cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
-    : device::Memory(owner), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
-    , pinnedMemory_(nullptr)
-    , parent_(nullptr) {
-
+    : device::Memory(owner),
+      Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
+      pinnedMemory_(nullptr),
+      parent_(nullptr) {
  if (owner.parent() != nullptr) {
    flags_ |= SubMemoryObject;
  }
@@ -51,10 +45,10 @@ Memory::Memory(const Device& gpuDev, amd::Memory& owner, size_t width, size_t he

 Memory::Memory(const Device& gpuDev, size_t size, size_t width, size_t height, size_t depth,
               cl_image_format format, cl_mem_object_type imageType, uint mipLevels)
-  : device::Memory(size), Resource(gpuDev, width, height, depth, format, imageType, mipLevels)
-  , pinnedMemory_(nullptr)
-  , parent_(nullptr) {
-}
+    : device::Memory(size),
+      Resource(gpuDev, width, height, depth, format, imageType, mipLevels),
+      pinnedMemory_(nullptr),
+      parent_(nullptr) {}

 #ifdef _WIN32
 static HANDLE getSharedHandle(IUnknown* pIface) {
@@ -130,7 +124,7 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
        break;
      case Resource::Remote:
      case Resource::RemoteUSWC:
-          if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
+        if ((!desc().tiled_) && (desc().dimSize_ != 3)) {
          // Marks memory object for direct GPU access to the host memory
          flags_ |= HostMemoryDirectAccess;
        }
@@ -402,7 +396,7 @@ Memory::~Memory() {
      (memoryType() != Resource::ExternalPhysical)) {
    // Unmap memory if direct access was requested
    // Note: runtime will perform unmap on the actual resource destruction
-    //unmap(nullptr);
+    // unmap(nullptr);
  }
 }

@@ -32,12 +32,12 @@ class Memory : public device::Memory, public Resource {
  Memory(const Device& gpuDev,  //!< GPU device object
         amd::Memory& owner,    //!< Abstraction layer memory object
         size_t size            //!< Memory size for allocation
-         );
+  );

  //! Constructor (nonfat version for local scratch mem use without heap block)
  Memory(const Device& gpuDev,  //!< GPU device object
         size_t size            //!< Memory size for allocation
-         );
+  );

  //! Constructor memory for images (without global heap allocation)
  Memory(const Device& gpuDev,          //!< GPU device object
@@ -48,7 +48,7 @@ class Memory : public device::Memory, public Resource {
         cl_image_format format,        //!< Memory format
         cl_mem_object_type imageType,  //!< CL image type
         uint mipLevels                 //!< The number of mip levels
-         );
+  );

  //! Constructor memory for images (without global heap allocation)
  Memory(const Device& gpuDev,          //!< GPU device object
@@ -59,7 +59,7 @@ class Memory : public device::Memory, public Resource {
         cl_image_format format,        //!< Memory format
         cl_mem_object_type imageType,  //!< CL image type
         uint mipLevels                 //!< The number of mip levels
-         );
+  );

  //! Default destructor
  ~Memory();
@@ -70,7 +70,7 @@ class Memory : public device::Memory, public Resource {
  //! Overloads the resource create method
  virtual bool create(Resource::MemoryType memType,          //!< Memory type
                      Resource::CreateParams* params = NULL  //!< Prameters for create
-                      );
+  );

  //! Allocate memory for API-level maps
  virtual void* allocMapTarget(const amd::Coord3D& origin,  //!< The map location in memory
@@ -78,12 +78,12 @@ class Memory : public device::Memory, public Resource {
                               uint mapFlags,               //!< Map flags
                               size_t* rowPitch = NULL,     //!< Row pitch for the mapped memory
                               size_t* slicePitch = NULL    //!< Slice for the mapped memory
-                               );
+  );

  //! Pins system memory associated with this memory object
  virtual bool pinSystemMemory(void* hostPtr,  //!< System memory address
                               size_t size     //!< Size of allocated system memory
-                               );
+  );

  //! Releases indirect map surface
  virtual void releaseIndirectMap() { decIndMapCount(); }
@@ -96,15 +96,15 @@ class Memory : public device::Memory, public Resource {
                       uint numLayers = 0,        //!< End layer for multilayer map
                       size_t* rowPitch = NULL,   //!< Row pitch for the device memory
                       size_t* slicePitch = NULL  //!< Slice pitch for the device memory
-                       );
+  );

  //! Unmap the device memory
  virtual void cpuUnmap(device::VirtualDevice& vDev  //!< Virtual device for unmap operaiton
-                        );
+  );

  //! Updates device memory from the owner's host allocation
  void syncCacheFromHost(VirtualGPU& gpu,  //!< Virtual GPU device object
-                         //! Synchronization flags
+                                           //! Synchronization flags
                         device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags());

  //! Updates the owner's host allocation from device memory
@@ -115,11 +115,13 @@ class Memory : public device::Memory, public Resource {
  //! Creates a view from current resource
  virtual Memory* createBufferView(
      amd::Memory& subBufferOwner  //!< The abstraction layer subbuf owner
-      );
+  );

  virtual uint64_t virtualAddress() const override { return vmAddress(); }

-  virtual const address cpuSrd() const { return reinterpret_cast<const address>(const_cast<void*>(hwState())); }
+  virtual const address cpuSrd() const {
+    return reinterpret_cast<const address>(const_cast<void*>(hwState()));
+  }

  //! Allocates host memory for synchronization with MGPU context
  void mgpuCacheWriteBack();
@@ -161,8 +163,8 @@ class Memory : public device::Memory, public Resource {
  //! Disable operator=
  Memory& operator=(const Memory&);

-  Memory* pinnedMemory_;     //!< Memory used as pinned system memory
-  const Memory* parent_;     //!< Parent memory object
+  Memory* pinnedMemory_;  //!< Memory used as pinned system memory
+  const Memory* parent_;  //!< Parent memory object
 };

 class Buffer : public pal::Memory {
@@ -219,7 +221,7 @@ class Image : public pal::Memory {
                               uint mapFlags,               //!< Map flags
                               size_t* rowPitch = NULL,     //!< Row pitch for the mapped memory
                               size_t* slicePitch = NULL    //!< Slice for the mapped memory
-                               );
+  );

  virtual uint64_t virtualAddress() const override { return hwSrd(); }

@@ -11,7 +11,7 @@
 #ifndef isinf
 #ifdef _MSC_VER
 #define isinf(X) (!_finite(X) && !_isnan(X))
-#else   //!_MSC_VER
+#else  //!_MSC_VER
 #define isinf(X) (std::isinf(X))
 #endif  //!_MSC_VER
 #endif  // isinf
@@ -19,7 +19,7 @@
 #ifndef isnan
 #ifdef _MSC_VER
 #define isnan(X) (_isnan(X))
-#else   //!_MSC_VER
+#else  //!_MSC_VER
 #define isnan(X) (std::isnan(X))
 #endif  //!_MSC_VER
 #endif  // isnan
@@ -55,14 +55,14 @@ class PrintfDbg : public amd::HeapObject {
  bool init(VirtualGPU& gpu,          //!< Virtual GPU object
            bool printfEnabled,       //!< checks for printf
            const amd::NDRange& size  //!< Kernel's workload
-            );
+  );

  //! Prints the kernel's debug informaiton from the buffer
-  bool output(VirtualGPU& gpu,                           //!< Virtual GPU object
-              bool printfEnabled,                        //!< checks for printf
-              const amd::NDRange& size,                  //!< Kernel's workload
+  bool output(VirtualGPU& gpu,                                   //!< Virtual GPU object
+              bool printfEnabled,                                //!< checks for printf
+              const amd::NDRange& size,                          //!< Kernel's workload
              const std::vector<device::PrintfInfo>& printfInfo  //!< printf info
-              );
+  );

  //! Debug buffer size per workitem
  size_t wiDbgSize() const { return wiDbgSize_; }
@@ -81,7 +81,7 @@ class PrintfDbg : public amd::HeapObject {

  //! Allocates the debug buffer
  bool allocate(bool realloc = false  //!< If TRUE then reallocate the debug memory
-                );
+  );

  //! Returns TRUE if a float value has to be printed
  bool checkFloat(const std::string& fmt  //!< Format string
@@ -105,9 +105,9 @@ class PrintfDbg : public amd::HeapObject {
                        ) const;

  //! Displays the PrintfDbg
-  void outputDbgBuffer(const device::PrintfInfo& info,//!< printf info
-                       const uint32_t* workitemData,  //!< The PrintfDbg dump buffer
-                       size_t& i                      //!< index to the data in the buffer
+  void outputDbgBuffer(const device::PrintfInfo& info,  //!< printf info
+                       const uint32_t* workitemData,    //!< The PrintfDbg dump buffer
+                       size_t& i                        //!< index to the data in the buffer
                       ) const;

 private:
@@ -127,7 +127,7 @@ class PrintfDbg : public amd::HeapObject {
  uint32_t* mapWorkitem(VirtualGPU& gpu,  //!< Virtual GPU object
                        size_t idx,       //!< Workitem global index
                        bool* realloc     //!< Returns TRUE if workitem reached the buffer limit
-                        );
+  );

  //! Unamp the staged buffer
  void unmapWorkitem(VirtualGPU& gpu,              //!< Virtual GPU object
@@ -145,13 +145,13 @@ class PrintfDbgHSA : public PrintfDbg {
  //! Initializes the debug buffer before kernel's execution
  bool init(VirtualGPU& gpu,    //!< Virtual GPU object
            bool printfEnabled  //!< checks for printf
-            );
+  );

  //! Prints the kernel's debug informaiton from the buffer
-  bool output(VirtualGPU& gpu,                           //!< Virtual GPU object
-              bool printfEnabled,                        //!< checks for printf
+  bool output(VirtualGPU& gpu,                                   //!< Virtual GPU object
+              bool printfEnabled,                                //!< checks for printf
              const std::vector<device::PrintfInfo>& printfInfo  //!< printf info
-              );
+  );

 private:
  //! Disable copy constructor
@@ -161,4 +161,5 @@ class PrintfDbgHSA : public PrintfDbg {
  PrintfDbgHSA& operator=(const PrintfDbgHSA&);
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -65,10 +65,10 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t
  align = amd::alignUp(align, sizeof(uint32_t));

  amd::Memory* amd_mem_obj = new (prog.dev().context())
-    amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
-    // HIP requires SVM allocation for segment code due to possible global variable access and
-    // global variables are a part of code segment with the latest loader
-    amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);
+      amd::Buffer(prog.dev().context(), 0, amd::alignUp(size, align),
+                  // HIP requires SVM allocation for segment code due to possible global variable
+                  // access and global variables are a part of code segment with the latest loader
+                  amd::IS_HIP ? reinterpret_cast<void*>(1) : nullptr);

  if (amd_mem_obj == nullptr) {
    LogError("[OCL] failed to create a mem object!");
@@ -103,9 +103,9 @@ bool Segment::alloc(HSAILProgram& prog, amdgpu_hsa_elf_segment_t segment, size_t

  if (zero && !prog.isInternal()) {
    uint64_t pattern = 0;
-    size_t   patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
-    prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize,
-        amd::Coord3D(0), amd::Coord3D(size));
+    size_t patternSize = ((size % sizeof(pattern)) == 0) ? sizeof(pattern) : 1;
+    prog.dev().xferMgr().fillBuffer(*gpuAccess_, &pattern, patternSize, amd::Coord3D(0),
+                                    amd::Coord3D(size));
  }

  switch (segment) {
@@ -237,7 +237,7 @@ inline static std::vector<std::string> splitSpaceSeparatedString(char* str) {
 }

 bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_t binSize) {
-#if  defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
  // ACL_TYPE_CG stage is not performed for offline compilation
  hsa_agent_t agent;
  agent.handle = 1;
@@ -262,8 +262,8 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
  }

  size_t kernelNamesSize = 0;
-  acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES,
-    nullptr, nullptr, &kernelNamesSize);
+  acl_error errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr,
+                                     nullptr, &kernelNamesSize);
  if (errorCode != ACL_SUCCESS) {
    buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
    return false;
@@ -274,11 +274,11 @@ bool HSAILProgram::setKernels(amd::option::Options* options, void* binary, size_
                             &kernelNamesSize);
    if (errorCode != ACL_SUCCESS) {
      buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
-      delete [] kernelNames;
+      delete[] kernelNames;
      return false;
    }
    std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
-    delete [] kernelNames;
+    delete[] kernelNames;
    bool dynamicParallelism = false;
    for (const auto& it : vKernels) {
      std::string kernelName(it);
@@ -338,12 +338,10 @@ bool HSAILProgram::allocKernelTable() {
  return true;
 }

-void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const {
-  gpu.addVmMemory(&codeSegGpu());
-}
+void HSAILProgram::fillResListWithKernels(VirtualGPU& gpu) const { gpu.addVmMemory(&codeSegGpu()); }

 const aclTargetInfo& HSAILProgram::info(const char* str) {
-#if  defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
  acl_error err;
  std::string arch = "hsail";
  if (dev().settings().use64BitPtr_) {
@@ -359,7 +357,7 @@ const aclTargetInfo& HSAILProgram::info(const char* str) {
 }

 bool HSAILProgram::saveBinaryAndSetType(type_t type) {
-#if  defined(WITH_COMPILER_LIB)
+#if defined(WITH_COMPILER_LIB)
  // Write binary to memory
  if (rawBinary_ != nullptr) {
    // Free memory containing rawBinary
@@ -378,8 +376,8 @@ bool HSAILProgram::saveBinaryAndSetType(type_t type) {
  return true;
 }

-bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr,
-                                      size_t* bytes, const char* global_name) const {
+bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_pptr, size_t* bytes,
+                                      const char* global_name) const {
  uint32_t length = 0;
  size_t offset = 0;
  uint32_t flags = 0;
@@ -456,7 +454,7 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p
  }

  /* Retrieve the Offset from global pal::Memory created @ segment::alloc */
-  if(!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
+  if (!codeSegment_->gpuAddressOffset(reinterpret_cast<uint64_t>(*device_pptr), &offset)) {
    buildLog_ += "Error: Cannot Retrieve the Address Offset";
    buildLog_ += "\n";
    return false;
@@ -484,13 +482,12 @@ bool HSAILProgram::createGlobalVarObj(amd::Memory** amd_mem_obj, void** device_p

 hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
  hsa_isa_t isa = {0};
-  uint32_t gfxip  = 0;
+  uint32_t gfxip = 0;
  std::string gfx_target(name);
  if (gfx_target.find("amdgcn-") == 0) {
    std::string gfxip_version_str = gfx_target.substr(gfx_target.find("gfx") + 3);
    gfxip = std::atoi(gfxip_version_str.c_str());
-  }
-  else {
+  } else {
    // FIXME: Old way. To be remove.
    uint32_t shift = 1;
    size_t last = gfx_target.length();
@@ -508,9 +505,9 @@ hsa_isa_t PALHSALoaderContext::IsaFromName(const char* name) {
 }

 bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
-  uint32_t gfxipVersion = program_->dev().settings().useLightning_ ?
-    program_->dev().hwInfo()->gfxipVersionLC_ :
-    program_->dev().hwInfo()->gfxipVersion_;
+  uint32_t gfxipVersion = program_->dev().settings().useLightning_
+      ? program_->dev().hwInfo()->gfxipVersionLC_
+      : program_->dev().hwInfo()->gfxipVersion_;
  uint32_t majorSrc = gfxipVersion / 10;
  uint32_t minorSrc = gfxipVersion % 10;

@@ -519,11 +516,9 @@ bool PALHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa)

  if (majorSrc != majorTrg) {
    return false;
-  }
-  else if (minorTrg == minorSrc) {
+  } else if (minorTrg == minorSrc) {
    return true;
-  }
-  else if (minorTrg < minorSrc) {
+  } else if (minorTrg < minorSrc) {
    LogWarning("ISA downgrade for execution!");
    return true;
  }
@@ -708,7 +703,7 @@ static hsa_status_t GetKernelNamesCallback(hsa_executable_t hExec, hsa_executabl
  return HSA_STATUS_SUCCESS;
 }

-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)

 bool LightningProgram::createBinary(amd::option::Options* options) {
 #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
@@ -716,7 +711,7 @@ bool LightningProgram::createBinary(amd::option::Options* options) {
    LogError("Failed to create ELF binary image!");
    return false;
  }
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
  return true;
 }

@@ -752,10 +747,10 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
  }

 #if defined(USE_COMGR_LIBRARY)
-  for (const auto &kernelMeta : kernelMetadataMap_) {
+  for (const auto& kernelMeta : kernelMetadataMap_) {
    auto kernelName = kernelMeta.first;
-    auto kernel = new LightningKernel(kernelName, this,
-                                      options->origOptionStr + ProcessOptions(options));
+    auto kernel =
+        new LightningKernel(kernelName, this, options->origOptionStr + ProcessOptions(options));
    kernels()[kernelName] = kernel;

    if (!kernel->init()) {
@@ -804,9 +799,9 @@ bool LightningProgram::setKernels(amd::option::Options* options, void* binary, s
    maxScratchRegs_ =
        std::max(static_cast<uint>(kernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
  }
-#endif // defined(USE_COMGR_LIBRARY)
+#endif  // defined(USE_COMGR_LIBRARY)
  DestroySegmentCpuAccess();
-#endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
+#endif  // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY)
  return true;
 }

@@ -9,15 +9,15 @@
 namespace amd {
 namespace option {
 class Options;
-}  // option
+}  // namespace option
 namespace hsa {
 namespace loader {
 class Loader;
 class Executable;
 class Context;
-}  // loader
-}  // hsa
-}  // amd
+}  // namespace loader
+}  // namespace hsa
+}  // namespace amd

 //! \namespace pal PAL Device Implementation
 namespace pal {
@@ -50,15 +50,16 @@ class Segment : public amd::HeapObject {
  bool gpuAddressOffset(uint64_t offAddr, size_t* offset);

  //! Returns address for CPU access in the segment
-  void* cpuAddress(size_t offset) const
-    { return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset; }
+  void* cpuAddress(size_t offset) const {
+    return ((cpuAccess_ != nullptr) ? cpuAccess_->data() : cpuMem_) + offset;
+  }

  void DestroyCpuAccess();

 private:
-  Memory* gpuAccess_;   //!< GPU memory for segment access
-  Memory* cpuAccess_;   //!< CPU memory for segment (backing store)
-  address cpuMem_;      //!< CPU memory for segment without GPU direct access (backing store)
+  Memory* gpuAccess_;  //!< GPU memory for segment access
+  Memory* cpuAccess_;  //!< CPU memory for segment (backing store)
+  address cpuMem_;     //!< CPU memory for segment without GPU direct access (backing store)
 };

 class PALHSALoaderContext final : public Context {
@@ -166,7 +167,7 @@ class HSAILProgram : public device::Program {
  }

  //! Get symbol by name
-  amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t *agent) const {
+  amd::hsa::loader::Symbol* GetSymbol(const char* symbol_name, const hsa_agent_t* agent) const {
    return executable_->GetSymbol(symbol_name, agent);
  }

@@ -180,11 +181,14 @@ class HSAILProgram : public device::Program {
  virtual bool setKernels(amd::option::Options* options, void* binary, size_t binSize) override;

  //! Destroys CPU allocations in the code segment
-  void DestroySegmentCpuAccess() const
-    { if (codeSegment_ != nullptr) { codeSegment_->DestroyCpuAccess(); } }
+  void DestroySegmentCpuAccess() const {
+    if (codeSegment_ != nullptr) {
+      codeSegment_->DestroyCpuAccess();
+    }
+  }

-  virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr,
-                                  size_t* bytes, const char* globalName) const;
+  virtual bool createGlobalVarObj(amd::Memory** amd_mem_obj, void** dptr, size_t* bytes,
+                                  const char* globalName) const;

 private:
  //! Disable default copy constructor
@@ -201,7 +205,7 @@ class HSAILProgram : public device::Program {
  std::vector<Memory*> globalStores_;  //!< Global memory for the program
  Memory* kernels_;                    //!< Table with kernel object pointers
  Memory* codeSegGpu_;                 //!< GPU memory with code objects
-  Segment*  codeSegment_;              //!< Pointer to the code segment for this program
+  Segment* codeSegment_;               //!< Pointer to the code segment for this program
  uint
      maxScratchRegs_;  //!< Maximum number of scratch regs used in the program by individual kernel
  std::list<Sampler*> staticSamplers_;  //!< List od internal static samplers
@@ -214,19 +218,17 @@ class HSAILProgram : public device::Program {
 //! \class Lightning Compiler Program
 class LightningProgram : public HSAILProgram {
 public:
-  LightningProgram(NullDevice& device)
-    : HSAILProgram(device) {
-      isLC_ = true;
-      xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
-      machineTarget_ = dev().hwInfo()->machineTargetLC_;
-    }
+  LightningProgram(NullDevice& device) : HSAILProgram(device) {
+    isLC_ = true;
+    xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
+    machineTarget_ = dev().hwInfo()->machineTargetLC_;
+  }

-  LightningProgram(Device& device)
-    : HSAILProgram(device) {
-      isLC_ = true;
-      xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
-      machineTarget_ = dev().hwInfo()->machineTargetLC_;
-    }
+  LightningProgram(Device& device) : HSAILProgram(device) {
+    isLC_ = true;
+    xnackEnabled_ = dev().hwInfo()->xnackEnabled_;
+    machineTarget_ = dev().hwInfo()->machineTargetLC_;
+  }
  virtual ~LightningProgram() {}

 protected:
@@ -235,4 +237,5 @@ class LightningProgram : public HSAILProgram {
  virtual bool createBinary(amd::option::Options* options) override;
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -41,8 +41,8 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,
  if (memRef != nullptr) {
    result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
    if ((result != Pal::Result::Success) &&
-         // Free cache if PAL failed allocation
-         dev.resourceCache().free()) {
+        // Free cache if PAL failed allocation
+        dev.resourceCache().free()) {
      // If cache was freed, then try to allocate again
      result = dev.iDev()->CreateGpuMemory(createInfo, &memRef[1], &memRef->gpuMem_);
    }
@@ -154,8 +154,7 @@ GpuMemoryReference* GpuMemoryReference::Create(const Device& dev,

 // ================================================================================================
 GpuMemoryReference::GpuMemoryReference(const Device& dev)
-  : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr)
-{}
+    : gpuMem_(nullptr), cpuAddress_(nullptr), device_(dev), gpu_(nullptr) {}

 // ================================================================================================
 GpuMemoryReference::~GpuMemoryReference() {
@@ -181,8 +180,7 @@ GpuMemoryReference::~GpuMemoryReference() {
    iMem()->Unmap();
  }
  if (0 != iMem()) {
-    if (!(iMem()->Desc().flags.isShared ||
-          iMem()->Desc().flags.isExternal ||
+    if (!(iMem()->Desc().flags.isShared || iMem()->Desc().flags.isExternal ||
          iMem()->Desc().flags.isExternPhys)) {
      // Update free memory size counters
      device_.updateAllocedMemory(iMem()->Desc().preferredHeap, iMem()->Desc().size, true);
@@ -368,7 +366,7 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
    case Persistent:
      createInfo->heapCount = 2;
      createInfo->heaps[0] = Pal::GpuHeapLocal;
-      createInfo->heaps[1] = Pal:: GpuHeapGartUswc;
+      createInfo->heaps[1] = Pal::GpuHeapGartUswc;
 #ifdef ATI_OS_LINUX
      // Note: SSG in Linux requires DGMA heap
      if (dev().properties().gpuMemoryProperties.busAddressableMemSize > 0) {
@@ -401,11 +399,10 @@ void Resource::memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo) {
 }

 // ================================================================================================
-bool Resource::CreateImage(CreateParams* params)
-{
+bool Resource::CreateImage(CreateParams* params) {
  Pal::Result result;
-  Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
-  Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+  Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
+  Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
  Pal::ChannelMapping channels;
  Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);

@@ -417,8 +414,7 @@ bool Resource::CreateImage(CreateParams* params)
      memRef_->retain();
      desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
      offset_ += viewOwner_->offset_;
-    }
-    else {
+    } else {
      Pal::GpuMemoryCreateInfo createInfo = {};
      createInfo.size = desc().width_ * elementSize();
      createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
@@ -427,8 +423,8 @@ bool Resource::CreateImage(CreateParams* params)
      createInfo.priority = Pal::GpuMemPriority::Normal;
      memTypeToHeap(&createInfo);
      // createInfo.priority;
-      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-        createInfo.alignment, nullptr, &subOffset_);
+      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+                                                    nullptr, &subOffset_);
      if (nullptr == memRef_) {
        memRef_ = GpuMemoryReference::Create(dev(), createInfo);
        if (nullptr == memRef_) {
@@ -477,16 +473,16 @@ bool Resource::CreateImage(CreateParams* params)
  imgCreateInfo.arraySize = 1;

  switch (desc_.topology_) {
-  case CL_MEM_OBJECT_IMAGE3D:
-    imgCreateInfo.imageType = Pal::ImageType::Tex3d;
-    viewInfo.viewType = Pal::ImageViewType::Tex3d;
-    break;
-  case CL_MEM_OBJECT_IMAGE1D:
-  case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-  case CL_MEM_OBJECT_IMAGE1D_BUFFER:
-    imgCreateInfo.imageType = Pal::ImageType::Tex1d;
-    viewInfo.viewType = Pal::ImageViewType::Tex1d;
-    break;
+    case CL_MEM_OBJECT_IMAGE3D:
+      imgCreateInfo.imageType = Pal::ImageType::Tex3d;
+      viewInfo.viewType = Pal::ImageViewType::Tex3d;
+      break;
+    case CL_MEM_OBJECT_IMAGE1D:
+    case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+    case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+      imgCreateInfo.imageType = Pal::ImageType::Tex1d;
+      viewInfo.viewType = Pal::ImageViewType::Tex1d;
+      break;
  }
  if (desc_.topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    ImgSubresRange.numSlices = imgCreateInfo.arraySize = desc_.height_;
@@ -504,8 +500,7 @@ bool Resource::CreateImage(CreateParams* params)
    ImgSubresRange.startSubres.arraySlice = imageView->layer_;
    viewOwner_ = imageView->resource_;
    image_ = viewOwner_->image_;
-  }
-  else if (memoryType() == ImageBuffer) {
+  } else if (memoryType() == ImageBuffer) {
    ImageBufferParams* imageBuffer = reinterpret_cast<ImageBufferParams*>(params);
    viewOwner_ = imageBuffer->resource_;
  }
@@ -515,11 +510,11 @@ bool Resource::CreateImage(CreateParams* params)
  ImgSubresRange.numMips = desc().mipLevels_;

  if ((memoryType() != ImageView) ||
-    //! @todo PAL doesn't allow an SRD view creation with different pixel size
-    (elementSize() != viewOwner_->elementSize())) {
+      //! @todo PAL doesn't allow an SRD view creation with different pixel size
+      (elementSize() != viewOwner_->elementSize())) {
    imgCreateInfo.usageFlags.shaderRead = true;
    imgCreateInfo.usageFlags.shaderWrite =
-      (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
+        (format == Pal::ChNumFormat::X8Y8Z8W8_Srgb) ? false : true;
    imgCreateInfo.swizzledFormat.format = format;
    imgCreateInfo.swizzledFormat.swizzle = channels;
    imgCreateInfo.mipLevels = (desc_.mipLevels_) ? desc_.mipLevels_ : 1;
@@ -529,10 +524,9 @@ bool Resource::CreateImage(CreateParams* params)
    uint32_t rowPitch = 0;

    if (((memoryType() == Persistent) && dev().settings().linearPersistentImage_) ||
-      (memoryType() == ImageBuffer)) {
+        (memoryType() == ImageBuffer)) {
      tiling = Pal::ImageTiling::Linear;
-    }
-    else if (memoryType() == ImageView) {
+    } else if (memoryType() == ImageView) {
      tiling = viewOwner_->image_->GetImageCreateInfo().tiling;
      // Find the new pitch in pixels for the new format
      rowPitch = viewOwner_->desc().pitch_ * viewOwner_->elementSize() / elementSize();
@@ -540,10 +534,9 @@ bool Resource::CreateImage(CreateParams* params)

    if (memoryType() == ImageBuffer) {
      if ((params->owner_ != NULL) && params->owner_->asImage() &&
-        (params->owner_->asImage()->getRowPitch() != 0)) {
+          (params->owner_->asImage()->getRowPitch() != 0)) {
        rowPitch = params->owner_->asImage()->getRowPitch() / elementSize();
-      }
-      else {
+      } else {
        rowPitch = desc().width_;
      }
    }
@@ -579,8 +572,8 @@ bool Resource::CreateImage(CreateParams* params)
    createInfo.priority = Pal::GpuMemPriority::Normal;
    memTypeToHeap(&createInfo);

-    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-      createInfo.alignment, nullptr, &subOffset_);
+    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+                                                  nullptr, &subOffset_);
    if (nullptr == memRef_) {
      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
      if (nullptr == memRef_) {
@@ -589,8 +582,7 @@ bool Resource::CreateImage(CreateParams* params)
      }
    }
    offset_ += static_cast<size_t>(subOffset_);
-  }
-  else {
+  } else {
    memRef_ = viewOwner_->memRef_;
    memRef_->retain();
    desc_.cardMemory_ = viewOwner_->desc().cardMemory_;
@@ -627,11 +619,10 @@ bool Resource::CreateImage(CreateParams* params)
 }

 // ================================================================================================
-bool Resource::CreateInterop(CreateParams* params)
-{
+bool Resource::CreateInterop(CreateParams* params) {
  Pal::Result result;
-  Pal::SubresId ImgSubresId = { Pal::ImageAspect::Color, 0, 0 };
-  Pal::SubresRange ImgSubresRange = { ImgSubresId, 1, 1 };
+  Pal::SubresId ImgSubresId = {Pal::ImageAspect::Color, 0, 0};
+  Pal::SubresRange ImgSubresRange = {ImgSubresId, 1, 1};
  Pal::ChannelMapping channels;
  Pal::ChNumFormat format = dev().getPalFormat(desc().format_, &channels);
  Pal::ExternalGpuMemoryOpenInfo gpuMemOpenInfo = {};
@@ -645,21 +636,21 @@ bool Resource::CreateInterop(CreateParams* params)
    OGLInteropParams* oglRes = reinterpret_cast<OGLInteropParams*>(params);
    assert(oglRes->glPlatformContext_ && "We don't have OGL context!");
    switch (oglRes->type_) {
-    case InteropVertexBuffer:
-      glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
-      break;
-    case InteropRenderBuffer:
-      glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
-      break;
-    case InteropTexture:
-    case InteropTextureViewLevel:
-    case InteropTextureViewCube:
-      glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
-      break;
-    default:
-      LogError("Unknown OGL interop type!");
-      return false;
-      break;
+      case InteropVertexBuffer:
+        glType_ = GL_RESOURCE_ATTACH_VERTEXBUFFER_AMD;
+        break;
+      case InteropRenderBuffer:
+        glType_ = GL_RESOURCE_ATTACH_RENDERBUFFER_AMD;
+        break;
+      case InteropTexture:
+      case InteropTextureViewLevel:
+      case InteropTextureViewCube:
+        glType_ = GL_RESOURCE_ATTACH_TEXTURE_AMD;
+        break;
+      default:
+        LogError("Unknown OGL interop type!");
+        return false;
+        break;
    }
    glPlatformContext_ = oglRes->glPlatformContext_;
    layer = oglRes->layer_;
@@ -667,17 +658,18 @@ bool Resource::CreateInterop(CreateParams* params)
    mipLevel = oglRes->mipLevel_;

    if (!dev().resGLAssociate(oglRes->glPlatformContext_, oglRes->handle_, glType_,
-      &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
+                              &openInfo.hExternalResource, &glInteropMbRes_, &offset_, desc_.format_
 #ifdef ATI_OS_WIN
-      , openInfo.doppDesktopInfo
+                              ,
+                              openInfo.doppDesktopInfo
 #endif
-    )) {
+                              )) {
      return false;
    }
    desc_.isDoppTexture_ = (openInfo.doppDesktopInfo.gpuVirtAddr != 0);
    format = dev().getPalFormat(desc().format_, &channels);
  }
-#ifdef ATI_OS_WIN	
+#ifdef ATI_OS_WIN
  else {
    D3DInteropParams* d3dRes = reinterpret_cast<D3DInteropParams*>(params);
    openInfo.hExternalResource = d3dRes->handle_;
@@ -713,8 +705,8 @@ bool Resource::CreateInterop(CreateParams* params)
      size_t gpuMemSize;

      if (Pal::Result::Success !=
-        dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
-          &imgCreateInfo)) {
+          dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
+                                                    &imgCreateInfo)) {
        return false;
      }

@@ -736,51 +728,51 @@ bool Resource::CreateInterop(CreateParams* params)
      imgCreateInfo.depthPitch = desc().height_ * imgCreateInfo.rowPitch;

      switch (misc) {
-      case 1:  // NV12 or P010 formats
-        switch (layer) {
-        case -1:
-        case 0:
+        case 1:  // NV12 or P010 formats
+          switch (layer) {
+            case -1:
+            case 0:
+              break;
+            case 1:
+              // Y - plane size to the offset
+              // NV12 format. UV is 2 times smaller plane Y
+              viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+              imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+              break;
+            default:
+              LogError("Unknown Interop View Type");
+              return false;
+          }
          break;
-        case 1:
-          // Y - plane size to the offset
-          // NV12 format. UV is 2 times smaller plane Y
-          viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+        case 2:  // YV12 format
+          switch (layer) {
+            case -1:
+            case 0:
+              break;
+            case 1:
+              // Y - plane size to the offset
+              // YV12 format. U is 4 times smaller plane than Y
+              viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
+              imgCreateInfo.rowPitch >>= 1;
+              break;
+            case 2:
+              // Y + U plane sizes to the offest.
+              // U plane is 4 times smaller than Y and U == V
+              viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
+              imgCreateInfo.rowPitch >>= 1;
+              break;
+            default:
+              LogError("Unknown Interop View Type");
+              return false;
+          }
+          imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
+          break;
+        case 3:  // YUY2 format
          imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
          break;
        default:
          LogError("Unknown Interop View Type");
          return false;
-        }
-        break;
-      case 2:  // YV12 format
-        switch (layer) {
-        case -1:
-        case 0:
-          break;
-        case 1:
-          // Y - plane size to the offset
-          // YV12 format. U is 4 times smaller plane than Y
-          viewOffset = 2 * imgCreateInfo.rowPitch * desc().height_;
-          imgCreateInfo.rowPitch >>= 1;
-          break;
-        case 2:
-          // Y + U plane sizes to the offest.
-          // U plane is 4 times smaller than Y and U == V
-          viewOffset = 5 * imgCreateInfo.rowPitch * desc().height_ / 2;
-          imgCreateInfo.rowPitch >>= 1;
-          break;
-        default:
-          LogError("Unknown Interop View Type");
-          return false;
-        }
-        imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
-        break;
-      case 3:  // YUY2 format
-        imgCreateInfo.depthPitch = imgCreateInfo.rowPitch * desc().height_;
-        break;
-      default:
-        LogError("Unknown Interop View Type");
-        return false;
      }

      imageSize = dev().iDev()->GetImageSize(imgCreateInfo, &result);
@@ -820,8 +812,7 @@ bool Resource::CreateInterop(CreateParams* params)
      hwState_[10] = static_cast<uint32_t>(desc().width_);
      hwState_[11] = 0;  // one extra reserved field in the argument
    }
-  }
-  else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+  } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
    memRef_ = GpuMemoryReference::Create(dev(), gpuMemOpenInfo);
    if (nullptr == memRef_) {
      return false;
@@ -842,8 +833,7 @@ bool Resource::CreateInterop(CreateParams* params)
    hwState_[9] = GetHSAILImageOrderType(desc().format_);
    hwState_[10] = static_cast<uint32_t>(desc().width_);
    hwState_[11] = 0;  // one extra reserved field in the argument
-  }
-  else {
+  } else {
    Pal::ExternalImageOpenInfo imgOpenInfo = {};
    Pal::ImageCreateInfo imgCreateInfo = {};
    imgOpenInfo.resourceInfo = openInfo;
@@ -865,14 +855,14 @@ bool Resource::CreateInterop(CreateParams* params)
    viewInfo.possibleLayouts.usages = Pal::LayoutShaderWrite;
    viewInfo.viewType = Pal::ImageViewType::Tex2d;
    switch (imgCreateInfo.imageType) {
-    case Pal::ImageType::Tex3d:
-      viewInfo.viewType = Pal::ImageViewType::Tex3d;
-      break;
-    case Pal::ImageType::Tex1d:
-      viewInfo.viewType = Pal::ImageViewType::Tex1d;
-      break;
-    default:
-      break;
+      case Pal::ImageType::Tex3d:
+        viewInfo.viewType = Pal::ImageViewType::Tex3d;
+        break;
+      case Pal::ImageType::Tex1d:
+        viewInfo.viewType = Pal::ImageViewType::Tex1d;
+        break;
+      default:
+        break;
    }
    viewInfo.pImage = image_;
    viewInfo.swizzledFormat.format = format;
@@ -897,14 +887,13 @@ bool Resource::CreateInterop(CreateParams* params)
    //! It's a workaround for D24S8 format, since PAL doesn't support this format
    //! and GSL decompresses 24bit DEPTH into D24S8 for OGL compatibility
    if ((desc().format_.image_channel_order == CL_DEPTH_STENCIL) &&
-      (desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
-        if (dev().settings().gfx10Plus_) {
-          hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
-        }
-        else {
-          hwState_[1] &= ~0x3c000000;
-          hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
-        }
+        (desc().format_.image_channel_data_type == CL_UNORM_INT24)) {
+      if (dev().settings().gfx10Plus_) {
+        hwState_[1] = (hwState_[1] & ~0x1ff00000) | 0x08d00000;
+      } else {
+        hwState_[1] &= ~0x3c000000;
+        hwState_[1] = (hwState_[1] & ~0x3f00000) | 0x1400000;
+      }
    }
    hwState_[8] = GetHSAILImageFormatType(desc().format_);
    hwState_[9] = GetHSAILImageOrderType(desc().format_);
@@ -915,8 +904,7 @@ bool Resource::CreateInterop(CreateParams* params)
 }

 // ================================================================================================
-bool Resource::CreatePinned(CreateParams* params)
-{
+bool Resource::CreatePinned(CreateParams* params) {
  PinnedParams* pinned = reinterpret_cast<PinnedParams*>(params);
  size_t allocSize = pinned->size_;
  const amd::HostMemoryReference* hostMemRef = pinned->hostMemRef_;
@@ -926,7 +914,7 @@ bool Resource::CreatePinned(CreateParams* params)
  if (desc().topology_ == CL_MEM_OBJECT_BUFFER) {
    // Allign offset to 4K boundary (Vista/Win7 limitation)
    char* tmpHost = const_cast<char*>(
-      amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));
+        amd::alignDown(reinterpret_cast<const char*>(address_), PinnedMemoryAlignment));

    // Find the partial size for unaligned copy
    hostMemOffset = static_cast<uint>(reinterpret_cast<const char*>(address_) - tmpHost);
@@ -940,18 +928,16 @@ bool Resource::CreatePinned(CreateParams* params)
    }
    allocSize = amd::alignUp(allocSize, PinnedMemoryAlignment);
    //            hostMemOffset &= ~(0xff);
-  }
-  else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
+  } else if (desc().topology_ == CL_MEM_OBJECT_IMAGE2D) {
    //! @todo: Width has to be aligned for 3D.
    //! Need to be replaced with a compute copy
    // Width aligned by 8 texels
    if (((desc().width_ % 0x8) != 0) ||
-      // Pitch aligned by 64 bytes
-      (((desc().width_ * elementSize()) % 0x40) != 0)) {
+        // Pitch aligned by 64 bytes
+        (((desc().width_ * elementSize()) % 0x40) != 0)) {
      return false;
    }
-  }
-  else {
+  } else {
    //! @todo GSL doesn't support pinning with resAlloc_
    return false;
  }
@@ -978,8 +964,7 @@ bool Resource::CreatePinned(CreateParams* params)
 }

 // ================================================================================================
-bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
-{
+bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr) {
  const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
  size_t allocSize = amd::alignUp(desc().width_ * elementSize_,
                                  dev().properties().gpuMemoryProperties.fragmentSize);
@@ -991,20 +976,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
    if (svmPtr != 0) {
      createInfo.flags.useReservedGpuVa = true;
      createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
-    }
-    else {
+    } else {
      createInfo.flags.useReservedGpuVa = false;
      createInfo.pReservedGpuVaOwner = nullptr;
    }
    if (!dev().settings().svmFineGrainSystem_) {
-      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-        createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+                                                    createInfo.pReservedGpuVaOwner, &subOffset_);
    }
    if (memRef_ == nullptr) {
      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
    }
-  }
-  else {
+  } else {
    Pal::GpuMemoryCreateInfo createInfo = {};
    createInfo.size = allocSize;
    createInfo.alignment = MaxGpuAlignment;
@@ -1015,8 +998,8 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
      createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
    }
    memTypeToHeap(&createInfo);
-    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-      createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+                                                  createInfo.pReservedGpuVaOwner, &subOffset_);
    if (memRef_ == nullptr) {
      createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
@@ -1028,9 +1011,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
  }
  desc_.cardMemory_ = false;
  if ((nullptr != params) && (nullptr != params->owner_) &&
-    (nullptr != params->owner_->getSvmPtr())) {
+      (nullptr != params->owner_->getSvmPtr())) {
    params->owner_->setSvmPtr(
-      reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
+        reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
    offset_ += static_cast<size_t>(subOffset_);
  }
  return true;
@@ -1126,18 +1109,18 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
  Pal::gpusize svmPtr = 0;
  if ((nullptr != params) && (nullptr != params->owner_) &&
      (nullptr != params->owner_->getSvmPtr())) {
-      svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
-      desc_.SVMRes_ = true;
-      svmPtr = (svmPtr == 1) ? 0 : svmPtr;
+    svmPtr = reinterpret_cast<Pal::gpusize>(params->owner_->getSvmPtr());
+    desc_.SVMRes_ = true;
+    svmPtr = (svmPtr == 1) ? 0 : svmPtr;
  }
  if (desc_.SVMRes_) {
-      return CreateSvm(params, svmPtr);
+    return CreateSvm(params, svmPtr);
  }

  Pal::GpuMemoryCreateInfo createInfo = {};
  createInfo.size = desc().width_ * elementSize_;
  createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
-  createInfo.alignment = desc().scratch_ ? 64*Ki : MaxGpuAlignment;
+  createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment;
  createInfo.vaRange = Pal::VaRange::Default;
  createInfo.priority = Pal::GpuMemPriority::Normal;

@@ -1152,8 +1135,8 @@ bool Resource::create(MemoryType memType, CreateParams* params) {

  memTypeToHeap(&createInfo);
  // createInfo.priority;
-  memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-    createInfo.alignment, nullptr, &subOffset_);
+  memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size, createInfo.alignment,
+                                                nullptr, &subOffset_);
  if (nullptr == memRef_) {
    memRef_ = GpuMemoryReference::Create(dev(), createInfo);
    if (nullptr == memRef_) {
@@ -1172,14 +1155,13 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
 }

 // ================================================================================================
-void Resource::free()
-{
+void Resource::free() {
  if (memRef_ == nullptr) {
    return;
  }

  const bool wait =
-    (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);
+      (memoryType() != ImageView) && (memoryType() != ImageBuffer) && (memoryType() != View);

  // OCL has to wait, even if resource is placed in the cache, since reallocation can occur
  // and resource can be reused on another async queue without a wait on a busy operation
@@ -1190,8 +1172,7 @@ void Resource::free()
      for (uint idx = 1; idx < dev().vgpus().size(); ++idx) {
        dev().vgpus()[idx]->waitForEvent(&events_[idx]);
      }
-    }
-    else {
+    } else {
      amd::ScopedLock l(memRef_->gpu_->execution());
      memRef_->gpu_->waitForEvent(&events_[memRef_->gpu_->index()]);
    }
@@ -1232,8 +1213,7 @@ void Resource::free()

 // ================================================================================================
 void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const void* data,
-                            bool waitForEvent) const
-{
+                            bool waitForEvent) const {
  GpuEvent event;

  // Write data size bytes to surface
@@ -1242,7 +1222,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
  gpu.eventBegin(MainEngine);
  gpu.queue(MainEngine).addCmdMemRef(memRef());
  gpu.iCmd()->CmdUpdateMemory(*iMem(), offset_ + offset, size,
-    reinterpret_cast<const uint32_t*>(data));
+                              reinterpret_cast<const uint32_t*>(data));
  gpu.eventEnd(MainEngine, event);

  if (waitForEvent) {
@@ -1259,8 +1239,7 @@ void Resource::writeRawData(VirtualGPU& gpu, size_t offset, size_t size, const v
 }

 // ================================================================================================
-static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
-{
+static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement) {
  if (bytesPerElement == 16) {
    return Pal::ChNumFormat::X32Y32Z32W32_Uint;
  } else if (bytesPerElement == 8) {
@@ -1292,8 +1271,7 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
  if (desc().buffer_ && !dstResource.desc().buffer_) {
    imageOffsetx = dstOrigin[0] % dstResource.elementSize();
    gpuMemoryOffset = srcOrigin[0] + offset();
-    gpuMemoryRowPitch =
-        (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
+    gpuMemoryRowPitch = (srcOrigin[1]) ? srcOrigin[1] : size[0] * dstResource.elementSize();
    img1Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE1D_ARRAY);
    img2Darray = (dstResource.desc().topology_ == CL_MEM_OBJECT_IMAGE2D_ARRAY);
  } else if (!desc().buffer_ && dstResource.desc().buffer_) {
@@ -1374,7 +1352,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
    }
    copyRegion.gpuMemoryOffset = gpuMemoryOffset;
    copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
-    copyRegion.gpuMemoryDepthPitch = (dstOrigin[2]) ? dstOrigin[2]
+    copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
+        ? dstOrigin[2]
        : copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
    gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, &copyRegion);
  } else {
@@ -1819,17 +1798,14 @@ void Resource::unmap(VirtualGPU* gpu) {
 }

 // ================================================================================================
-void Resource::unmapLayers(VirtualGPU* gpu) {
-  Unimplemented();
-}
+void Resource::unmapLayers(VirtualGPU* gpu) { Unimplemented(); }

 // ================================================================================================
 bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
-  MemBuddyAllocator* allocator = new MemBuddyAllocator(
-    device_, device_->settings().subAllocationChunkSize_,
-    device_->settings().subAllocationMinSize_);
-  if (!((allocator != nullptr) &&
-        (allocator->Init() == Pal::Result::Success) &&
+  MemBuddyAllocator* allocator =
+      new MemBuddyAllocator(device_, device_->settings().subAllocationChunkSize_,
+                            device_->settings().subAllocationMinSize_);
+  if (!((allocator != nullptr) && (allocator->Init() == Pal::Result::Success) &&
        heaps_.insert({mem_ref, allocator}).second)) {
    mem_ref->release();
    delete allocator;
@@ -1890,8 +1866,7 @@ bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
 }

 // ================================================================================================
-MemorySubAllocator::~MemorySubAllocator()
-{
+MemorySubAllocator::~MemorySubAllocator() {
  // Release memory heap for suballocations
  for (const auto& it : heaps_) {
    it.first->release();
@@ -1901,8 +1876,8 @@ MemorySubAllocator::~MemorySubAllocator()

 // ================================================================================================
 GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
-  const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
-{
+                                                 const Pal::IGpuMemory* reserved_va,
+                                                 Pal::gpusize* offset) {
  GpuMemoryReference* mem_ref = nullptr;
  MemBuddyAllocator* allocator = nullptr;
  // Check if the resource size and alignment are allowed for suballocation
@@ -1927,7 +1902,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
      }
      // We didn't find a valid chunk, so create a new one
      if (!CreateChunk(reserved_va)) {
-          return nullptr;
+        return nullptr;
      }
      i++;
    } while (i < 2);
@@ -1936,8 +1911,7 @@ GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize
 }

 // ================================================================================================
-bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
-{
+bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset) {
  bool release_mem = false;
  {
    amd::ScopedLock l(monitor);
@@ -1966,9 +1940,8 @@ ResourceCache::~ResourceCache() { free(); }

 // ================================================================================================
 //! \note the cache works in FILO mode
-bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
-  GpuMemoryReference* ref, Pal::gpusize offset)
-{
+bool ResourceCache::addGpuMemory(Resource::Descriptor* desc, GpuMemoryReference* ref,
+                                 Pal::gpusize offset) {
  bool result = false;
  size_t size = ref->iMem()->Desc().size;

@@ -2017,7 +1990,9 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,

 // ================================================================================================
 GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
-  Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
+                                                 Pal::gpusize alignment,
+                                                 const Pal::IGpuMemory* reserved_va,
+                                                 Pal::gpusize* offset) {
  amd::ScopedLock l(&lockCacheOps_);
  GpuMemoryReference* ref = nullptr;

@@ -2051,7 +2026,7 @@ GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal
      ref = it.second;
      cacheSize_ -= sizeRes;
      if (entry->type_ == Resource::Local) {
-          lclCacheSize_ -= sizeRes;
+        lclCacheSize_ -= sizeRes;
      }
      delete it.first;
      // Remove the found etry from the cache
@@ -2078,8 +2053,7 @@ bool ResourceCache::free(size_t minCacheEntries) {
 }

 // ================================================================================================
-void ResourceCache::removeLast()
-{
+void ResourceCache::removeLast() {
  std::pair<Resource::Descriptor*, GpuMemoryReference*> entry;
  {
    // Protect access to the global data
@@ -41,11 +41,11 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
  //! Get PAL memory object
  Pal::IGpuMemory* iMem() const { return gpuMem_; }

-  Pal::IGpuMemory* gpuMem_;   //!< PAL GPU memory object
-  void* cpuAddress_;          //!< CPU address of this memory
-  const Device& device_;      //!< GPU device
+  Pal::IGpuMemory* gpuMem_;  //!< PAL GPU memory object
+  void* cpuAddress_;         //!< CPU address of this memory
+  const Device& device_;     //!< GPU device
  //! @note: This field is necessary for the thread safe release only
-  VirtualGPU* gpu_;           //!< Resource will be used only on this queue
+  VirtualGPU* gpu_;  //!< Resource will be used only on this queue

 protected:
  //! Default destructor
@@ -186,7 +186,7 @@ class Resource : public amd::HeapObject {
  //! Constructor of 1D Resource object
  Resource(const Device& gpuDev,  //!< GPU device object
           size_t size            //!< Resource size
-           );
+  );

  //! Constructor of Image Resource object
  Resource(const Device& gpuDev,          //!< GPU device object
@@ -196,7 +196,7 @@ class Resource : public amd::HeapObject {
           cl_image_format format,        //!< resource format
           cl_mem_object_type imageType,  //!< CL image type
           uint mipLevels = 1             //!< Number of mip levels
-           );
+  );

  //! Destructor of the resource
  virtual ~Resource();
@@ -207,7 +207,7 @@ class Resource : public amd::HeapObject {
   */
  virtual bool create(MemoryType memType,       //!< memory type
                      CreateParams* params = 0  //!< special parameters for resource allocation
-                      );
+  );

  /*! \brief Copies a subregion of memory from one resource to another
   *
@@ -253,14 +253,13 @@ class Resource : public amd::HeapObject {
  Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }

  //! Returns a pointer to the memory reference
-  GpuMemoryReference* memRef() const {return memRef_; }
+  GpuMemoryReference* memRef() const { return memRef_; }

  //! Returns global memory offset
  uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }

  //! Returns global memory offset
-  uint64_t vmSize() const
-    { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }
+  uint64_t vmSize() const { return desc_.width_ * desc_.height_ * desc_.depth_ * elementSize(); }

  //! Returns global memory offset
  bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
@@ -279,11 +278,11 @@ class Resource : public amd::HeapObject {
            // Optimization for multilayer map/unmap
            uint startLayer = 0,  //!< Start layer for multilayer map
            uint numLayers = 0    //!< End layer for multilayer map
-            );
+  );

  //! Unlocks the resource if it was locked
  void unmap(VirtualGPU* gpu  //!< Virtual GPU device object
-             );
+  );

  //! Marks the resource as busy
  void setBusy(VirtualGPU& gpu,   //!< Virtual GPU device object
@@ -303,7 +302,7 @@ class Resource : public amd::HeapObject {
                 uint flags = 0,              //!< Map flags
                 size_t rowPitch = 0,         //!< Raw data row pitch
                 size_t slicePitch = 0        //!< Raw data slice pitch
-                 );
+  );

  //! Performs host read from the resource GPU memory
  bool hostRead(VirtualGPU* gpu,             //!< Virtual GPU device object
@@ -312,7 +311,7 @@ class Resource : public amd::HeapObject {
                const amd::Coord3D& size,    //!< The number of bytes to write
                size_t rowPitch = 0,         //!< Raw data row pitch
                size_t slicePitch = 0        //!< Raw data slice pitch
-                );
+  );

  //! Gets the resource element size
  uint elementSize() const { return elementSize_; }
@@ -377,7 +376,7 @@ class Resource : public amd::HeapObject {
      memRef_ = viewOwner_->memRef_;
      memRef_->retain();
      desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
-        Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
+          Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
      setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
    }
  }
@@ -390,33 +389,32 @@ class Resource : public amd::HeapObject {

 protected:
  /*! \brief Creates a PAL iamge object, associated with the resource
-  *
-  *  \return True if we succesfully created a PAL resource
-  */
-  bool CreateImage(CreateParams* params //!< special parameters for resource allocation
-                   );
+   *
+   *  \return True if we succesfully created a PAL resource
+   */
+  bool CreateImage(CreateParams* params  //!< special parameters for resource allocation
+  );

  /*! \brief Creates a PAL interop object, associated with the resource
-  *
-  *  \return True if we succesfully created a PAL interop resource
-  */
-  bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
-                     );
+   *
+   *  \return True if we succesfully created a PAL interop resource
+   */
+  bool CreateInterop(CreateParams* params  //!< special parameters for resource allocation
+  );

  /*! \brief Creates a PAL pinned object, associated with the resource
-  *
-  *  \return True if we succesfully created a PAL pinned resource
-  */
-  bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
-                    );
+   *
+   *  \return True if we succesfully created a PAL pinned resource
+   */
+  bool CreatePinned(CreateParams* params  //!< special parameters for resource allocation
+  );

  /*! \brief Creates a PAL SVM object, associated with the resource
-  *
-  *  \return True if we succesfully created a PAL SVM resource
-  */
+   *
+   *  \return True if we succesfully created a PAL SVM resource
+   */
  bool CreateSvm(CreateParams* params,  //!< special parameters for resource allocation
-                 Pal::gpusize svmPtr
-                 );
+                 Pal::gpusize svmPtr);

  uint elementSize_;  //!< Size of a single element in bytes

@@ -433,11 +431,11 @@ class Resource : public amd::HeapObject {
   */
  void* mapLayers(VirtualGPU* gpu,  //!< Virtual GPU device object
                  uint flags = 0    //!< flags for the map operation
-                  );
+  );

  //! Unlocks the resource with layers if it was locked
  void unmapLayers(VirtualGPU* gpu  //!< Virtual GPU device object
-                   );
+  );

  //! Calls PAL to map a resource
  void* gpuMemoryMap(size_t* pitch,             //!< Pitch value for the image
@@ -454,7 +452,7 @@ class Resource : public amd::HeapObject {

  //! Converts Resource memory type to the PAL heaps
  void memTypeToHeap(Pal::GpuMemoryCreateInfo* createInfo  //!< Memory create info
-                     );
+  );

  const Device& gpuDevice_;     //!< GPU device
  Descriptor desc_;             //!< Descriptor for this resource
@@ -462,7 +460,7 @@ class Resource : public amd::HeapObject {
  void* address_;               //!< Physical address of this resource
  size_t offset_;               //!< Resource offset
  GpuMemoryReference* memRef_;  //!< PAL resource reference
-  Pal::gpusize  subOffset_;     //!< GPU memory offset in the oririnal resource
+  Pal::gpusize subOffset_;      //!< GPU memory offset in the oririnal resource
  const Resource* viewOwner_;   //!< GPU resource, which owns this view
  void* glInteropMbRes_;        //!< Mb Res handle
  uint32_t glType_;             //!< GL interop type
@@ -485,41 +483,35 @@ class Resource : public amd::HeapObject {
 typedef Util::BuddyAllocator<Device> MemBuddyAllocator;

 class MemorySubAllocator : public amd::HeapObject {
-public:
+ public:
  MemorySubAllocator(Device* device) : device_(device) {}

  ~MemorySubAllocator();

  //! Create suballocation
-  GpuMemoryReference* Allocate(Pal::gpusize size,
-                               Pal::gpusize alignment,
-                               const Pal::IGpuMemory* reserved_va,
-                               Pal::gpusize* offset
-                               );
+  GpuMemoryReference* Allocate(Pal::gpusize size, Pal::gpusize alignment,
+                               const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset);
  //! Free suballocation
-  bool Free(amd::Monitor* monitor,
-            GpuMemoryReference* mem_ref,
-            Pal::gpusize offset
-            );
+  bool Free(amd::Monitor* monitor, GpuMemoryReference* mem_ref, Pal::gpusize offset);

-protected:
+ protected:
  //! Allocate new chunk of memory
  virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
  bool InitAllocator(GpuMemoryReference* mem_ref);

  Device* device_;
-  std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*>  heaps_;
+  std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
 };

 class CoarseMemorySubAllocator : public MemorySubAllocator {
-public:
+ public:
  CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}

  bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
 };

 class FineMemorySubAllocator : public MemorySubAllocator {
-public:
+ public:
  FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}

  bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
@@ -529,29 +521,28 @@ class ResourceCache : public amd::HeapObject {
 public:
  //! Default constructor
  ResourceCache(Device* device, size_t cacheSizeLimit)
-      : lockCacheOps_("PAL resource cache", true)
-      , cacheSize_(0)
-      , lclCacheSize_(0)
-      , cacheSizeLimit_(cacheSizeLimit)
-      , mem_sub_alloc_local_(device)
-      , mem_sub_alloc_coarse_ (device)
-      , mem_sub_alloc_fine_ (device) {}
+      : lockCacheOps_("PAL resource cache", true),
+        cacheSize_(0),
+        lclCacheSize_(0),
+        cacheSizeLimit_(cacheSizeLimit),
+        mem_sub_alloc_local_(device),
+        mem_sub_alloc_coarse_(device),
+        mem_sub_alloc_fine_(device) {}

  //! Default destructor
  ~ResourceCache();

  //! Adds a PAL resource to the cache
-  bool addGpuMemory(Resource::Descriptor* desc,   //!< Resource descriptor - cache key
-                    GpuMemoryReference*   ref,    //!< Resource reference
-                    Pal::gpusize          offset  //!< Original resource offset
-                    );
+  bool addGpuMemory(Resource::Descriptor* desc,  //!< Resource descriptor - cache key
+                    GpuMemoryReference* ref,     //!< Resource reference
+                    Pal::gpusize offset          //!< Original resource offset
+  );

  //! Finds a PAL resource from the cache
  GpuMemoryReference* findGpuMemory(
      Resource::Descriptor* desc,  //!< Resource descriptor - cache key
-      Pal::gpusize size,
-      Pal::gpusize alignment,
-      const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
+      Pal::gpusize size, Pal::gpusize alignment,
+      const Pal::IGpuMemory* reserved_va,  //!< Reserved VA for SVM suballocations
      Pal::gpusize* offset);

  //! Destroys cache
@@ -576,16 +567,17 @@ class ResourceCache : public amd::HeapObject {

  amd::Monitor lockCacheOps_;  //!< Lock to serialise cache access

-  size_t cacheSize_;            //!< Current cache size in bytes
-  size_t lclCacheSize_;         //!< Local memory stored in the cache
-  const size_t cacheSizeLimit_; //!< Cache size limit in bytes
+  size_t cacheSize_;             //!< Current cache size in bytes
+  size_t lclCacheSize_;          //!< Local memory stored in the cache
+  const size_t cacheSizeLimit_;  //!< Cache size limit in bytes

  //! PAL resource cache
  std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;

-  MemorySubAllocator  mem_sub_alloc_local_;  //!< Allocator for suballocations in Local
-  CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
-  FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
+  MemorySubAllocator mem_sub_alloc_local_;         //!< Allocator for suballocations in Local
+  CoarseMemorySubAllocator mem_sub_alloc_coarse_;  //!< Allocator for suballocations in Coarse SVM
+  FineMemorySubAllocator mem_sub_alloc_fine_;      //!< Allocator for suballocations in Fine SVM
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -136,7 +136,7 @@ Settings::Settings() {
  subAllocationMinSize_ = 4 * Ki;
  subAllocationChunkSize_ = 64 * Mi;
  subAllocationMaxSize_ =
-    std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
+      std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);

  maxCmdBuffers_ = 12;
  useLightning_ = GPU_ENABLE_LC;
@@ -148,8 +148,7 @@ Settings::Settings() {

 bool Settings::create(const Pal::DeviceProperties& palProp,
                      const Pal::GpuMemoryHeapProperties* heaps, const Pal::WorkStationCaps& wscaps,
-                      bool reportAsOCL12Device)
-{
+                      bool reportAsOCL12Device) {
  uint32_t osVer = 0x0;

  // Disable thread trace by default for all devices
@@ -198,8 +197,9 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
    case Pal::AsicRevision::Navi10Lite:
      gfx10Plus_ = true;
      useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
-      hsailExplicitXnack_ = static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled
-        || palProp.gpuMemoryProperties.flags.iommuv2Support);
+      hsailExplicitXnack_ =
+          static_cast<uint>(palProp.gpuMemoryProperties.flags.pageMigrationEnabled ||
+                            palProp.gpuMemoryProperties.flags.iommuv2Support);
      enableWgpMode_ = GPU_ENABLE_WGP_MODE;
      if (useLightning_) {
        enableWave32Mode_ = true;
@@ -346,7 +346,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
  if (VerifyVersionInfo(&versionInfo, VER_MAJORVERSION | VER_MINORVERSION, conditionMask)) {
    splitSizeForWin7_ = true;  // Update flag of DMA flush split size for Win 7
    if (modifyMaxWorkload.time > 0) {
-      maxWorkloadTime_ = modifyMaxWorkload.time; // Update max workload time
+      maxWorkloadTime_ = modifyMaxWorkload.time;  // Update max workload time
    }
  }
 #endif  // defined(_WIN32)
@@ -39,63 +39,63 @@ class Settings : public device::Settings {

  union {
    struct {
-      uint remoteAlloc_ : 1;             //!< Allocate remote memory for the heap
-      uint stagedXferRead_ : 1;          //!< Uses a staged buffer read
-      uint stagedXferWrite_ : 1;         //!< Uses a staged buffer write
-      uint disablePersistent_ : 1;       //!< Disables using persistent memory for staging
-      uint imageSupport_ : 1;            //!< Report images support
-      uint doublePrecision_ : 1;         //!< Enables double precision support
-      uint use64BitPtr_ : 1;             //!< Use 64bit pointers on GPU
-      uint force32BitOcl20_ : 1;         //!< Force 32bit apps to take CLANG/HSAIL path on GPU
-      uint imageDMA_ : 1;                //!< Enable direct image DMA transfers
-      uint viPlus_ : 1;                  //!< VI and post VI features
-      uint aiPlus_ : 1;                  //!< AI and post AI features
-      uint gfx10Plus_ : 1;               //!< gfx10 and post gfx10 features
-      uint threadTraceEnable_ : 1;       //!< Thread trace enable
-      uint linearPersistentImage_ : 1;   //!< Allocates linear images in persistent
-      uint useSingleScratch_ : 1;        //!< Allocates single scratch per device
-      uint svmAtomics_ : 1;              //!< SVM device atomics
-      uint svmFineGrainSystem_ : 1;      //!< SVM fine grain system support
-      uint useDeviceQueue_ : 1;          //!< Submit to separate device queue
-      uint sdamPageFaultWar_ : 1;        //!< SDMA page fault workaround
-      uint rgpSqttWaitIdle_: 1;          //!< Wait for idle after SQTT trace
-      uint rgpSqttForceDisable_: 1;      //!< Disables SQTT
-      uint splitSizeForWin7_: 1;         //!< DMA flush split size for Win 7
+      uint remoteAlloc_ : 1;            //!< Allocate remote memory for the heap
+      uint stagedXferRead_ : 1;         //!< Uses a staged buffer read
+      uint stagedXferWrite_ : 1;        //!< Uses a staged buffer write
+      uint disablePersistent_ : 1;      //!< Disables using persistent memory for staging
+      uint imageSupport_ : 1;           //!< Report images support
+      uint doublePrecision_ : 1;        //!< Enables double precision support
+      uint use64BitPtr_ : 1;            //!< Use 64bit pointers on GPU
+      uint force32BitOcl20_ : 1;        //!< Force 32bit apps to take CLANG/HSAIL path on GPU
+      uint imageDMA_ : 1;               //!< Enable direct image DMA transfers
+      uint viPlus_ : 1;                 //!< VI and post VI features
+      uint aiPlus_ : 1;                 //!< AI and post AI features
+      uint gfx10Plus_ : 1;              //!< gfx10 and post gfx10 features
+      uint threadTraceEnable_ : 1;      //!< Thread trace enable
+      uint linearPersistentImage_ : 1;  //!< Allocates linear images in persistent
+      uint useSingleScratch_ : 1;       //!< Allocates single scratch per device
+      uint svmAtomics_ : 1;             //!< SVM device atomics
+      uint svmFineGrainSystem_ : 1;     //!< SVM fine grain system support
+      uint useDeviceQueue_ : 1;         //!< Submit to separate device queue
+      uint sdamPageFaultWar_ : 1;       //!< SDMA page fault workaround
+      uint rgpSqttWaitIdle_ : 1;        //!< Wait for idle after SQTT trace
+      uint rgpSqttForceDisable_ : 1;    //!< Disables SQTT
+      uint splitSizeForWin7_ : 1;       //!< DMA flush split size for Win 7
      uint reserved_ : 11;
    };
    uint value_;
  };

-  uint oclVersion_;            //!< Reported OpenCL version support
-  uint debugFlags_;            //!< Debug GPU flags
-  uint hwLDSSize_;             //!< HW local data store size
-  uint maxWorkGroupSize_;      //!< Requested workgroup size for this device
-  uint preferredWorkGroupSize_;//!< Requested preferred workgroup size for this device
-  uint workloadSplitSize_;     //!< Workload split size
-  uint minWorkloadTime_;       //!< Minimal workload time in 0.1 ms
-  uint maxWorkloadTime_;       //!< Maximum workload time in 0.1 ms
-  uint blitEngine_;            //!< Blit engine type
-  uint cacheLineSize_;         //!< Cache line size in bytes
-  uint cacheSize_;             //!< L1 cache size in bytes
-  uint numComputeRings_;       //!< 0 - disabled, 1 , 2,.. - the number of compute rings
-  uint numDeviceEvents_;       //!< The number of device events
-  uint numWaitEvents_;         //!< The number of wait events for device enqueue
-  uint hostMemDirectAccess_;   //!< Enables direct access to the host memory
-  uint numScratchWavesPerCu_;  //!< Maximum number of waves when scratch is enabled
-  size_t xferBufSize_;         //!< Transfer buffer size for image copy optimization
-  size_t stagedXferSize_;      //!< Staged buffer size
-  size_t pinnedXferSize_;      //!< Pinned buffer size for transfer
-  size_t pinnedMinXferSize_;   //!< Minimal buffer size for pinned transfer
-  size_t resourceCacheSize_;   //!< Resource cache size in MB
-  size_t numMemDependencies_;  //!< The array size for memory dependencies tracking
-  uint64_t maxAllocSize_;      //!< Maximum single allocation size
-  uint rgpSqttDispCount_;      //!< The number of dispatches captured in SQTT
-  uint maxCmdBuffers_;         //!< Maximum number of command buffers allocated per queue
+  uint oclVersion_;              //!< Reported OpenCL version support
+  uint debugFlags_;              //!< Debug GPU flags
+  uint hwLDSSize_;               //!< HW local data store size
+  uint maxWorkGroupSize_;        //!< Requested workgroup size for this device
+  uint preferredWorkGroupSize_;  //!< Requested preferred workgroup size for this device
+  uint workloadSplitSize_;       //!< Workload split size
+  uint minWorkloadTime_;         //!< Minimal workload time in 0.1 ms
+  uint maxWorkloadTime_;         //!< Maximum workload time in 0.1 ms
+  uint blitEngine_;              //!< Blit engine type
+  uint cacheLineSize_;           //!< Cache line size in bytes
+  uint cacheSize_;               //!< L1 cache size in bytes
+  uint numComputeRings_;         //!< 0 - disabled, 1 , 2,.. - the number of compute rings
+  uint numDeviceEvents_;         //!< The number of device events
+  uint numWaitEvents_;           //!< The number of wait events for device enqueue
+  uint hostMemDirectAccess_;     //!< Enables direct access to the host memory
+  uint numScratchWavesPerCu_;    //!< Maximum number of waves when scratch is enabled
+  size_t xferBufSize_;           //!< Transfer buffer size for image copy optimization
+  size_t stagedXferSize_;        //!< Staged buffer size
+  size_t pinnedXferSize_;        //!< Pinned buffer size for transfer
+  size_t pinnedMinXferSize_;     //!< Minimal buffer size for pinned transfer
+  size_t resourceCacheSize_;     //!< Resource cache size in MB
+  size_t numMemDependencies_;    //!< The array size for memory dependencies tracking
+  uint64_t maxAllocSize_;        //!< Maximum single allocation size
+  uint rgpSqttDispCount_;        //!< The number of dispatches captured in SQTT
+  uint maxCmdBuffers_;           //!< Maximum number of command buffers allocated per queue
+
+  uint64_t subAllocationMinSize_;    //!< Minimum size allowed for suballocations
+  uint64_t subAllocationMaxSize_;    //!< Maximum size allowed with suballocations
+  uint64_t subAllocationChunkSize_;  //!< Chunk size for suballocaitons

-  uint64_t subAllocationMinSize_;   //!< Minimum size allowed for suballocations
-  uint64_t subAllocationMaxSize_;   //!< Maximum size allowed with suballocations
-  uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
-  
  amd::LibrarySelector libSelector_;  //!< Select linking libraries for compiler

  //! Default constructor
@@ -106,7 +106,7 @@ class Settings : public device::Settings {
              const Pal::GpuMemoryHeapProperties* heaps,  //!< PAL heap settings
              const Pal::WorkStationCaps& wscaps,         //!< PAL  workstation settings
              bool reportAsOCL12Device = false            //!< Report As OpenCL1.2 Device
-              );
+  );

 private:
  //! Disable copy constructor
@@ -119,4 +119,5 @@ class Settings : public device::Settings {
  void override();
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -40,7 +40,7 @@ class TimeStamp : public amd::HeapObject {
            Pal::IGpuMemory* iMem,  //!< Buffer with the timer values
            uint memOffset,         //!< Offset in the buffer for the current TS
            address cpuAddr         //!< CPU pointer for the values in memory
-            );
+  );

  //! Default destructor
  ~TimeStamp();
@@ -114,4 +114,5 @@ class TimeStampCache : public amd::HeapObject {
  uint tsOffset_;                    //!< Active offset in the current mem object
 };

-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal
@@ -70,8 +70,7 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
  if (qCreateInfo.engineType == Pal::EngineTypeExclusiveCompute) {
    if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
      qCreateInfo.engineIndex = it->second;
-    }
-    else {
+    } else {
      return nullptr;
    }
  }
@@ -97,8 +96,8 @@ VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueTy
  }

  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
-  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev,
-    residency_limit, max_command_buffers);
+  VirtualGPU::Queue* queue =
+      new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = reinterpret_cast<address>(&queue[1]);
    // Create PAL queue object
@@ -163,16 +162,16 @@ VirtualGPU::Queue::~Queue() {
  }
 }

-Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile()
-{
-    std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
+Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
+  std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();

-    const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
-    // Find the last occurance of the '\\' character and extract the name of the application as wide char.
-    const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
-    const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
+  const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
+  // Find the last occurance of the '\\' character and extract the name of the application as wide
+  // char.
+  const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
+  const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;

-    return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
+  return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
 }

 void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
@@ -188,8 +187,7 @@ void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
    memRef.pGpuMemory = iMem;
    palMemRefs_.push_back(memRef);
    // Check SDI memory object
-    if (iMem->Desc().flags.isExternPhys &&
-        (sdiReferences_.find(iMem) == sdiReferences_.end())) {
+    if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
      sdiReferences_.insert(iMem);
      palSdiRefs_.push_back(iMem);
    }
@@ -268,8 +266,7 @@ bool VirtualGPU::Queue::flush() {
  // Submit command buffer to OS
  Pal::Result result;
  if (gpu_.rgpCaptureEna()) {
-    result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(
-      iQueue_, cmdBufIdCurrent_, submitInfo);
+    result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
  } else {
    result = iQueue_->Submit(submitInfo);
  }
@@ -383,28 +380,28 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
  if (dump.is_open()) {
    dump << start << " Queue: ";
    switch (iQueue_->Type()) {
-    case Pal::QueueTypeCompute:
-      dump << "Compute";
-      break;
-    case Pal::QueueTypeDma:
-      dump << "SDMA";
-      break;
-    default:
-      dump << "unknown";
-      break;
+      case Pal::QueueTypeCompute:
+        dump << "Compute";
+        break;
+      case Pal::QueueTypeDma:
+        dump << "SDMA";
+        break;
+      default:
+        dump << "unknown";
+        break;
    }
    dump << "\n"
-        << "Resident memory resources:\n";
+         << "Resident memory resources:\n";
    uint idx = 0;
    for (auto it : memReferences_) {
      dump << " " << idx << "\t[";
      dump.setf(std::ios::hex, std::ios::basefield);
      dump.setf(std::ios::showbase);
      dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
-          << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
+           << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
      dump.setf(std::ios::dec);
-      dump << "] CbId:" << it.second <<
-          ", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n";
+      dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap
+           << "\n";
      idx++;
    }

@@ -414,8 +411,7 @@ void VirtualGPU::Queue::DumpMemoryReferences() const {
      for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        // Find if the current argument is a memory object
-        if ((desc.type_ == T_POINTER) &&
-            (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
+        if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
          dump << " " << desc.name_ << ": " << std::endl;
        }
      }
@@ -519,7 +515,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
      // note: The array growth shouldn't occur under the normal conditions,
      // but in a case when SVM path sends the amount of SVM ptrs over
      // the max size of kernel arguments
-      MemoryState* ptr  = new MemoryState[maxMemObjectsInQueue_ << 1];
+      MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
      if (nullptr == ptr) {
        numMemObjectsInQueue_ = 0;
        return;
@@ -527,7 +523,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
      maxMemObjectsInQueue_ <<= 1;
      memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
      delete[] memObjectsInQueue_;
-      memObjectsInQueue_= ptr;
+      memObjectsInQueue_ = ptr;
    }

    // Adjust the number of active objects
@@ -748,7 +744,6 @@ VirtualGPU::VirtualGPU(Device& device)
      maskGroups_(1),
      hsaQueueMem_(nullptr),
      cmdAllocator_(nullptr) {
-
  // Note: Virtual GPU device creation must be a thread safe operation
  index_ = gpuDevice_.numOfVgpus_++;
  gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
@@ -780,8 +775,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
  createInfo.flags.autoMemoryReuse = false;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
-  createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
-    VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
+      createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
+          VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));

  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
@@ -803,8 +798,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,

  const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0;
  uint idx = index() % (dev().numComputeEngines() - firstQueue);
-  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
-    (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
+  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
+      ? 0
+      : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
  uint max_cmd_buffers = dev().settings().maxCmdBuffers_;

  if (dev().numComputeEngines()) {
@@ -815,9 +811,9 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
    // hwRing_ should be set 0 if forced to have single scratch buffer
    hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;

-    queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue,
-                                        cmdAllocator_, rtCUs, priority,
-                                        residency_limit, max_cmd_buffers);
+    queues_[MainEngine] =
+        Queue::Create(*this, Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs,
+                      priority, residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
      return false;
    }
@@ -832,20 +828,19 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
        sdma = 1;
      }

-      queues_[SdmaEngine] =
-          Queue::Create(*this, Pal::QueueTypeDma, sdma, cmdAllocator_,
-                        amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
-                        residency_limit, max_cmd_buffers);
+      queues_[SdmaEngine] = Queue::Create(
+          *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
+          amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    } else {
-        queues_[SdmaEngine] = Queue::Create(*this, Pal::QueueTypeCompute,
-            idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
-            residency_limit, max_cmd_buffers);
-        if (nullptr == queues_[SdmaEngine]) {
-            return false;
-        }
+      queues_[SdmaEngine] =
+          Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
+                        amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
+      if (nullptr == queues_[SdmaEngine]) {
+        return false;
+      }
    }
  } else {
    Unimplemented();
@@ -921,7 +916,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
    bool dbg_vmid = false;
    state_.rgpCaptureEnabled_ = true;
    dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
-    dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
+    dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_,
+                                              &dbg_vmid);
  }

  return true;
@@ -1511,99 +1507,99 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
 void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
  bool unmapMip = false;
  amd::Image* amdImage;
-{
-  // Make sure VirtualGPU has an exclusive access to the resources
-  amd::ScopedLock lock(execution());
+  {
+    // Make sure VirtualGPU has an exclusive access to the resources
+    amd::ScopedLock lock(execution());

-  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
-  amd::Memory* owner = memory->owner();
-  const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
-  if (nullptr == writeMapInfo) {
-    LogError("Unmap without map call");
-    return;
-  }
-  profilingBegin(vcmd, true);
-
-  // Check if image is a mipmap and assign a saved view
-  amdImage = owner->asImage();
-  if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
-      (writeMapInfo->baseMip_ != nullptr)) {
-    // Assign mip level view
-    amdImage = writeMapInfo->baseMip_;
-    // Clear unmap flags from the parent image
-    memory->clearUnmapInfo(vcmd.mapPtr());
-    memory = dev().getGpuMemory(amdImage);
-    unmapMip = true;
-    writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
-  }
-
-  // We used host memory
-  if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
-    if (writeMapInfo->isUnmapWrite()) {
-      // Target is the backing store, so sync
-      owner->signalWrite(nullptr);
-      memory->syncCacheFromHost(*this);
+    pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
+    amd::Memory* owner = memory->owner();
+    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
+    if (nullptr == writeMapInfo) {
+      LogError("Unmap without map call");
+      return;
    }
-    // Remove memory from VA cache
-    dev().removeVACache(memory);
-  }
-  // data check was added for persistent memory that failed to get aperture
-  // and therefore are treated like a remote resource
-  else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
-    memory->unmap(this);
-  } else if (memory->mapMemory() != nullptr) {
-    if (writeMapInfo->isUnmapWrite()) {
-      amd::Coord3D srcOrigin(0, 0, 0);
-      // Target is a remote resource, so copy
-      assert(memory->mapMemory() != nullptr);
-      if (memory->desc().buffer_) {
-        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
-                                  writeMapInfo->origin_, writeMapInfo->region_,
-                                  writeMapInfo->isEntire())) {
-          LogError("submitUnmapMemory() - copy failed");
-          vcmd.setStatus(CL_OUT_OF_RESOURCES);
-        }
-      } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
-        Memory* memoryBuf = memory;
-        amd::Coord3D origin(writeMapInfo->origin_[0]);
-        amd::Coord3D size(writeMapInfo->region_[0]);
-        size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
-        origin.c[0] *= elemSize;
-        size.c[0] *= elemSize;
+    profilingBegin(vcmd, true);

-        amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
-        if (nullptr == bufferFromImage) {
-          LogError("We should not fail buffer creation from image_buffer!");
+    // Check if image is a mipmap and assign a saved view
+    amdImage = owner->asImage();
+    if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
+        (writeMapInfo->baseMip_ != nullptr)) {
+      // Assign mip level view
+      amdImage = writeMapInfo->baseMip_;
+      // Clear unmap flags from the parent image
+      memory->clearUnmapInfo(vcmd.mapPtr());
+      memory = dev().getGpuMemory(amdImage);
+      unmapMip = true;
+      writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
+    }
+
+    // We used host memory
+    if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
+      if (writeMapInfo->isUnmapWrite()) {
+        // Target is the backing store, so sync
+        owner->signalWrite(nullptr);
+        memory->syncCacheFromHost(*this);
+      }
+      // Remove memory from VA cache
+      dev().removeVACache(memory);
+    }
+    // data check was added for persistent memory that failed to get aperture
+    // and therefore are treated like a remote resource
+    else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
+      memory->unmap(this);
+    } else if (memory->mapMemory() != nullptr) {
+      if (writeMapInfo->isUnmapWrite()) {
+        amd::Coord3D srcOrigin(0, 0, 0);
+        // Target is a remote resource, so copy
+        assert(memory->mapMemory() != nullptr);
+        if (memory->desc().buffer_) {
+          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
+                                    writeMapInfo->origin_, writeMapInfo->region_,
+                                    writeMapInfo->isEntire())) {
+            LogError("submitUnmapMemory() - copy failed");
+            vcmd.setStatus(CL_OUT_OF_RESOURCES);
+          }
+        } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
+          Memory* memoryBuf = memory;
+          amd::Coord3D origin(writeMapInfo->origin_[0]);
+          amd::Coord3D size(writeMapInfo->region_[0]);
+          size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
+          origin.c[0] *= elemSize;
+          size.c[0] *= elemSize;
+
+          amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
+          if (nullptr == bufferFromImage) {
+            LogError("We should not fail buffer creation from image_buffer!");
+          } else {
+            memoryBuf = dev().getGpuMemory(bufferFromImage);
+          }
+          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
+                                    writeMapInfo->isEntire())) {
+            LogError("submitUnmapMemory() - copy failed");
+            vcmd.setStatus(CL_OUT_OF_RESOURCES);
+          }
+          if (nullptr != bufferFromImage) {
+            bufferFromImage->release();
+          }
        } else {
-          memoryBuf = dev().getGpuMemory(bufferFromImage);
-        }
-        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
-                                  writeMapInfo->isEntire())) {
-          LogError("submitUnmapMemory() - copy failed");
-          vcmd.setStatus(CL_OUT_OF_RESOURCES);
-        }
-        if (nullptr != bufferFromImage) {
-          bufferFromImage->release();
-        }
-      } else {
-        if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
-                                         writeMapInfo->origin_, writeMapInfo->region_,
-                                         writeMapInfo->isEntire())) {
-          LogError("submitUnmapMemory() - copy failed");
-          vcmd.setStatus(CL_OUT_OF_RESOURCES);
+          if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
+                                           writeMapInfo->origin_, writeMapInfo->region_,
+                                           writeMapInfo->isEntire())) {
+            LogError("submitUnmapMemory() - copy failed");
+            vcmd.setStatus(CL_OUT_OF_RESOURCES);
+          }
        }
      }
+    } else {
+      LogError("Unhandled unmap!");
+      vcmd.setStatus(CL_INVALID_VALUE);
    }
-  } else {
-    LogError("Unhandled unmap!");
-    vcmd.setStatus(CL_INVALID_VALUE);
+
+    // Clear unmap flags
+    memory->clearUnmapInfo(vcmd.mapPtr());
+
+    profilingEnd(vcmd);
  }
-
-  // Clear unmap flags
-  memory->clearUnmapInfo(vcmd.mapPtr());
-
-  profilingEnd(vcmd);
-}
  // Release a view for a mipmap map
  if (unmapMip) {
    // Memory release should be outside of the execution lock,
@@ -1700,9 +1696,9 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
  profilingBegin(cmd);

  Memory* srcDevMem = static_cast<pal::Memory*>(
-    cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
+      cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
  Memory* dstDevMem = static_cast<pal::Memory*>(
-    cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
+      cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));

  bool p2pAllowed = false;
 #if 0
@@ -1728,16 +1724,15 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
      amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);

      if (p2pAllowed) {
-        result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin,
-                                      size, cmd.isEntireMemory());
-      }
-      else {
+        result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
+                                      cmd.isEntireMemory());
+      } else {
        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<pal::Memory*>(
-          dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
+            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<pal::Memory*>(
-          dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
-          
+            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
+
        size_t copy_size = Device::kP2PStagingSize;
        size_t left_size = size[0];
        amd::Coord3D stageOffset(0);
@@ -1750,11 +1745,11 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
          amd::Coord3D cpSize(copy_size);

          // Perform 2 step transfer with staging buffer
-          result &= dev().xferMgr().copyBuffer(
-            *srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
+          result &=
+              dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize);
          srcOrigin.c[0] += copy_size;
-          result &= dstDevMem->dev().xferMgr().copyBuffer(
-            *srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize);
+          result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
+                                                          dstOrigin, cpSize);
          dstOrigin.c[0] += copy_size;
        } while (left_size > 0);
      }
@@ -1940,10 +1935,8 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
 }

 // ================================================================================================
-void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue)
-{
-  AmdAqlWrap* wraps =
-      (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
+void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) {
+  AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
  uint p = 0;
  for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
    if (wraps[i].state != 0) {
@@ -1963,11 +1956,9 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
      print << "\twait_list: " << wraps[i].wait_list << "\n";
      print << "\twait_num: " << wraps[i].wait_num << "\n";
      uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
-      size_t* events =
-          reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
+      size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
      for (j = 0; j < wraps[i].wait_num; ++j) {
-        uint offs =
-            static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
+        uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
        print << "Wait Event#: " << j << "\n";
        print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
@@ -1980,8 +1971,8 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
      print << wraps[i].aql.grid_size_z << "]\n";

      HSAILKernel* child = nullptr;
-      for (auto it = hsaKernel.prog().kernels().begin();
-        it != hsaKernel.prog().kernels().end(); ++it) {
+      for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
+           ++it) {
        if (wraps[i].aql.kernel_object == static_cast<HSAILKernel*>(it->second)->gpuAqlCode()) {
          child = static_cast<HSAILKernel*>(it->second);
        }
@@ -1995,7 +1986,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
      uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
      address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
      print << "Kernel: " << child->name() << "\n";
-      const amd::KernelSignature&  signature = child->signature();
+      const amd::KernelSignature& signature = child->signature();

      // Check if runtime has to setup hidden arguments
      for (const auto it : signature.parameters()) {
@@ -2033,7 +2024,7 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
          continue;
        }
        print << "\t" << it.name_ << ": ";
-        for (int s = it.size_- 1; s >= 0; --s) {
+        for (int s = it.size_ - 1; s >= 0; --s) {
          print.width(2);
          print.fill('0');
          print << static_cast<uint32_t>(argum[s]);
@@ -2047,26 +2038,20 @@ void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQ
 }

 // ================================================================================================
-bool VirtualGPU::PreDeviceEnqueue(
-    const amd::Kernel& kernel,
-    const HSAILKernel& hsaKernel,
-    VirtualGPU** gpuDefQueue,
-    uint64_t* vmDefQueue)
-{
+bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
+                                  VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
  if (nullptr == defQueue) {
    LogError("Default device queue wasn't allocated");
    return false;
-  }
-  else {
+  } else {
    if (dev().settings().useDeviceQueue_) {
      *gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
      if ((*gpuDefQueue)->hwRing() == hwRing()) {
        LogError("Can't submit the child kernels to the same HW ring as the host queue!");
        return false;
      }
-    }
-    else {
+    } else {
      createVirtualQueue(defQueue->size());
      *gpuDefQueue = this;
    }
@@ -2086,15 +2071,10 @@ bool VirtualGPU::PreDeviceEnqueue(
 }

 // ================================================================================================
-void VirtualGPU::PostDeviceEnqueue(
-    const amd::Kernel& kernel,
-    const HSAILKernel& hsaKernel,
-    VirtualGPU* gpuDefQueue,
-    uint64_t vmDefQueue,
-    uint64_t vmParentWrap,
-    GpuEvent* gpuEvent)
-{
-  uint32_t id  = gpuEvent->id_;
+void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
+                                   VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
+                                   uint64_t vmParentWrap, GpuEvent* gpuEvent) {
+  uint32_t id = gpuEvent->id_;
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());

  // Make sure exculsive access to the device queue
@@ -2110,16 +2090,16 @@ void VirtualGPU::PostDeviceEnqueue(
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-      vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
-      dev().settings().useDeviceQueue_);
+                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
+                                     dev().settings().useDeviceQueue_);
    eventEnd(MainEngine, *gpuEvent);
  }

  // Get the global loop start before the scheduler
  Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
-    .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
-      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+      .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
+                    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  const static bool FlushL2 = true;
  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);

@@ -2127,8 +2107,7 @@ void VirtualGPU::PostDeviceEnqueue(
  //! @note DMA flush must not occur between patch and the scheduler
  Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  // Program parameters for the scheduler
-  SchedulerParam* param = reinterpret_cast<SchedulerParam*>(
-    gpuDefQueue->schedParams_->data());
+  SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
  param->signal = 1;
  // Scale clock to 1024 to avoid 64 bit div in the scheduler
  param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
@@ -2147,8 +2126,7 @@ void VirtualGPU::PostDeviceEnqueue(
    param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
    param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
    addVmMemory(scratchBuf);
-  }
-  else {
+  } else {
    param->numMaxWaves = 0;
    param->scratchSize = 0;
    param->scratch = 0;
@@ -2162,8 +2140,8 @@ void VirtualGPU::PostDeviceEnqueue(
  Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
  gpuDefQueue->eventBegin(MainEngine);
  gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
-    signalAddr, loopStart,
-    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
+      signalAddr, loopStart,
+      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
  // Thus TS command for profiling has to follow in the next CB.
  constexpr bool ForceSubmitFirst = true;
@@ -2173,10 +2151,10 @@ void VirtualGPU::PostDeviceEnqueue(
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-      vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-      signalAddr, dev().settings().useDeviceQueue_);
+                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
+                                     dev().settings().useDeviceQueue_);
    if (id != gpuEvent->id_) {
-        LogError("Something is wrong. ID mismatch!\n");
+      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, *gpuEvent);
  }
@@ -2193,7 +2171,8 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  profilingBegin(vcmd);

  // Submit kernel to HW
-  if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(), vcmd.sharedMemBytes())) {
+  if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
+                            vcmd.sharedMemBytes())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

@@ -2203,10 +2182,9 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                      const_address parameters, bool nativeMem,
-                                      amd::Event* enqueueEvent, uint32_t sharedMemBytes)
-{
-  size_t newOffset[3] = { 0, 0, 0 };
-  size_t newGlobalSize[3] = { 0, 0, 0 };
+                                      amd::Event* enqueueEvent, uint32_t sharedMemBytes) {
+  size_t newOffset[3] = {0, 0, 0};
+  size_t newGlobalSize[3] = {0, 0, 0};

  int dim = -1;
  int iteration = 1;
@@ -2221,17 +2199,17 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const

  // If RGP capturing is enabled, then start SQTT trace
  if (rgpCaptureEna()) {
-    size_t newLocalSize[3] = { 1, 1, 1 };
+    size_t newLocalSize[3] = {1, 1, 1};
    for (uint i = 0; i < sizes.dimensions(); i++) {
      if (sizes.local()[i] != 0) {
        newLocalSize[i] = sizes.local()[i];
      }
    }
-    dev().rgpCaptureMgr()->PreDispatch(this, hsaKernel,
-      // Report global size in workgroups, since that's the RGP trace semantics
-      newGlobalSize[0] / newLocalSize[0],
-      newGlobalSize[1] / newLocalSize[1],
-      newGlobalSize[2] / newLocalSize[2]);
+    dev().rgpCaptureMgr()->PreDispatch(
+        this, hsaKernel,
+        // Report global size in workgroups, since that's the RGP trace semantics
+        newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
+        newGlobalSize[2] / newLocalSize[2]);
  }

  bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
@@ -2257,8 +2235,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const

  // Check memory dependency and SVM objects
  if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
-      LogError("Wrong memory objects!");
-      return false;
+    LogError("Wrong memory objects!");
+    return false;
  }
  bool needFlush = false;
  // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
@@ -2305,15 +2283,14 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
      // an extra loop is required.
      const amd::KernelParameters& kernelParams = kernel.parameters();
      amd::Memory* const* memories =
-        reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
+          reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
      for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
        if (nativeMem) {
          Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
          if (gpuMem != nullptr) {
            gpuMem->setBusy(*this, gpuEvent);
          }
-        }
-        else {
+        } else {
          amd::Memory* mem = memories[i];
          if (mem != nullptr) {
            dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
@@ -2325,7 +2302,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    uint64_t vmParentWrap = 0;
    // Program the kernel arguments for the GPU execution
    hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
-      *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
+        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
    if (nullptr == aqlPkt) {
      LogError("Couldn't load kernel arguments");
      return false;
@@ -2348,8 +2325,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    }
    dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
    dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
-    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
-      enqueueEvent->profilingInfo().waves_ : 0;
+    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
    dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
@@ -2660,7 +2636,6 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
    eventEnd(MainEngine, gpuEvent);

  } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
-
    EngineType activeEngineID = engineID_;
    engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);

@@ -2669,8 +2644,8 @@ void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
    addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
    // Workarounds: We had systems where an extra delay was necessary.
    {
-        // Flush CB associated with the DGMA buffer
-        isDone(pGpuMemory->getGpuEvent(*this));
+      // Flush CB associated with the DGMA buffer
+      isDone(pGpuMemory->getGpuEvent(*this));
    }

    eventBegin(engineID_);
@@ -2711,10 +2686,11 @@ void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd
    pGpuMems[i] = pGpuMemory->iMem();
  }

-  dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim);
+  dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
+                                       Pal::GpuMemoryRefCantTrim);
  dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
  if (numObjects != 0) {
-      dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
+    dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
  }

  for (uint i = 0; i < numObjects; i++) {
@@ -3104,8 +3080,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
        break;
    }
    // get svm non arugment information
-    void* const* svmPtrArray = reinterpret_cast<void* const*>(
-      params + kernelParams.getExecInfoOffset());
+    void* const* svmPtrArray =
+        reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
    for (size_t i = 0; i < count; i++) {
      amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
      if (nullptr == memory) {
@@ -3149,8 +3125,7 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
  bool srdResource = false;
  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
-  const HSAILKernel& hsaKernel =
-      static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
+  const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
  const amd::KernelSignature& signature = kernel.signature();
  ldsAddress = hsaKernel.ldsSize();

@@ -3225,10 +3200,10 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
          addVmMemory(gpuMem);
          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
          LogPrintfInfo("!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
-            desc.typeName_.c_str(), desc.name_.c_str(),
-            globalAddress, reinterpret_cast<void*>(gpuMem->vmAddress()),
-            reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
-            std::this_thread::get_id());
+                        desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
+                        reinterpret_cast<void*>(gpuMem->vmAddress()),
+                        reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
+                        std::this_thread::get_id());

          //! Check if compiler expects read/write.
          //! Note: SVM with subbuffers has an issue with tracking.
@@ -3255,30 +3230,28 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
            }
            if (gpuMem->desc().isDoppTexture_) {
              addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
-                kernel.parameters().getExecPfpaVcop());
+                         kernel.parameters().getExecPfpaVcop());
            }
          }
        }
      }
-    }
-    else if (desc.type_ == T_VOID) {
+    } else if (desc.type_ == T_VOID) {
      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
        // Copy the current structure into CB1
-        size_t gpuPtr = static_cast<size_t>(cb(1)->UploadDataToHw(
-          params + desc.offset_, desc.size_));
+        size_t gpuPtr =
+            static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
        // Then use a pointer in aqlArgBuffer to CB1
        const auto it = hsaKernel.patch().find(desc.offset_);
        // Patch the GPU VA address in the original arguments
        WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
        addVmMemory(cb(1)->ActiveMemory());
      }
-    }
-    else if (desc.type_ == T_SAMPLER) {
+    } else if (desc.type_ == T_SAMPLER) {
      srdResource = true;
    } else if (desc.type_ == T_QUEUE) {
      uint32_t index = desc.info_.arrayIndex_;
-      const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
-        params + kernelParams.queueObjOffset())[index];
+      const amd::DeviceQueue* queue =
+          reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
      VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
      uint64_t vmQueue;
      if (dev().settings().useDeviceQueue_) {
@@ -51,17 +51,18 @@ class VirtualGPU : public device::VirtualDevice {
    Queue(const Queue&) = delete;
    Queue& operator=(const Queue&) = delete;

-    static Queue* Create(const VirtualGPU& gpu,                //!< OCL virtual GPU object
-                         Pal::QueueType queueType,             //!< PAL queue type
-                         uint engineIdx,                       //!< Select particular engine index
-                         Pal::ICmdAllocator* cmdAlloc,         //!< PAL CMD buffer allocator
-                         uint rtCU,                            //!< The number of reserved CUs
-                         amd::CommandQueue::Priority priority, //!< Queue priority
-                         uint64_t residency_limit,             //!< Enables residency limit
-                         uint max_command_buffers              //!< Number of allocated command buffers
-                         );
+    static Queue* Create(const VirtualGPU& gpu,                 //!< OCL virtual GPU object
+                         Pal::QueueType queueType,              //!< PAL queue type
+                         uint engineIdx,                        //!< Select particular engine index
+                         Pal::ICmdAllocator* cmdAlloc,          //!< PAL CMD buffer allocator
+                         uint rtCU,                             //!< The number of reserved CUs
+                         amd::CommandQueue::Priority priority,  //!< Queue priority
+                         uint64_t residency_limit,              //!< Enables residency limit
+                         uint max_command_buffers  //!< Number of allocated command buffers
+    );

-    Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit, uint max_command_buffers)
+    Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
+          uint max_command_buffers)
        : iQueue_(nullptr),
          iCmdBuffs_(max_command_buffers, nullptr),
          iCmdFences_(max_command_buffers, nullptr),
@@ -75,8 +76,7 @@ class VirtualGPU : public device::VirtualDevice {
          vlAlloc_(64 * Ki),
          residency_size_(0),
          residency_limit_(residency_limit),
-          max_command_buffers_(max_command_buffers)
-    {
+          max_command_buffers_(max_command_buffers) {
      vlAlloc_.Init();
    }

@@ -100,8 +100,7 @@ class VirtualGPU : public device::VirtualDevice {
    Pal::Result UpdateAppPowerProfile();

    // ibReuse forces event wait without polling, to make sure event occured
-    template <bool ibReuse>
-    bool waifForFence(uint cbId) const {
+    template <bool ibReuse> bool waifForFence(uint cbId) const {
      Pal::Result result = Pal::Result::Success;
      uint64_t start;
      uint64_t end;
@@ -138,8 +137,7 @@ class VirtualGPU : public device::VirtualDevice {

    //! Flushes the current command buffer to HW
    //! Returns ID associated with the submission
-    template <bool avoidBarrierSubmit = false>
-    uint submit(bool forceFlush);
+    template <bool avoidBarrierSubmit = false> uint submit(bool forceFlush);

    bool flush();

@@ -151,28 +149,28 @@ class VirtualGPU : public device::VirtualDevice {

    uint cmdBufId() const { return cmdBufIdCurrent_; }

-    Pal::IQueue* iQueue_;                        //!< PAL queue object
-    std::vector<Pal::ICmdBuffer*> iCmdBuffs_;    //!< PAL command buffers
-    std::vector<Pal::IFence*> iCmdFences_;       //!< PAL fences, associated with CMD
-    const amd::Kernel* last_kernel_;             //!< Last submitted kernel
+    Pal::IQueue* iQueue_;                      //!< PAL queue object
+    std::vector<Pal::ICmdBuffer*> iCmdBuffs_;  //!< PAL command buffers
+    std::vector<Pal::IFence*> iCmdFences_;     //!< PAL fences, associated with CMD
+    const amd::Kernel* last_kernel_;           //!< Last submitted kernel

-  private:
+   private:
    void DumpMemoryReferences() const;
-    const VirtualGPU& gpu_; //!< OCL virtual GPU object
-    Pal::IDevice* iDev_;    //!< PAL device
-    uint cmdBufIdSlot_;     //!< Command buffer ID slot for submissions
-    uint cmdBufIdCurrent_;  //!< Current global command buffer ID
-    uint cmbBufIdRetired_;  //!< The last retired command buffer ID
-    uint cmdCnt_;           //!< Counter of commands
+    const VirtualGPU& gpu_;  //!< OCL virtual GPU object
+    Pal::IDevice* iDev_;     //!< PAL device
+    uint cmdBufIdSlot_;      //!< Command buffer ID slot for submissions
+    uint cmdBufIdCurrent_;   //!< Current global command buffer ID
+    uint cmbBufIdRetired_;   //!< The last retired command buffer ID
+    uint cmdCnt_;            //!< Counter of commands
    std::unordered_map<GpuMemoryReference*, uint> memReferences_;
-    Util::VirtualLinearAllocator    vlAlloc_;
-    std::vector<Pal::GpuMemoryRef>  palMemRefs_;
-    std::vector<Pal::IGpuMemory*>   palMems_;
-    std::vector<Pal::DoppRef>       palDoppRefs_;
-    std::set<Pal::IGpuMemory*>      sdiReferences_;
-    std::vector<const Pal::IGpuMemory*>   palSdiRefs_;
-    uint64_t  residency_size_;  //!< Resource residency size
-    uint64_t  residency_limit_; //!< Enables residency limit
+    Util::VirtualLinearAllocator vlAlloc_;
+    std::vector<Pal::GpuMemoryRef> palMemRefs_;
+    std::vector<Pal::IGpuMemory*> palMems_;
+    std::vector<Pal::DoppRef> palDoppRefs_;
+    std::set<Pal::IGpuMemory*> sdiReferences_;
+    std::vector<const Pal::IGpuMemory*> palSdiRefs_;
+    uint64_t residency_size_;   //!< Resource residency size
+    uint64_t residency_limit_;  //!< Enables residency limit
    uint max_command_buffers_;
  };

@@ -185,14 +183,14 @@ class VirtualGPU : public device::VirtualDevice {
    CommandBatch(amd::Command* head,      //!< Command batch head
                 const GpuEvent* events,  //!< HW events on all engines
                 TimeStamp* lastTS        //!< Last TS in command batch
-                 ) {
+    ) {
      init(head, events, lastTS);
    }

    void init(amd::Command* head,      //!< Command batch head
              const GpuEvent* events,  //!< HW events on all engines
              TimeStamp* lastTS        //!< Last TS in command batch
-              ) {
+    ) {
      head_ = head;
      lastTS_ = lastTS;
      memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
@@ -202,11 +200,11 @@ class VirtualGPU : public device::VirtualDevice {
  //! The virtual GPU states
  union State {
    struct {
-      uint profiling_          : 1;     //!< Profiling is enabled
-      uint forceWait_          : 1;     //!< Forces wait in flush()
-      uint profileEnabled_     : 1;     //!< Profiling is enabled for WaveLimiter
-      uint perfCounterEnabled_ : 1;     //!< PerfCounter is enabled
-      uint rgpCaptureEnabled_  : 1;     //!< RGP capture is enabled in the runtime
+      uint profiling_ : 1;           //!< Profiling is enabled
+      uint forceWait_ : 1;           //!< Forces wait in flush()
+      uint profileEnabled_ : 1;      //!< Profiling is enabled for WaveLimiter
+      uint perfCounterEnabled_ : 1;  //!< PerfCounter is enabled
+      uint rgpCaptureEnabled_ : 1;   //!< RGP capture is enabled in the runtime
    };
    uint value_;
    State() : value_(0) {}
@@ -259,13 +257,13 @@ class VirtualGPU : public device::VirtualDevice {
    void findSplitSize(const Device& dev,  //!< GPU device object
                       uint64_t threads,   //!< Total number of execution threads
                       uint instructions   //!< Number of ALU instructions
-                       );
+    );

    // Returns TRUE if DMA command buffer is ready for a flush
    bool isCbReady(VirtualGPU& gpu,   //!< Virtual GPU object
                   uint64_t threads,  //!< Total number of execution threads
                   uint instructions  //!< Number of ALU instructions
-                   );
+    );

    // Returns dispatch split size
    uint dispatchSplitSize() const { return dispatchSplitSize_; }
@@ -301,7 +299,7 @@ class VirtualGPU : public device::VirtualDevice {
      bool nativeMem = true,               //!< Native memory objects
      amd::Event* enqueueEvent = nullptr,  //!< Event provided in the enqueue kernel command
      uint32_t sharedMemBytes = 0          //!< Shared memory size
-      );
+  );
  void submitNativeFn(amd::NativeFnCommand& vcmd);
  void submitFillMemory(amd::FillMemoryCommand& vcmd);
  void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
@@ -331,20 +329,20 @@ class VirtualGPU : public device::VirtualDevice {
  //! Set the last known GPU event
  void setGpuEvent(GpuEvent gpuEvent,  //!< GPU event for tracking
                   bool flush = false  //!< TRUE if flush is required
-                   );
+  );

  //! Flush DMA buffer on the specified engine
  void flushDMA(uint engineID  //!< Engine ID for DMA flush
-                );
+  );

  //! Wait for all engines on this Virtual GPU
  //! Returns TRUE if CPU didn't wait for GPU
  bool waitAllEngines(CommandBatch* cb = nullptr  //!< Command batch
-                      );
+  );

  //! Waits for the latest GPU event with a lock to prevent multiple entries
  void waitEventLock(CommandBatch* cb  //!< Command batch
-                     );
+  );

  //! Returns a resource associated with the constant buffer
  const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
@@ -355,7 +353,7 @@ class VirtualGPU : public device::VirtualDevice {
  //! Start the command profiling
  void profilingBegin(amd::Command& command,     //!< Command queue object
                      bool drmProfiling = false  //!< Measure DRM time
-                      );
+  );

  //! End the command profiling
  void profilingEnd(amd::Command& command);
@@ -363,11 +361,11 @@ class VirtualGPU : public device::VirtualDevice {
  //! Collect the profiling results
  bool profilingCollectResults(CommandBatch* cb,               //!< Command batch
                               const amd::Event* waitingEvent  //!< Waiting event
-                               );
+  );

  //! Adds a memory handle into the GSL memory array for Virtual Heap
  inline void addVmMemory(const Memory* memory  //!< GPU memory object
-                          );
+  );

  //! Adds the last submitted kernel to the queue for tracking a possible hang
  inline void AddKernel(const amd::Kernel& kernel  //!< AMD kernel object
@@ -377,7 +375,7 @@ class VirtualGPU : public device::VirtualDevice {
  void addDoppRef(const Memory* memory,  //!< GPU memory object
                  bool lastDoopCmd,      //!< is the last submission for the pre-present primary
                  bool pfpaDoppCmd       //!< is a submission for the pre-present primary
-                  );
+  );

  //! Return xfer buffer for staging operations
  XferBuffer& xferWrite() { return writeBuffer_; }
@@ -429,7 +427,7 @@ class VirtualGPU : public device::VirtualDevice {

  //! Returns TRUE if virtual queue was successfully allocatted
  bool createVirtualQueue(uint deviceQueueSize  //!< Device queue size
-                          );
+  );

  EngineType engineID_;  //!< Engine ID for this VirtualGPU

@@ -447,7 +445,8 @@ class VirtualGPU : public device::VirtualDevice {
  //! Returns queue, associated with VirtualGPU
  Queue& queue(EngineType id) const { return *queues_[id]; }

-  void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown, bool flushL2 = false) const {
+  void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
+                  bool flushL2 = false) const {
    Pal::BarrierInfo barrier = {};
    barrier.pipePointWaitCount = 1;
    Pal::HwPipePoint point = Pal::HwPipePostCs;
@@ -508,7 +507,7 @@ class VirtualGPU : public device::VirtualDevice {
  //! Returns TRUE if SDMA requires overlap synchronizaiton
  bool validateSdmaOverlap(const Resource& src,  //!< Source resource for SDMA transfer
                           const Resource& dst   //!< Destination resource for SDMA transfer
-                           );
+  );

  //! Checks if RGP capture is enabled
  bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
@@ -519,7 +518,7 @@ class VirtualGPU : public device::VirtualDevice {
  //! Creates buffer object from image
  amd::Memory* createBufferFromImage(
      amd::Memory& amdImage  //! The parent image object(untiled images only)
-      );
+  );

 private:
  struct MemoryRange {
@@ -537,14 +536,14 @@ class VirtualGPU : public device::VirtualDevice {
  //! Awaits a command batch with a waiting event
  bool awaitCompletion(CommandBatch* cb,                         //!< Command batch for to wait
                       const amd::Event* waitingEvent = nullptr  //!< A waiting event
-                       );
+  );

  //! Detects memory dependency for HSAIL kernels and flushes caches
  bool processMemObjectsHSA(const amd::Kernel& kernel,  //!< AMD kernel object for execution
                            const_address params,       //!< Pointer to the param's store
                            bool nativeMem,             //!< Native memory objects
-                            size_t& ldsAddess         //!< Returns LDS size, used in the kernel
-                            );
+                            size_t& ldsAddess           //!< Returns LDS size, used in the kernel
+  );

  //! Common function for fill memory used by both svm Fill and non-svm fill
  bool fillMemory(cl_command_type type,        //!< the command type
@@ -553,7 +552,7 @@ class VirtualGPU : public device::VirtualDevice {
                  size_t patternSize,          //!< pattern size
                  const amd::Coord3D& origin,  //!< memory origin
                  const amd::Coord3D& size     //!< memory size for filling
-                  );
+  );

  bool copyMemory(cl_command_type type,            //!< the command type
                  amd::Memory& srcMem,             //!< source memory object
@@ -564,35 +563,36 @@ class VirtualGPU : public device::VirtualDevice {
                  const amd::Coord3D& size,        //!< copy size
                  const amd::BufferRect& srcRect,  //!< region of source for copy
                  const amd::BufferRect& dstRect   //!< region of destination for copy
-                  );
+  );

  void buildKernelInfo(const HSAILKernel& hsaKernel,          //!< hsa kernel
                       hsa_kernel_dispatch_packet_t* aqlPkt,  //!< aql packet for dispatch
                       HwDbgKernelInfo& kernelInfo,           //!< kernel info for the dispatch
                       amd::Event* enqueueEvent  //!< Event provided in the enqueue kernel command
-                       );
+  );

  void assignDebugTrapHandler(const DebugToolInfo& dbgSetting,  //!< debug settings
                              HwDbgKernelInfo& kernelInfo       //!< kernel info for the dispatch
-                              );
+  );

  void PrintChildren(const HSAILKernel& hsaKernel,  //!< The parent HSAIL kernel
                     VirtualGPU* gpuDefQueue        //!< Device queue for children execution
-                     );
+  );

-  bool PreDeviceEnqueue(const amd::Kernel& kernel,    //!< Parent amd kernel object
-                        const HSAILKernel& hsaKernel, //!< Parent HSAIL object
-                        VirtualGPU** gpuDefQueue,     //!< [Return] GPU default queue
-                        uint64_t* vmDefQueue          //!< [Return] VM handle to the virtual queue
-                        );
+  bool PreDeviceEnqueue(const amd::Kernel& kernel,     //!< Parent amd kernel object
+                        const HSAILKernel& hsaKernel,  //!< Parent HSAIL object
+                        VirtualGPU** gpuDefQueue,      //!< [Return] GPU default queue
+                        uint64_t* vmDefQueue           //!< [Return] VM handle to the virtual queue
+  );

-  void PostDeviceEnqueue(const amd::Kernel& kernel,    //!< Parent amd kernel object
-                         const HSAILKernel& hsaKernel, //!< Parent HSAIL object
-                         VirtualGPU* gpuDefQueue,      //!< GPU default queue
-                         uint64_t vmDefQueue,          //!< VM handle to the virtual queue
-                         uint64_t vmParentWrap,        //!< VM handle to the wrapped AQL packet location
-                         GpuEvent* gpuEvent            //!< [Return] GPU event associated with the device enqueue
-                         );
+  void PostDeviceEnqueue(
+      const amd::Kernel& kernel,     //!< Parent amd kernel object
+      const HSAILKernel& hsaKernel,  //!< Parent HSAIL object
+      VirtualGPU* gpuDefQueue,       //!< GPU default queue
+      uint64_t vmDefQueue,           //!< VM handle to the virtual queue
+      uint64_t vmParentWrap,         //!< VM handle to the wrapped AQL packet location
+      GpuEvent* gpuEvent             //!< [Return] GPU event associated with the device enqueue
+  );

  Device& gpuDevice_;       //!< physical GPU device
  amd::Monitor execution_;  //!< Lock to serialise access to all device objects
@@ -605,11 +605,11 @@ class VirtualGPU : public device::VirtualDevice {

  DmaFlushMgmt dmaFlushMgmt_;  //!< DMA flush management

-  std::vector<amd::Memory*> pinnedMems_;   //!< Pinned memory list
+  std::vector<amd::Memory*> pinnedMems_;  //!< Pinned memory list

-  ManagedBuffer managedBuffer_; //!< Managed write buffer
-  constbufs_t   constBufs_;     //!< constant buffers
-  XferBuffer    writeBuffer_;   //!< Transfer/staging buffer for uploads
+  ManagedBuffer managedBuffer_;  //!< Managed write buffer
+  constbufs_t constBufs_;        //!< constant buffers
+  XferBuffer writeBuffer_;       //!< Transfer/staging buffer for uploads

  typedef std::queue<CommandBatch*> CommandBatchQueue;
  CommandBatchQueue cbQueue_;      //!< Queue of command batches
@@ -617,12 +617,12 @@ class VirtualGPU : public device::VirtualDevice {

  uint hwRing_;  //!< HW ring used on this virtual device

-  State state_;          //!< virtual GPU current state
+  State state_;                  //!< virtual GPU current state
  GpuEvent events_[AllEngines];  //!< Last known GPU events

-  uint64_t readjustTimeGPU_;   //!< Readjust time between GPU and CPU timestamps
-  TimeStamp* lastTS_;          //!< Last timestamp executed on Virtual GPU
-  TimeStamp* profileTs_;       //!< current profiling timestamp for command
+  uint64_t readjustTimeGPU_;  //!< Readjust time between GPU and CPU timestamps
+  TimeStamp* lastTS_;         //!< Last timestamp executed on Virtual GPU
+  TimeStamp* profileTs_;      //!< current profiling timestamp for command

  AmdVQueueHeader* vqHeader_;  //!< Sysmem copy for virtual queue header
  Memory* virtualQueue_;       //!< Virtual device queue
@@ -645,8 +645,7 @@ inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
  queues_[MainEngine]->last_kernel_ = &kernel;
 }

-template <bool avoidBarrierSubmit>
-uint VirtualGPU::Queue::submit(bool forceFlush) {
+template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
  cmdCnt_++;
  uint id = cmdBufIdCurrent_;
  bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
@@ -659,32 +658,30 @@ uint VirtualGPU::Queue::submit(bool forceFlush) {
 }

 template <typename T>
-inline void WriteAqlArgAt(
-  unsigned char* dst,   //!< The write pointer to the buffer
-  const T* src,         //!< The source pointer
-  uint size,            //!< The size in bytes to copy
-  size_t offset         //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst,  //!< The write pointer to the buffer
+                          const T* src,        //!< The source pointer
+                          uint size,           //!< The size in bytes to copy
+                          size_t offset  //!< The alignment to follow while writing to the buffer
 ) {
  memcpy(dst + offset, src, size);
 }

 template <>
-inline void WriteAqlArgAt(
-  unsigned char* dst,   //!< The write pointer to the buffer
-  const uint32_t* src,  //!< The source pointer
-  uint size,            //!< The size in bytes to copy
-  size_t offset         //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst,   //!< The write pointer to the buffer
+                          const uint32_t* src,  //!< The source pointer
+                          uint size,            //!< The size in bytes to copy
+                          size_t offset  //!< The alignment to follow while writing to the buffer
 ) {
  *(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
 }

 template <>
-inline void WriteAqlArgAt(
-  unsigned char* dst,   //!< The write pointer to the buffer
-  const uint64_t* src,  //!< The source pointer
-  uint size,            //!< The size in bytes to copy
-  size_t offset         //!< The alignment to follow while writing to the buffer
+inline void WriteAqlArgAt(unsigned char* dst,   //!< The write pointer to the buffer
+                          const uint64_t* src,  //!< The source pointer
+                          uint size,            //!< The size in bytes to copy
+                          size_t offset  //!< The alignment to follow while writing to the buffer
 ) {
  *(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
 }
-/*@}*/} // namespace pal
+/*@}*/  // namespace pal
+}  // namespace pal