rocm-systems/rocclr/runtime/device/pal/palvirtual.cpp

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/kernel.hpp"
#include "platform/commandqueue.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/paldebugger.hpp"
#include "device/appprofile.hpp"
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "amd_hsa_queue.h"
#include <fstream>
#include <sstream>
#include <algorithm>
#include <thread>
#include "palQueue.h"
#include "palFence.h"

#ifdef _WIN32
#include <d3d10_1.h>
#include "amdocl/cl_d3d9_amd.hpp"
#include "amdocl/cl_d3d10_amd.hpp"
#include "amdocl/cl_d3d11_amd.hpp"
#endif  // _WIN32

namespace pal {

uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
  uint32_t allocedQueues = 0;
  for (const auto& queue : gpu.dev().QueuePool()) {
    allocedQueues += (queue.second->engineType_ == type) ? 1 : 0;
  }
  return allocedQueues;
}

VirtualGPU::Queue* VirtualGPU::Queue::Create(const VirtualGPU& gpu, Pal::QueueType queueType,
                                             uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
                                             uint rtCU, amd::CommandQueue::Priority priority,
                                             uint64_t residency_limit, uint max_command_buffers) {
  Pal::IDevice* palDev = gpu.dev().iDev();
  Pal::Result result;
  Pal::CmdBufferCreateInfo cmdCreateInfo = {};
  Pal::QueueCreateInfo qCreateInfo = {};
  qCreateInfo.engineIndex =
    (queueType == Pal::QueueTypeCompute) ? gpu.dev().computeEnginesId()[engineIdx] : engineIdx;
  qCreateInfo.aqlQueue = true;
  qCreateInfo.queueType = queueType;
  qCreateInfo.priority = Pal::QueuePriority::Normal;

  if (queueType == Pal::QueueTypeDma) {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma;
  } else {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
  }
  std::map<ExclusiveQueueType, uint32_t>::const_iterator it;
  if ((priority == amd::CommandQueue::Priority::Medium) &&
      (amd::CommandQueue::RealTimeDisabled == rtCU)) {
    it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::Medium);
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
    qCreateInfo.priority = Pal::QueuePriority::Medium;
  } else if (amd::CommandQueue::RealTimeDisabled != rtCU) {
    qCreateInfo.numReservedCu = rtCU;
    if (priority == amd::CommandQueue::Priority::Medium) {
      it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1);
    } else {
      it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime0);
    }
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
    cmdCreateInfo.flags.realtimeComputeUnits = true;
    qCreateInfo.priority = Pal::QueuePriority::Realtime;

    // If the app creates an exclusive compute, then find the engine id
    if (qCreateInfo.engineType == Pal::EngineTypeCompute) {
      if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
        qCreateInfo.engineIndex = it->second;
      } else {
        return nullptr;
      }
    }
  }

  // Find queue object size
  size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  cmdCreateInfo.pCmdAllocator = cmdAllocator;
  cmdCreateInfo.queueType = queueType;

  // Find command buffer object size
  size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  // Find fence object size
  size_t fSize = palDev->GetFenceSize(&result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
  VirtualGPU::Queue* queue =
      new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = nullptr;
    if ((qCreateInfo.engineType == Pal::EngineTypeCompute) ||
        (qCreateInfo.engineType == Pal::EngineTypeDma)) {
      uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
      // Create PAL queue object
      if (index < GPU_MAX_HW_QUEUES) {
        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
        addrQ = reinterpret_cast<address>(&info[1]);
        result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
        if (result == Pal::Result::Success) {
          const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
          info->engineType_ = qCreateInfo.engineType;
          // Save uniqueue index for scratch buffer access
          info->index_ = index;
        }
      } else {
        int usage = std::numeric_limits<int>::max();
        uint indexBase = std::numeric_limits<uint32_t>::max();
        // Loop through all allocated queues and find the lowest usage
        for (const auto& it : gpu.dev().QueuePool()) {
          if ((qCreateInfo.engineType == it.second->engineType_) &&
              (it.second->counter_ <= usage)) {
            if ((it.second->counter_ < usage) ||
                // Preserve the order of allocations, because SDMA engines
                // should be used in round-robin manner
                ((it.second->counter_ == usage) && (it.second->index_ < indexBase))) {
              queue->iQueue_ = it.first;
              usage = it.second->counter_;
              indexBase = it.second->index_;
            }
          }
        }
        // Increment the usage of the current queue
        gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
      }
      Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
      queue->lock_ = &info->queue_lock_;
      addrQ = reinterpret_cast<address>(&queue[1]);
    } else {
      // Exclusive compute path
      addrQ = reinterpret_cast<address>(&queue[1]);
      result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
    }
    if (result != Pal::Result::Success) {
      delete queue;
      return nullptr;
    }
    queue->UpdateAppPowerProfile();
    address addrCmd = addrQ + qSize;
    address addrF = addrCmd + max_command_buffers * cmdSize;
    Pal::CmdBufferBuildInfo cmdBuildInfo = {};

    for (uint i = 0; i < max_command_buffers; ++i) {
      result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }

      Pal::FenceCreateInfo fenceCreateinfo = {};
      fenceCreateinfo.flags.signaled = false;
      result = palDev->CreateFence(fenceCreateinfo, &addrF[i * fSize], &queue->iCmdFences_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }
      if (i == StartCmdBufIdx) {
        result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
        if (result != Pal::Result::Success) {
          delete queue;
          return nullptr;
        }
      }
    }
  }
  return queue;
}

VirtualGPU::Queue::~Queue() {
  if (nullptr != iQueue_) {
    // Make sure the queues are idle
    // It's unclear why PAL could still have a busy queue
    amd::ScopedLock l(lock_);
    iQueue_->WaitIdle();
  }

  // Remove all memory references
  std::vector<Pal::IGpuMemory*> memRef;
  for (auto it : memReferences_) {
    memRef.push_back(it.first->iMem());
  }
  if (memRef.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
  }
  memReferences_.clear();

  for (uint i = 0; i < max_command_buffers_; ++i) {
    if (nullptr != iCmdBuffs_[i]) {
      iCmdBuffs_[i]->Destroy();
    }
    if (nullptr != iCmdFences_[i]) {
      iCmdFences_[i]->Destroy();
    }
  }

  if (nullptr != iQueue_) {
    // Find if this queue was used in recycling
    if (lock_ != nullptr) {
      // Release the queue if the counter is 0
      if (--gpu_.dev().QueuePool().find(iQueue_)->second->counter_ == 0) {
        iQueue_->Destroy();
        const auto& info = gpu_.dev().QueuePool().find(iQueue_);
        // Readjust HW queue index for scratch buffer access
        for (auto& queue : gpu_.dev().QueuePool()) {
          if ((queue.second->engineType_ == info->second->engineType_) &&
              (queue.second->index_ > info->second->index_)) {
            queue.second->index_--;
          }
        }
        delete gpu_.dev().QueuePool().find(iQueue_)->second;
        const_cast<Device&>(gpu_.dev()).QueuePool().erase(iQueue_);
      }
    } else {
      iQueue_->Destroy();
    }
  }
}

Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
  std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();

  const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
  // Find the last occurance of the '\\' character and extract the name of the application as wide
  // char.
  const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
  const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;

  return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}

void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
  if (gpu_.dev().settings().alwaysResident_) {
    return;
  }
  Pal::IGpuMemory* iMem = mem->iMem();
  auto it = memReferences_.find(mem);
  if (it != memReferences_.end()) {
    it->second = cmdBufIdSlot_;
  } else {
    // Update runtime tracking with TS
    memReferences_[mem] = cmdBufIdSlot_;
    // Update PAL list with the new entry
    Pal::GpuMemoryRef memRef = {};
    memRef.pGpuMemory = iMem;
    palMemRefs_.push_back(memRef);
    // Check SDI memory object
    if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
      sdiReferences_.insert(iMem);
      palSdiRefs_.push_back(iMem);
    }
    residency_size_ += iMem->Desc().size;
  }
}

void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
  Pal::IGpuMemory* iMem = mem->iMem();
  if (0 != memReferences_.erase(mem)) {
    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
    residency_size_ -= iMem->Desc().size;
  }
}

void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) {
  for (size_t i = 0; i < palDoppRefs_.size(); i++) {
    if (palDoppRefs_[i].pGpuMemory == iMem) {
      // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested,
      // the LAST_DOPP_SUBMISSION is send as requsted by KMD
      //
      if (palDoppRefs_[i].flags.lastPfpaCmd == 1) {
        return;  // no need to override the last submission command
      }

      if (lastDoppCmd) {
        palDoppRefs_[i].flags.lastPfpaCmd = 1;
        palDoppRefs_[i].flags.pfpa = 0;
      } else if (pfpaDoppCmd) {
        palDoppRefs_[i].flags.pfpa = 1;
      }
      return;
    }
  }

  //  this is the first reference of the DOPP desktop texture, add it in the vector
  Pal::DoppRef doppRef = {};
  doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0;
  doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0;
  doppRef.pGpuMemory = iMem;
  palDoppRefs_.push_back(doppRef);
}

bool VirtualGPU::Queue::flush() {
  // Stop commands building
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) {
    LogError("PAL failed to finalize a command buffer!");
    return false;
  }

  if (palMemRefs_.size() != 0) {
    if (Pal::Result::Success !=
        iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
                                      Pal::GpuMemoryRefCantTrim)) {
      LogError("PAL failed to make resident resources!");
      return false;
    }
    palMemRefs_.clear();
  }

  // Reset the fence. PAL will reset OS event
  if (Pal::Result::Success != iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) {
    LogError("PAL failed to reset a fence!");
    return false;
  }

  Pal::SubmitInfo submitInfo = {};
  submitInfo.cmdBufferCount = 1;
  submitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_];
  submitInfo.pFence = iCmdFences_[cmdBufIdSlot_];
  submitInfo.doppRefCount = palDoppRefs_.size();
  submitInfo.pDoppRefs = palDoppRefs_.data();
  submitInfo.externPhysMemCount = palSdiRefs_.size();
  submitInfo.ppExternPhysMem = palSdiRefs_.data();

  // Submit command buffer to OS
  Pal::Result result;
  if (gpu_.rgpCaptureEna()) {
    amd::ScopedLock l(lock_);
    result = gpu_.dev().rgpCaptureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
  } else {
    amd::ScopedLock l(lock_);
    result = iQueue_->Submit(submitInfo);
  }
  if (Pal::Result::Success != result) {
    LogError("PAL failed to submit CMD!");
    return false;
  }
  // Make sure the slot isn't busy
  constexpr bool IbReuse = true;
  if (GPU_FLUSH_ON_EXECUTION) {
    waifForFence<!IbReuse>(cmdBufIdSlot_);
  }

  // Reset the counter of commands
  cmdCnt_ = 0;

  // Find the next command buffer
  cmdBufIdCurrent_++;

  if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
    // Wait for the last one
    waifForFence<!IbReuse>(cmdBufIdSlot_);
    cmdBufIdCurrent_ = 1;
    cmbBufIdRetired_ = 0;
  }

  // Wrap current slot
  cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;

  waifForFence<IbReuse>(cmdBufIdSlot_);

  // Progress retired TS
  if ((cmdBufIdCurrent_ > max_command_buffers_) &&
      (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
    cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
  }

  // Reset command buffer, so CB chunks could be reused
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
    LogError("PAL failed CB reset!");
    return false;
  }
  // Start command buffer building
  Pal::CmdBufferBuildInfo cmdBuildInfo = {};
  cmdBuildInfo.pMemAllocator = &vlAlloc_;
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
    LogError("PAL failed CB building initialization!");
    return false;
  }

  // Clear dopp references
  palDoppRefs_.clear();
  palSdiRefs_.clear();

  // Remove old memory references
  if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
    for (auto it = memReferences_.begin(); it != memReferences_.end();) {
      if (it->second == cmdBufIdSlot_) {
        palMems_.push_back(it->first->iMem());
        residency_size_ -= it->first->iMem()->Desc().size;
        it = memReferences_.erase(it);
      } else {
        ++it;
      }
    }
  }
  if (palMems_.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
    palMems_.clear();
  }

  return true;
}

bool VirtualGPU::Queue::waitForEvent(uint id) {
  if (isDone(id)) {
    return true;
  }

  uint slotId = id % max_command_buffers_;
  constexpr bool IbReuse = true;
  bool result = waifForFence<!IbReuse>(slotId);
  cmbBufIdRetired_ = id;
  return result;
}

bool VirtualGPU::Queue::isDone(uint id) {
  if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
    return true;
  }

  if (id == cmdBufIdCurrent_) {
    // Flush the current command buffer
    flush();
  }

  if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
    return false;
  }
  cmbBufIdRetired_ = id;
  return true;
}

void VirtualGPU::Queue::DumpMemoryReferences() const {
  std::fstream dump;
  std::stringstream file_name("ocl_hang_dump.txt");
  uint64_t start = amd::Os::timeNanos() / 1e9;

  dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app));
  // Check if we have OpenCL program
  if (dump.is_open()) {
    dump << start << " Queue: ";
    switch (iQueue_->Type()) {
      case Pal::QueueTypeCompute:
        dump << "Compute";
        break;
      case Pal::QueueTypeDma:
        dump << "SDMA";
        break;
      default:
        dump << "unknown";
        break;
    }
    dump << "\n"
         << "Resident memory resources:\n";
    uint idx = 0;
    for (auto it : memReferences_) {
      dump << " " << idx << "\t[";
      dump.setf(std::ios::hex, std::ios::basefield);
      dump.setf(std::ios::showbase);
      dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
           << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
      dump.setf(std::ios::dec);
      dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().preferredHeap
           << "\n";
      idx++;
    }

    if (last_kernel_ != nullptr) {
      const amd::KernelSignature& signature = last_kernel_->signature();
      dump << last_kernel_->name() << std::endl;
      for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        // Find if the current argument is a memory object
        if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
          dump << " " << desc.name_ << ": " << std::endl;
        }
      }
    }
    dump.close();
  }
}

bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  if (numMemObj > 0) {
    // Allocate the array of memory objects for dependency tracking
    memObjectsInQueue_ = new MemoryState[numMemObj];
    if (nullptr == memObjectsInQueue_) {
      return false;
    }
    memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
    maxMemObjectsInQueue_ = numMemObj;
  }

  return true;
}

void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
  bool flushL1Cache = false;

  if (maxMemObjectsInQueue_ == 0) {
    // Flush cache
    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
    return;
  }

  uint64_t curStart = memory->vmAddress();
  uint64_t curEnd = curStart + memory->size();

  if (memory->isModified(gpu) || !readOnly) {
    // Mark resource as modified
    memory->setModified(gpu, !readOnly);

    // Loop through all memory objects in the queue and find dependency
    // @note don't include objects from the current kernel
    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
      // Check if the queue already contains this mem object and
      // GPU operations aren't readonly
      uint64_t busyStart = memObjectsInQueue_[j].start_;
      uint64_t busyEnd = memObjectsInQueue_[j].end_;

      // Check if the start inside the busy region
      if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
           // Check if the end inside the busy region
           ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
           // Check if the start/end cover the busy region
           ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
          // If the buys region was written or the current one is for write
          (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
        flushL1Cache = true;
        break;
      }
    }
  }

  // Did we reach the limit?
  if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) {
    flushL1Cache = true;
  }

  if (flushL1Cache) {
    // Flush cache
    if (!gpu.profiling()) {
      gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
    }
    // Clear memory dependency state
    const static bool All = true;
    clear(!All);
  }

  // Insert current memory object into the queue always,
  // since runtime calls flush before kernel execution and it has to keep
  // current kernel in tracking
  memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart;
  memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
  memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
  numMemObjectsInQueue_++;
}

void VirtualGPU::MemoryDependency::clear(bool all) {
  if (numMemObjectsInQueue_ > 0) {
    size_t i, j;
    if (all) {
      endMemObjectsInQueue_ = numMemObjectsInQueue_;
    }

    // If the current launch didn't start from the beginning, then move the data
    if (0 != endMemObjectsInQueue_) {
      // Preserve all objects from the current kernel
      for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
        memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
        memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
        memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
      }
    } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) {
      // note: The array growth shouldn't occur under the normal conditions,
      // but in a case when SVM path sends the amount of SVM ptrs over
      // the max size of kernel arguments
      MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
      if (nullptr == ptr) {
        numMemObjectsInQueue_ = 0;
        return;
      }
      maxMemObjectsInQueue_ <<= 1;
      memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
      delete[] memObjectsInQueue_;
      memObjectsInQueue_ = ptr;
    }

    // Adjust the number of active objects
    numMemObjectsInQueue_ -= endMemObjectsInQueue_;
    endMemObjectsInQueue_ = 0;
  }
}

VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) {
  aluCnt_ = dev.properties().gfxipProperties.shaderCore.numSimdsPerCu * dev.info().simdWidth_ *
      dev.info().maxComputeUnits_;
  maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
      // find time in us
      dev.settings().maxWorkloadTime_ * aluCnt_;
  resetCbWorkload(dev);
}

void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
  cbWorkload_ = 0;
  maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
      // find time in us
      dev.settings().minWorkloadTime_ * aluCnt_;
}

void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
                                             uint instructions) {
  if (!dev.settings().splitSizeForWin7_) {
    dispatchSplitSize_ = 0;
    return;
  }

  uint64_t workload = threads * instructions;
  if (maxDispatchWorkload_ < workload) {
    dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
    uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_;
    if ((dispatchSplitSize_ % fullLoad) != 0) {
      dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
    }
  } else {
    dispatchSplitSize_ =
        (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0;
  }
}

bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) {
  bool cbReady = false;
  uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
  // Add current workload to the overall workload in the current DMA
  cbWorkload_ += workload;
  // Did it exceed maximum?
  if (cbWorkload_ > maxCbWorkload_) {
    // Reset DMA workload
    cbWorkload_ = 0;
    // Increase workload of the next DMA buffer by 50%
    maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
    if (maxCbWorkload_ > maxDispatchWorkload_) {
      maxCbWorkload_ = maxDispatchWorkload_;
    }
    cbReady = true;
  }
  return cbReady;
}

void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
    if (pinnedMems_.size() > 7) {
      pinnedMems_.front()->release();
      pinnedMems_.erase(pinnedMems_.begin());
    }

    // Start operation, since we should release mem object
    flushDMA(dev().getGpuMemory(mem)->getGpuEvent(*this)->engineId_);

    // Delay destruction
    pinnedMems_.push_back(mem);
  }
}

void VirtualGPU::releasePinnedMem() {
  for (auto& amdMemory : pinnedMems_) {
    amdMemory->release();
  }
  pinnedMems_.clear();
}

amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
  for (auto& amdMemory : pinnedMems_) {
    if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
      return amdMemory;
    }
  }
  return nullptr;
}

bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
  uint MinDeviceQueueSize = 16 * 1024;
  deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);

  maskGroups_ = deviceQueueSize / (512 * Ki);
  maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_;

  // Align the queue size for the multiple dispatch scheduler.
  // Each thread works with 32 entries * maskGroups
  uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_);
  if (extra != 0) {
    deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra;
  }

  if (deviceQueueSize_ == deviceQueueSize) {
    return true;
  } else {
    delete vqHeader_;
    delete virtualQueue_;
    vqHeader_ = nullptr;
    virtualQueue_ = nullptr;
    deviceQueueSize_ = 0;
  }
  uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
  uint allocSize = deviceQueueSize;

  // Add the virtual queue header
  allocSize += sizeof(AmdVQueueHeader);
  allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));

  uint argOffs = allocSize;

  // Add the kernel arguments and wait events
  uint singleArgSize = amd::alignUp(
      dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t),
      sizeof(AmdAqlWrap));
  allocSize += singleArgSize * numSlots;

  uint eventsOffs = allocSize;
  // Add the device events
  allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);

  uint eventMaskOffs = allocSize;
  // Add mask array for events
  allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;

  uint slotMaskOffs = allocSize;
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

  virtualQueue_ = new Memory(dev(), allocSize);
  Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
  if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
    return false;
  }

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    address ptr = reinterpret_cast<address>(virtualQueue_->map(this, Resource::WriteOnly));
    if (nullptr == ptr) {
      return false;
    }
  }

  uint64_t pattern = 0;
  amd::Coord3D origin(0, 0, 0);
  amd::Coord3D region(virtualQueue_->size(), 0, 0);
  if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), origin, region)) {
    return false;
  }

  uint64_t vaBase = virtualQueue_->vmAddress();
  AmdVQueueHeader header = {};
  // Initialize the virtual queue header
  header.aql_slot_num = numSlots;
  header.event_slot_num = dev().settings().numDeviceEvents_;
  header.event_slot_mask = vaBase + eventMaskOffs;
  header.event_slots = vaBase + eventsOffs;
  header.aql_slot_mask = vaBase + slotMaskOffs;
  header.wait_size = dev().settings().numWaitEvents_;
  header.arg_size = dev().info().maxParameterSize_ + 64;
  header.mask_groups = maskGroups_;

  vqHeader_ = new AmdVQueueHeader;
  if (nullptr == vqHeader_) {
    return false;
  }
  *vqHeader_ = header;

  virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);

  // Go over all slots and perform initialization
  AmdAqlWrap slot = {};
  size_t offset = sizeof(AmdVQueueHeader);
  for (uint i = 0; i < numSlots; ++i) {
    uint64_t argStart = vaBase + argOffs + i * singleArgSize;
    slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
    slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
    virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
    offset += sizeof(AmdAqlWrap);
  }

  deviceQueueSize_ = deviceQueueSize;

  return true;
}

VirtualGPU::VirtualGPU(Device& device)
    : device::VirtualDevice(device),
      engineID_(MainEngine),
      gpuDevice_(static_cast<Device&>(device)),
      printfDbgHSA_(nullptr),
      tsCache_(nullptr),
      dmaFlushMgmt_(device),
      managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
      writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
      hwRing_(0),
      readjustTimeGPU_(0),
      lastTS_(nullptr),
      profileTs_(nullptr),
      vqHeader_(nullptr),
      virtualQueue_(nullptr),
      schedParams_(nullptr),
      deviceQueueSize_(0),
      maskGroups_(1),
      hsaQueueMem_(nullptr),
      cmdAllocator_(nullptr) {
  // Note: Virtual GPU device creation must be a thread safe operation
  index_ = gpuDevice_.numOfVgpus_++;
  gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
  gpuDevice_.vgpus_[index()] = this;

  queues_[MainEngine] = nullptr;
  queues_[SdmaEngine] = nullptr;
}

bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
                        amd::CommandQueue::Priority priority) {
  device::BlitManager::Setup blitSetup;

  // Resize the list of device resources always,
  // because destructor calls eraseResourceList() even if create() failed
  dev().resizeResoureList(index());

  if (index() >= GPU_MAX_COMMAND_QUEUES) {
    // Cap the maximum number of concurrent Virtual GPUs
    return false;
  }

  // Virtual GPU will have profiling enabled
  state_.profiling_ = profiling;

  Pal::CmdAllocatorCreateInfo createInfo = {};
  createInfo.flags.threadSafe = true;
  // \todo forces PAL to reuse CBs, but requires postamble
  createInfo.flags.autoMemoryReuse = false;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
      VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));
  createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
      dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize;

  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 256 * Ki;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;

  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocHeap = Pal::GpuHeapInvisible;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocSize = 64 * Ki;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].suballocSize = 4 * Ki;

  Pal::Result result;
  size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
  if (Pal::Result::Success != result) {
    return false;
  }
  char* addr = new char[cmdAllocSize];
  if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
    return false;
  }

  uint idx = index() % dev().numComputeEngines();
  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
      ? 0
      : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
  uint max_cmd_buffers = dev().settings().maxCmdBuffers_;

  if (dev().numComputeEngines()) {
    queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
                                        priority, residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
      return false;
    }
    const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_);
    hwRing_ = (info != dev().QueuePool().end())
        ? info->second->index_
        : (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;

    // Check if device has SDMA engines
    if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) {
      uint sdma;
      // If only 1 SDMA engine is available then use that one, otherwise it's a round-robin manner
      if ((dev().numDMAEngines() < 2) || ((idx + 1) & 0x1)) {
        sdma = 0;
      } else {
        sdma = 1;
      }
      queues_[SdmaEngine] = Queue::Create(
          *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
          amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    }
  } else {
    LogError("Runtme couldn't find compute queues!");
    return false;
  }

  if (!managedBuffer_.create(Resource::RemoteUSWC)) {
    return false;
  }

  // Diable double copy optimization,
  // since UAV read from nonlocal is fast enough
  blitSetup.disableCopyBufferToImageOpt_ = true;
  if (!allocConstantBuffers()) {
    return false;
  }

  // Create HSAILPrintf class
  printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
  if (nullptr == printfDbgHSA_) {
    delete printfDbgHSA_;
    LogError("Could not create PrintfDbgHSA class!");
    return false;
  }

  tsCache_ = new TimeStampCache(*this);
  if (nullptr == tsCache_) {
    LogError("Could not create TimeStamp cache!");
    return false;
  }

  if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
    LogError("Could not create the array of memory objects!");
    return false;
  }

  if (!allocHsaQueueMem()) {
    LogError("Could not create hsaQueueMem object!");
    return false;
  }

  // Check if the app requested a device queue creation
  if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) &&
      !createVirtualQueue(deviceQueueSize)) {
    LogError("Could not create a virtual queue!");
    return false;
  }

  // Choose the appropriate class for blit engine
  switch (dev().settings().blitEngine_) {
    default:
    // Fall through ...
    case Settings::BlitEngineHost:
      blitSetup.disableAll();
    // Fall through ...
    case Settings::BlitEngineCAL:
    case Settings::BlitEngineKernel:
      // use host blit for HW debug
      if (dev().settings().enableHwDebug_) {
        blitSetup.disableCopyImageToBuffer_ = true;
        blitSetup.disableCopyBufferToImage_ = true;
      }
      blitMgr_ = new KernelBlitManager(*this, blitSetup);
      break;
  }
  if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
    LogError("Could not create BlitManager!");
    return false;
  }

  // If the developer mode manager is available and it's not a device queue,
  // then enable RGP capturing
  if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
    bool dbg_vmid = false;
    state_.rgpCaptureEnabled_ = true;
    dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
    dev().rgpCaptureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_,
                                              &dbg_vmid);
  }

  return true;
}

bool VirtualGPU::allocHsaQueueMem() {
  // Allocate a dummy HSA queue
  hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
  if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) {
    delete hsaQueueMem_;
    return false;
  }
  amd_queue_t hsa_queue = {};

  // Provide private and local heap addresses
  constexpr uint addressShift = LP64_SWITCH(0, 32);
  hsa_queue.private_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
  hsa_queue.group_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);

  hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true);

  return true;
}

VirtualGPU::~VirtualGPU() {
  // Destroy RGP trace
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->FinishRGPTrace(this, true);
  }

  // Not safe to remove a queue. So lock the device
  amd::ScopedLock k(dev().lockAsyncOps());
  amd::ScopedLock lock(dev().vgpusAccess());

  while (!freeCbQueue_.empty()) {
    auto cb = freeCbQueue_.front();
    delete cb;
    freeCbQueue_.pop();
  }

  // Destroy printfHSA object
  delete printfDbgHSA_;

  // Destroy TimeStamp cache
  delete tsCache_;

  // Destroy resource list with the constant buffers
  for (uint i = 0; i < constBufs_.size(); ++i) {
    delete constBufs_[i];
  }

  managedBuffer_.release();

  delete vqHeader_;
  delete virtualQueue_;
  delete hsaQueueMem_;

  // Release scratch buffer memory to reduce memory pressure
  //!@note OCLtst uses single device with multiple tests
  //! Release memory only if it's the last command queue.
  //! The first queue is reserved for the transfers on device
  if (static_cast<int>(gpuDevice_.numOfVgpus_ - 1) <= 1) {
    gpuDevice_.destroyScratchBuffers();
  }

  // Destroy BlitManager object
  delete blitMgr_;

  {
    // Destroy queues
    if (nullptr != queues_[MainEngine]) {
      delete queues_[MainEngine];
    }

    if (nullptr != queues_[SdmaEngine]) {
      delete queues_[SdmaEngine];
    }

    if (nullptr != cmdAllocator_) {
      cmdAllocator_->Destroy();
      delete[] reinterpret_cast<char*>(cmdAllocator_);
    }
  }

  {
    // Find all available virtual GPUs and lock them
    // from the execution of commands, since the queue index and resource list
    // Will be adjusted
    for (auto it : dev().vgpus()) {
      if (it != this) {
        it->execution().lock();
      }
    }

    // Not safe to add a resource if create/destroy queue is in process, since
    // the size of the TS array can change
    amd::ScopedLock r(dev().lockResources());
    gpuDevice_.numOfVgpus_--;
    gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
    for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
      dev().vgpus()[idx]->index_--;
    }
    dev().eraseResoureList(index());

    // Find all available virtual GPUs and unlock them
    // for the execution of commands
    for (auto it : dev().vgpus()) {
      it->execution().unlock();
    }
  }
}

void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up-to-date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.source());

  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);

  profilingBegin(vcmd, true);

  memory->syncCacheFromHost(*this);
  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_READ_IMAGE) &&
      (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.source());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_READ_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_READ_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size,
                                      vcmd.isEntireMemory());
      } else {
        // The logic below will perform 2 step copy to make sure memory pinning doesn't
        // occur on the first unaligned page, because in Windows memory manager can
        // have CPU access to the allocation header in another thread
        // and a race condition is possible.
        char* tmpHost = amd::alignUp(
            reinterpret_cast<char*>(vcmd.destination()), PinnedMemoryAlignment);

        // Find the partial size for unaligned copy
        size_t partial = tmpHost - reinterpret_cast<char*>(vcmd.destination());
        result = true;
        // Check if it's staging copy, then ignore unaligned address
        if (size[0] <= dev().settings().pinnedMinXferSize_) {
          partial = size[0];
        }
        // Make first step transfer
        if (partial > 0) {
          result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, partial);
        }
        // Second step transfer if something left to copy
        if (partial < size[0]) {
          result &= blitMgr().readBuffer(*memory, tmpHost, origin[0] + partial, size[0] - partial);
        }
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_READ_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect,
                                          vcmd.size(), vcmd.isEntireMemory());
      } else {
        result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(),
                                          vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory());
      }
    } break;
    case CL_COMMAND_READ_IMAGE:
      if (memory->memoryType() == Resource::ImageBuffer) {
        Image* imageBuffer = static_cast<Image*>(memory);
        // Check if synchronization has to be performed
        if (nullptr != imageBuffer->CopyImageBuffer()) {
          memory = imageBuffer->CopyImageBuffer();
          if (nullptr == imageBuffer->owner()->getLastWriter()) {
            Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
            amd::Image* image = imageBuffer->owner()->asImage();
            amd::Coord3D offs(0);
            // Copy memory from the original image buffer into the backing store image
            result = blitMgr().copyBufferToImage(*buffer, *imageBuffer->CopyImageBuffer(), offs,
                                                 offs, image->getRegion(), true,
                                                 image->getRowPitch(), image->getSlicePitch());
          }
        }
      }
      if (hostMemory != nullptr) {
        // Accelerated image to buffer transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result =
            blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(),
                                        vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch());
      } else {
        result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(),
                                     vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory());
      }
      break;
    default:
      LogError("Unsupported type for the read command");
      break;
  }

  if (!result) {
    LogError("submitReadMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up to date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);

  profilingBegin(vcmd, true);

  bool entire = vcmd.isEntireMemory();

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer write for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_WRITE_IMAGE) &&
      (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.destination());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_WRITE_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_WRITE_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size,
                                      vcmd.isEntireMemory());
      } else {
        // The logic below will perform 2 step copy to make sure memory pinning doesn't
        // occur on the first unaligned page, because in Windows memory manager can
        // have CPU access to the allocation header in another thread
        // and a race condition is possible.
        const char* tmpHost =
            amd::alignUp(reinterpret_cast<const char*>(vcmd.source()), PinnedMemoryAlignment);

        // Find the partial size for unaligned copy
        size_t partial = tmpHost - reinterpret_cast<const char*>(vcmd.source());
        result = true;
        // Check if it's staging copy, then ignore unaligned address
        if (size[0] <= dev().settings().pinnedMinXferSize_) {
          partial = size[0];
        }
        // Make first step transfer
        if (partial > 0) {
          result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, partial);
        }
        // Second step transfer if something left to copy
        if (partial < size[0]) {
          result &= blitMgr().writeBuffer(tmpHost, *memory, origin[0] + partial, size[0] - partial);
        }
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_WRITE_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(),
                                          vcmd.size(), vcmd.isEntireMemory());
      } else {
        result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(),
                                           vcmd.size(), vcmd.isEntireMemory());
      }
    } break;
    case CL_COMMAND_WRITE_IMAGE:
      if (hostMemory != nullptr) {
        // Accelerated buffer to image transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result =
            blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(),
                                        vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch());
      } else {
        result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(),
                                      vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory());
      }
      break;
    default:
      LogError("Unsupported type for the write command");
      break;
  }

  if (!result) {
    LogError("submitWriteMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  } else {
    // Mark this as the most-recently written cache of the destination
    vcmd.destination().signalWrite(&gpuDevice_);
  }
  profilingEnd(vcmd);
}

bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
                            bool entire, const amd::Coord3D& srcOrigin,
                            const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                            const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) {
  // Translate memory references and ensure cache up-to-date
  pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
  pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  dstMemory->syncCacheFromHost(*this, syncFlags);
  srcMemory->syncCacheFromHost(*this);

  amd::Memory* bufferFromImageSrc = nullptr;
  amd::Memory* bufferFromImageDst = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImageSrc = createBufferFromImage(srcMem);
    if (nullptr == bufferFromImageSrc) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_COPY_BUFFER;
      srcMemory = dev().getGpuMemory(bufferFromImageSrc);
    }
  }
  // Force buffer write for IMAGE1D_BUFFER
  if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImageDst = createBufferFromImage(dstMem);
    if (nullptr == bufferFromImageDst) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_COPY_BUFFER;
      dstMemory = dev().getGpuMemory(bufferFromImageDst);
    }
  }

  bool result = false;

  // Check if HW can be used for memory copy
  switch (type) {
    case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD:
    case CL_COMMAND_SVM_MEMCPY:
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D realSrcOrigin(srcOrigin[0]);
      amd::Coord3D realDstOrigin(dstOrigin[0]);
      amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);

      if (nullptr != bufferFromImageSrc) {
        size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
        realSrcOrigin.c[0] *= elemSize;
        if (nullptr != bufferFromImageDst) {
          realDstOrigin.c[0] *= elemSize;
        }
        realSize.c[0] *= elemSize;
      } else if (nullptr != bufferFromImageDst) {
        size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
        realDstOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
      }

      result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize,
                                    entire);

      if (nullptr != bufferFromImageSrc) {
        bufferFromImageSrc->release();
      }
      if (nullptr != bufferFromImageDst) {
        bufferFromImageDst->release();
      }
    } break;
    case CL_COMMAND_COPY_BUFFER_RECT:
      result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire);
      break;
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
      result =
          blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
      result =
          blitMgr().copyBufferToImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    case CL_COMMAND_COPY_IMAGE:
      result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    default:
      LogError("Unsupported command type for memory copy!");
      break;
  }

  if (!result) {
    LogError("submitCopyMemory failed!");
    return false;
  } else {
    // Mark this as the most-recently written cache of the destination
    dstMem.signalWrite(&gpuDevice_);
  }
  return true;
}

void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  bool entire = vcmd.isEntireMemory();

  if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(),
                  vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    amd::Coord3D srcOrigin(0, 0, 0);
    amd::Coord3D dstOrigin(0, 0, 0);
    amd::Coord3D size(vcmd.srcSize(), 1, 1);
    amd::BufferRect srcRect;
    amd::BufferRect dstRect;

    bool result = false;
    amd::Memory* srcMem = amd::MemObjMap::FindMemObj(vcmd.src());
    amd::Memory* dstMem = amd::MemObjMap::FindMemObj(vcmd.dst());

    device::Memory::SyncFlags syncFlags;
    if (nullptr != srcMem) {
      srcMem->commitSvmMemory();
      srcOrigin.c[0] =
          static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
      if (!(srcMem->validateRegion(srcOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
    if (nullptr != dstMem) {
      dstMem->commitSvmMemory();
      dstOrigin.c[0] =
          static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
      if (!(dstMem->validateRegion(dstOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }

    if (nullptr == srcMem && nullptr == dstMem) {  // both not in svm space
      amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
      result = true;
    } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getGpuMemory(dstMem);
      // Synchronize source and destination memory
      syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
      memory->syncCacheFromHost(*this, syncFlags);

      result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size,
                                     dstMem->isEntirelyCovered(dstOrigin, size));
      // Mark this as the most-recently written cache of the destination
      dstMem->signalWrite(&gpuDevice_);
    } else if (nullptr != srcMem && nullptr == dstMem) {  // dst not in svm space
      Memory* memory = dev().getGpuMemory(srcMem);
      // Synchronize source and destination memory
      memory->syncCacheFromHost(*this);

      result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size,
                                    srcMem->isEntirelyCovered(srcOrigin, size));
    } else if (nullptr != srcMem && nullptr != dstMem) {  // both in svm space
      bool entire =
          srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
      result =
          copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect);
    }

    if (!result) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // direct memcpy for FGS enabled system
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
  }
  profilingEnd(vcmd);
}

void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());

  // Save map info for unmap operation
  memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                      vcmd.isEntireMemory());

  // If we have host memory, use it
  if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
    if (!memory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before
      // synchronization with the backing store
      memory->wait(*this);
    }

    // Target is the backing store, so just ensure that owner is up-to-date
    memory->owner()->cacheWriteBack();

    // Add memory to VA cache, so rutnime can detect direct access to VA
    dev().addVACache(memory);
  } else if (memory->isPersistentDirectMap()) {
    // Nothing to do here
  } else if (memory->mapMemory() != nullptr) {
    // Target is a remote resource, so copy
    assert(memory->mapMemory() != nullptr);
    if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
      amd::Coord3D dstOrigin(0, 0, 0);
      if (memory->desc().buffer_) {
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        Memory* memoryBuf = memory;
        amd::Coord3D origin(vcmd.origin()[0]);
        amd::Coord3D size(vcmd.size()[0]);
        size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;

        amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
        if (nullptr == bufferFromImage) {
          LogError("We should not fail buffer creation from image_buffer!");
        } else {
          memoryBuf = dev().getGpuMemory(bufferFromImage);
        }
        if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size,
                                  vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
        if (nullptr != bufferFromImage) {
          bufferFromImage->release();
        }
      } else {
        // Validate if it's a view for a map of mip level
        if (vcmd.memory().parent() != nullptr) {
          amd::Image* amdImage = vcmd.memory().parent()->asImage();
          if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
            // Save map write info in the parent object
            dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(),
                                                      vcmd.mapFlags(), vcmd.isEntireMemory(),
                                                      vcmd.memory().asImage());
          }
        }
        if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin,
                                         vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    }
  } else {
    LogError("Unhandled map!");
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
  bool unmapMip = false;
  amd::Image* amdImage;
  {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
    amd::Memory* owner = memory->owner();
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
    if (nullptr == writeMapInfo) {
      LogError("Unmap without map call");
      return;
    }
    profilingBegin(vcmd, true);

    // Check if image is a mipmap and assign a saved view
    amdImage = owner->asImage();
    if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
        (writeMapInfo->baseMip_ != nullptr)) {
      // Assign mip level view
      amdImage = writeMapInfo->baseMip_;
      // Clear unmap flags from the parent image
      memory->clearUnmapInfo(vcmd.mapPtr());
      memory = dev().getGpuMemory(amdImage);
      unmapMip = true;
      writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
    }

    // We used host memory
    if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (writeMapInfo->isUnmapWrite()) {
        // Target is the backing store, so sync
        owner->signalWrite(nullptr);
        memory->syncCacheFromHost(*this);
      }
      // Remove memory from VA cache
      dev().removeVACache(memory);
    }
    // data check was added for persistent memory that failed to get aperture
    // and therefore are treated like a remote resource
    else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
      memory->unmap(this);
    } else if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        amd::Coord3D srcOrigin(0, 0, 0);
        // Target is a remote resource, so copy
        assert(memory->mapMemory() != nullptr);
        if (memory->desc().buffer_) {
          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                    writeMapInfo->origin_, writeMapInfo->region_,
                                    writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
        } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
          Memory* memoryBuf = memory;
          amd::Coord3D origin(writeMapInfo->origin_[0]);
          amd::Coord3D size(writeMapInfo->region_[0]);
          size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
          origin.c[0] *= elemSize;
          size.c[0] *= elemSize;

          amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
          if (nullptr == bufferFromImage) {
            LogError("We should not fail buffer creation from image_buffer!");
          } else {
            memoryBuf = dev().getGpuMemory(bufferFromImage);
          }
          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
                                    writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
          if (nullptr != bufferFromImage) {
            bufferFromImage->release();
          }
        } else {
          if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
                                           writeMapInfo->origin_, writeMapInfo->region_,
                                           writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
        }
      }
    } else {
      LogError("Unhandled unmap!");
      vcmd.setStatus(CL_INVALID_VALUE);
    }

    // Clear unmap flags
    memory->clearUnmapInfo(vcmd.mapPtr());

    profilingEnd(vcmd);
  }
  // Release a view for a mipmap map
  if (unmapMip) {
    // Memory release should be outside of the execution lock,
    // because mapMemory_ isn't marked for a specifc GPU
    amdImage->release();
  }
}

bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
                            size_t patternSize, const amd::Coord3D& origin,
                            const amd::Coord3D& size) {
  pal::Memory* memory = dev().getGpuMemory(amdMemory);
  bool entire = amdMemory->isEntirelyCovered(origin, size);

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  bool result = false;
  amd::Memory* bufferFromImage = nullptr;
  float fillValue[4];

  // Force fill buffer for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(*amdMemory);
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_FILL_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Find the the right fill operation
  switch (type) {
    case CL_COMMAND_FILL_BUFFER:
    case CL_COMMAND_SVM_MEMFILL: {
      amd::Coord3D realOrigin(origin[0]);
      amd::Coord3D realSize(size[0]);
      // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
      if (nullptr != bufferFromImage) {
        size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
        realOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
        memset(fillValue, 0, sizeof(fillValue));
        amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
        pattern = fillValue;
        patternSize = elemSize;
      }
      result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize,
                                    amdMemory->isEntirelyCovered(origin, size));
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_FILL_IMAGE:
      result = blitMgr().fillImage(*memory, pattern, origin, size,
                                   amdMemory->isEntirelyCovered(origin, size));
      break;
    default:
      LogError("Unsupported command type for FillMemory!");
      break;
  }

  if (!result) {
    LogError("fillMemory failed!");
    return false;
  }

  // Mark this as the most-recently written cache of the destination
  amdMemory->signalWrite(&gpuDevice_);
  return true;
}

void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  if (!fillMemory(vcmd.type(), &vcmd.memory(), vcmd.pattern(), vcmd.patternSize(), vcmd.origin(),
                  vcmd.size())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  // Get the device memory objects for the current device
  Memory* srcDevMem = dev().getGpuMemory(&cmd.source());
  Memory* dstDevMem = dev().getGpuMemory(&cmd.destination());

  bool p2pAllowed = true;

  // If any device object is null, then no HW P2P and runtime has to use staging
  if (srcDevMem == nullptr) {
    srcDevMem = static_cast<pal::Memory*>(
        cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
    p2pAllowed = false;
  } else if (dstDevMem == nullptr) {
    dstDevMem = static_cast<pal::Memory*>(
        cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
    p2pAllowed = false;
  }

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = cmd.isEntireMemory();
  amd::Coord3D size = cmd.size();

  bool result = false;
  switch (cmd.type()) {
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
      amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);

      if (p2pAllowed) {
        result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
                                      cmd.isEntireMemory());
      } else {
        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));

        size_t copy_size = Device::kP2PStagingSize;
        size_t left_size = size[0];
        amd::Coord3D stageOffset(0);
        result = true;
        do {
          if (left_size <= copy_size) {
            copy_size = left_size;
          }
          left_size -= copy_size;
          amd::Coord3D cpSize(copy_size);

          // Perform 2 step transfer with staging buffer
          result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
                                                          stageOffset, cpSize);
          srcOrigin.c[0] += copy_size;
          result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
                                                          dstOrigin, cpSize);
          dstOrigin.c[0] += copy_size;
        } while (left_size > 0);
      }
      break;
    }
    case CL_COMMAND_COPY_BUFFER_RECT:
    case CL_COMMAND_COPY_IMAGE:
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
      LogError("Unsupported P2P type!");
      break;
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitCopyMemoryP2P failed!");
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  }

  cmd.destination().signalWrite(&dstDevMem->dev());

  profilingEnd(cmd);
}

void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    // Make sure we have memory for the command execution
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());

    memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                        vcmd.isEntireMemory());

    if (memory->mapMemory() != nullptr) {
      if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitSVMMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (!memory->isHostMemDirectAccess()) {
        // Make sure GPU finished operation before
        // synchronization with the backing store
        memory->wait(*this);
      }

      // Target is the backing store, so just ensure that owner is up-to-date
      memory->owner()->cacheWriteBack();
    } else {
      LogError("Unhandled svm map!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr());

    if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        amd::Coord3D srcOrigin(0, 0, 0);
        // Target is a remote resource, so copy
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
                                  writeMapInfo->isEntire())) {
          LogError("submitSvmUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (writeMapInfo->isUnmapWrite()) {
        // Target is the backing store, so sync
        memory->owner()->signalWrite(nullptr);
        memory->syncCacheFromHost(*this);
      }
    }
    memory->clearUnmapInfo(vcmd.svmPtr());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  if (!dev().isFineGrainedSystem()) {
    size_t patternSize = vcmd.patternSize();
    size_t fillSize = patternSize * vcmd.times();
    amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst());
    assert(dstMemory && "No svm Buffer to fill with!");
    size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
        reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());

    pal::Memory* memory = dev().getGpuMemory(dstMemory);

    amd::Coord3D origin(offset, 0, 0);
    amd::Coord3D size(fillSize, 1, 1);

    assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
    // Synchronize memory from host if necessary
    device::Memory::SyncFlags syncFlags;
    syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size);
    memory->syncCacheFromHost(*this, syncFlags);

    if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
    // Mark this as the most-recently written cache of the destination
    dstMemory->signalWrite(&gpuDevice_);
  } else {
    // for FGS capable device, fill CPU memory directly
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
    pal::Memory* memory = dev().getGpuMemory(it);

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      memory->mgpuCacheWriteBack();
    } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
      // Synchronize memory from host if necessary.
      // The sync function will perform memory migration from
      // another device if necessary
      device::Memory::SyncFlags syncFlags;
      memory->syncCacheFromHost(*this, syncFlags);
    } else {
      LogWarning("Unknown operation for memory migration!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
  // in-order semantics: previous commands need to be done before we start
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
  std::vector<void*>& svmPointers = vcmd.svmPointers();
  if (vcmd.pfnFreeFunc() == nullptr) {
    // pointers allocated using clSVMAlloc
    for (cl_uint i = 0; i < svmPointers.size(); ++i) {
      dev().svmFree(svmPointers[i]);
    }
  } else {
    vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
                       static_cast<void**>(&(svmPointers[0])), vcmd.userData());
  }
  profilingEnd(vcmd);
}

// ================================================================================================
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue) {
  AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
  uint p = 0;
  for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
    if (wraps[i].state != 0) {
      uint j;
      if (p == GPU_PRINT_CHILD_KERNEL) {
        break;
      }
      p++;
      std::stringstream print;
      print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
      print << "Slot#: " << i << "\n";
      print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
      print << "\tcommand_id: " << wraps[i].command_id << "\n";
      print << "\tchild_counter: " << wraps[i].child_counter << "\n";
      print << "\tcompletion: " << wraps[i].completion << "\n";
      print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
      print << "\twait_list: " << wraps[i].wait_list << "\n";
      print << "\twait_num: " << wraps[i].wait_num << "\n";
      uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
      size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
      for (j = 0; j < wraps[i].wait_num; ++j) {
        uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
        print << "Wait Event#: " << j << "\n";
        print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
      }
      print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
      print << wraps[i].aql.workgroup_size_y << ", ";
      print << wraps[i].aql.workgroup_size_z << "]\n";
      print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
      print << wraps[i].aql.grid_size_y << ", ";
      print << wraps[i].aql.grid_size_z << "]\n";

      HSAILKernel* child = nullptr;
      for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
           ++it) {
        if (wraps[i].aql.kernel_object == static_cast<HSAILKernel*>(it->second)->gpuAqlCode()) {
          child = static_cast<HSAILKernel*>(it->second);
        }
      }
      if (child == nullptr) {
        printf("Error: couldn't find child kernel!\n");
        continue;
      }
      const uint64_t kernarg_address =
          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
      uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
      address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
      print << "Kernel: " << child->name() << "\n";
      const amd::KernelSignature& signature = child->signature();

      // Check if runtime has to setup hidden arguments
      for (const auto it : signature.parameters()) {
        const char* extraArgName = nullptr;
        switch (it.info_.oclObject_) {
          case amd::KernelParameterDescriptor::HiddenNone:
            // void* zero = 0;
            // WriteAqlArgAt(const_cast<address>(parameters), &zero, it.size_, it.offset_);
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
            extraArgName = "Offset0: ";
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
            extraArgName = "Offset1: ";
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
            extraArgName = "Offset2: ";
            break;
          case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
            extraArgName = "PrintfBuf: ";
            break;
          case amd::KernelParameterDescriptor::HiddenDefaultQueue:
            extraArgName = "VqueuePtr: ";
            break;
          case amd::KernelParameterDescriptor::HiddenCompletionAction:
            extraArgName = "AqlWrap: ";
            break;
          default:
            break;
        }
        if (extraArgName) {
          print << "\t" << extraArgName << *reinterpret_cast<size_t*>(argum);
          print << "\n";
          argum += sizeof(size_t);
          continue;
        }
        print << "\t" << it.name_ << ": ";
        for (int s = it.size_ - 1; s >= 0; --s) {
          print.width(2);
          print.fill('0');
          print << static_cast<uint32_t>(argum[s]);
        }
        argum += it.offset_;
        print << "\n";
      }
      printf("%s", print.str().c_str());
    }
  }
}

// ================================================================================================
bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
                                  VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
  if (nullptr == defQueue) {
    LogError("Default device queue wasn't allocated");
    return false;
  } else {
    if (dev().settings().useDeviceQueue_) {
      *gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
      if ((*gpuDefQueue)->hwRing() == hwRing()) {
        LogError("Can't submit the child kernels to the same HW ring as the host queue!");
        return false;
      }
    } else {
      createVirtualQueue(defQueue->size());
      *gpuDefQueue = this;
    }
  }
  *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();

  (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable());

  // Acquire USWC memory for the scheduler parameters
  (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));

  // Add memory handles before the actual dispatch
  addVmMemory((*gpuDefQueue)->virtualQueue_);
  addVmMemory((*gpuDefQueue)->schedParams_);

  return true;
}

// ================================================================================================
void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const HSAILKernel& hsaKernel,
                                   VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
                                   uint64_t vmParentWrap, GpuEvent* gpuEvent) {
  uint32_t id = gpuEvent->id_;
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());

  // Make sure exculsive access to the device queue
  amd::ScopedLock(defQueue->lock());
  Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam));

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    waitForEvent(gpuEvent);
    PrintChildren(hsaKernel, gpuDefQueue);
  }

  if (!dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
                                     dev().settings().useDeviceQueue_);
    eventEnd(MainEngine, *gpuEvent);
  }

  // Get the global loop start before the scheduler
  Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
      .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
                    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  const static bool FlushL2 = true;
  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, FlushL2);

  // Get the address of PM4 template and add write it to params
  //! @note DMA flush must not occur between patch and the scheduler
  Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  // Program parameters for the scheduler
  SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
  param->signal = 1;
  // Scale clock to 1024 to avoid 64 bit div in the scheduler
  param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
  param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/;
  param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
  param->releaseHostCP = 0;
  param->parentAQL = vmParentWrap;
  param->dedicatedQueue = dev().settings().useDeviceQueue_;
  param->useATC = dev().settings().svmFineGrainSystem_;

  // Fill the scratch buffer information
  if (hsaKernel.prog().maxScratchRegs() > 0) {
    pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
    param->scratchSize = scratchBuf->size();
    param->scratch = scratchBuf->vmAddress();
    param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
    param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
    addVmMemory(scratchBuf);
  } else {
    param->numMaxWaves = 0;
    param->scratchSize = 0;
    param->scratch = 0;
    param->scratchOffset = 0;
  }

  // Add all kernels in the program to the mem list.
  //! \note Runtime doesn't know which one will be called
  hsaKernel.prog().fillResListWithKernels(*this);

  Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
  gpuDefQueue->eventBegin(MainEngine);
  gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
      signalAddr, loopStart,
      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
  // Thus TS command for profiling has to follow in the next CB.
  constexpr bool ForceSubmitFirst = true;
  gpuDefQueue->eventEnd(MainEngine, *gpuEvent, ForceSubmitFirst);

  if (dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
                                     dev().settings().useDeviceQueue_);
    if (id != gpuEvent->id_) {
      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, *gpuEvent);
  }

  xferWrite().Release(*gpuDefQueue->schedParams_);
  gpuDefQueue->schedParams_ = nullptr;
}

// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  if (vcmd.cooperativeGroups()) {
    uint32_t workgroups = 0;
    for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
      if ((vcmd.sizes().local()[i] != 0) && (vcmd.sizes().global()[i] != 1)) {
        workgroups += (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
      }
    }

    bool test = true;
    VirtualGPU* queue = (test) ? this : dev().xferQueue();

    // Wait for the execution on the current queue, since the coop groups will use the device queue
    waitAllEngines();

    amd::ScopedLock lock(queue->blitMgr().lockXfer());

    queue->profilingBegin(vcmd);

    static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
    queue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue);

    // Submit kernel to HW
    if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
                                     &vcmd.event(), vcmd.sharedMemBytes(),
                                     vcmd.cooperativeGroups())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

    queue->profilingEnd(vcmd);

    // Wait for the execution on the device queue. Keep the current queue in-order
    queue->waitAllEngines();
  } else {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    profilingBegin(vcmd);

    // Submit kernel to HW
    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event(),
                              vcmd.sharedMemBytes(), vcmd.cooperativeGroups())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

    profilingEnd(vcmd);
  }
}

// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                      const_address parameters, bool nativeMem,
                                      amd::Event* enqueueEvent, uint32_t sharedMemBytes,
                                      bool cooperativeGroup) {
  size_t newOffset[3] = {0, 0, 0};
  size_t newGlobalSize[3] = {0, 0, 0};

  int dim = -1;
  int iteration = 1;
  size_t globalStep = 0;
  for (uint i = 0; i < sizes.dimensions(); i++) {
    newGlobalSize[i] = sizes.global()[i];
    newOffset[i] = sizes.offset()[i];
  }

  // Get the HSA kernel object
  const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));

  // If RGP capturing is enabled, then start SQTT trace
  if (rgpCaptureEna()) {
    size_t newLocalSize[3] = {1, 1, 1};
    for (uint i = 0; i < sizes.dimensions(); i++) {
      if (sizes.local()[i] != 0) {
        newLocalSize[i] = sizes.local()[i];
      }
    }
    dev().rgpCaptureMgr()->PreDispatch(
        this, hsaKernel,
        // Report global size in workgroups, since that's the RGP trace semantics
        newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
        newGlobalSize[2] / newLocalSize[2]);
  }

  bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
  if (printfEnabled && !printfDbgHSA().init(*this, printfEnabled)) {
    LogError("Printf debug buffer initialization failed!");
    return false;
  }

  // Add ISA memory object to the resource tracking list
  AddKernel(kernel);

  uint64_t vmDefQueue = 0;
  VirtualGPU* gpuDefQueue = nullptr;
  if (hsaKernel.dynamicParallelism()) {
    // Initialize GPU device queue for execution (gpuDefQueue)
    if (!PreDeviceEnqueue(kernel, hsaKernel, &gpuDefQueue, &vmDefQueue)) {
      return false;
    }
  }
  size_t ldsSize;

  ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\tShaderName : %s\n", hsaKernel.name().c_str());

  // Check memory dependency and SVM objects
  if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize)) {
    LogError("Wrong memory objects!");
    return false;
  }

  bool needFlush = false;
  // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
  // are in the same cmdBuffer
  if (!state_.perfCounterEnabled_) {
    dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize());
    if (dmaFlushMgmt().dispatchSplitSize() != 0) {
      needFlush = true;
    }
  }

  // Check if it is blit kernel. If it is, then check if split is needed.
  if (hsaKernel.isInternalKernel()) {
    // Calculate new group size for each submission
    for (uint i = 0; i < sizes.dimensions(); i++) {
      if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
        dim = i;
        iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0);
        globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim];
        break;
      }
    }
  }

  for (int iter = 0; iter < iteration; ++iter) {
    GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
    uint32_t id = gpuEvent.id_;
    // Reset global size for dimension dim if split is needed
    if (dim != -1) {
      newOffset[dim] = sizes.offset()[dim] + globalStep * iter;
      if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (iter != (iteration - 1))) {
        newGlobalSize[dim] = globalStep;
      } else {
        newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
      }
    }

    amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
                                   &(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));

    if (iter > 0) {
      // Updates the timestamp values, since a CB flush could occur.
      // Resource processing was  moved from loadArguments() and
      // an extra loop is required.
      const amd::KernelParameters& kernelParams = kernel.parameters();
      amd::Memory* const* memories =
          reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
      for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
        if (nativeMem) {
          Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
          if (gpuMem != nullptr) {
            gpuMem->setBusy(*this, gpuEvent);
          }
        } else {
          amd::Memory* mem = memories[i];
          if (mem != nullptr) {
            dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
          }
        }
      }
    }

    uint64_t vmParentWrap = 0;
    // Program the kernel arguments for the GPU execution
    hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
        *this, kernel, tmpSizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap);
    if (nullptr == aqlPkt) {
      LogError("Couldn't load kernel arguments");
      return false;
    }

    // Set up the dispatch information
    Pal::DispatchAqlParams dispatchParam = {};
    dispatchParam.pAqlPacket = aqlPkt;
    if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
      const Device::ScratchBuffer* scratch = nullptr;
      scratch = dev().scratch(hwRing());
      dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
      dispatchParam.scratchSize = scratch->size_;
      dispatchParam.scratchOffset = scratch->offset_;
      dispatchParam.workitemPrivateSegmentSize = hsaKernel.spillSegSize();
    }
    dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
    dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ? enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
    dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
    // Run AQL dispatch in HW
    eventBegin(MainEngine);
    iCmd()->CmdDispatchAql(dispatchParam);
    // Note: This a workaround for incorrect results reported with release_mem packet,
    // when the packet can be processed later after this dispatch and including extra time
    if (profiling() || state_.profileEnabled_) {
      addBarrier(RgpSqqtBarrierReason::ProfilingControl);
      // Clear memory dependency to avoid the second L1 invalidation
      memoryDependency().clear();
    }
    if (id != gpuEvent.id_) {
      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, gpuEvent);

    // Execute scheduler for device enqueue
    if (hsaKernel.dynamicParallelism()) {
      PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
    }

    // Update the global GPU event
    setGpuEvent(gpuEvent, needFlush);

    if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
      LogError("Couldn't read printf data from the buffer!\n");
      return false;
    }
  }

  // Check if image buffer write back is required
  if (state_.imageBufferWrtBack_) {
    // Avoid recursive write back
    state_.imageBufferWrtBack_ = false;
    // Make sure the original kernel execution is done
    addBarrier(RgpSqqtBarrierReason::MemDependency);
    for (const auto imageBuffer : wrtBackImageBuffer_) {
      Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
      amd::Image* image = imageBuffer->owner()->asImage();
      amd::Coord3D offs(0);
      // Copy memory from the the backing store image into original buffer
      bool result = blitMgr().copyImageToBuffer(*imageBuffer->CopyImageBuffer(), *buffer, offs,
                                                offs, image->getRegion(), true,
                                                image->getRowPitch(), image->getSlicePitch());
    }
    wrtBackImageBuffer_.clear();
  }

  // Perform post dispatch logic for RGP traces
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->PostDispatch(this);
  }

  return true;
}

void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  Unimplemented();  //!< @todo: Unimplemented
}

void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  //!@note runtime doesn't need to lock this command on execution

  if (vcmd.waitingEvent() != nullptr) {
    bool foundEvent = false;

    // Loop through all outstanding command batches
    while (!cbQueue_.empty()) {
      auto cb = cbQueue_.front();
      // Wait for completion
      foundEvent = awaitCompletion(cb, vcmd.waitingEvent());
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
      // Early exit if we found a command
      if (foundEvent) break;
    }

    // Event should be in the current command batch
    if (!foundEvent) {
      state_.forceWait_ = true;
    }
    // If we don't have any more batches, then assume GPU is idle
    else if (cbQueue_.empty()) {
      dmaFlushMgmt_.resetCbWorkload(dev());
    }
  }
}

void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
  if (!dev().settings().disableSdma_) {
    queues_[SdmaEngine]->removeCmdMemRef(mem);
  }
}

void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();

  PalCounterReference* palRef = PalCounterReference::Create(*this);
  if (palRef == nullptr) {
    LogError("We failed to allocate memory for the GPU perfcounter");
    vcmd.setStatus(CL_INVALID_OPERATION);
    return;
  }

  bool newExperiment = false;

  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    // Make sure we have a valid gpu performance counter
    if (nullptr == counter) {
      amd::PerfCounter::Properties prop = amdCounter->properties();
      PerfCounter* gpuCounter = new PerfCounter(
          gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
          prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
      if (nullptr == gpuCounter) {
        LogError("We failed to allocate memory for the GPU perfcounter");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      } else if (gpuCounter->create()) {
        newExperiment = true;
      } else {
        LogPrintfError(
            "We failed to allocate a perfcounter in PAL.\
                    Block: %d, counter: #d, event: %d",
            gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_,
            gpuCounter->info()->eventIndex_);
      }
      amdCounter->setDeviceCounter(gpuCounter);
    }
  }

  if (newExperiment) {
    palRef->finalize();
  }

  palRef->release();

  Pal::IPerfExperiment* palPerf = nullptr;
  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    if (palPerf != counter->iPerf()) {
      palPerf = counter->iPerf();
      // Find the state and sends the command to PAL
      if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
        state_.perfCounterEnabled_ = true;
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdBeginPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
      } else if (vcmd.getState() == amd::PerfCounterCommand::End) {
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdEndPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
        state_.perfCounterEnabled_ = false;
      } else {
        LogError("Unsupported performance counter state");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
  }
}

void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE_MEM: {
      amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      if (threadTrace == nullptr) {
        PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this);
        if (palRef == nullptr) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        size_t numSe = amdThreadTrace->deviceSeNumThreadTrace();

        ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe);
        if (nullptr == gpuThreadTrace) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        if (gpuThreadTrace->create()) {
          amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
        } else {
          LogError("Failure in memory allocation for the GPU threadtrace");
          delete gpuThreadTrace;
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        palRef->finalize();
        palRef->release();
      }

      break;
    }
    default:
      LogError("Unsupported command type for ThreadTraceMemObjects!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE: {
      amd::ThreadTrace* amdThreadTrace = static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      // gpu thread trace object had to be generated prior to begin/end/pause/resume due
      // to ThreadTraceMemObjectsCommand execution
      if (threadTrace == nullptr) {
        return;
      } else {
        Pal::IPerfExperiment* palPerf = threadTrace->iPerf();
        if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
          amd::ThreadTrace::ThreadTraceConfig* traceCfg =
              static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
          iCmd()->CmdBeginPerfExperiment(palPerf);
        } else if (cmd.getState() == amd::ThreadTraceCommand::End) {
          GpuEvent event;
          eventBegin(MainEngine);
          iCmd()->CmdEndPerfExperiment(palPerf);
          threadTrace->populateUserMemory();
          eventEnd(MainEngine, event);
          setGpuEvent(event);
        } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
          // There's no Pause from the PerfExperiment interface
        } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
          // There's no Resume from the PerfExperiment interface
        }
      }
      break;
    }
    default:
      LogError("Unsupported command type for ThreadTrace!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data from original resource
    it->getInteropObj()->copyOrigToShared();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data back to original resource
    it->getInteropObj()->copySharedToOrig();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);
  pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory());

  GpuEvent gpuEvent;
  uint32_t value = vcmd.markerValue();

  if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
    eventBegin(MainEngine);
    addVmMemory(pGpuMemory);

    iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF,
                                              Pal::CompareFunc::GreaterEqual);
    eventEnd(MainEngine, gpuEvent);

  } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
    EngineType activeEngineID = engineID_;
    engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);

    // Make sure GPU finished operation and data reached memory before the marker write
    static constexpr bool FlushL2 = true;
    addBarrier(RgpSqqtBarrierReason::SignalSubmit, FlushL2);
    // Workarounds: We had systems where an extra delay was necessary.
    {
      // Flush CB associated with the DGMA buffer
      isDone(pGpuMemory->getGpuEvent(*this));
    }

    eventBegin(engineID_);
    queues_[engineID_]->addCmdMemRef(pGpuMemory->memRef());

    queues_[engineID_]->iCmd()->
#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 396)
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value);
#else
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), vcmd.markerOffset(), value);
#endif
    eventEnd(engineID_, gpuEvent);

    // Restore the original engine
    engineID_ = activeEngineID;
  }

  // Update the global GPU event
  setGpuEvent(gpuEvent);

  profilingEnd(vcmd);
}

void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  std::vector<amd::Memory*> memObjects = vcmd.memObjects();
  uint32_t numObjects = memObjects.size();
  Pal::GpuMemoryRef* pGpuMemRef = new Pal::GpuMemoryRef[numObjects];
  Pal::IGpuMemory** pGpuMems = new Pal::IGpuMemory*[numObjects];

  for (uint i = 0; i < numObjects; i++) {
    pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]);
    pGpuMemory->syncCacheFromHost(*this);

    pGpuMemRef[i].pGpuMemory = pGpuMemory->iMem();
    pGpuMems[i] = pGpuMemory->iMem();
  }

  dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
                                       Pal::GpuMemoryRefCantTrim);
  {
    amd::ScopedLock l(queues_[MainEngine]->lock_);
    dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
  }

  if (numObjects != 0) {
    dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
  }

  for (uint i = 0; i < numObjects; i++) {
    vcmd.busAddress()[i].surface_bus_address = pGpuMems[i]->Desc().surfaceBusAddr;
    vcmd.busAddress()[i].marker_bus_address = pGpuMems[i]->Desc().markerBusAddr;
  }
  profilingEnd(vcmd);
}


bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* head = cb->head_;

  // Make sure that profiling is enabled
  if (state_.profileEnabled_) {
    return profilingCollectResults(cb, waitingEvent);
  }
  // Mark the first command in the batch as running
  if (head != nullptr) {
    head->setStatus(CL_RUNNING);
  } else {
    return found;
  }

  // Wait for the last known GPU event
  waitEventLock(cb);

  while (nullptr != head) {
    current = head->getNext();
    if (head->status() == CL_SUBMITTED) {
      head->setStatus(CL_RUNNING);
      head->setStatus(CL_COMPLETE);
    } else if (head->status() == CL_RUNNING) {
      head->setStatus(CL_COMPLETE);
    } else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", head->status());
    }

    // Check if it's a waiting command
    if (head == waitingEvent) {
      found = true;
    }

    head->release();
    head = current;
  }

  return found;
}

void VirtualGPU::flush(amd::Command* list, bool wait) {
  CommandBatch* cb = nullptr;
  bool gpuCommand = false;

  for (uint i = 0; i < AllEngines; ++i) {
    if (events_[i].isValid()) {
      gpuCommand = true;
    }
  }

  // If the batch doesn't have any GPU command and the list is empty
  if (!gpuCommand && cbQueue_.empty()) {
    state_.forceWait_ = true;
  }

  // Insert the current batch into a list
  if (nullptr != list) {
    if (!freeCbQueue_.empty()) {
      cb = freeCbQueue_.front();
    }

    if (nullptr == cb) {
      cb = new CommandBatch(list, events_, lastTS_);
    } else {
      freeCbQueue_.pop();
      cb->init(list, events_, lastTS_);
    }
  }

  {
    //! @todo: Check if really need a lock
    amd::ScopedLock lock(execution());
    for (uint i = 0; i < AllEngines; ++i) {
      flushDMA(i);
      // Reset event so we won't try to wait again,
      // if runtime didn't submit any commands
      //! @note: it's safe to invalidate events, since
      //! we already saved them with the batch creation step above
      events_[i].invalidate();
    }
  }

  // Mark last TS as nullptr, so runtime won't process empty batches with the old TS
  lastTS_ = nullptr;
  if (nullptr != cb) {
    cbQueue_.push(cb);
  }

  wait |= state_.forceWait_;
  // Loop through all outstanding command batches
  while (!cbQueue_.empty()) {
    cb = cbQueue_.front();
    // Check if command batch finished without a wait
    bool finished = true;
    for (uint i = 0; i < AllEngines; ++i) {
      finished &= isDone(&cb->events_[i]);
    }
    if (finished || wait) {
      // Wait for completion
      awaitCompletion(cb);
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
    } else {
      // Early exit if no finished
      break;
    }
  }
  state_.forceWait_ = false;
}

void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }

void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) {
  events_[engineID_] = gpuEvent;

  // Flush current DMA buffer if requested
  if (flush) {
    flushDMA(engineID_);
  }
}

void VirtualGPU::flushDMA(uint engineID) {
  if (engineID == MainEngine) {
    // Clear memory dependency state, since runtime flushes compute
    // memoryDependency().clear();
    //!@todo Keep memory dependency alive even if we flush DMA,
    //! since only L2 cache is flushed in KMD frame,
    //! but L1 still has to be invalidated.
  }

  isDone(&events_[engineID]);
}

bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
  uint i;
  GpuEvent* events;  //!< GPU events for the batch

  // If command batch is nullptr then wait for the current
  if (nullptr == cb) {
    events = events_;
  } else {
    events = cb->events_;
  }

  bool earlyDone = true;
  // The first loop is to flush all engines and/or check if
  // engines are idle already
  for (i = 0; i < AllEngines; ++i) {
    earlyDone &= isDone(&events[i]);
  }

  // Rlease all pinned memory
  releasePinnedMem();

  // The second loop is to wait all engines
  for (i = 0; i < AllEngines; ++i) {
    waitForEvent(&events[i]);
  }

  return earlyDone;
}

void VirtualGPU::waitEventLock(CommandBatch* cb) {
  bool earlyDone = false;
  {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    earlyDone = waitAllEngines(cb);
  }

  // Free resource cache if we have too many entries
  //! \note we do it here, when all engines are idle,
  // because Vista/Win7 idles GPU on a resource destruction
  static const size_t MinCacheEntries = 4096;
  dev().resourceCache().free(MinCacheEntries);

  // Find the timestamp object of the last command in the batch
  if (cb->lastTS_ != nullptr) {
    // If earlyDone is TRUE, then CPU didn't wait for GPU.
    // Thus the sync point between CPU and GPU is unclear and runtime
    // will use an older adjustment value to maintain the same timeline
    if (!earlyDone ||
        //! \note Workaround for APU(s).
        //! GPU-CPU timelines may go off too much, thus always
        //! force calibration with the last batch in the list
        (cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) {
      uint64_t startTimeStampGPU = 0;
      uint64_t endTimeStampGPU = 0;

      // Get the timestamp value of the last command in the batch
      cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);

      uint64_t endTimeStampCPU = amd::Os::timeNanos();
      // Make sure the command batch has a valid GPU TS
      if (!GPU_RAW_TIMESTAMP) {
        // Adjust the base time by the execution time
        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
      }
    }
  }
}

bool VirtualGPU::allocConstantBuffers() {
  // Allocate constant buffers.
  // Use double size, reported to the app to account for internal arguments
  const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_;
  uint i;

  // Create/reallocate constant buffer resources
  for (i = 0; i < MaxConstBuffersArguments; ++i) {
    ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize);

    if ((constBuf != nullptr) && constBuf->Create()) {
      addConstBuffer(constBuf);
    } else {
      // We failed to create a constant buffer
      delete constBuf;
      return false;
    }
  }

  return true;
}

void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
  // Is profiling enabled?
  if (command.profilingInfo().enabled_) {
    // Allocate a timestamp object from the cache
    TimeStamp* ts = tsCache_->allocTimeStamp();
    if (nullptr == ts) {
      return;
    }
    // Save the TimeStamp object in the current OCL event
    command.setData(ts);
    profileTs_ = ts;
    state_.profileEnabled_ = true;
  }
}

void VirtualGPU::profilingEnd(amd::Command& command) {
  // Get the TimeStamp object associated witht the current command
  TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
  if (ts != nullptr) {
    // Check if the command actually did any GPU submission
    if (ts->isValid()) {
      lastTS_ = ts;
    } else {
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      command.setData(nullptr);
    }
  }
}

bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* first = cb->head_;

  // If the command list is, empty then exit
  if (nullptr == first) {
    return found;
  }

  // Wait for the last known GPU events on all engines
  waitEventLock(cb);

  // Find the CPU base time of the entire command batch execution
  uint64_t endTimeStamp = amd::Os::timeNanos();
  uint64_t startTimeStamp = endTimeStamp;

  // First step, walk the command list to find the first valid command
  //! \note The batch may have empty markers at the beginning.
  //! So the start/end of the empty commands is equal to
  //! the start of the first valid command in the batch.
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Assign to endTimeStamp the start of the first valid command
      endTimeStamp = startTimeStamp;
      break;
    }
    first = first->getNext();
  }

  // Second step, walk the command list to construct the time line
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());

    current = first->getNext();

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      first->setData(nullptr);
    } else {
      // For empty commands start/end is equal to
      // the end of the last valid command
      startTimeStamp = endTimeStamp;
    }

    // Update the command status with the proper timestamps
    if (first->status() == CL_SUBMITTED) {
      first->setStatus(CL_RUNNING, startTimeStamp);
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if (first->status() == CL_RUNNING) {
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", first->status());
    }

    // Do we wait this event?
    if (first == waitingEvent) {
      found = true;
    }

    first->release();
    first = current;
  }

  return found;
}

void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) {
  queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd);
}

void VirtualGPU::profileEvent(EngineType engine, bool type) const {
  if (nullptr == profileTs_) {
    return;
  }
  if (type) {
    profileTs_->begin();
  } else {
    profileTs_->end();
  }
}

bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
                                      bool nativeMem, size_t& ldsAddress) {
  const amd::KernelParameters& kernelParams = kernel.parameters();

  // Mark the tracker with a new kernel,
  // so we can avoid checks of the aliased objects
  memoryDependency().newKernel();

  size_t count = kernelParams.getNumberOfSvmPtr();
  if (count > 0) {
    bool supportFineGrainedSystem = dev().isFineGrainedSystem(true);
    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
    switch (status) {
      case FGS_YES:
        if (!supportFineGrainedSystem) {
          return false;
        }
        break;
      case FGS_NO:
        supportFineGrainedSystem = false;
        break;
      case FGS_DEFAULT:
      default:
        break;
    }
    // get svm non arugment information
    void* const* svmPtrArray =
        reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
    for (size_t i = 0; i < count; i++) {
      amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
      if (nullptr == memory) {
        if (!supportFineGrainedSystem) {
          return false;
        } else {
          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
          continue;
        }
      } else {
        Memory* gpuMemory = dev().getGpuMemory(memory);
        if (nullptr != gpuMemory) {
          // Synchronize data with other memory instances if necessary
          gpuMemory->syncCacheFromHost(*this);

          const static bool IsReadOnly = false;
          // Validate SVM passed in the non argument list
          memoryDependency().validate(*this, gpuMemory, IsReadOnly);

          // Wait for resource if it was used on an inactive engine
          //! \note syncCache may call DRM transfer
          constexpr bool WaitOnBusyEngine = true;
          gpuMemory->wait(*this, WaitOnBusyEngine);

          // Mark signal write for cache coherency,
          // since this object isn't a part of kernel arg setup
          if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
            memory->signalWrite(&dev());
          }
          addVmMemory(gpuMemory);
        } else {
          return false;
        }
      }
    }
  }

  bool srdResource = false;
  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
  const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
  const amd::KernelSignature& signature = kernel.signature();
  ldsAddress = hsaKernel.ldsSize();

  if (!nativeMem) {
    // Process cache coherency first, since the extra transfers may affect
    // other mem dependency tracking logic: TS and signalWrite()
    for (uint i = 0; i < signature.numMemories(); ++i) {
      amd::Memory* mem = memories[i];
      if (mem != nullptr) {
        // Synchronize data with other memory instances if necessary
        dev().getGpuMemory(mem)->syncCacheFromHost(*this);
      }
    }
  }

  // Check all parameters for the current kernel
  for (size_t i = 0; i < signature.numParameters(); ++i) {
    const amd::KernelParameterDescriptor& desc = signature.at(i);
    const amd::KernelParameterDescriptor::InfoData& info = desc.info_;

    // Find if current argument is a buffer
    if (desc.type_ == T_POINTER) {
      // If it is a local pointer
      if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
        ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
        if (desc.size_ == 8) {
          // Save the original LDS size
          uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          WriteAqlArgAt(const_cast<address>(params), &ldsAddress, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        } else {
          // Save the original LDS size
          uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          uint32_t ldsAddr = ldsAddress;
          WriteAqlArgAt(const_cast<address>(params), &ldsAddr, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        }
      } else {
        Memory* gpuMem = nullptr;
        amd::Memory* mem = nullptr;
        uint32_t index = info.arrayIndex_;
        if (nativeMem) {
          gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
          if (nullptr != gpuMem) {
            mem = gpuMem->owner();
          }
        } else {
          mem = memories[index];
          if (mem != nullptr) {
            gpuMem = dev().getGpuMemory(mem);
          }
        }
        //! This condition is for SVM fine-grain
        if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
          continue;
        } else if (gpuMem != nullptr) {
          // Validate memory for a dependency in the queue
          memoryDependency().validate(*this, gpuMem, info.readOnly_);
          // Wait for resource if it was used on an inactive engine
          //! \note syncCache may call DRM transfer
          constexpr bool WaitOnBusyEngine = true;
          gpuMem->wait(*this, WaitOnBusyEngine);

          addVmMemory(gpuMem);
          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
          ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\targ%d: %s %s = ptr:%p obj:[%p-%p] threadId : %zx\n", index,
                  desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
                  reinterpret_cast<void*>(gpuMem->vmAddress()),
                  reinterpret_cast<void*>(gpuMem->vmAddress() + gpuMem->size()),
                  std::this_thread::get_id());

          //! Check if compiler expects read/write.
          //! Note: SVM with subbuffers has an issue with tracking.
          //! Conformance can send read only subbuffer, but update the region
          //! in the kernel.
          if ((mem != nullptr) &&
              ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
               ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
            mem->signalWrite(&dev());
          }
          if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
            if (gpuMem->memoryType() == Resource::ImageBuffer) {
              Image* imageBuffer = static_cast<Image*>(gpuMem);
              // Check if synchronization has to be performed
              if (imageBuffer->CopyImageBuffer() != nullptr) {
                Memory* buffer = dev().getGpuMemory(mem->parent());
                amd::Image* image = mem->asImage();
                amd::Coord3D offs(0);
                // Copy memory from the original image buffer into the backing store image
                bool result = blitMgr().copyBufferToImage(
                    *buffer, *imageBuffer->CopyImageBuffer(), offs, offs, image->getRegion(), true,
                    image->getRowPitch(), image->getSlicePitch());
                // Make sure the copy operation is done
                addBarrier(RgpSqqtBarrierReason::MemDependency);
                // Use backing store SRD as the replacment
                uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
                WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
                // Add backing store image to the list of memory handles
                addVmMemory(imageBuffer->CopyImageBuffer());
                // If it's not a read only resource, then runtime has to write back
                if (!info.readOnly_) {
                  wrtBackImageBuffer_.push_back(imageBuffer);
                  state_.imageBufferWrtBack_ = true;
                }
              }
            }

            //! \note Special case for the image views.
            //! Copy SRD to CB1, so blit manager will be able to release
            //! this view without a wait for SRD resource.
            if (gpuMem->memoryType() == Resource::ImageView) {
              // Copy the current image SRD into CB1
              uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
              // Then use a pointer in aqlArgBuffer to CB1
              // Patch the GPU VA address in the original arguments
              WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
              addVmMemory(cb(1)->ActiveMemory());
            } else {
              srdResource = true;
            }
            if (gpuMem->desc().isDoppTexture_) {
              addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
                         kernel.parameters().getExecPfpaVcop());
            }
          }
        }
      }
    } else if (desc.type_ == T_VOID) {
      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
        // Copy the current structure into CB1
        size_t gpuPtr =
            static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
        // Then use a pointer in aqlArgBuffer to CB1
        const auto it = hsaKernel.patch().find(desc.offset_);
        // Patch the GPU VA address in the original arguments
        WriteAqlArgAt(const_cast<address>(params), &gpuPtr, sizeof(size_t), it->second);
        addVmMemory(cb(1)->ActiveMemory());
      }
    } else if (desc.type_ == T_SAMPLER) {
      srdResource = true;
    } else if (desc.type_ == T_QUEUE) {
      uint32_t index = desc.info_.arrayIndex_;
      const amd::DeviceQueue* queue =
          reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
      VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
      uint64_t vmQueue;
      if (dev().settings().useDeviceQueue_) {
        vmQueue = gpuQueue->vQueue()->vmAddress();
      } else {
        if (!createVirtualQueue(queue->size())) {
          LogError("Virtual queue creation failed!");
          return false;
        }
        vmQueue = vQueue()->vmAddress();
      }
      // Patch the GPU VA address in the original arguments
      WriteAqlArgAt(const_cast<address>(params), &vmQueue, sizeof(vmQueue), desc.offset_);
      break;
    }
  }

  if (ldsAddress > dev().info().localMemSize_) {
    LogError("No local memory available\n");
    return false;
  }

  if (srdResource || hsaKernel.prog().isStaticSampler()) {
    dev().srds().fillResourceList(*this);
  }

  const static bool IsReadOnly = false;
  for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
    // Validate global store for a dependency in the queue
    memoryDependency().validate(*this, mem, IsReadOnly);
    addVmMemory(mem);
  }
  if (hsaKernel.prog().hasGlobalStores()) {
    // Validate code object for a dependency in the queue
    memoryDependency().validate(*this, &hsaKernel.prog().codeSegGpu(), IsReadOnly);
    std::vector<amd::Memory*> undefined = hsaKernel.prog().getUndefMemObj();
    for (auto it = undefined.begin(); it != undefined.end(); ++it) {
      addVmMemory(dev().getGpuMemory(*it));
    }
  }

  addVmMemory(&hsaKernel.prog().codeSegGpu());

  if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
    const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
    // Validate scratch buffer to force sync mode, because
    // the current scratch logic is optimized for size and performance
    memoryDependency().validate(*this, scratch->memObj_, IsReadOnly);
    addVmMemory(scratch->memObj_);
  }

  return true;
}

amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) {
  amd::Memory* mem = new (amdImage.getContext()) amd::Buffer(amdImage, 0, 0, amdImage.getSize());
  mem->setVirtualDevice(this);
  if ((mem != nullptr) && !mem->create()) {
    mem->release();
  }

  return mem;
}

void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) {
  if (nullptr == kernelTable) {
    vqHeader_->kernel_table = 0;
  } else {
    vqHeader_->kernel_table = kernelTable->vmAddress();
    addVmMemory(kernelTable);
  }

  virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true);
}

void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt,
                                 HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) {
  amd::HwDebugManager* dbgManager = dev().hwDebugMgr();
  assert(dbgManager && "No HW Debug Manager!");

  // Initialize structure with default values

  if (hsaKernel.prog().maxScratchRegs() > 0) {
    pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_;
    kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
    kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();

    // Get the address of the scratch buffer and its size for CPU access
    address scratchRingAddr = nullptr;
    scratchRingAddr = static_cast<address>(scratchBuf->map(nullptr, 0));
    dbgManager->setScratchRing(scratchRingAddr, scratchBuf->size());
    scratchBuf->unmap(nullptr);
  } else {
    kernelInfo.scratchBufAddr = 0;
    kernelInfo.scratchBufferSizeInBytes = 0;
    dbgManager->setScratchRing(nullptr, 0);
  }

  //! @todo:  need to verify what is wanted for the global memory
  Unimplemented();
  kernelInfo.heapBufAddr = 0;

  kernelInfo.pAqlDispatchPacket = aqlPkt;
  kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());

  // Get the address of the kernel code and its size for CPU access
  /*    pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
      if (nullptr != aqlCode) {
          address aqlCodeAddr = static_cast<address>(aqlCode->map(nullptr, 0));
          dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
          aqlCode->unmap(nullptr);
      }
      else {
          dbgManager->setKernelCodeInfo(nullptr, 0);
      }
  */
  kernelInfo.trapPresent = false;
  kernelInfo.trapHandler = nullptr;
  kernelInfo.trapHandlerBuffer = nullptr;

  kernelInfo.excpEn = 0;
  kernelInfo.cacheDisableMask = 0;
  kernelInfo.sqDebugMode = 0;

  kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
  kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;

  // set kernel info for HW debug and call the callback function
  if (nullptr != dbgManager->preDispatchCallBackFunc()) {
    DebugToolInfo dbgSetting = {0};
    dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
    dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
    dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
    dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf();
    dbgSetting.event_ = enqueueEvent;

    // Call the predispatch callback function & set the trap info
    AqlCodeInfo aqlCodeInfo;
    aqlCodeInfo.aqlCode_ = (amd_kernel_code_t*)hsaKernel.cpuAqlCode();
    aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();

    // Execute the pre-dispatch call back function
    dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);

    // assign the debug TMA and TBA for kernel dispatch
    if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) {
      assignDebugTrapHandler(dbgSetting, kernelInfo);
    }

    kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;

    // Execption policy
    kernelInfo.excpEn = dbgSetting.exceptionMask_;
    kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
    kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;

    // Compute the mask for reserved CUs. These two dwords correspond to
    // two registers used for reserving CUs for display. In the current
    // implementation, the number of CUs reserved can be 0 to 7, and it
    // is set by debugger users.
    if (dbgSetting.monitorMode_) {
      uint32_t i = dbgSetting.reservedCuNum_ / 2;
      kernelInfo.mgmtSe0Mask <<= i;
      i = dbgSetting.reservedCuNum_ - i;
      kernelInfo.mgmtSe1Mask <<= i;
    }
    Unimplemented();
    /*
            // flush/invalidate the instruction, data, L1 and L2 caches
            InvalidateSqCaches();
    */
  }
}

void VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
                                        HwDbgKernelInfo& kernelInfo) {
  // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
  //
  Memory* rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
  Memory* rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());

  kernelInfo.trapHandler = reinterpret_cast<void*>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
  // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
  // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
  // without the workaround can still function correctly.
  kernelInfo.trapHandlerBuffer = reinterpret_cast<void*>(rtTrapBufferMem->vmAddress());

  address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));

  Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
  Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);

  // Address of the trap handler code/buffer should be 256-byte aligned
  uint64_t tbaAddress = trapHandlerMem->vmAddress();
  uint64_t tmaAddress = trapBufferMem->vmAddress();
  if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
    assert(false && "Trap handler/buffer is not 256-byte aligned");
  }

  // The addresses of the debug trap handler code (TBA) and buffer (TMA) are
  // stored in the runtime trap handler buffer with offset location of 0x18-19
  // and 0x20-21, respectively.
  uint64_t* rtTmaPtr = reinterpret_cast<uint64_t*>(rtTrapBufferAddress + 0x18);
  rtTmaPtr[0] = tbaAddress;
  rtTmaPtr[1] = tmaAddress;

  rtTrapBufferMem->unmap(nullptr);
  // Add GPU mem handles to the memory list for VidMM
  addVmMemory(trapHandlerMem);
  addVmMemory(trapBufferMem);
  addVmMemory(rtTrapHandlerMem);
  addVmMemory(rtTrapBufferMem);
}

bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
  uint64_t srcVmEnd = src.vmAddress() + src.vmSize();
  if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) ||
      ((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) ||
      ((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) {
    sdmaRange_.start_ = dst.vmAddress();
    sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
    return true;
  }

  sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
  sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
  return false;
}

void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
  size_t copySize = cmd.size()[0];
  size_t fileOffset = cmd.fileOffset();
  Memory* mem = dev().getGpuMemory(&cmd.memory());
  uint idx = 0;

  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);

  if (writeBuffer) {
    size_t dstOffset = cmd.origin()[0];
    while (copySize > 0) {
      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
      dstSize = std::min(dstSize, copySize);
      void* dstBuffer = staging->cpuMap(*this);
      if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0,
                                     dstSize)) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
      staging->cpuUnmap(*this);

      blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
      flushDMA(staging->getGpuEvent(*this)->engineId_);
      fileOffset += dstSize;
      dstOffset += dstSize;
      copySize -= dstSize;
    }
  } else {
    size_t srcOffset = cmd.origin()[0];
    while (copySize > 0) {
      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
      srcSize = std::min(srcSize, copySize);
      blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);

      void* srcBuffer = staging->cpuMap(*this);
      if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0,
                                     srcSize)) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
      staging->cpuUnmap(*this);

      fileOffset += srcSize;
      srcOffset += srcSize;
      copySize -= srcSize;
    }
  }
}
}  // namespace pal