rocm-systems/rocclr/runtime/device/pal/palvirtual.cpp

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/kernel.hpp"
#include "platform/commandqueue.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/paldebugger.hpp"
#include "device/appprofile.hpp"
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "amd_hsa_queue.h"
#include <fstream>
#include <sstream>
#include <algorithm>
#include "palQueue.h"
#include "palFence.h"

#ifdef _WIN32
#include <d3d10_1.h>
#include "amdocl/cl_d3d9_amd.hpp"
#include "amdocl/cl_d3d10_amd.hpp"
#include "amdocl/cl_d3d11_amd.hpp"
#endif  // _WIN32

namespace pal {

VirtualGPU::Queue* VirtualGPU::Queue::Create(Pal::IDevice* palDev, Pal::QueueType queueType,
                                             uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
                                             uint rtCU, amd::CommandQueue::Priority priority,
                                             uint64_t residency_limit, uint max_command_buffers) {
  Pal::Result result;
  Pal::CmdBufferCreateInfo cmdCreateInfo = {};
  Pal::QueueCreateInfo qCreateInfo = {};
  qCreateInfo.engineIndex = engineIdx;
  qCreateInfo.aqlQueue = true;
  qCreateInfo.queueType = queueType;

  if (queueType == Pal::QueueTypeDma) {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma;
  } else {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
  }

  if (priority == amd::CommandQueue::Priority::Medium) {
    qCreateInfo.engineIndex = 0x1;
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
  } else if (amd::CommandQueue::RealTimeDisabled != rtCU) {
    qCreateInfo.numReservedCu = rtCU;
    qCreateInfo.engineIndex = 0x0;
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
    cmdCreateInfo.flags.realtimeComputeUnits = true;
  }

  // Find queue object size
  size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  cmdCreateInfo.pCmdAllocator = cmdAllocator;
  cmdCreateInfo.queueType = queueType;

  // Find command buffer object size
  size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  // Find fence object size
  size_t fSize = palDev->GetFenceSize(&result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
  VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev, residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = reinterpret_cast<address>(&queue[1]);
    // Create PAL queue object
    result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
    if (result != Pal::Result::Success) {
      delete queue;
      return nullptr;
    }
    queue->UpdateAppPowerProfile();
    address addrCmd = addrQ + qSize;
    address addrF = addrCmd + max_command_buffers * cmdSize;
    Pal::CmdBufferBuildInfo cmdBuildInfo = {};

    for (uint i = 0; i < max_command_buffers; ++i) {
      result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }

      Pal::FenceCreateInfo fenceCreateinfo = {};
      fenceCreateinfo.flags.signaled = false;
      result = palDev->CreateFence(fenceCreateinfo, &addrF[i * fSize], &queue->iCmdFences_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }
      if (i == StartCmdBufIdx) {
        result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
        if (result != Pal::Result::Success) {
          delete queue;
          return nullptr;
        }
      }
    }
  }
  return queue;
}

VirtualGPU::Queue::~Queue() {
  // Remove all memory references
  std::vector<Pal::IGpuMemory*> memRef;
  for (auto it : memReferences_) {
    memRef.push_back(it.first->iMem());
  }
  if (memRef.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
  }
  memReferences_.clear();

  for (uint i = 0; i < max_command_buffers_; ++i) {
    if (nullptr != iCmdBuffs_[i]) {
      iCmdBuffs_[i]->Destroy();
    }
    if (nullptr != iCmdFences_[i]) {
      iCmdFences_[i]->Destroy();
    }
  }

  if (nullptr != iQueue_) {
    iQueue_->Destroy();
  }
}

Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile()
{
    std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();

    const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
    // Find the last occurance of the '\\' character and extract the name of the application as wide char.
    const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
    const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;

    return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}

void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
  Pal::IGpuMemory* iMem = mem->iMem();
  auto it = memReferences_.find(mem);
  if (it != memReferences_.end()) {
    it->second = cmdBufIdSlot_;
  } else {
    // Update runtime tracking with TS
    memReferences_[mem] = cmdBufIdSlot_;
    // Update PAL list with the new entry
    Pal::GpuMemoryRef memRef = {};
    memRef.pGpuMemory = iMem;
    palMemRefs_.push_back(memRef);
    // Check SDI memory object
    if (iMem->Desc().flags.isExternPhys &&
        (sdiReferences_.find(iMem) == sdiReferences_.end())) {
      sdiReferences_.insert(iMem);
      palSdiRefs_.push_back(iMem);
    }
    residency_size_ += iMem->Desc().size;
  }
}

void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
  Pal::IGpuMemory* iMem = mem->iMem();
  if (0 != memReferences_.erase(mem)) {
    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
    residency_size_ -= iMem->Desc().size;
  }
}

void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) {
  for (size_t i = 0; i < palDoppRefs_.size(); i++) {
    if (palDoppRefs_[i].pGpuMemory == iMem) {
      // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested,
      // the LAST_DOPP_SUBMISSION is send as requsted by KMD
      //
      if (palDoppRefs_[i].flags.lastPfpaCmd == 1) {
        return;  // no need to override the last submission command
      }

      if (lastDoppCmd) {
        palDoppRefs_[i].flags.lastPfpaCmd = 1;
        palDoppRefs_[i].flags.pfpa = 0;
      } else if (pfpaDoppCmd) {
        palDoppRefs_[i].flags.pfpa = 1;
      }
      return;
    }
  }

  //  this is the first reference of the DOPP desktop texture, add it in the vector
  Pal::DoppRef doppRef = {};
  doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0;
  doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0;
  doppRef.pGpuMemory = iMem;
  palDoppRefs_.push_back(doppRef);
}

bool VirtualGPU::Queue::flush() {
  // Stop commands building
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) {
    LogError("PAL failed to finalize a command buffer!");
    return false;
  }

  if (palMemRefs_.size() != 0) {
    if (Pal::Result::Success !=
        iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
                                      Pal::GpuMemoryRefCantTrim)) {
      LogError("PAL failed to make resident resources!");
      return false;
    }
    palMemRefs_.resize(0);
  }

  // Reset the fence. PAL will reset OS event
  if (Pal::Result::Success != iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) {
    LogError("PAL failed to reset a fence!");
    return false;
  }

  Pal::SubmitInfo submitInfo = {};
  submitInfo.cmdBufferCount = 1;
  submitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_];
  submitInfo.pFence = iCmdFences_[cmdBufIdSlot_];
  submitInfo.doppRefCount = palDoppRefs_.size();
  submitInfo.pDoppRefs = palDoppRefs_.data();
  submitInfo.externPhysMemCount = palSdiRefs_.size();
  submitInfo.ppExternPhysMem = palSdiRefs_.data();

  // Submit command buffer to OS
  if (Pal::Result::Success != iQueue_->Submit(submitInfo)) {
    LogError("PAL failed to submit CMD!");
    return false;
  }
  // Make sure the slot isn't busy
  constexpr bool IbReuse = true;
  if (GPU_FLUSH_ON_EXECUTION) {
    waifForFence<!IbReuse>(cmdBufIdSlot_);
  }

  // Reset the counter of commands
  cmdCnt_ = 0;

  // Find the next command buffer
  cmdBufIdCurrent_++;

  if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
    // Wait for the last one
    waifForFence<!IbReuse>(cmdBufIdSlot_);
    cmdBufIdCurrent_ = 1;
    cmbBufIdRetired_ = 0;
  }

  // Wrap current slot
  cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;

  waifForFence<IbReuse>(cmdBufIdSlot_);

  // Progress retired TS
  if ((cmdBufIdCurrent_ > max_command_buffers_) &&
      (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
    cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
  }

  // Reset command buffer, so CB chunks could be reused
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false)) {
    LogError("PAL failed CB reset!");
    return false;
  }
  // Start command buffer building
  Pal::CmdBufferBuildInfo cmdBuildInfo = {};
  cmdBuildInfo.pMemAllocator = &vlAlloc_;
  if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
    LogError("PAL failed CB building initialization!");
    return false;
  }

  // Clear dopp references
  palDoppRefs_.resize(0);
  palSdiRefs_.resize(0);

  // Remove old memory references
  if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
    for (auto it = memReferences_.begin(); it != memReferences_.end();) {
      if (it->second == cmdBufIdSlot_) {
        palMems_.push_back(it->first->iMem());
        residency_size_ -= it->first->iMem()->Desc().size;
        it = memReferences_.erase(it);
      } else {
        ++it;
      }
    }
  }
  if (palMems_.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
    palMems_.resize(0);
  }

  return true;
}

bool VirtualGPU::Queue::waitForEvent(uint id) {
  if (isDone(id)) {
    return true;
  }

  uint slotId = id % max_command_buffers_;
  constexpr bool IbReuse = true;
  bool result = waifForFence<!IbReuse>(slotId);
  cmbBufIdRetired_ = id;
  return result;
}

bool VirtualGPU::Queue::isDone(uint id) {
  if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
    return true;
  }

  if (id == cmdBufIdCurrent_) {
    // Flush the current command buffer
    flush();
  }

  if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
    return false;
  }
  cmbBufIdRetired_ = id;
  return true;
}

void VirtualGPU::Queue::DumpMemoryReferences() const {
  std::fstream dump;
  std::stringstream file_name("ocl_hang_dump.txt");
  uint64_t start = amd::Os::timeNanos() / 1e9;

  dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app));
  // Check if we have OpenCL program
  if (dump.is_open()) {
    dump << start << " Queue: ";
    switch (iQueue_->Type()) {
    case Pal::QueueTypeCompute:
      dump << "Compute";
      break;
    case Pal::QueueTypeDma:
      dump << "SDMA";
      break;
    default:
      dump << "unknown";
      break;
    }
    dump << "\n"
        << "Resident memory resources:\n";
    uint idx = 0;
    for (auto it : memReferences_) {
      dump << " " << idx << "\t[";
      dump.setf(std::ios::hex, std::ios::basefield);
      dump.setf(std::ios::showbase);
      dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
          << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
      dump.setf(std::ios::dec);
      dump << "] CbId:" << it.second <<
          ", Heap: " << (it.first)->iMem()->Desc().preferredHeap << "\n";
      idx++;
    }

    if (last_kernel_ != nullptr) {
      const amd::KernelSignature& signature = last_kernel_->signature();
      dump << last_kernel_->name() << std::endl;
      for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        // Find if the current argument is a memory object
        if ((desc.type_ == T_POINTER) &&
            (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
          dump << " " << desc.name_ << ": " << std::endl;
        }
      }
    }
    dump.close();
  }
}

bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  if (numMemObj > 0) {
    // Allocate the array of memory objects for dependency tracking
    memObjectsInQueue_ = new MemoryState[numMemObj];
    if (nullptr == memObjectsInQueue_) {
      return false;
    }
    memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
    maxMemObjectsInQueue_ = numMemObj;
  }

  return true;
}

void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
  bool flushL1Cache = false;

  if (maxMemObjectsInQueue_ == 0) {
    // Flush cache
    gpu.addBarrier();
    return;
  }

  uint64_t curStart = memory->vmAddress();
  uint64_t curEnd = curStart + memory->size();

  if (memory->isModified(gpu) || !readOnly) {
    // Mark resource as modified
    memory->setModified(gpu, !readOnly);

    // Loop through all memory objects in the queue and find dependency
    // @note don't include objects from the current kernel
    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
      // Check if the queue already contains this mem object and
      // GPU operations aren't readonly
      uint64_t busyStart = memObjectsInQueue_[j].start_;
      uint64_t busyEnd = memObjectsInQueue_[j].end_;

      // Check if the start inside the busy region
      if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
           // Check if the end inside the busy region
           ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
           // Check if the start/end cover the busy region
           ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
          // If the buys region was written or the current one is for write
          (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
        flushL1Cache = true;
        break;
      }
    }
  }

  // Did we reach the limit?
  if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
    flushL1Cache = true;
  }

  if (flushL1Cache) {
    // Flush cache
    if (!gpu.profiling()) {
        gpu.addBarrier();
    }

    // Clear memory dependency state
    const static bool All = true;
    clear(!All);
  }

  // Insert current memory object into the queue always,
  // since runtime calls flush before kernel execution and it has to keep
  // current kernel in tracking
  memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart;
  memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
  memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
  numMemObjectsInQueue_++;
}

void VirtualGPU::MemoryDependency::clear(bool all) {
  if (numMemObjectsInQueue_ > 0) {
    size_t i, j;
    if (all) {
      endMemObjectsInQueue_ = numMemObjectsInQueue_;
    }

    // Preserve all objects from the current kernel
    for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
      memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
      memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
      memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
    }
    // Adjust the number of active objects
    numMemObjectsInQueue_ -= endMemObjectsInQueue_;
    endMemObjectsInQueue_ = 0;
  }
}

VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) {
  aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_;
  maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
      // find time in us
      dev.settings().maxWorkloadTime_ * aluCnt_;
  resetCbWorkload(dev);
}

void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) {
  cbWorkload_ = 0;
  maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxEngineClockFrequency_) *
      // find time in us
      dev.settings().minWorkloadTime_ * aluCnt_;
}

void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads,
                                             uint instructions) {
  uint64_t workload = threads * instructions;
  if (maxDispatchWorkload_ < workload) {
    dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
    uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_;
    if ((dispatchSplitSize_ % fullLoad) != 0) {
      dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
    }
  } else {
    dispatchSplitSize_ =
        (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0;
  }
}

bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) {
  bool cbReady = false;
  uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
  // Add current workload to the overall workload in the current DMA
  cbWorkload_ += workload;
  // Did it exceed maximum?
  if (cbWorkload_ > maxCbWorkload_) {
    // Reset DMA workload
    cbWorkload_ = 0;
    // Increase workload of the next DMA buffer by 50%
    maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
    if (maxCbWorkload_ > maxDispatchWorkload_) {
      maxCbWorkload_ = maxDispatchWorkload_;
    }
    cbReady = true;
  }
  return cbReady;
}

void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
    if (pinnedMems_.size() > 7) {
      pinnedMems_.front()->release();
      pinnedMems_.erase(pinnedMems_.begin());
    }

    // Start operation, since we should release mem object
    flushDMA(dev().getGpuMemory(mem)->getGpuEvent(*this)->engineId_);

    // Delay destruction
    pinnedMems_.push_back(mem);
  }
}

void VirtualGPU::releasePinnedMem() {
  for (auto& amdMemory : pinnedMems_) {
    amdMemory->release();
  }
  pinnedMems_.resize(0);
}

amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
  for (auto& amdMemory : pinnedMems_) {
    if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
      return amdMemory;
    }
  }
  return nullptr;
}

bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
  uint MinDeviceQueueSize = 16 * 1024;
  deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);

  maskGroups_ = deviceQueueSize / (512 * Ki);
  maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_;

  // Align the queue size for the multiple dispatch scheduler.
  // Each thread works with 32 entries * maskGroups
  uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_);
  if (extra != 0) {
    deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra;
  }

  if (deviceQueueSize_ == deviceQueueSize) {
    return true;
  } else {
    delete vqHeader_;
    delete virtualQueue_;
    vqHeader_ = nullptr;
    virtualQueue_ = nullptr;
    deviceQueueSize_ = 0;
  }
  uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
  uint allocSize = deviceQueueSize;

  // Add the virtual queue header
  allocSize += sizeof(AmdVQueueHeader);
  allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));

  uint argOffs = allocSize;

  // Add the kernel arguments and wait events
  uint singleArgSize = amd::alignUp(
      dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t),
      sizeof(AmdAqlWrap));
  allocSize += singleArgSize * numSlots;

  uint eventsOffs = allocSize;
  // Add the device events
  allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);

  uint eventMaskOffs = allocSize;
  // Add mask array for events
  allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;

  uint slotMaskOffs = allocSize;
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

  virtualQueue_ = new Memory(dev(), allocSize);
  Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
  if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
    return false;
  }

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    address ptr = reinterpret_cast<address>(virtualQueue_->map(this, Resource::WriteOnly));
    if (nullptr == ptr) {
      return false;
    }
  }

  uint64_t pattern = 0;
  amd::Coord3D origin(0, 0, 0);
  amd::Coord3D region(virtualQueue_->size(), 0, 0);
  if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), origin, region)) {
    return false;
  }

  uint64_t vaBase = virtualQueue_->vmAddress();
  AmdVQueueHeader header = {};
  // Initialize the virtual queue header
  header.aql_slot_num = numSlots;
  header.event_slot_num = dev().settings().numDeviceEvents_;
  header.event_slot_mask = vaBase + eventMaskOffs;
  header.event_slots = vaBase + eventsOffs;
  header.aql_slot_mask = vaBase + slotMaskOffs;
  header.wait_size = dev().settings().numWaitEvents_;
  header.arg_size = dev().info().maxParameterSize_ + 64;
  header.mask_groups = maskGroups_;

  vqHeader_ = new AmdVQueueHeader;
  if (nullptr == vqHeader_) {
    return false;
  }
  *vqHeader_ = header;

  virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);

  // Go over all slots and perform initialization
  AmdAqlWrap slot = {};
  size_t offset = sizeof(AmdVQueueHeader);
  for (uint i = 0; i < numSlots; ++i) {
    uint64_t argStart = vaBase + argOffs + i * singleArgSize;
    slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
    slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
    virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
    offset += sizeof(AmdAqlWrap);
  }

  deviceQueueSize_ = deviceQueueSize;

  return true;
}

VirtualGPU::VirtualGPU(Device& device)
    : device::VirtualDevice(device),
      engineID_(MainEngine),
      gpuDevice_(static_cast<Device&>(device)),
      execution_("Virtual GPU execution lock", true),
      printfDbgHSA_(nullptr),
      tsCache_(nullptr),
      dmaFlushMgmt_(device),
      managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
      writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
      hwRing_(0),
      readjustTimeGPU_(0),
      lastTS_(nullptr),
      profileTs_(nullptr),
      vqHeader_(nullptr),
      virtualQueue_(nullptr),
      schedParams_(nullptr),
      deviceQueueSize_(0),
      maskGroups_(1),
      hsaQueueMem_(nullptr),
      cmdAllocator_(nullptr) {

  // Note: Virtual GPU device creation must be a thread safe operation
  index_ = gpuDevice_.numOfVgpus_++;
  gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
  gpuDevice_.vgpus_[index()] = this;

  queues_[MainEngine] = nullptr;
  queues_[SdmaEngine] = nullptr;
}

bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
                        amd::CommandQueue::Priority priority) {
  device::BlitManager::Setup blitSetup;

  // Resize the list of device resources always,
  // because destructor calls eraseResourceList() even if create() failed
  dev().resizeResoureList(index());

  if (index() >= GPU_MAX_COMMAND_QUEUES) {
    // Cap the maximum number of concurrent Virtual GPUs
    return false;
  }

  // Virtual GPU will have profiling enabled
  state_.profiling_ = profiling;

  Pal::CmdAllocatorCreateInfo createInfo = {};
  createInfo.flags.threadSafe = true;
  // \todo forces PAL to reuse CBs, but requires postamble
  createInfo.flags.autoMemoryReuse = false;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
  createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
    VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0));

  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;

  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocHeap = Pal::GpuHeapInvisible;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocSize = 64 * Ki;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].suballocSize = 64 * Ki;

  Pal::Result result;
  size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
  if (Pal::Result::Success != result) {
    return false;
  }
  char* addr = new char[cmdAllocSize];
  if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
    return false;
  }

  const uint firstQueue = (dev().numComputeEngines() > 2) ? 1 : 0;
  uint idx = index() % (dev().numComputeEngines() - firstQueue);
  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 :
    (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
  uint max_cmd_buffers = dev().settings().maxCmdBuffers_;

  if (dev().numComputeEngines()) {
    //! @todo There is a hang with a mix of user and non user queues.
    //! Currently there is no simple way to detect which queue is what.
    //! Disable first for now.

    // hwRing_ should be set 0 if forced to have single scratch buffer
    hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;

    queues_[MainEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue,
                                        cmdAllocator_, rtCUs, priority,
                                        residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
      return false;
    }

    // Check if device has SDMA engines
    if (dev().numDMAEngines() != 0 && !PAL_DISABLE_SDMA) {
      uint sdma;
      // If only 1 DMA engine is available then use that one
      if ((dev().numDMAEngines() < 2) || ((idx & 0x1) && !dev().settings().svmFineGrainSystem_)) {
        sdma = 0;
      } else {
        sdma = 1;
      }

      queues_[SdmaEngine] =
          Queue::Create(dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_,
                        amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal,
                        residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    } else {
        queues_[SdmaEngine] = Queue::Create(dev().iDev(), Pal::QueueTypeCompute,
            idx, cmdAllocator_, rtCUs, amd::CommandQueue::Priority::Normal,
            residency_limit, max_cmd_buffers);
        if (nullptr == queues_[SdmaEngine]) {
            return false;
        }
    }
  } else {
    Unimplemented();
  }

  if (!managedBuffer_.create(Resource::RemoteUSWC)) {
    return false;
  }

  // Diable double copy optimization,
  // since UAV read from nonlocal is fast enough
  blitSetup.disableCopyBufferToImageOpt_ = true;
  if (!allocConstantBuffers()) {
    return false;
  }

  // Create HSAILPrintf class
  printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
  if (nullptr == printfDbgHSA_) {
    delete printfDbgHSA_;
    LogError("Could not create PrintfDbgHSA class!");
    return false;
  }

  tsCache_ = new TimeStampCache(*this);
  if (nullptr == tsCache_) {
    LogError("Could not create TimeStamp cache!");
    return false;
  }

  if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
    LogError("Could not create the array of memory objects!");
    return false;
  }

  if (!allocHsaQueueMem()) {
    LogError("Could not create hsaQueueMem object!");
    return false;
  }

  // Check if the app requested a device queue creation
  if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) &&
      !createVirtualQueue(deviceQueueSize)) {
    LogError("Could not create a virtual queue!");
    return false;
  }

  // Choose the appropriate class for blit engine
  switch (dev().settings().blitEngine_) {
    default:
    // Fall through ...
    case Settings::BlitEngineHost:
      blitSetup.disableAll();
    // Fall through ...
    case Settings::BlitEngineCAL:
    case Settings::BlitEngineKernel:
      // use host blit for HW debug
      if (dev().settings().enableHwDebug_) {
        blitSetup.disableCopyImageToBuffer_ = true;
        blitSetup.disableCopyBufferToImage_ = true;
      }
      blitMgr_ = new KernelBlitManager(*this, blitSetup);
      break;
  }
  if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
    LogError("Could not create BlitManager!");
    return false;
  }

  // If the developer mode manager is available and it's not a device queue,
  // then enable RGP capturing
  if ((index() != 0) && dev().rgpCaptureMgr() != nullptr) {
    state_.rgpCaptureEnabled_ = true;
  }

  return true;
}

bool VirtualGPU::allocHsaQueueMem() {
  // Allocate a dummy HSA queue
  hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
  if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) {
    delete hsaQueueMem_;
    return false;
  }
  amd_queue_t hsa_queue = {};

  // Provide private and local heap addresses
  constexpr uint addressShift = LP64_SWITCH(0, 32);
  hsa_queue.private_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
  hsa_queue.group_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);

  hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true);

  return true;
}

VirtualGPU::~VirtualGPU() {
  // Destroy RGP trace
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->FinishRGPTrace(true);
  }

  // Not safe to remove a queue. So lock the device
  amd::ScopedLock k(dev().lockAsyncOps());
  amd::ScopedLock lock(dev().vgpusAccess());

  while (!freeCbQueue_.empty()) {
    auto cb = freeCbQueue_.front();
    delete cb;
    freeCbQueue_.pop();
  }

  // Destroy printfHSA object
  delete printfDbgHSA_;

  // Destroy TimeStamp cache
  delete tsCache_;

  // Destroy resource list with the constant buffers
  for (uint i = 0; i < constBufs_.size(); ++i) {
    delete constBufs_[i];
  }

  managedBuffer_.release();

  delete vqHeader_;
  delete virtualQueue_;
  delete hsaQueueMem_;

  // Release scratch buffer memory to reduce memory pressure
  //!@note OCLtst uses single device with multiple tests
  //! Release memory only if it's the last command queue.
  //! The first queue is reserved for the transfers on device
  if (static_cast<int>(gpuDevice_.numOfVgpus_ - 1) <= 1) {
    gpuDevice_.destroyScratchBuffers();
  }

  // Destroy BlitManager object
  delete blitMgr_;

  {
    // Destroy queues
    if (nullptr != queues_[MainEngine]) {
      // Make sure the queues are idle
      // It's unclear why PAL could still have a busy queue
      queues_[MainEngine]->iQueue_->WaitIdle();
      delete queues_[MainEngine];
    }

    if (nullptr != queues_[SdmaEngine]) {
      queues_[SdmaEngine]->iQueue_->WaitIdle();
      delete queues_[SdmaEngine];
    }

    if (nullptr != cmdAllocator_) {
      cmdAllocator_->Destroy();
      delete[] reinterpret_cast<char*>(cmdAllocator_);
    }
  }

  {
    // Find all available virtual GPUs and lock them
    // from the execution of commands, since the queue index and resource list
    // Will be adjusted
    for (auto it : dev().vgpus()) {
      if (it != this) {
        it->execution().lock();
      }
    }

    // Not safe to add a resource if create/destroy queue is in process, since
    // the size of the TS array can change
    amd::ScopedLock r(dev().lockResources());
    gpuDevice_.numOfVgpus_--;
    gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
    for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
      dev().vgpus()[idx]->index_--;
    }
    dev().eraseResoureList(index());

    // Find all available virtual GPUs and unlock them
    // for the execution of commands
    for (auto it : dev().vgpus()) {
      it->execution().unlock();
    }
  }
}

void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up-to-date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.source());

  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);

  profilingBegin(vcmd, true);

  memory->syncCacheFromHost(*this);
  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_READ_IMAGE) &&
      (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.source());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_READ_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_READ_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size,
                                      vcmd.isEntireMemory());
      } else {
        result =
            blitMgr().readBuffer(*memory, vcmd.destination(), origin, size, vcmd.isEntireMemory());
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_READ_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect,
                                          vcmd.size(), vcmd.isEntireMemory());
      } else {
        result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(),
                                          vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory());
      }
    } break;
    case CL_COMMAND_READ_IMAGE:
      if (hostMemory != nullptr) {
        // Accelerated image to buffer transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result =
            blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(),
                                        vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch());
      } else {
        result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(),
                                     vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory());
      }
      break;
    default:
      LogError("Unsupported type for the read command");
      break;
  }

  if (!result) {
    LogError("submitReadMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up to date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);

  profilingBegin(vcmd, true);

  bool entire = vcmd.isEntireMemory();

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer write for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_WRITE_IMAGE) &&
      (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.destination());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_WRITE_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_WRITE_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size,
                                      vcmd.isEntireMemory());
      } else {
        result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, size, vcmd.isEntireMemory());
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_WRITE_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(),
                                          vcmd.size(), vcmd.isEntireMemory());
      } else {
        result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(),
                                           vcmd.size(), vcmd.isEntireMemory());
      }
    } break;
    case CL_COMMAND_WRITE_IMAGE:
      if (hostMemory != nullptr) {
        // Accelerated buffer to image transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result =
            blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(),
                                        vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch());
      } else {
        result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(),
                                      vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory());
      }
      break;
    default:
      LogError("Unsupported type for the write command");
      break;
  }

  if (!result) {
    LogError("submitWriteMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  } else {
    // Mark this as the most-recently written cache of the destination
    vcmd.destination().signalWrite(&gpuDevice_);
  }
  profilingEnd(vcmd);
}

bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
                            bool entire, const amd::Coord3D& srcOrigin,
                            const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                            const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) {
  // Translate memory references and ensure cache up-to-date
  pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
  pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  dstMemory->syncCacheFromHost(*this, syncFlags);
  srcMemory->syncCacheFromHost(*this);

  amd::Memory* bufferFromImageSrc = nullptr;
  amd::Memory* bufferFromImageDst = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImageSrc = createBufferFromImage(srcMem);
    if (nullptr == bufferFromImageSrc) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_COPY_BUFFER;
      srcMemory = dev().getGpuMemory(bufferFromImageSrc);
    }
  }
  // Force buffer write for IMAGE1D_BUFFER
  if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImageDst = createBufferFromImage(dstMem);
    if (nullptr == bufferFromImageDst) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_COPY_BUFFER;
      dstMemory = dev().getGpuMemory(bufferFromImageDst);
    }
  }

  bool result = false;

  // Check if HW can be used for memory copy
  switch (type) {
    case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD:
    case CL_COMMAND_SVM_MEMCPY:
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D realSrcOrigin(srcOrigin[0]);
      amd::Coord3D realDstOrigin(dstOrigin[0]);
      amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);

      if (nullptr != bufferFromImageSrc) {
        size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
        realSrcOrigin.c[0] *= elemSize;
        if (nullptr != bufferFromImageDst) {
          realDstOrigin.c[0] *= elemSize;
        }
        realSize.c[0] *= elemSize;
      } else if (nullptr != bufferFromImageDst) {
        size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
        realDstOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
      }

      result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize,
                                    entire);

      if (nullptr != bufferFromImageSrc) {
        bufferFromImageSrc->release();
      }
      if (nullptr != bufferFromImageDst) {
        bufferFromImageDst->release();
      }
    } break;
    case CL_COMMAND_COPY_BUFFER_RECT:
      result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire);
      break;
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
      result =
          blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
      result =
          blitMgr().copyBufferToImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    case CL_COMMAND_COPY_IMAGE:
      result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire);
      break;
    default:
      LogError("Unsupported command type for memory copy!");
      break;
  }

  if (!result) {
    LogError("submitCopyMemory failed!");
    return false;
  } else {
    // Mark this as the most-recently written cache of the destination
    dstMem.signalWrite(&gpuDevice_);
  }
  return true;
}

void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  bool entire = vcmd.isEntireMemory();

  if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(),
                  vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    amd::Coord3D srcOrigin(0, 0, 0);
    amd::Coord3D dstOrigin(0, 0, 0);
    amd::Coord3D size(vcmd.srcSize(), 1, 1);
    amd::BufferRect srcRect;
    amd::BufferRect dstRect;

    bool result = false;
    amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src());
    amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst());

    device::Memory::SyncFlags syncFlags;
    if (nullptr != srcMem) {
      srcMem->commitSvmMemory();
      srcOrigin.c[0] =
          static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
      if (!(srcMem->validateRegion(srcOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
    if (nullptr != dstMem) {
      dstMem->commitSvmMemory();
      dstOrigin.c[0] =
          static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
      if (!(dstMem->validateRegion(dstOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }

    if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getGpuMemory(dstMem);
      // Synchronize source and destination memory
      syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
      memory->syncCacheFromHost(*this, syncFlags);

      result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size,
                                     dstMem->isEntirelyCovered(dstOrigin, size));
      // Mark this as the most-recently written cache of the destination
      dstMem->signalWrite(&gpuDevice_);
    } else if (nullptr != srcMem && nullptr == dstMem) {  // dst not in svm space
      Memory* memory = dev().getGpuMemory(srcMem);
      // Synchronize source and destination memory
      memory->syncCacheFromHost(*this);

      result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size,
                                    srcMem->isEntirelyCovered(srcOrigin, size));
    } else if (nullptr != srcMem && nullptr != dstMem) {  // both not in svm space
      bool entire =
          srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
      result =
          copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect);
    }

    if (!result) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // direct memcpy for FGS enabled system
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
  }
  profilingEnd(vcmd);
}

void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());

  // Save map info for unmap operation
  memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                      vcmd.isEntireMemory());

  // If we have host memory, use it
  if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
    if (!memory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before
      // synchronization with the backing store
      memory->wait(*this);
    }

    // Target is the backing store, so just ensure that owner is up-to-date
    memory->owner()->cacheWriteBack();

    // Add memory to VA cache, so rutnime can detect direct access to VA
    dev().addVACache(memory);
  } else if (memory->isPersistentDirectMap()) {
    // Nothing to do here
  } else if (memory->mapMemory() != nullptr) {
    // Target is a remote resource, so copy
    assert(memory->mapMemory() != nullptr);
    if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
      amd::Coord3D dstOrigin(0, 0, 0);
      if (memory->desc().buffer_) {
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        Memory* memoryBuf = memory;
        amd::Coord3D origin(vcmd.origin()[0]);
        amd::Coord3D size(vcmd.size()[0]);
        size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;

        amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
        if (nullptr == bufferFromImage) {
          LogError("We should not fail buffer creation from image_buffer!");
        } else {
          memoryBuf = dev().getGpuMemory(bufferFromImage);
        }
        if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size,
                                  vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
        if (nullptr != bufferFromImage) {
          bufferFromImage->release();
        }
      } else {
        // Validate if it's a view for a map of mip level
        if (vcmd.memory().parent() != nullptr) {
          amd::Image* amdImage = vcmd.memory().parent()->asImage();
          if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
            // Save map write info in the parent object
            dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(),
                                                      vcmd.mapFlags(), vcmd.isEntireMemory(),
                                                      vcmd.memory().asImage());
          }
        }
        if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin,
                                         vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    }
  } else {
    LogError("Unhandled map!");
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
  bool unmapMip = false;
  amd::Image* amdImage;
{
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
  amd::Memory* owner = memory->owner();
  const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
  if (nullptr == writeMapInfo) {
    LogError("Unmap without map call");
    return;
  }
  profilingBegin(vcmd, true);

  // Check if image is a mipmap and assign a saved view
  amdImage = owner->asImage();
  if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
      (writeMapInfo->baseMip_ != nullptr)) {
    // Assign mip level view
    amdImage = writeMapInfo->baseMip_;
    // Clear unmap flags from the parent image
    memory->clearUnmapInfo(vcmd.mapPtr());
    memory = dev().getGpuMemory(amdImage);
    unmapMip = true;
    writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
  }

  // We used host memory
  if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
    if (writeMapInfo->isUnmapWrite()) {
      // Target is the backing store, so sync
      owner->signalWrite(nullptr);
      memory->syncCacheFromHost(*this);
    }
    // Remove memory from VA cache
    dev().removeVACache(memory);
  }
  // data check was added for persistent memory that failed to get aperture
  // and therefore are treated like a remote resource
  else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
    memory->unmap(this);
  } else if (memory->mapMemory() != nullptr) {
    if (writeMapInfo->isUnmapWrite()) {
      amd::Coord3D srcOrigin(0, 0, 0);
      // Target is a remote resource, so copy
      assert(memory->mapMemory() != nullptr);
      if (memory->desc().buffer_) {
        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
                                  writeMapInfo->isEntire())) {
          LogError("submitUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        Memory* memoryBuf = memory;
        amd::Coord3D origin(writeMapInfo->origin_[0]);
        amd::Coord3D size(writeMapInfo->region_[0]);
        size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;

        amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
        if (nullptr == bufferFromImage) {
          LogError("We should not fail buffer creation from image_buffer!");
        } else {
          memoryBuf = dev().getGpuMemory(bufferFromImage);
        }
        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
                                  writeMapInfo->isEntire())) {
          LogError("submitUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
        if (nullptr != bufferFromImage) {
          bufferFromImage->release();
        }
      } else {
        if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
                                         writeMapInfo->origin_, writeMapInfo->region_,
                                         writeMapInfo->isEntire())) {
          LogError("submitUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      }
    }
  } else {
    LogError("Unhandled unmap!");
    vcmd.setStatus(CL_INVALID_VALUE);
  }

  // Clear unmap flags
  memory->clearUnmapInfo(vcmd.mapPtr());

  profilingEnd(vcmd);
}
  // Release a view for a mipmap map
  if (unmapMip) {
    // Memory release should be outside of the execution lock,
    // because mapMemory_ isn't marked for a specifc GPU
    amdImage->release();
  }
}

bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
                            size_t patternSize, const amd::Coord3D& origin,
                            const amd::Coord3D& size) {
  pal::Memory* memory = dev().getGpuMemory(amdMemory);
  bool entire = amdMemory->isEntirelyCovered(origin, size);

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  bool result = false;
  amd::Memory* bufferFromImage = nullptr;
  float fillValue[4];

  // Force fill buffer for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(*amdMemory);
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_FILL_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Find the the right fill operation
  switch (type) {
    case CL_COMMAND_FILL_BUFFER:
    case CL_COMMAND_SVM_MEMFILL: {
      amd::Coord3D realOrigin(origin[0]);
      amd::Coord3D realSize(size[0]);
      // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
      if (nullptr != bufferFromImage) {
        size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
        realOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
        memset(fillValue, 0, sizeof(fillValue));
        amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
        pattern = fillValue;
        patternSize = elemSize;
      }
      result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize,
                                    amdMemory->isEntirelyCovered(origin, size));
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_FILL_IMAGE:
      result = blitMgr().fillImage(*memory, pattern, origin, size,
                                   amdMemory->isEntirelyCovered(origin, size));
      break;
    default:
      LogError("Unsupported command type for FillMemory!");
      break;
  }

  if (!result) {
    LogError("fillMemory failed!");
    return false;
  }

  // Mark this as the most-recently written cache of the destination
  amdMemory->signalWrite(&gpuDevice_);
  return true;
}

void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  if (!fillMemory(vcmd.type(), &vcmd.memory(), vcmd.pattern(), vcmd.patternSize(), vcmd.origin(),
                  vcmd.size())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    // Make sure we have memory for the command execution
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());

    memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                        vcmd.isEntireMemory());

    if (memory->mapMemory() != nullptr) {
      if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitSVMMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (!memory->isHostMemDirectAccess()) {
        // Make sure GPU finished operation before
        // synchronization with the backing store
        memory->wait(*this);
      }

      // Target is the backing store, so just ensure that owner is up-to-date
      memory->owner()->cacheWriteBack();
    } else {
      LogError("Unhandled svm map!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr());

    if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        amd::Coord3D srcOrigin(0, 0, 0);
        // Target is a remote resource, so copy
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
                                  writeMapInfo->isEntire())) {
          LogError("submitSvmUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (writeMapInfo->isUnmapWrite()) {
        // Target is the backing store, so sync
        memory->owner()->signalWrite(nullptr);
        memory->syncCacheFromHost(*this);
      }
    }
    memory->clearUnmapInfo(vcmd.svmPtr());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  if (!dev().isFineGrainedSystem()) {
    size_t patternSize = vcmd.patternSize();
    size_t fillSize = patternSize * vcmd.times();
    amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst());
    assert(dstMemory && "No svm Buffer to fill with!");
    size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
        reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());

    pal::Memory* memory = dev().getGpuMemory(dstMemory);

    amd::Coord3D origin(offset, 0, 0);
    amd::Coord3D size(fillSize, 1, 1);

    assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
    // Synchronize memory from host if necessary
    device::Memory::SyncFlags syncFlags;
    syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size);
    memory->syncCacheFromHost(*this, syncFlags);

    if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
    // Mark this as the most-recently written cache of the destination
    dstMemory->signalWrite(&gpuDevice_);
  } else {
    // for FGS capable device, fill CPU memory directly
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd, true);

  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
    pal::Memory* memory = dev().getGpuMemory(it);

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      memory->mgpuCacheWriteBack();
    } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
      // Synchronize memory from host if necessary.
      // The sync function will perform memory migration from
      // another device if necessary
      device::Memory::SyncFlags syncFlags;
      memory->syncCacheFromHost(*this, syncFlags);
    } else {
      LogWarning("Unknown operation for memory migration!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
  // in-order semantics: previous commands need to be done before we start
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
  std::vector<void*>& svmPointers = vcmd.svmPointers();
  if (vcmd.pfnFreeFunc() == nullptr) {
    // pointers allocated using clSVMAlloc
    for (cl_uint i = 0; i < svmPointers.size(); ++i) {
      dev().svmFree(svmPointers[i]);
    }
  } else {
    vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
                       static_cast<void**>(&(svmPointers[0])), vcmd.userData());
  }
  profilingEnd(vcmd);
}

// ================================================================================================
void VirtualGPU::PrintChildren(const HSAILKernel& hsaKernel, VirtualGPU* gpuDefQueue)
{
  AmdAqlWrap* wraps =
      (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
  uint p = 0;
  for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
    if (wraps[i].state != 0) {
      uint j;
      if (p == GPU_PRINT_CHILD_KERNEL) {
        break;
      }
      p++;
      std::stringstream print;
      print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
      print << "Slot#: " << i << "\n";
      print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
      print << "\tcommand_id: " << wraps[i].command_id << "\n";
      print << "\tchild_counter: " << wraps[i].child_counter << "\n";
      print << "\tcompletion: " << wraps[i].completion << "\n";
      print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
      print << "\twait_list: " << wraps[i].wait_list << "\n";
      print << "\twait_num: " << wraps[i].wait_num << "\n";
      uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
      size_t* events =
          reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
      for (j = 0; j < wraps[i].wait_num; ++j) {
        uint offs =
            static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
        print << "Wait Event#: " << j << "\n";
        print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
      }
      print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
      print << wraps[i].aql.workgroup_size_y << ", ";
      print << wraps[i].aql.workgroup_size_z << "]\n";
      print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
      print << wraps[i].aql.grid_size_y << ", ";
      print << wraps[i].aql.grid_size_z << "]\n";

      uint64_t* kernels =
        (uint64_t*)(const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
      for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
        if (kernels[j] == wraps[i].aql.kernel_object) {
          break;
        }
      }
      const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
      HSAILKernel* child = nullptr;
      for (auto it = hsaKernel.prog().kernels().begin();
        it != hsaKernel.prog().kernels().end(); ++it) {
        if (j == static_cast<HSAILKernel*>(it->second)->index()) {
          child = static_cast<HSAILKernel*>(it->second);
        }
      }
      if (child == nullptr) {
        printf("Error: couldn't find child kernel!\n");
        continue;
      }
      const uint64_t kernarg_address =
          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
      uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
      address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
      print << "Kernel: " << child->name() << "\n";
      for (auto arg : child->arguments()) {
        const char* extraArgName = nullptr;
        switch (arg->type_) {
        case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X:
            extraArgName = "Offset0: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y:
            extraArgName = "Offset1: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z:
            extraArgName = "Offset2: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_PRINTF_BUFFER:
            extraArgName = "PrintfBuf: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
            extraArgName = "VqueuePtr: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_COMPLETION_ACTION:
            extraArgName = "AqlWrap: ";
            break;
        case HSAIL_ARGTYPE_HIDDEN_NONE:
            extraArgName = "Unknown: ";
            break;
        default:
            break;
        }
        if (extraArgName) {
            print << "\t" << extraArgName << *(size_t*)argum;
            print << "\n";
            argum += sizeof(size_t);
            continue;
        }
        print << "\t" << arg->name_ << ": ";
        for (int s = arg->size_ - 1; s >= 0; --s) {
            print.width(2);
            print.fill('0');
            print << (uint32_t)(argum[s]);
        }
        argum += arg->size_;
        print << "\n";
      }
      printf("%s", print.str().c_str());
    }
  }
}

// ================================================================================================
bool VirtualGPU::PreDeviceEnqueue(
    const amd::Kernel& kernel,
    const HSAILKernel& hsaKernel,
    VirtualGPU** gpuDefQueue,
    uint64_t* vmDefQueue)
{
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
  if (nullptr == defQueue) {
    LogError("Default device queue wasn't allocated");
    return false;
  }
  else {
    if (dev().settings().useDeviceQueue_) {
      *gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
      if ((*gpuDefQueue)->hwRing() == hwRing()) {
        LogError("Can't submit the child kernels to the same HW ring as the host queue!");
        return false;
      }
    }
    else {
      createVirtualQueue(defQueue->size());
      *gpuDefQueue = this;
    }
  }
  *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();

  (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress());
  // Acquire USWC memory for the scheduler parameters
  (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));

  // Add memory handles before the actual dispatch
  addVmMemory((*gpuDefQueue)->virtualQueue_);
  addVmMemory((*gpuDefQueue)->schedParams_);
  addVmMemory(hsaKernel.prog().kernelTable());
  return true;
}

// ================================================================================================
void VirtualGPU::PostDeviceEnqueue(
    const amd::Kernel& kernel,
    const HSAILKernel& hsaKernel,
    VirtualGPU* gpuDefQueue,
    uint64_t vmDefQueue,
    uint64_t vmParentWrap,
    GpuEvent* gpuEvent)
{
  uint32_t id  = gpuEvent->id_;
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());

  // Make sure exculsive access to the device queue
  amd::ScopedLock(defQueue->lock());
  Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam));

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    waitForEvent(gpuEvent);
    PrintChildren(hsaKernel, gpuDefQueue);
  }

  if (!dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
      vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
      dev().settings().useDeviceQueue_);
    eventEnd(MainEngine, *gpuEvent);
  }

  // Get the global loop start before the scheduler
  Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
    .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  const static bool FlushL2 = true;
  gpuDefQueue->addBarrier(FlushL2);

  // Get the address of PM4 template and add write it to params
  //! @note DMA flush must not occur between patch and the scheduler
  Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  // Program parameters for the scheduler
  SchedulerParam* param = reinterpret_cast<SchedulerParam*>(
    gpuDefQueue->schedParams_->data());
  param->signal = 1;
  // Scale clock to 1024 to avoid 64 bit div in the scheduler
  param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
  param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/;
  param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
  param->releaseHostCP = 0;
  param->parentAQL = vmParentWrap;
  param->dedicatedQueue = dev().settings().useDeviceQueue_;
  param->useATC = dev().settings().svmFineGrainSystem_;

  // Fill the scratch buffer information
  if (hsaKernel.prog().maxScratchRegs() > 0) {
    pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
    param->scratchSize = scratchBuf->size();
    param->scratch = scratchBuf->vmAddress();
    param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
    param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
    addVmMemory(scratchBuf);
  }
  else {
    param->numMaxWaves = 0;
    param->scratchSize = 0;
    param->scratch = 0;
    param->scratchOffset = 0;
  }

  // Add all kernels in the program to the mem list.
  //! \note Runtime doesn't know which one will be called
  hsaKernel.prog().fillResListWithKernels(*this);

  Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
  gpuDefQueue->eventBegin(MainEngine);
  gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
    signalAddr, loopStart,
    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
  // Thus TS command for profiling has to follow in the next CB.
  constexpr bool ForceSubmitFirst = true;
  gpuDefQueue->eventEnd(MainEngine, *gpuEvent, ForceSubmitFirst);

  if (dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
      vmParentWrap + offsetof(AmdAqlWrap, child_counter),
      signalAddr, dev().settings().useDeviceQueue_);
    if (id != gpuEvent->id_) {
        LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, *gpuEvent);
  }

  xferWrite().Release(*gpuDefQueue->schedParams_);
  gpuDefQueue->schedParams_ = nullptr;
}

// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  // Submit kernel to HW
  if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                      const_address parameters, bool nativeMem,
                                      amd::Event* enqueueEvent)
{
  // If RGP capturing is enabled, then start SQTT trace
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->PreDispatch(this);
  }

  // Get the HSA kernel object
  const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));

  bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
  if (printfEnabled && !printfDbgHSA().init(*this, printfEnabled)) {
    LogError("Printf debug buffer initialization failed!");
    return false;
  }

  // Add ISA memory object to the resource tracking list
  AddKernel(kernel);

  uint64_t vmDefQueue = 0;
  VirtualGPU* gpuDefQueue = nullptr;
  if (hsaKernel.dynamicParallelism()) {
    // Initialize GPU device queue for execution (gpuDefQueue)
    if (!PreDeviceEnqueue(kernel, hsaKernel, &gpuDefQueue, &vmDefQueue)) {
      return false;
    }
  }

  // Check memory dependency and SVM objects
  if (!processMemObjectsHSA(kernel, parameters, nativeMem)) {
      LogError("Wrong memory objects!");
      return false;
  }

  bool needFlush = false;
  // Avoid flushing when PerfCounter is enabled, to make sure PerfStart/dispatch/PerfEnd
  // are in the same cmdBuffer
  if (!state_.perfCounterEnabled_) {
    dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize());
    if (dmaFlushMgmt().dispatchSplitSize() != 0) {
      needFlush = true;
    }
  }

  size_t newOffset[3] = {0, 0, 0};
  size_t newGlobalSize[3] = {0, 0, 0};

  int dim = -1;
  int iteration = 1;
  size_t globalStep = 0;
  for (uint i = 0; i < sizes.dimensions(); i++) {
    newGlobalSize[i] = sizes.global()[i];
    newOffset[i] = sizes.offset()[i];
  }
  // Check if it is blit kernel. If it is, then check if split is needed.
  if (hsaKernel.isInternalKernel()) {
    // Calculate new group size for each submission
    for (uint i = 0; i < sizes.dimensions(); i++) {
      if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
        dim = i;
        iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0);
        globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim];
        break;
      }
    }
  }

  for (int iter = 0; iter < iteration; ++iter) {
    GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
    uint32_t id = gpuEvent.id_;
    // Reset global size for dimension dim if split is needed
    if (dim != -1) {
      newOffset[dim] = sizes.offset()[dim] + globalStep * iter;
      if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (iter != (iteration - 1))) {
        newGlobalSize[dim] = globalStep;
      } else {
        newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
      }
    }

    amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0],
                                   &(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));

    if (iter > 0) {
      // Updates the timestamp values, since a CB flush could occur.
      // Resource processing was  moved from loadArguments() and
      // an extra loop is required.
      const amd::KernelParameters& kernelParams = kernel.parameters();
      amd::Memory* const* memories =
        reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
      for (uint32_t i = 0; i < kernel.signature().numMemories(); ++i) {
        if (nativeMem) {
          Memory* gpuMem = reinterpret_cast<Memory* const*>(memories)[i];
          if (gpuMem != nullptr) {
            gpuMem->setBusy(*this, gpuEvent);
          }
        }
        else {
          amd::Memory* mem = memories[i];
          if (mem != nullptr) {
            dev().getGpuMemory(mem)->setBusy(*this, gpuEvent);
          }
        }
      }
    }

    uint64_t vmParentWrap = 0;
    // Program the kernel arguments for the GPU execution
    hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(
      *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap);
    if (nullptr == aqlPkt) {
      LogError("Couldn't load kernel arguments");
      return false;
    }

    const Device::ScratchBuffer* scratch = nullptr;
    // Check if the device allocated more registers than the old setup
    if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
      scratch = dev().scratch(hwRing());
      addVmMemory(scratch->memObj_);
    }

    // Set up the dispatch information
    Pal::DispatchAqlParams dispatchParam = {};
    dispatchParam.pAqlPacket = aqlPkt;
    if (nullptr != scratch) {
      dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
      dispatchParam.scratchSize = scratch->size_;
      dispatchParam.scratchOffset = scratch->offset_;
    }
    dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode();
    dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
    dispatchParam.wavesPerSh = (enqueueEvent != nullptr) ?
      enqueueEvent->profilingInfo().waves_ : 0;
    dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
    // Run AQL dispatch in HW
    eventBegin(MainEngine);
    iCmd()->CmdDispatchAql(dispatchParam);
    // Note: This a workaround for incorrect results reported with release_mem packet,
    // when the packet can be processed later after this dispatch and including extra time
    if (profiling() || state_.profileEnabled_) {
      addBarrier();
    }
    if (id != gpuEvent.id_) {
      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, gpuEvent);

    // Execute scheduler for device enqueue
    if (hsaKernel.dynamicParallelism()) {
      PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
    }

    // Update the global GPU event
    setGpuEvent(gpuEvent, needFlush);

    if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
      LogError("Couldn't read printf data from the buffer!\n");
      return false;
    }
  }
  // Perform post dispatch logic for RGP traces
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->PostDispatch(this);
  }

  return true;
}

void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  Unimplemented();  //!< @todo: Unimplemented
}

void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  //!@note runtime doesn't need to lock this command on execution

  if (vcmd.waitingEvent() != nullptr) {
    bool foundEvent = false;

    // Loop through all outstanding command batches
    while (!cbQueue_.empty()) {
      auto cb = cbQueue_.front();
      // Wait for completion
      foundEvent = awaitCompletion(cb, vcmd.waitingEvent());
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
      // Early exit if we found a command
      if (foundEvent) break;
    }

    // Event should be in the current command batch
    if (!foundEvent) {
      state_.forceWait_ = true;
    }
    // If we don't have any more batches, then assume GPU is idle
    else if (cbQueue_.empty()) {
      dmaFlushMgmt_.resetCbWorkload(dev());
    }
  }
}

void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
  queues_[SdmaEngine]->removeCmdMemRef(mem);
}

void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();

  PalCounterReference* palRef = PalCounterReference::Create(*this);
  if (palRef == nullptr) {
    LogError("We failed to allocate memory for the GPU perfcounter");
    vcmd.setStatus(CL_INVALID_OPERATION);
    return;
  }

  bool newExperiment = false;

  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    // Make sure we have a valid gpu performance counter
    if (nullptr == counter) {
      amd::PerfCounter::Properties prop = amdCounter->properties();
      PerfCounter* gpuCounter = new PerfCounter(
          gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
          prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
      if (nullptr == gpuCounter) {
        LogError("We failed to allocate memory for the GPU perfcounter");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      } else if (gpuCounter->create()) {
        newExperiment = true;
      } else {
        LogPrintfError(
            "We failed to allocate a perfcounter in PAL.\
                    Block: %d, counter: #d, event: %d",
            gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_,
            gpuCounter->info()->eventIndex_);
      }
      amdCounter->setDeviceCounter(gpuCounter);
    }
  }

  if (newExperiment) {
    palRef->finalize();
  }

  palRef->release();

  Pal::IPerfExperiment* palPerf = nullptr;
  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    if (palPerf != counter->iPerf()) {
      palPerf = counter->iPerf();
      // Find the state and sends the command to PAL
      if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
        Pal::SetClockModeInput input;
        Pal::SetClockModeOutput output = {};
        state_.perfCounterEnabled_ = true;
        input.clockMode = Pal::DeviceClockMode::Profiling;
        dev().iDev()->SetClockMode(input, &output);
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdBeginPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
      } else if (vcmd.getState() == amd::PerfCounterCommand::End) {
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdEndPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
        state_.perfCounterEnabled_ = false;
      } else {
        LogError("Unsupported performance counter state");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
  }
}

void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE_MEM: {
      amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      if (threadTrace == nullptr) {
        PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this);
        if (palRef == nullptr) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        size_t numSe = amdThreadTrace->deviceSeNumThreadTrace();

        ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe);
        if (nullptr == gpuThreadTrace) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        if (gpuThreadTrace->create()) {
          amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
        } else {
          LogError("Failure in memory allocation for the GPU threadtrace");
          delete gpuThreadTrace;
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        palRef->finalize();
        palRef->release();
      }

      break;
    }
    default:
      LogError("Unsupported command type for ThreadTraceMemObjects!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE: {
      amd::ThreadTrace* amdThreadTrace = static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      // gpu thread trace object had to be generated prior to begin/end/pause/resume due
      // to ThreadTraceMemObjectsCommand execution
      if (threadTrace == nullptr) {
        return;
      } else {
        Pal::IPerfExperiment* palPerf = threadTrace->iPerf();
        if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
          amd::ThreadTrace::ThreadTraceConfig* traceCfg =
              static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
          iCmd()->CmdBeginPerfExperiment(palPerf);
        } else if (cmd.getState() == amd::ThreadTraceCommand::End) {
          GpuEvent event;
          eventBegin(MainEngine);
          iCmd()->CmdEndPerfExperiment(palPerf);
          threadTrace->populateUserMemory();
          eventEnd(MainEngine, event);
          setGpuEvent(event);
        } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
          // There's no Pause from the PerfExperiment interface
        } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
          // There's no Resume from the PerfExperiment interface
        }
      }
      break;
    }
    default:
      LogError("Unsupported command type for ThreadTrace!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data from original resource
    it->getInteropObj()->copyOrigToShared();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data back to original resource
    it->getInteropObj()->copySharedToOrig();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);
  pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory());

  GpuEvent gpuEvent;
  uint32_t value = vcmd.markerValue();

  if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
    eventBegin(MainEngine);
    addVmMemory(pGpuMemory);

    iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF,
                                              Pal::CompareFunc::GreaterEqual);
    eventEnd(MainEngine, gpuEvent);

  } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {

    EngineType activeEngineID = engineID_;
    engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);

    // Make sure GPU finished operation and data reached memory before the marker write
    static constexpr bool FlushL2 = true;
    addBarrier(FlushL2);
    // Workarounds: We had systems where an extra delay was necessary.
    {
        // Flush CB associated with the DGMA buffer
        isDone(pGpuMemory->getGpuEvent(*this));
    }

    eventBegin(engineID_);
    queues_[engineID_]->addCmdMemRef(pGpuMemory->memRef());

    queues_[engineID_]->iCmd()->
#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 396)
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value);
#else
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), vcmd.markerOffset(), value);
#endif
    eventEnd(engineID_, gpuEvent);

    // Restore the original engine
    engineID_ = activeEngineID;
  }

  // Update the global GPU event
  setGpuEvent(gpuEvent);

  profilingEnd(vcmd);
}

void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  std::vector<amd::Memory*> memObjects = vcmd.memObjects();
  uint32_t numObjects = memObjects.size();
  Pal::GpuMemoryRef* pGpuMemRef = new Pal::GpuMemoryRef[numObjects];
  Pal::IGpuMemory** pGpuMems = new Pal::IGpuMemory*[numObjects];

  for (uint i = 0; i < numObjects; i++) {
    pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]);
    pGpuMemory->syncCacheFromHost(*this);

    pGpuMemRef[i].pGpuMemory = pGpuMemory->iMem();
    pGpuMems[i] = pGpuMemory->iMem();
  }

  dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim);
  dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
  if (numObjects != 0) {
      dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
  }

  for (uint i = 0; i < numObjects; i++) {
    vcmd.busAddress()[i].surface_bus_address = pGpuMems[i]->Desc().surfaceBusAddr;
    vcmd.busAddress()[i].marker_bus_address = pGpuMems[i]->Desc().markerBusAddr;
  }
  profilingEnd(vcmd);
}


bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* head = cb->head_;

  // Make sure that profiling is enabled
  if (state_.profileEnabled_) {
    return profilingCollectResults(cb, waitingEvent);
  }
  // Mark the first command in the batch as running
  if (head != nullptr) {
    head->setStatus(CL_RUNNING);
  } else {
    return found;
  }

  // Wait for the last known GPU event
  waitEventLock(cb);

  while (nullptr != head) {
    current = head->getNext();
    if (head->status() == CL_SUBMITTED) {
      head->setStatus(CL_RUNNING);
      head->setStatus(CL_COMPLETE);
    } else if (head->status() == CL_RUNNING) {
      head->setStatus(CL_COMPLETE);
    } else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", head->status());
    }

    // Check if it's a waiting command
    if (head == waitingEvent) {
      found = true;
    }

    head->release();
    head = current;
  }

  return found;
}

void VirtualGPU::flush(amd::Command* list, bool wait) {
  CommandBatch* cb = nullptr;
  bool gpuCommand = false;

  for (uint i = 0; i < AllEngines; ++i) {
    if (events_[i].isValid()) {
      gpuCommand = true;
    }
  }

  // If the batch doesn't have any GPU command and the list is empty
  if (!gpuCommand && cbQueue_.empty()) {
    state_.forceWait_ = true;
  }

  // Insert the current batch into a list
  if (nullptr != list) {
    if (!freeCbQueue_.empty()) {
      cb = freeCbQueue_.front();
    }

    if (nullptr == cb) {
      cb = new CommandBatch(list, events_, lastTS_);
    } else {
      freeCbQueue_.pop();
      cb->init(list, events_, lastTS_);
    }
  }

  {
    //! @todo: Check if really need a lock
    amd::ScopedLock lock(execution());
    for (uint i = 0; i < AllEngines; ++i) {
      flushDMA(i);
      // Reset event so we won't try to wait again,
      // if runtime didn't submit any commands
      //! @note: it's safe to invalidate events, since
      //! we already saved them with the batch creation step above
      events_[i].invalidate();
    }
  }

  // Mark last TS as nullptr, so runtime won't process empty batches with the old TS
  lastTS_ = nullptr;
  if (nullptr != cb) {
    cbQueue_.push(cb);
  }

  wait |= state_.forceWait_;
  // Loop through all outstanding command batches
  while (!cbQueue_.empty()) {
    cb = cbQueue_.front();
    // Check if command batch finished without a wait
    bool finished = true;
    for (uint i = 0; i < AllEngines; ++i) {
      finished &= isDone(&cb->events_[i]);
    }
    if (finished || wait) {
      // Wait for completion
      awaitCompletion(cb);
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
    } else {
      // Early exit if no finished
      break;
    }
  }
  state_.forceWait_ = false;
}

void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }

void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) {
  events_[engineID_] = gpuEvent;

  // Flush current DMA buffer if requested
  if (flush) {
    flushDMA(engineID_);
  }
}

void VirtualGPU::flushDMA(uint engineID) {
  if (engineID == MainEngine) {
    // Clear memory dependency state, since runtime flushes compute
    // memoryDependency().clear();
    //!@todo Keep memory dependency alive even if we flush DMA,
    //! since only L2 cache is flushed in KMD frame,
    //! but L1 still has to be invalidated.
  }

  isDone(&events_[engineID]);
}

bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
  uint i;
  GpuEvent* events;  //!< GPU events for the batch

  // If command batch is nullptr then wait for the current
  if (nullptr == cb) {
    events = events_;
  } else {
    events = cb->events_;
  }

  bool earlyDone = true;
  // The first loop is to flush all engines and/or check if
  // engines are idle already
  for (i = 0; i < AllEngines; ++i) {
    earlyDone &= isDone(&events[i]);
  }

  // Rlease all pinned memory
  releasePinnedMem();

  // The second loop is to wait all engines
  for (i = 0; i < AllEngines; ++i) {
    waitForEvent(&events[i]);
  }

  return earlyDone;
}

void VirtualGPU::waitEventLock(CommandBatch* cb) {
  bool earlyDone = false;
  {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    earlyDone = waitAllEngines(cb);
  }

  // Free resource cache if we have too many entries
  //! \note we do it here, when all engines are idle,
  // because Vista/Win7 idles GPU on a resource destruction
  static const size_t MinCacheEntries = 4096;
  dev().resourceCache().free(MinCacheEntries);

  // Find the timestamp object of the last command in the batch
  if (cb->lastTS_ != nullptr) {
    // If earlyDone is TRUE, then CPU didn't wait for GPU.
    // Thus the sync point between CPU and GPU is unclear and runtime
    // will use an older adjustment value to maintain the same timeline
    if (!earlyDone ||
        //! \note Workaround for APU(s).
        //! GPU-CPU timelines may go off too much, thus always
        //! force calibration with the last batch in the list
        (cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) {
      uint64_t startTimeStampGPU = 0;
      uint64_t endTimeStampGPU = 0;

      // Get the timestamp value of the last command in the batch
      cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);

      uint64_t endTimeStampCPU = amd::Os::timeNanos();
      // Make sure the command batch has a valid GPU TS
      if (!GPU_RAW_TIMESTAMP) {
        // Adjust the base time by the execution time
        readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
      }
    }
  }
}

bool VirtualGPU::allocConstantBuffers() {
  // Allocate constant buffers.
  // Use double size, reported to the app to account for internal arguments
  const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_;
  uint i;

  // Create/reallocate constant buffer resources
  for (i = 0; i < MaxConstBuffersArguments; ++i) {
    ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize);

    if ((constBuf != nullptr) && constBuf->Create()) {
      addConstBuffer(constBuf);
    } else {
      // We failed to create a constant buffer
      delete constBuf;
      return false;
    }
  }

  return true;
}

void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
  // Is profiling enabled?
  if (command.profilingInfo().enabled_) {
    // Allocate a timestamp object from the cache
    TimeStamp* ts = tsCache_->allocTimeStamp();
    if (nullptr == ts) {
      return;
    }
    // Save the TimeStamp object in the current OCL event
    command.setData(ts);
    profileTs_ = ts;
    state_.profileEnabled_ = true;
  }
}

void VirtualGPU::profilingEnd(amd::Command& command) {
  // Get the TimeStamp object associated witht the current command
  TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
  if (ts != nullptr) {
    // Check if the command actually did any GPU submission
    if (ts->isValid()) {
      lastTS_ = ts;
    } else {
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      command.setData(nullptr);
    }
  }
}

bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* first = cb->head_;

  // If the command list is, empty then exit
  if (nullptr == first) {
    return found;
  }

  // Wait for the last known GPU events on all engines
  waitEventLock(cb);

  // Find the CPU base time of the entire command batch execution
  uint64_t endTimeStamp = amd::Os::timeNanos();
  uint64_t startTimeStamp = endTimeStamp;

  // First step, walk the command list to find the first valid command
  //! \note The batch may have empty markers at the beginning.
  //! So the start/end of the empty commands is equal to
  //! the start of the first valid command in the batch.
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Assign to endTimeStamp the start of the first valid command
      endTimeStamp = startTimeStamp;
      break;
    }
    first = first->getNext();
  }

  // Second step, walk the command list to construct the time line
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());

    current = first->getNext();

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      first->setData(nullptr);
    } else {
      // For empty commands start/end is equal to
      // the end of the last valid command
      startTimeStamp = endTimeStamp;
    }

    // Update the command status with the proper timestamps
    if (first->status() == CL_SUBMITTED) {
      first->setStatus(CL_RUNNING, startTimeStamp);
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if (first->status() == CL_RUNNING) {
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", first->status());
    }

    // Do we wait this event?
    if (first == waitingEvent) {
      found = true;
    }

    first->release();
    first = current;
  }

  return found;
}

void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) {
  queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd);
}

void VirtualGPU::profileEvent(EngineType engine, bool type) const {
  if (nullptr == profileTs_) {
    return;
  }
  if (type) {
    profileTs_->begin((engine == SdmaEngine) ? true : false);
  } else {
    profileTs_->end((engine == SdmaEngine) ? true : false);
  }
}

bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
                                      bool nativeMem) {
  const amd::KernelParameters& kernelParams = kernel.parameters();

  // Mark the tracker with a new kernel,
  // so we can avoid checks of the aliased objects
  memoryDependency().newKernel();

  size_t count = kernelParams.getNumberOfSvmPtr();
  if (count > 0) {
    bool supportFineGrainedSystem = dev().isFineGrainedSystem(true);
    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
    switch (status) {
      case FGS_YES:
        if (!supportFineGrainedSystem) {
          return false;
        }
        break;
      case FGS_NO:
        supportFineGrainedSystem = false;
        break;
      case FGS_DEFAULT:
      default:
        break;
    }
    // get svm non arugment information
    void* const* svmPtrArray = reinterpret_cast<void* const*>(
      params + kernelParams.getExecInfoOffset());
    for (size_t i = 0; i < count; i++) {
      amd::Memory* memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
      if (nullptr == memory) {
        if (!supportFineGrainedSystem) {
          return false;
        } else {
          addBarrier();
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
          continue;
        }
      } else {
        Memory* gpuMemory = dev().getGpuMemory(memory);
        if (nullptr != gpuMemory) {
          // Synchronize data with other memory instances if necessary
          gpuMemory->syncCacheFromHost(*this);

          const static bool IsReadOnly = false;
          // Validate SVM passed in the non argument list
          memoryDependency().validate(*this, gpuMemory, IsReadOnly);

          // Wait for resource if it was used on an inactive engine
          //! \note syncCache may call DRM transfer
          constexpr bool WaitOnBusyEngine = true;
          gpuMemory->wait(*this, WaitOnBusyEngine);

          // Mark signal write for cache coherency,
          // since this object isn't a part of kernel arg setup
          if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
            memory->signalWrite(&dev());
          }
          addVmMemory(gpuMemory);
        } else {
          return false;
        }
      }
    }
  }

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
  const HSAILKernel& hsaKernel =
      static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
  const amd::KernelSignature& signature = kernel.signature();

  // Check all parameters for the current kernel
  for (size_t i = 0; i < signature.numParameters(); ++i) {
    const amd::KernelParameterDescriptor& desc = signature.at(i);
    const HSAILKernel::Argument* arg = hsaKernel.argumentAt(i);

    // Find if current argument is a buffer
    if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
      Memory* gpuMem = nullptr;
      amd::Memory* mem = nullptr;
      uint32_t index = desc.info_.arrayIndex_;
      if (nativeMem) {
        gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
        if (nullptr != gpuMem) {
          mem = gpuMem->owner();
        }
      } else {
        mem = memories[index];
        if (mem != nullptr) {
          gpuMem = dev().getGpuMemory(mem);
          // Synchronize data with other memory instances if necessary
          gpuMem->syncCacheFromHost(*this);
        }
      }
      //! This condition is for SVM fine-grain
      if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
        addBarrier();
        // Clear memory dependency state
        const static bool All = true;
        memoryDependency().clear(!All);
        continue;
      } else if (gpuMem != nullptr) {
        // Check image
        bool readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
        // Check buffer
        readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
        // Validate memory for a dependency in the queue
        memoryDependency().validate(*this, gpuMem, readOnly);

        // Wait for resource if it was used on an inactive engine
        //! \note syncCache may call DRM transfer
        constexpr bool WaitOnBusyEngine = true;
        gpuMem->wait(*this, WaitOnBusyEngine);

        //! Check if compiler expects read/write
        if ((mem != nullptr) && !desc.info_.readOnly_) {
          mem->signalWrite(&dev());
        }
        addVmMemory(gpuMem);
      }
    }
  }

  for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
    const static bool IsReadOnly = false;
    // Validate global store for a dependency in the queue
    memoryDependency().validate(*this, mem, IsReadOnly);
  }

  return true;
}

amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) {
  amd::Memory* mem = new (amdImage.getContext()) amd::Buffer(amdImage, 0, 0, amdImage.getSize());
  mem->setVirtualDevice(this);
  if ((mem != nullptr) && !mem->create()) {
    mem->release();
  }

  return mem;
}

void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) {
  const static bool Wait = true;
  vqHeader_->kernel_table = kernelTable;
  virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, Wait);
}

void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt,
                                 HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) {
  amd::HwDebugManager* dbgManager = dev().hwDebugMgr();
  assert(dbgManager && "No HW Debug Manager!");

  // Initialize structure with default values

  if (hsaKernel.prog().maxScratchRegs() > 0) {
    pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_;
    kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
    kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();

    // Get the address of the scratch buffer and its size for CPU access
    address scratchRingAddr = nullptr;
    scratchRingAddr = static_cast<address>(scratchBuf->map(nullptr, 0));
    dbgManager->setScratchRing(scratchRingAddr, scratchBuf->size());
    scratchBuf->unmap(nullptr);
  } else {
    kernelInfo.scratchBufAddr = 0;
    kernelInfo.scratchBufferSizeInBytes = 0;
    dbgManager->setScratchRing(nullptr, 0);
  }

  //! @todo:  need to verify what is wanted for the global memory
  Unimplemented();
  kernelInfo.heapBufAddr = 0;

  kernelInfo.pAqlDispatchPacket = aqlPkt;
  kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());

  // Get the address of the kernel code and its size for CPU access
  /*    pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
      if (nullptr != aqlCode) {
          address aqlCodeAddr = static_cast<address>(aqlCode->map(nullptr, 0));
          dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
          aqlCode->unmap(nullptr);
      }
      else {
          dbgManager->setKernelCodeInfo(nullptr, 0);
      }
  */
  kernelInfo.trapPresent = false;
  kernelInfo.trapHandler = nullptr;
  kernelInfo.trapHandlerBuffer = nullptr;

  kernelInfo.excpEn = 0;
  kernelInfo.cacheDisableMask = 0;
  kernelInfo.sqDebugMode = 0;

  kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
  kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;

  // set kernel info for HW debug and call the callback function
  if (nullptr != dbgManager->preDispatchCallBackFunc()) {
    DebugToolInfo dbgSetting = {0};
    dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
    dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
    dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
    dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf();
    dbgSetting.event_ = enqueueEvent;

    // Call the predispatch callback function & set the trap info
    AqlCodeInfo aqlCodeInfo;
    aqlCodeInfo.aqlCode_ = (amd_kernel_code_t*)hsaKernel.cpuAqlCode();
    aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();

    // Execute the pre-dispatch call back function
    dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);

    // assign the debug TMA and TBA for kernel dispatch
    if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) {
      assignDebugTrapHandler(dbgSetting, kernelInfo);
    }

    kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;

    // Execption policy
    kernelInfo.excpEn = dbgSetting.exceptionMask_;
    kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
    kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;

    // Compute the mask for reserved CUs. These two dwords correspond to
    // two registers used for reserving CUs for display. In the current
    // implementation, the number of CUs reserved can be 0 to 7, and it
    // is set by debugger users.
    if (dbgSetting.monitorMode_) {
      uint32_t i = dbgSetting.reservedCuNum_ / 2;
      kernelInfo.mgmtSe0Mask <<= i;
      i = dbgSetting.reservedCuNum_ - i;
      kernelInfo.mgmtSe1Mask <<= i;
    }
    Unimplemented();
    /*
            // flush/invalidate the instruction, data, L1 and L2 caches
            InvalidateSqCaches();
    */
  }
}

void VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
                                        HwDbgKernelInfo& kernelInfo) {
  // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
  //
  Memory* rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
  Memory* rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());

  kernelInfo.trapHandler = reinterpret_cast<void*>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
  // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
  // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
  // without the workaround can still function correctly.
  kernelInfo.trapHandlerBuffer = reinterpret_cast<void*>(rtTrapBufferMem->vmAddress());

  address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));

  Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
  Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);

  // Address of the trap handler code/buffer should be 256-byte aligned
  uint64_t tbaAddress = trapHandlerMem->vmAddress();
  uint64_t tmaAddress = trapBufferMem->vmAddress();
  if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
    assert(false && "Trap handler/buffer is not 256-byte aligned");
  }

  // The addresses of the debug trap handler code (TBA) and buffer (TMA) are
  // stored in the runtime trap handler buffer with offset location of 0x18-19
  // and 0x20-21, respectively.
  uint64_t* rtTmaPtr = reinterpret_cast<uint64_t*>(rtTrapBufferAddress + 0x18);
  rtTmaPtr[0] = tbaAddress;
  rtTmaPtr[1] = tmaAddress;

  rtTrapBufferMem->unmap(nullptr);
  // Add GPU mem handles to the memory list for VidMM
  addVmMemory(trapHandlerMem);
  addVmMemory(trapBufferMem);
  addVmMemory(rtTrapHandlerMem);
  addVmMemory(rtTrapBufferMem);
}

bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
  uint64_t srcVmEnd = src.vmAddress() + src.vmSize();
  if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) ||
      ((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) ||
      ((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) {
    sdmaRange_.start_ = dst.vmAddress();
    sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
    return true;
  }

  sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
  sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
  return false;
}

void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
  size_t copySize = cmd.size()[0];
  size_t fileOffset = cmd.fileOffset();
  Memory* mem = dev().getGpuMemory(&cmd.memory());
  uint idx = 0;

  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);

  if (writeBuffer) {
    size_t dstOffset = cmd.origin()[0];
    while (copySize > 0) {
      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
      dstSize = std::min(dstSize, copySize);
      void* dstBuffer = staging->cpuMap(*this);
      if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0,
                                     dstSize)) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
      staging->cpuUnmap(*this);

      blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
      flushDMA(staging->getGpuEvent(*this)->engineId_);
      fileOffset += dstSize;
      dstOffset += dstSize;
      copySize -= dstSize;
    }
  } else {
    size_t srcOffset = cmd.origin()[0];
    while (copySize > 0) {
      Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
      srcSize = std::min(srcSize, copySize);
      blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);

      void* srcBuffer = staging->cpuMap(*this);
      if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0,
                                     srcSize)) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
      staging->cpuUnmap(*this);

      fileOffset += srcSize;
      srcOffset += srcSize;
      copySize -= srcSize;
    }
  }
}
}  // namespace pal