rocm-systems/projects/clr/rocclr/device/pal/palvirtual.cpp

/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/kernel.hpp"
#include "platform/commandqueue.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palblit.hpp"
#include "device/appprofile.hpp"
#include "device/devhostcall.hpp"
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "amd_hsa_queue.h"
#include <fstream>
#include <sstream>
#include <algorithm>
#include <thread>
#include "palQueue.h"
#include "palFence.h"
#include "palQueueSemaphore.h"

#ifdef _WIN32
#include <d3d10_1.h>
#include "platform/interop_d3d9.hpp"
#include "platform/interop_d3d10.hpp"
#include "platform/interop_d3d11.hpp"
#endif  // _WIN32

namespace amd::pal {

uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
  uint32_t allocedQueues = 0;
  for (const auto& queue : gpu.dev().QueuePool()) {
    allocedQueues += (queue.second->engineType_ == type) ? 1 : 0;
  }
  return allocedQueues;
}

// ================================================================================================
VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType,
                                             uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
                                             uint rtCU, amd::CommandQueue::Priority priority,
                                             uint64_t residency_limit, uint max_command_buffers) {
  Pal::IDevice* palDev = gpu.dev().iDev();
  Pal::Result result;
  Pal::CmdBufferCreateInfo cmdCreateInfo = {};
  Pal::QueueCreateInfo qCreateInfo = {};
  qCreateInfo.engineIndex =
      (queueType == Pal::QueueTypeCompute) ? gpu.dev().computeEnginesId()[engineIdx] : engineIdx;
  qCreateInfo.aqlQueue = true;
  qCreateInfo.queueType = queueType;
  qCreateInfo.priority = Pal::QueuePriority::Normal;

  if (queueType == Pal::QueueTypeDma) {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma;
  } else {
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
  }
  std::map<ExclusiveQueueType, uint32_t>::const_iterator it;
  if ((priority == amd::CommandQueue::Priority::Medium) &&
      (amd::CommandQueue::RealTimeDisabled == rtCU)) {
    it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::Medium);
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
    qCreateInfo.priority = Pal::QueuePriority::Medium;
  } else if (amd::CommandQueue::RealTimeDisabled != rtCU) {
    if (gpu.dev().settings().enableWgpMode_) {
      rtCU = rtCU * 2;
    }
    qCreateInfo.numReservedCu = amd::alignDown(
        rtCU,
        gpu.dev().properties().engineProperties[Pal::EngineTypeCompute].dedicatedCuGranularity);
    if (qCreateInfo.numReservedCu == 0) {
      return nullptr;
    }
    if ((priority == amd::CommandQueue::Priority::Medium) &&
        // If Windows HWS is enabled, then the both real time queues are allocated
        // on the same engine
        (gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) !=
         gpu.dev().exclusiveComputeEnginesId().end())) {
      it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1);
    } else {
      it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime0);
    }
    cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
    cmdCreateInfo.flags.realtimeComputeUnits = true;
    qCreateInfo.priority = Pal::QueuePriority::Realtime;

    // If the app creates an exclusive compute, then find the engine id
    if (qCreateInfo.engineType == Pal::EngineTypeCompute) {
      if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
        qCreateInfo.engineIndex = it->second;
      } else {
        return nullptr;
      }
    }
  }

  // Find queue object size
  size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  cmdCreateInfo.pCmdAllocator = cmdAllocator;
  cmdCreateInfo.queueType = queueType;

  // Find command buffer object size
  size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  // Find fence object size
  size_t fSize = palDev->GetFenceSize(&result);
  if (result != Pal::Result::Success) {
    return nullptr;
  }

  size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
  VirtualGPU::Queue* queue =
      new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
  if (queue != nullptr) {
    address addrQ = nullptr;
    if (((qCreateInfo.engineType == Pal::EngineTypeCompute) ||
         (qCreateInfo.engineType == Pal::EngineTypeDma)) &&
        (qCreateInfo.priority != Pal::QueuePriority::Realtime)) {
      uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
      // Create PAL queue object
      if (index < GPU_MAX_HW_QUEUES) {
        Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
        if (info == nullptr) {
          LogError("Could not create QueueRecycleInfo!");
          return nullptr;
        }
        addrQ = reinterpret_cast<address>(&info[1]);
        qCreateInfo.aqlPacketList = info->AqlPacketList();
        result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
        if (result == Pal::Result::Success) {
          const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
          info->engineType_ = qCreateInfo.engineType;
          // Save uniqueue index for scratch buffer access
          info->index_ = index;
        } else {
          delete queue;
          return nullptr;
        }
      } else {
        int usage = std::numeric_limits<int>::max();
        uint indexBase = std::numeric_limits<uint32_t>::max();
        // Loop through all allocated queues and find the lowest usage
        for (const auto& it : gpu.dev().QueuePool()) {
          if ((qCreateInfo.engineType == it.second->engineType_) &&
              (it.second->counter_ <= usage)) {
            if ((it.second->counter_ < usage) ||
                // Preserve the order of allocations, because SDMA engines
                // should be used in round-robin manner
                ((it.second->counter_ == usage) && (it.second->index_ < indexBase))) {
              queue->iQueue_ = it.first;
              usage = it.second->counter_;
              indexBase = it.second->index_;
            }
          }
        }
        // Increment the usage of the current queue
        gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
      }
      Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
      queue->lock_ = &info->queue_lock_;
      addrQ = reinterpret_cast<address>(&queue[1]);
    } else {
      Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
      if (info == nullptr) {
        LogError("Could not create QueueRecycleInfo!");
        return nullptr;
      }
      queue->info_ = info;
      queue->aql_mgmt_ = &info->aql_packet_mgmt_;
      // Exclusive compute path
      addrQ = reinterpret_cast<address>(&queue[1]);
      qCreateInfo.aqlPacketList = info->AqlPacketList();
      result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
    }
    if (result != Pal::Result::Success) {
      delete queue;
      return nullptr;
    }
    queue->UpdateAppPowerProfile();
    address addrCmd = addrQ + qSize;
    address addrF = addrCmd + max_command_buffers * cmdSize;
    Pal::CmdBufferBuildInfo cmdBuildInfo = {};

    for (uint i = 0; i < max_command_buffers; ++i) {
      result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }

      Pal::FenceCreateInfo fenceCreateinfo = {};
      fenceCreateinfo.flags.signaled = false;
      result = palDev->CreateFence(fenceCreateinfo, &addrF[i * fSize], &queue->iCmdFences_[i]);
      if (result != Pal::Result::Success) {
        delete queue;
        return nullptr;
      }
      if (i == StartCmdBufIdx) {
        result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
        if (result != Pal::Result::Success) {
          delete queue;
          return nullptr;
        }
      }
    }
  }
  return queue;
}

VirtualGPU::Queue::~Queue() {
  delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);

  if (nullptr != iQueue_) {
    // Make sure the queues are idle
    // It's unclear why PAL could still have a busy queue
    amd::ScopedLock l(lock_);
    iQueue_->WaitIdle();
  }

  // Remove all memory references
  std::vector<Pal::IGpuMemory*> memRef;
  for (auto it : memReferences_) {
    memRef.push_back(it.first->iMem());
  }
  if (memRef.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
  }
  memReferences_.clear();

  for (uint i = 0; i < max_command_buffers_; ++i) {
    if (nullptr != iCmdBuffs_[i]) {
      iCmdBuffs_[i]->Destroy();
    }
    if (nullptr != iCmdFences_[i]) {
      iCmdFences_[i]->Destroy();
    }
  }

  if (nullptr != iQueue_) {
    // Find if this queue was used in recycling
    if (lock_ != nullptr) {
      // Release the queue if the counter is 0
      if (--gpu_.dev().QueuePool().find(iQueue_)->second->counter_ == 0) {
        iQueue_->Destroy();
        const auto& info = gpu_.dev().QueuePool().find(iQueue_);
        // Readjust HW queue index for scratch buffer access
        for (auto& queue : gpu_.dev().QueuePool()) {
          if ((queue.second->engineType_ == info->second->engineType_) &&
              (queue.second->index_ > info->second->index_)) {
            queue.second->index_--;
          }
        }
        delete gpu_.dev().QueuePool().find(iQueue_)->second;
        const_cast<Device&>(gpu_.dev()).QueuePool().erase(iQueue_);
      }
    } else {
      iQueue_->Destroy();
    }
  }
}

Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
  std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();

  const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
  // Find the last occurance of the '\\' character and extract the name of the application as wide
  // char.
  const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
  const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;

  return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}

void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
  if (gpu_.dev().settings().alwaysResident_) {
    return;
  }
  Pal::IGpuMemory* iMem = mem->iMem();
  auto it = memReferences_.find(mem);
  if (it != memReferences_.end()) {
    it->second = cmdBufIdSlot_;
  } else {
    // Update runtime tracking with TS
    memReferences_[mem] = cmdBufIdSlot_;
    // Update PAL list with the new entry
    Pal::GpuMemoryRef memRef = {};
    memRef.pGpuMemory = iMem;
    palMemRefs_.push_back(memRef);
    // Check SDI memory object
    if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
      sdiReferences_.insert(iMem);
      palSdiRefs_.push_back(iMem);
    }
    residency_size_ += iMem->Desc().size;
  }
}

void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
  Pal::IGpuMemory* iMem = mem->iMem();
  if (0 != memReferences_.erase(mem)) {
    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
    residency_size_ -= iMem->Desc().size;
  }
}

void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) {
  for (size_t i = 0; i < palDoppRefs_.size(); i++) {
    if (palDoppRefs_[i].pGpuMemory == iMem) {
      // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested,
      // the LAST_DOPP_SUBMISSION is send as requsted by KMD
      //
      if (palDoppRefs_[i].flags.lastPfpaCmd == 1) {
        return;  // no need to override the last submission command
      }

      if (lastDoppCmd) {
        palDoppRefs_[i].flags.lastPfpaCmd = 1;
        palDoppRefs_[i].flags.pfpa = 0;
      } else if (pfpaDoppCmd) {
        palDoppRefs_[i].flags.pfpa = 1;
      }
      return;
    }
  }

  //  this is the first reference of the DOPP desktop texture, add it in the vector
  Pal::DoppRef doppRef = {};
  doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0;
  doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0;
  doppRef.pGpuMemory = iMem;
  palDoppRefs_.push_back(doppRef);
}

// ================================================================================================
bool VirtualGPU::Queue::flush() {
  amd::ScopedLock l(lock_);
  const Settings& settings = gpu_.dev().settings();

  if (!settings.alwaysResident_ && palMemRefs_.size() != 0) {
    Pal::Result result = iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
                                                       Pal::GpuMemoryRefCantTrim);
    if (Pal::Result::Success != result) {
      LogPrintfError("PAL failed to make resident resources! result: %d", result);
      return false;
    }
    palMemRefs_.clear();
  }

  // Stop commands building
  Pal::Result result;
  result = iCmdBuffs_[cmdBufIdSlot_]->End();
  if (Pal::Result::Success != result) {
    LogPrintfError("PAL failed to finalize a command buffer! result: %d", result);
    return false;
  }

  // Reset the fence. PAL will reset OS event
  result = iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_]);
  if (Pal::Result::Success != result) {
    LogPrintfError("PAL failed to reset a fence! result:%d", result);
    return false;
  }

  Pal::PerSubQueueSubmitInfo perSubQueueSubmitInfo = {};
  perSubQueueSubmitInfo.cmdBufferCount = 1;
  perSubQueueSubmitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_];

  Pal::MultiSubmitInfo submitInfo = {};
  submitInfo.perSubQueueInfoCount = 1;
  submitInfo.pPerSubQueueInfo = &perSubQueueSubmitInfo;

  submitInfo.doppRefCount = palDoppRefs_.size();
  submitInfo.pDoppRefs = palDoppRefs_.data();

  submitInfo.externPhysMemCount = palSdiRefs_.size();
  submitInfo.ppExternPhysMem = palSdiRefs_.data();

  submitInfo.fenceCount = 1;
  submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];

  if (iQueue_->Type() == Pal::QueueTypeCompute) {
    if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
      // If runtime uses device memory for kernel arguments, then perform a CPU read back on
      // submission. That will make sure NBIO puches all previous CPU write requests through PCIE
      gpu_.managedBuffer().CpuReadBack();
    }
    if (amd::IS_HIP) {
      // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
      // Force CPU sync if there are pending operations on SDMA, until OS fences will be added
      gpu_.WaitForIdleSdma();
    }
  }
  // Submit command buffer to OS
  if (gpu_.rgpCaptureEna()) {
    result = gpu_.dev().captureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
  } else {
    result = iQueue_->Submit(submitInfo);
  }
  if (Pal::Result::Success != result) {
    LogPrintfError("PAL failed to submit CMD! result:%d", result);
    if (GPU_ANALYZE_HANG) {
      DumpMemoryReferences();
    }
    return false;
  }
  // Make sure the slot isn't busy
  constexpr bool IbReuse = true;
  if (GPU_FLUSH_ON_EXECUTION) {
    waitForFence<!IbReuse>(cmdBufIdSlot_);
  }

  // Reset the counter of commands
  cmdCnt_ = 0;

  // Find the next command buffer
  cmdBufIdCurrent_++;

  if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
    // Wait for the last one
    waitForFence<!IbReuse>(cmdBufIdSlot_);
    cmdBufIdCurrent_ = 1;
    cmbBufIdRetired_ = 0;
  }

  // Wrap current slot
  cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;

  waitForFence<IbReuse>(cmdBufIdSlot_);

  // Progress retired TS
  if ((cmdBufIdCurrent_ > max_command_buffers_) &&
      (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
    cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
  }

  // Reset command buffer, so CB chunks could be reused
  result = iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false);
  if (Pal::Result::Success != result) {
    LogPrintfError("PAL failed CB reset! result:%d", result);
    return false;
  }
  // Start command buffer building
  Pal::CmdBufferBuildInfo cmdBuildInfo = {};
  cmdBuildInfo.pMemAllocator = &vlAlloc_;
  result = iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo);
  if (Pal::Result::Success != result) {
    LogPrintfError("PAL failed CB building initialization! result:%d", result);
    return false;
  }

  // Clear dopp references
  palDoppRefs_.clear();
  palSdiRefs_.clear();

  // Remove old memory references
  if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
    for (auto it = memReferences_.begin(); it != memReferences_.end();) {
      if (it->second == cmdBufIdSlot_) {
        palMems_.push_back(it->first->iMem());
        residency_size_ -= it->first->iMem()->Desc().size;
        it = memReferences_.erase(it);
      } else {
        ++it;
      }
    }
  }
  if (!settings.alwaysResident_ && palMems_.size() != 0) {
    iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
    palMems_.clear();
  }

  return true;
}

// ================================================================================================
bool VirtualGPU::Queue::waitForEvent(uint id) {
  amd::ScopedLock l(lock_);
  if (isDone(id)) {
    return true;
  }

  if (id == cmdBufIdCurrent_) {
    // There is an error in the flush() and wait is bogus
    return false;
  }

  uint slotId = id % max_command_buffers_;
  constexpr bool IbReuse = true;
  bool result = waitForFence<!IbReuse>(slotId);
  cmbBufIdRetired_ = id;
  return result;
}

// ================================================================================================
bool VirtualGPU::Queue::isDone(uint id) {
  amd::ScopedLock l(lock_);
  if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
    return true;
  }

  if (id == cmdBufIdCurrent_) {
    // Flush the current command buffer
    if (!flush()) {
      // If flush failed, then exit earlier...
      return false;
    }
  }

  if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
    return false;
  }
  cmbBufIdRetired_ = id;
  return true;
}

// ================================================================================================
void VirtualGPU::Queue::DumpMemoryReferences() const {
  std::fstream dump;
  std::stringstream file_name("ocl_hang_dump.txt");
  uint64_t start = amd::Os::timeNanos() / 1e9;

  dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app));
  // Check if we have OpenCL program
  if (dump.is_open()) {
    dump << start << " Queue: ";
    switch (iQueue_->Type()) {
      case Pal::QueueTypeCompute:
        dump << "Compute";
        break;
      case Pal::QueueTypeDma:
        dump << "SDMA";
        break;
      default:
        dump << "unknown";
        break;
    }
    dump << "\n"
         << "Resident memory resources:\n";
    uint idx = 0;
    for (auto it : memReferences_) {
      dump << " " << idx << "\t[";
      dump.setf(std::ios::hex, std::ios::basefield);
      dump.setf(std::ios::showbase);
      dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
           << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
      dump.setf(std::ios::dec);
      dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().heaps[0] << "\n";
      idx++;
    }

    if (last_kernel_ != nullptr) {
      const amd::KernelSignature& signature = last_kernel_->signature();
      dump << last_kernel_->name() << std::endl;
      for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        // Find if the current argument is a memory object
        if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
          dump << " " << desc.name_ << ": " << std::endl;
        }
      }
    }
    dump.close();
  }
}

bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  if (numMemObj > 0) {
    // Allocate the array of memory objects for dependency tracking
    memObjectsInQueue_ = new MemoryState[numMemObj];
    if (nullptr == memObjectsInQueue_) {
      return false;
    }
    memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
    maxMemObjectsInQueue_ = numMemObj;
  }

  return true;
}

void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
  bool flushL1Cache = false;

  if (maxMemObjectsInQueue_ == 0) {
    // Return earlier if tracking is disabled
    return;
  }

  uint64_t curStart = memory->vmAddress();
  uint64_t curEnd = curStart + memory->size();

  if (memory->isModified(gpu) || !readOnly) {
    // Mark resource as modified
    memory->setModified(gpu, !readOnly);

    // Loop through all memory objects in the queue and find dependency
    // @note don't include objects from the current kernel
    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
      // Check if the queue already contains this mem object and
      // GPU operations aren't readonly
      uint64_t busyStart = memObjectsInQueue_[j].start_;
      uint64_t busyEnd = memObjectsInQueue_[j].end_;

      // Check if the start inside the busy region
      if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
           // Check if the end inside the busy region
           ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
           // Check if the start/end cover the busy region
           ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
          // If the buys region was written or the current one is for write
          (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
        flushL1Cache = true;
        break;
      }
    }
  }

  // Did we reach the limit?
  if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) {
    flushL1Cache = true;
  }

  if (flushL1Cache) {
    // Flush cache
    gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);

    // Clear memory dependency state
    const static bool All = true;
    clear(!All);
  }

  // Insert current memory object into the queue always,
  // since runtime calls flush before kernel execution and it has to keep
  // current kernel in tracking
  memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart;
  memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
  memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
  numMemObjectsInQueue_++;
}

void VirtualGPU::MemoryDependency::clear(bool all) {
  if (numMemObjectsInQueue_ > 0) {
    if (all) {
      endMemObjectsInQueue_ = numMemObjectsInQueue_;
    }

    // If the current launch didn't start from the beginning, then move the data
    if (0 != endMemObjectsInQueue_) {
      size_t i, j;
      // Preserve all objects from the current kernel
      for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
        memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
        memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
        memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
      }
    } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) {
      // note: The array growth shouldn't occur under the normal conditions,
      // but in a case when SVM path sends the amount of SVM ptrs over
      // the max size of kernel arguments
      MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
      if (nullptr == ptr) {
        numMemObjectsInQueue_ = 0;
        return;
      }
      maxMemObjectsInQueue_ <<= 1;
      memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
      delete[] memObjectsInQueue_;
      memObjectsInQueue_ = ptr;
    }

    // Adjust the number of active objects
    numMemObjectsInQueue_ -= endMemObjectsInQueue_;
    endMemObjectsInQueue_ = 0;
  }
}

void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
    if (pinnedMems_.size() > 7) {
      pinnedMems_.front()->release();
      pinnedMems_.erase(pinnedMems_.begin());
    }

    // Start operation, since we should release mem object
    flushDMA(dev().getGpuMemory(mem)->getGpuEvent(*this)->engineId_);

    // Delay destruction
    pinnedMems_.push_back(mem);
  }
}

void VirtualGPU::releasePinnedMem() {
  for (auto& amdMemory : pinnedMems_) {
    amdMemory->release();
  }
  pinnedMems_.clear();
}

amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
  for (auto& amdMemory : pinnedMems_) {
    if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
      return amdMemory;
    }
  }
  return nullptr;
}

bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
  uint MinDeviceQueueSize = 16 * 1024;
  deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);

  maskGroups_ = deviceQueueSize / (512 * Ki);
  maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_;

  // Align the queue size for the multiple dispatch scheduler.
  // Each thread works with 32 entries * maskGroups
  uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_);
  if (extra != 0) {
    deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra;
  }

  if (deviceQueueSize_ == deviceQueueSize) {
    return true;
  } else {
    delete vqHeader_;
    delete virtualQueue_;
    vqHeader_ = nullptr;
    virtualQueue_ = nullptr;
    deviceQueueSize_ = 0;
  }
  uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
  uint allocSize = deviceQueueSize;

  // Add the virtual queue header
  allocSize += sizeof(AmdVQueueHeader);
  allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));

  uint argOffs = allocSize;

  // Add the kernel arguments and wait events
  uint singleArgSize = amd::alignUp(
      dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t),
      sizeof(AmdAqlWrap));
  allocSize += singleArgSize * numSlots;

  uint eventsOffs = allocSize;
  // Add the device events
  allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);

  uint eventMaskOffs = allocSize;
  // Add mask array for events
  allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;

  uint slotMaskOffs = allocSize;
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

  // Align size to 64 bytes for more efficient fill operation
  allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));

  virtualQueue_ = new Memory(dev(), allocSize);
  Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
  if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
    return false;
  }

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    address ptr = reinterpret_cast<address>(virtualQueue_->map(this, Resource::WriteOnly));
    if (nullptr == ptr) {
      return false;
    }
  }

  uint64_t pattern = 0;
  amd::Coord3D origin(0, 0, 0);
  amd::Coord3D region(virtualQueue_->size(), 0, 0);
  if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), region, origin,
                                  region)) {
    return false;
  }

  uint64_t vaBase = virtualQueue_->vmAddress();
  AmdVQueueHeader header = {};
  // Initialize the virtual queue header
  header.aql_slot_num = numSlots;
  header.event_slot_num = dev().settings().numDeviceEvents_;
  header.event_slot_mask = vaBase + eventMaskOffs;
  header.event_slots = vaBase + eventsOffs;
  header.aql_slot_mask = vaBase + slotMaskOffs;
  header.wait_size = dev().settings().numWaitEvents_;
  header.arg_size = dev().info().maxParameterSize_ + 64;
  header.mask_groups = maskGroups_;

  vqHeader_ = new AmdVQueueHeader;
  if (nullptr == vqHeader_) {
    return false;
  }
  *vqHeader_ = header;

  virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);

  // Go over all slots and perform initialization
  AmdAqlWrap slot = {};
  size_t offset = sizeof(AmdVQueueHeader);
  for (uint i = 0; i < numSlots; ++i) {
    uint64_t argStart = vaBase + argOffs + i * singleArgSize;
    slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
    slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
    virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
    offset += sizeof(AmdAqlWrap);
  }

  deviceQueueSize_ = deviceQueueSize;

  return true;
}

// ================================================================================================
VirtualGPU::VirtualGPU(Device& device)
    : device::VirtualDevice(device),
      engineID_(MainEngine),
      gpuDevice_(static_cast<Device&>(device)),
      printfDbgHSA_(nullptr),
      tsCache_(nullptr),
      managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
      writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
      hwRing_(0),
      readjustTimeGPU_(0),
      lastTS_(nullptr),
      profileTs_(nullptr),
      vqHeader_(nullptr),
      virtualQueue_(nullptr),
      schedParams_(nullptr),
      deviceQueueSize_(0),
      maskGroups_(1),
      hsaQueueMem_(nullptr),
      cmdAllocator_(nullptr) {
  // Note: Virtual GPU device creation must be a thread safe operation
  index_ = gpuDevice_.numOfVgpus_++;
  gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
  gpuDevice_.vgpus_[index()] = this;

  queues_[MainEngine] = nullptr;
  queues_[SdmaEngine] = nullptr;

  // The hostcall buffer for this vqueue is initialized on demand.
  hostcallBuffer_ = nullptr;
}

// ================================================================================================
bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
                        amd::CommandQueue::Priority priority) {
  device::BlitManager::Setup blitSetup;

  // Resize the list of device resources always,
  // because destructor calls eraseResourceList() even if create() failed
  dev().resizeResoureList(index());

  // Virtual GPU will have profiling enabled
  state_.profiling_ = profiling;

  Pal::CmdAllocatorCreateInfo createInfo = {};
  createInfo.flags.threadSafe = true;
  // \todo forces PAL to reuse CBs, but requires postamble
  createInfo.flags.autoMemoryReuse = false;
  createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
      VirtualGPU::Queue::MaxCommands *
      (320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0));
  createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
      dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize;

  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 256 * Ki;
  createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;

  createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
  createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocSize = 64 * Ki;
  createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].suballocSize = 32 * Ki;

  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocHeap = Pal::GpuHeapInvisible;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocSize = 64 * Ki;
  createInfo.allocInfo[Pal::GpuScratchMemAlloc].suballocSize = 4 * Ki;

  Pal::Result result;
  size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
  if (Pal::Result::Success != result) {
    return false;
  }
  char* addr = new char[cmdAllocSize];
  if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
    return false;
  }

  uint idx = index() % dev().numComputeEngines();
  uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
                                 ? 0
                                 : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
  uint max_cmd_buffers = dev().settings().maxCmdBuffers_;

  if (dev().numComputeEngines()) {
    queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
                                        priority, residency_limit, max_cmd_buffers);
    if (nullptr == queues_[MainEngine]) {
      return false;
    }
    const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_);
    hwRing_ = (info != dev().QueuePool().end())
                  ? info->second->index_
                  : (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;

    // Check if device has SDMA engines
    if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) {
      uint sdma;
      // If only 1 SDMA engine is available then use that one, otherwise it's a round-robin manner
      if ((dev().numDMAEngines() < 2) || ((idx + 1) & 0x1)) {
        sdma = 0;
      } else {
        sdma = 1;
      }
      queues_[SdmaEngine] = Queue::Create(
          *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
          amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
      if (nullptr == queues_[SdmaEngine]) {
        return false;
      }
    }
  } else {
    LogError("Runtme couldn't find compute queues!");
    return false;
  }

  // Create buffers for kernel arg management
  if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs
                                 ? Resource::Persistent
                                 : Resource::RemoteUSWC)) {
    // Try just USWC if persistent memory failed
    if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
      if (!managedBuffer_.create(Resource::RemoteUSWC)) {
        return false;
      }
    } else {
      return false;
    }
  }

  // Diable double copy optimization,
  // since UAV read from nonlocal is fast enough
  blitSetup.disableCopyBufferToImageOpt_ = true;
  if (!allocConstantBuffers()) {
    return false;
  }

  // Create HSAILPrintf class
  printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
  if (nullptr == printfDbgHSA_) {
    LogError("Could not create PrintfDbgHSA class!");
    return false;
  }

  tsCache_ = new TimeStampCache(*this);
  if (nullptr == tsCache_) {
    LogError("Could not create TimeStamp cache!");
    return false;
  }

  if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
    LogError("Could not create the array of memory objects!");
    return false;
  }

  if (!allocHsaQueueMem()) {
    LogError("Could not create hsaQueueMem object!");
    return false;
  }

  // Check if the app requested a device queue creation
  if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) &&
      !createVirtualQueue(deviceQueueSize)) {
    LogError("Could not create a virtual queue!");
    return false;
  }

  // Choose the appropriate class for blit engine
  switch (dev().settings().blitEngine_) {
    default:
    // Fall through ...
    case Settings::BlitEngineHost:
      blitSetup.disableAll();
    // Fall through ...
    case Settings::BlitEngineCAL:
    case Settings::BlitEngineKernel:
      blitMgr_ = new KernelBlitManager(*this, blitSetup);
      break;
  }
  if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
    LogError("Could not create BlitManager!");
    return false;
  }

  // If the developer mode manager is available and it's not a device queue,
  // then enable RGP capturing
  if ((index() != 0) && dev().captureMgr() != nullptr) {
    bool dbg_vmid = false;
    state_.rgpCaptureEnabled_ = true;
    dev().captureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
    dev().captureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
  }

  return true;
}

// ================================================================================================
bool VirtualGPU::allocHsaQueueMem() {
  // Allocate a dummy HSA queue
  hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
  if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) {
    delete hsaQueueMem_;
    return false;
  }
  amd_queue_t hsa_queue = {};

  // Provide private and local heap addresses
  constexpr uint addressShift = LP64_SWITCH(0, 32);
  hsa_queue.private_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
  hsa_queue.group_segment_aperture_base_hi = static_cast<uint32_t>(
      dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);

  hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true);

  return true;
}

VirtualGPU::~VirtualGPU() {
  // Not safe to remove a queue. So lock the device
  amd::ScopedLock k(dev().lockAsyncOps());
  amd::ScopedLock lock(dev().vgpusAccess());

  if (queues_[MainEngine] != nullptr) {
    // Clear all timestamps, associated with this virtual GPU
    auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
    for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
      if (mgmt.aql_vgpus_[i] == this) {
        mgmt.aql_vgpus_[i] = nullptr;
        mgmt.aql_events_[i].invalidate();
      }
    }
  }

  // Destroy RGP trace
  if (rgpCaptureEna()) {
    dev().captureMgr()->FinishRGPTrace(this, true);
  }

  while (!freeCbQueue_.empty()) {
    auto cb = freeCbQueue_.front();
    delete cb;
    freeCbQueue_.pop();
  }

  // Destroy printfHSA object
  delete printfDbgHSA_;

  // Destroy TimeStamp cache
  delete tsCache_;

  // Destroy resource list with the constant buffers
  for (uint i = 0; i < constBufs_.size(); ++i) {
    delete constBufs_[i];
  }

  managedBuffer_.release();

  delete vqHeader_;
  delete virtualQueue_;
  delete hsaQueueMem_;

  // Release scratch buffer memory to reduce memory pressure
  //!@note OCLtst uses single device with multiple tests
  //! Release memory only if it's the last command queue.
  //! The first queue is reserved for the transfers on device
  if (static_cast<int>(gpuDevice_.numOfVgpus_ - 1) <= 1) {
    gpuDevice_.destroyScratchBuffers();
  }

  // Destroy BlitManager object
  delete blitMgr_;

  {
    // Destroy queues
    delete queues_[MainEngine];
    delete queues_[SdmaEngine];

    if (nullptr != cmdAllocator_) {
      cmdAllocator_->Destroy();
      delete[] reinterpret_cast<char*>(cmdAllocator_);
    }
  }

  {
    // Find all available virtual GPUs and lock them
    // from the execution of commands, since the queue index and resource list
    // Will be adjusted
    for (auto it : dev().vgpus()) {
      if (it != this) {
        it->execution().lock();
      }
    }

    // Not safe to add a resource if create/destroy queue is in process, since
    // the size of the TS array can change
    amd::ScopedLock r(dev().lockResources());
    gpuDevice_.numOfVgpus_--;
    gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
    for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
      dev().vgpus()[idx]->index_--;
    }
    dev().eraseResoureList(index());

    // Find all available virtual GPUs and unlock them
    // for the execution of commands
    for (auto it : dev().vgpus()) {
      it->execution().unlock();
    }
  }

  if (hostcallBuffer_ != nullptr) {
    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for virtual queue %p",
            hostcallBuffer_, this);
    amd::disableHostcalls(hostcallBuffer_);
    dev().svmFree(hostcallBuffer_);
  }
}

void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up-to-date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.source());

  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);

  profilingBegin(vcmd);

  memory->syncCacheFromHost(*this);
  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_READ_IMAGE) &&
      (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.source());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_READ_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_READ_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size,
                                      vcmd.isEntireMemory(), vcmd.copyMetadata());
      } else {
        // The logic below will perform 2 step copy to make sure memory pinning doesn't
        // occur on the first unaligned page, because in Windows memory manager can
        // have CPU access to the allocation header in another thread
        // and a race condition is possible.
        char* tmpHost =
            amd::alignUp(reinterpret_cast<char*>(vcmd.destination()), PinnedMemoryAlignment);

        // Find the partial size for unaligned copy
        size_t partial = tmpHost - reinterpret_cast<char*>(vcmd.destination());
        result = true;
        // Check if it's staging copy, then ignore unaligned address
        if (size[0] <= dev().settings().pinnedMinXferSize_) {
          partial = size[0];
        }
        // Make first step transfer
        if (partial > 0) {
          result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, partial, false,
                                        vcmd.copyMetadata());
        }
        // Second step transfer if something left to copy
        if (partial < size[0]) {
          result &= blitMgr().readBuffer(*memory, tmpHost, origin[0] + partial, size[0] - partial,
                                         false, vcmd.copyMetadata());
        }
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_READ_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect,
                                          vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
      } else {
        result =
            blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(),
                                     vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
      }
    } break;
    case CL_COMMAND_READ_IMAGE:
      if (memory->memoryType() == Resource::ImageBuffer) {
        Image* imageBuffer = static_cast<Image*>(memory);
        // Check if synchronization has to be performed
        if (nullptr != imageBuffer->CopyImageBuffer()) {
          memory = imageBuffer->CopyImageBuffer();
          Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
          amd::Image* image = imageBuffer->owner()->asImage();
          amd::Coord3D offs(0);
          // Copy memory from the original image buffer into the backing store image
          result = blitMgr().copyBufferToImage(*buffer, *imageBuffer->CopyImageBuffer(), offs, offs,
                                               image->getRegion(), true, image->getRowPitch(),
                                               image->getSlicePitch(), vcmd.copyMetadata());
        }
      }
      if (hostMemory != nullptr) {
        // Accelerated image to buffer transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin,
                                             vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(),
                                             vcmd.slicePitch(), vcmd.copyMetadata());
      } else {
        result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(),
                                     vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(),
                                     vcmd.copyMetadata());
      }
      break;
    default:
      LogError("Unsupported type for the read command");
      break;
  }

  if (!result) {
    LogError("submitReadMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // Translate memory references and ensure cache up to date
  pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);

  profilingBegin(vcmd);

  bool entire = vcmd.isEntireMemory();

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  cl_command_type type = vcmd.type();
  bool result = false;
  amd::Memory* bufferFromImage = nullptr;

  // Force buffer write for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_WRITE_IMAGE) &&
      (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(vcmd.destination());
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_WRITE_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Process different write commands
  switch (type) {
    case CL_COMMAND_WRITE_BUFFER: {
      amd::Coord3D origin(vcmd.origin()[0]);
      amd::Coord3D size(vcmd.size()[0]);
      if (nullptr != bufferFromImage) {
        size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if ((hostMemory != nullptr) && (vcmd.size()[0] > dev().settings().prepinnedMinSize_)) {
        // Accelerated transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size,
                                      vcmd.isEntireMemory(), vcmd.copyMetadata());
      } else {
        // The logic below will perform 2 step copy to make sure memory pinning doesn't
        // occur on the first unaligned page, because in Windows memory manager can
        // have CPU access to the allocation header in another thread
        // and a race condition is possible.
        const char* tmpHost =
            amd::alignUp(reinterpret_cast<const char*>(vcmd.source()), PinnedMemoryAlignment);

        // Find the partial size for unaligned copy
        size_t partial = tmpHost - reinterpret_cast<const char*>(vcmd.source());
        result = true;
        // Check if it's staging copy, then ignore unaligned address
        if (size[0] <= dev().settings().pinnedMinXferSize_) {
          partial = size[0];
        }
        // Make first step transfer
        if (partial > 0) {
          result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, partial, false,
                                         vcmd.copyMetadata());
        }
        // Second step transfer if something left to copy
        if (partial < size[0]) {
          result &= blitMgr().writeBuffer(tmpHost, *memory, origin[0] + partial, size[0] - partial,
                                          false, vcmd.copyMetadata());
        }
      }
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_WRITE_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
                            vcmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(),
                                          vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
      } else {
        result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(),
                                           vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
      }
    } break;
    case CL_COMMAND_WRITE_IMAGE:
      if (hostMemory != nullptr) {
        // Accelerated buffer to image transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(),
                                             vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(),
                                             vcmd.slicePitch(), vcmd.copyMetadata());
      } else {
        result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(),
                                      vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(),
                                      vcmd.copyMetadata());
      }
      break;
    default:
      LogError("Unsupported type for the write command");
      break;
  }

  if (!result) {
    LogError("submitWriteMemory failed!");
    vcmd.setStatus(CL_INVALID_OPERATION);
  } else {
    // Mark this as the most-recently written cache of the destination
    vcmd.destination().signalWrite(&gpuDevice_);
  }
  profilingEnd(vcmd);
}

bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
                            bool entire, const amd::Coord3D& srcOrigin,
                            const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                            const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                            amd::CopyMetadata copyMetadata) {
  // Translate memory references and ensure cache up-to-date
  pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
  pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);

  if (dstMemory == nullptr || srcMemory == nullptr) {
    LogError("submitcopyMemory Failed!");
    return false;
  }

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  dstMemory->syncCacheFromHost(*this, syncFlags);
  srcMemory->syncCacheFromHost(*this);

  amd::Memory* bufferFromImageSrc = nullptr;
  amd::Memory* bufferFromImageDst = nullptr;

  // Force buffer read for IMAGE1D_BUFFER
  if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
    bufferFromImageSrc = createBufferFromImage(srcMem);
    if (nullptr == bufferFromImageSrc) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      srcMemory = dev().getGpuMemory(bufferFromImageSrc);
    }
  }
  // Force buffer write for IMAGE1D_BUFFER
  if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
    bufferFromImageDst = createBufferFromImage(dstMem);
    if (nullptr == bufferFromImageDst) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      dstMemory = dev().getGpuMemory(bufferFromImageDst);
    }
  }

  type = getCopyCommandType(type, srcMem.getType(), dstMem.getType());

  bool result = false;

  // Check if HW can be used for memory copy
  switch (type) {
    case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD:
    case CL_COMMAND_SVM_MEMCPY:
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D realSrcOrigin(srcOrigin[0]);
      amd::Coord3D realDstOrigin(dstOrigin[0]);
      amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);

      if (nullptr != bufferFromImageSrc) {
        const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
        realSrcOrigin.c[0] *= elemSize;
        if (nullptr != bufferFromImageDst) {
          realDstOrigin.c[0] *= elemSize;
        }
        realSize.c[0] *= elemSize;
      } else if (nullptr != bufferFromImageDst) {
        const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
        realDstOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
      }

      result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize,
                                    entire, copyMetadata);
    } break;
    case CL_COMMAND_COPY_BUFFER_RECT:
      result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire,
                                        copyMetadata);
      break;
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
      amd::Coord3D realDstOrigin(dstOrigin);
      if (nullptr != bufferFromImageDst) {
        const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
        realDstOrigin.c[0] *= elemSize;
      }
      result =
          blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, realDstOrigin, size, entire,
                                      dstRect.rowPitch_, dstRect.slicePitch_, copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
      amd::Coord3D realSrcOrigin(srcOrigin);
      if (nullptr != bufferFromImageSrc) {
        const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
        realSrcOrigin.c[0] *= elemSize;
      }
      result =
          blitMgr().copyBufferToImage(*srcMemory, *dstMemory, realSrcOrigin, dstOrigin, size, entire,
                                      srcRect.rowPitch_, srcRect.slicePitch_, copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_IMAGE:
      result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire,
                                   copyMetadata);
      break;
    default:
      LogError("Unsupported command type for memory copy!");
      break;
  }
  if (nullptr != bufferFromImageSrc) {
    bufferFromImageSrc->release();
  }
  if (nullptr != bufferFromImageDst) {
    bufferFromImageDst->release();
  }
  if (!result) {
    LogError("submitCopyMemory failed!");
    return false;
  } else {
    // Mark this as the most-recently written cache of the destination
    dstMem.signalWrite(&gpuDevice_);
  }
  return true;
}

void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  bool entire = vcmd.isEntireMemory();

  if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(),
                  vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect(),
                  vcmd.copyMetadata())) {
    vcmd.setStatus(CL_INVALID_OPERATION);
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  cl_command_type type = vcmd.type();
  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    amd::Coord3D srcOrigin(0, 0, 0);
    amd::Coord3D dstOrigin(0, 0, 0);
    amd::Coord3D size(vcmd.srcSize(), 1, 1);
    amd::BufferRect srcRect;
    amd::BufferRect dstRect;

    bool result = false;
    amd::Memory* srcMem = amd::MemObjMap::FindMemObj(vcmd.src());
    amd::Memory* dstMem = amd::MemObjMap::FindMemObj(vcmd.dst());

    device::Memory::SyncFlags syncFlags;
    if (nullptr != srcMem) {
      srcMem->commitSvmMemory();
      srcOrigin.c[0] =
          static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
      if (!(srcMem->validateRegion(srcOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
    if (nullptr != dstMem) {
      dstMem->commitSvmMemory();
      dstOrigin.c[0] =
          static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
      if (!(dstMem->validateRegion(dstOrigin, size))) {
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }

    if (nullptr == srcMem && nullptr == dstMem) {  // both not in svm space
      std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
      result = true;
    } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getGpuMemory(dstMem);
      // Synchronize source and destination memory
      syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
      memory->syncCacheFromHost(*this, syncFlags);

      result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size,
                                     dstMem->isEntirelyCovered(dstOrigin, size));
      // Mark this as the most-recently written cache of the destination
      dstMem->signalWrite(&gpuDevice_);
    } else if (nullptr != srcMem && nullptr == dstMem) {  // dst not in svm space
      Memory* memory = dev().getGpuMemory(srcMem);
      // Synchronize source and destination memory
      memory->syncCacheFromHost(*this);

      result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size,
                                    srcMem->isEntirelyCovered(srcOrigin, size));
    } else if (nullptr != srcMem && nullptr != dstMem) {  // both in svm space
      bool entire =
          srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
      result =
          copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect);
    }

    if (!result) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // direct memcpy for FGS enabled system
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
  }
  profilingEnd(vcmd);
}

void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());

  // Save map info for unmap operation
  memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                      vcmd.isEntireMemory());

  // If we have host memory, use it
  if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
    if (!memory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before
      // synchronization with the backing store
      memory->wait(*this);
    }

    // Target is the backing store, so just ensure that owner is up-to-date
    memory->owner()->cacheWriteBack(this);

    // Add memory to VA cache, so rutnime can detect direct access to VA
    dev().addVACache(memory);
  } else if (memory->isPersistentMapped()) {
    // Nothing to do here
  } else if (memory->mapMemory() != nullptr) {
    // Target is a remote resource, so copy
    assert(memory->mapMemory() != nullptr);
    if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
      amd::Coord3D dstOrigin(0, 0, 0);
      if (memory->desc().buffer_) {
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        Memory* memoryBuf = memory;
        amd::Coord3D origin(vcmd.origin()[0]);
        amd::Coord3D size(vcmd.size()[0]);
        size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;

        amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
        if (nullptr == bufferFromImage) {
          LogError("We should not fail buffer creation from image_buffer!");
        } else {
          memoryBuf = dev().getGpuMemory(bufferFromImage);
        }
        if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size,
                                  vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
        if (nullptr != bufferFromImage) {
          bufferFromImage->release();
        }
      } else {
        // Validate if it's a view for a map of mip level
        if (vcmd.memory().parent() != nullptr) {
          amd::Image* amdImage = vcmd.memory().parent()->asImage();
          if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
            // Save map write info in the parent object
            dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(),
                                                      vcmd.mapFlags(), vcmd.isEntireMemory(),
                                                      vcmd.memory().asImage());
          }
        }
        if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin,
                                         vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    }
  } else {
    LogError("Unhandled map!");
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
  bool unmapMip = false;
  amd::Image* amdImage;
  {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
    amd::Memory* owner = memory->owner();
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
    if (nullptr == writeMapInfo) {
      LogError("Unmap without map call");
      return;
    }
    profilingBegin(vcmd);

    // Check if image is a mipmap and assign a saved view
    amdImage = owner->asImage();
    if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
        (writeMapInfo->baseMip_ != nullptr)) {
      // Assign mip level view
      amdImage = writeMapInfo->baseMip_;
      // Clear unmap flags from the parent image
      memory->clearUnmapInfo(vcmd.mapPtr());
      memory = dev().getGpuMemory(amdImage);
      unmapMip = true;
      writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
    }

    // We used host memory
    if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (writeMapInfo->isUnmapWrite()) {
        // Target is the backing store, so sync
        owner->signalWrite(nullptr);
        memory->syncCacheFromHost(*this);
      }
      // Remove memory from VA cache
      dev().removeVACache(memory);
    }
    // data check was added for persistent memory that failed to get aperture
    // and therefore are treated like a remote resource
    else if (memory->isPersistentMapped()) {
      // Map/unmap must be serialized
      amd::ScopedLock lock(owner->lockMemoryOps());
      memory->unmap(this);
      if (memory->getMapCount() == 0) {
        memory->setPersistentMapFlag(false);
      }
    } else if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        amd::Coord3D srcOrigin(0, 0, 0);
        // Target is a remote resource, so copy
        assert(memory->mapMemory() != nullptr);
        if (memory->desc().buffer_) {
          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                    writeMapInfo->origin_, writeMapInfo->region_,
                                    writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
        } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
          Memory* memoryBuf = memory;
          amd::Coord3D origin(writeMapInfo->origin_[0]);
          amd::Coord3D size(writeMapInfo->region_[0]);
          size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
          origin.c[0] *= elemSize;
          size.c[0] *= elemSize;

          amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
          if (nullptr == bufferFromImage) {
            LogError("We should not fail buffer creation from image_buffer!");
          } else {
            memoryBuf = dev().getGpuMemory(bufferFromImage);
          }
          if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
                                    writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
          if (nullptr != bufferFromImage) {
            bufferFromImage->release();
          }
        } else {
          if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
                                           writeMapInfo->origin_, writeMapInfo->region_,
                                           writeMapInfo->isEntire())) {
            LogError("submitUnmapMemory() - copy failed");
            vcmd.setStatus(CL_OUT_OF_RESOURCES);
          }
        }
      }
    } else {
      LogError("Unhandled unmap!");
      vcmd.setStatus(CL_INVALID_VALUE);
    }

    // Clear unmap flags
    memory->clearUnmapInfo(vcmd.mapPtr());

    profilingEnd(vcmd);
  }
  // Release a view for a mipmap map
  if (unmapMip) {
    // Memory release should be outside of the execution lock,
    // because mapMemory_ isn't marked for a specifc GPU
    amdImage->release();
  }
}

bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
                            size_t patternSize, const amd::Coord3D& origin,
                            const amd::Coord3D& size, bool forceBlit) {
  pal::Memory* memory = dev().getGpuMemory(amdMemory);
  bool entire = amdMemory->isEntirelyCovered(origin, size);

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  bool result = false;
  amd::Memory* bufferFromImage = nullptr;
  float fillValue[4];

  // Force fill buffer for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    bufferFromImage = createBufferFromImage(*amdMemory);
    if (nullptr == bufferFromImage) {
      LogError("We should not fail buffer creation from image_buffer!");
    } else {
      type = CL_COMMAND_FILL_BUFFER;
      memory = dev().getGpuMemory(bufferFromImage);
    }
  }

  // Find the the right fill operation
  switch (type) {
    case CL_COMMAND_FILL_BUFFER:
    case CL_COMMAND_SVM_MEMFILL: {
      amd::Coord3D realOrigin(origin[0]);
      amd::Coord3D realSize(size[0]);
      // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
      if (nullptr != bufferFromImage) {
        size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
        realOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
        memset(fillValue, 0, sizeof(fillValue));
        amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
        pattern = fillValue;
        patternSize = elemSize;
      }
      result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSize, realOrigin, realSize,
                                    amdMemory->isEntirelyCovered(origin, size), forceBlit);
      if (nullptr != bufferFromImage) {
        bufferFromImage->release();
      }
    } break;
    case CL_COMMAND_FILL_IMAGE:
      result = blitMgr().fillImage(*memory, pattern, origin, size,
                                   amdMemory->isEntirelyCovered(origin, size));
      break;
    default:
      LogError("Unsupported command type for FillMemory!");
      break;
  }

  if (!result) {
    LogError("fillMemory failed!");
    return false;
  }

  // Mark this as the most-recently written cache of the destination
  amdMemory->signalWrite(&gpuDevice_);
  return true;
}

void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);
  if (cmd.type() == CL_COMMAND_FILL_IMAGE) {
    if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
                    cmd.size())) {
      cmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    size_t width = cmd.size().c[0];
    size_t height = cmd.size().c[1];
    size_t depth = cmd.size().c[2];
    size_t pitch = cmd.surface().c[0];
    amd::Coord3D origin = cmd.origin();
    amd::Coord3D region{cmd.surface().c[1], cmd.surface().c[2], depth};
    amd::BufferRect rect;
    rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region), pitch, 0);

    bool force_blit = false;
    if (amd::IS_HIP) {
      constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
      // In case of HMM, use blit kernel instead of CPU memcpy
      if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
        force_blit = true;
      }
    }

    for (size_t slice = 0; slice < depth; slice++) {
      for (size_t row = 0; row < height; row++) {
        const size_t rowOffset = rect.offset(0, row, slice);
        if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
                        amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1}, force_blit)) {
          cmd.setStatus(CL_INVALID_OPERATION);
        }
      }
    }
  }
  profilingEnd(cmd);
}

void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  // Get the device memory objects for the current device
  Memory* srcDevMem = dev().getGpuMemory(&cmd.source());
  Memory* dstDevMem = dev().getGpuMemory(&cmd.destination());

  bool p2pAllowed = true;

  // If any device object is null, then no HW P2P and runtime has to use staging
  if (srcDevMem == nullptr) {
    srcDevMem = static_cast<pal::Memory*>(
        cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
    p2pAllowed = false;
  } else if (dstDevMem == nullptr) {
    dstDevMem = static_cast<pal::Memory*>(
        cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
    p2pAllowed = false;
  }

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = cmd.isEntireMemory();
  amd::Coord3D size = cmd.size();

  bool result = false;
  switch (cmd.type()) {
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
      amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);

      if (p2pAllowed) {
        result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
                                      cmd.isEntireMemory());
      } else {
        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));

        size_t copy_size = Device::kP2PStagingSize;
        size_t left_size = size[0];
        amd::Coord3D stageOffset(0);
        result = true;
        do {
          if (left_size <= copy_size) {
            copy_size = left_size;
          }
          left_size -= copy_size;
          amd::Coord3D cpSize(copy_size);

          // Perform 2 step transfer with staging buffer
          result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
                                                          stageOffset, cpSize);
          srcOrigin.c[0] += copy_size;
          result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
                                                          dstOrigin, cpSize);
          dstOrigin.c[0] += copy_size;
        } while (left_size > 0);
      }
      break;
    }
    case CL_COMMAND_COPY_BUFFER_RECT: {
      if (p2pAllowed) {
        result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(),
                                          size, cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<pal::Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));

        if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
          result = true;
          // Perform 2 step transfer with staging buffer
          result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
                                                              cmd.srcRect(), size, false,
                                                              cmd.copyMetadata());

          result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
                                                              cmd.dstRect(), size, false,
                                                              cmd.copyMetadata());
        } else {
          size_t srcOffset;
          size_t dstOffset;
          result = true;

          for (size_t z = 0; z < size[2]; ++z) {
            for (size_t y = 0; y < size[1]; ++y) {
              srcOffset = cmd.srcRect().offset(0, y, z);
              dstOffset = cmd.dstRect().offset(0, y, z);

              amd::Coord3D srcOrigin(srcOffset);
              amd::Coord3D dstOrigin(dstOffset);
              size_t copy_size = Device::kP2PStagingSize;
              size_t left_size = size[0];
              amd::Coord3D stageOffset(0);
              do {
                if (left_size <= copy_size) {
                  copy_size = left_size;
                }
                left_size -= copy_size;

                // Perform 2 step transfer with staging buffer
                result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin,
                                                      stageOffset, copy_size, *dstStgMem);
                srcDevMem->dev().xferQueue()->waitAllEngines();

                result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset,
                                                      dstOrigin, copy_size, *dstDevMem);
                srcStgMem->dev().xferQueue()->waitAllEngines();

                srcOrigin.c[0] += copy_size;
                dstOrigin.c[0] += copy_size;
              } while (left_size > 0);
            }
          }
        }
      }
      break;
    }
    case CL_COMMAND_COPY_IMAGE:
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
      LogError("Unsupported P2P type!");
      break;
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitCopyMemoryP2P failed!");
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  }

  cmd.destination().signalWrite(&dstDevMem->dev());

  profilingEnd(cmd);
}

void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    // Make sure we have memory for the command execution
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());

    memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
                        vcmd.isEntireMemory());

    if (memory->mapMemory() != nullptr) {
      if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
                                  vcmd.size(), vcmd.isEntireMemory())) {
          LogError("submitSVMMapMemory() - copy failed");
          vcmd.setStatus(CL_MAP_FAILURE);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (!memory->isHostMemDirectAccess()) {
        // Make sure GPU finished operation before
        // synchronization with the backing store
        memory->wait(*this);
      }

      // Target is the backing store, so just ensure that owner is up-to-date
      memory->owner()->cacheWriteBack(this);
    } else {
      LogError("Unhandled svm map!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem()) {
    pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr());

    if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        amd::Coord3D srcOrigin(0, 0, 0);
        // Target is a remote resource, so copy
        assert(memory->desc().buffer_ && "SVM memory can't be an image");
        if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
                                  writeMapInfo->isEntire())) {
          LogError("submitSvmUnmapMemory() - copy failed");
          vcmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      }
    } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
      if (writeMapInfo->isUnmapWrite()) {
        // Target is the backing store, so sync
        memory->owner()->signalWrite(nullptr);
        memory->syncCacheFromHost(*this);
      }
    }
    memory->clearUnmapInfo(vcmd.svmPtr());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  if (!dev().isFineGrainedSystem()) {
    size_t patternSize = vcmd.patternSize();
    size_t fillSize = patternSize * vcmd.times();
    amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst());
    assert(dstMemory && "No svm Buffer to fill with!");
    size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
                    reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());

    pal::Memory* memory = dev().getGpuMemory(dstMemory);

    amd::Coord3D origin(offset, 0, 0);
    amd::Coord3D size(fillSize, 1, 1);

    assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");

    if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // for FGS capable device, fill CPU memory directly
    amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.memObjects()) {
    // Find device memory
    pal::Memory* memory = dev().getGpuMemory(it);

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      memory->mgpuCacheWriteBack(*this);
    } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
      // Synchronize memory from host if necessary.
      // The sync function will perform memory migration from
      // another device if necessary
      device::Memory::SyncFlags syncFlags;
      memory->syncCacheFromHost(*this, syncFlags);
    } else {
      LogWarning("Unknown operation for memory migration!");
    }
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
  // in-order semantics: previous commands need to be done before we start
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
  std::vector<void*>& svmPointers = vcmd.svmPointers();
  if (vcmd.pfnFreeFunc() == nullptr) {
    // pointers allocated using clSVMAlloc
    for (uint32_t i = 0; i < svmPointers.size(); ++i) {
      dev().svmFree(svmPointers[i]);
    }
  } else {
    vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
                       static_cast<void**>(&(svmPointers[0])), vcmd.userData());
  }
  profilingEnd(vcmd);
}

void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(cmd);

  const cl_command_type type = cmd.type();
  const uint64_t value = cmd.value();
  const uint64_t mask = cmd.mask();
  const unsigned int flags = cmd.flags();
  const size_t sizeBytes = cmd.sizeBytes();
  const size_t offset = cmd.offset();

  amd::Memory* amdMemory = &cmd.memory();
  Memory* memory = dev().getGpuMemory(amdMemory);

  if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
    // Use a blit kernel to perform the wait operation
    // mask is applied on value before performing
    // the comparision defined by 'condition'
    bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
                                                                           sizeBytes, flags, mask);
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
            "Waiting for value: 0x%lx."
            " Flags: 0x%lx mask: 0x%lx",
            value, flags, mask);
    if (!result) {
      LogError("submitStreamOperation: Wait failed!");
    }
  } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
    bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value, offset,
                                                                            sizeBytes);
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
    if (!result) {
      LogError("submitStreamOperation: Write failed!");
    }
  } else {
    ShouldNotReachHere();
  }
  profilingEnd(cmd);
}

// ================================================================================================
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
  amd::Memory* phys_mem_obj = vcmd.memory();
  amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
  if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
    profilingEnd(vcmd);
    return;
  }

  // Create a view, since original base obj will map the whole memory and multimap cases wont work.
  amd::Memory* vaddr_sub_obj = nullptr;
  size_t vaddr_offset = 0;
  if (phys_mem_obj != nullptr) {
    constexpr bool kParent = false;
    vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer(
        phys_mem_obj->getContext(), const_cast<void*>(vcmd.ptr()), vcmd.size(),
        phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent);

    // Calculate the offset from the original pointer.
    vaddr_offset = (reinterpret_cast<address>(vaddr_sub_obj->getSvmPtr()) -
                    reinterpret_cast<address>(vaddr_base_obj->getSvmPtr()));
  }

  // The imem() in the backend is shared between base and sub/view object.
  pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_base_obj);
  Pal::IGpuMemory* phymem_igpu_mem =
      (phys_mem_obj == nullptr) ? nullptr : dev().getGpuMemory(phys_mem_obj)->iMem();

  Pal::VirtualMemoryRemapRange range{vaddr_pal_mem->iMem(), vaddr_offset,
                                     phymem_igpu_mem,       0,
                                     vcmd.size(),           Pal::VirtualGpuMemAccessMode::NoAccess};

  // Wait for previous operations before unmap
  if (phys_mem_obj == nullptr) {
    // @note: Need to verify if compute requires a wait or IB flush is enough
    WaitForIdleCompute();
    WaitForIdleSdma();
  }

  eventBegin(MainEngine);
  auto result = queue(MainEngine).iQueue_->RemapVirtualMemoryPages(1, &range, false, nullptr);
  // Capture GPU event for the paging operation
  GpuEvent event;
  eventEnd(MainEngine, event);
  setGpuEvent(event);
  if (result == Pal::Result::Success) {
    if (phys_mem_obj != nullptr) {
      // assert the vaddr_mem_obj wasn't mapped already
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_sub_obj);
      vaddr_sub_obj->getUserData().phys_mem_obj = phys_mem_obj;
      phys_mem_obj->getUserData().vaddr_mem_obj = vaddr_sub_obj;
    } else {
      // assert the vaddr_mem_obj is mapped and needs to be removed
      amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(vcmd.ptr());
      assert(vaddr_sub_obj != nullptr);
      assert(vcmd.ptr() == vaddr_sub_obj->getSvmPtr());

      amd::MemObjMap::RemoveMemObj(vcmd.ptr());
      if (vaddr_sub_obj->getUserData().phys_mem_obj != nullptr) {
        vaddr_sub_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr;
        vaddr_sub_obj->getUserData().phys_mem_obj = nullptr;
      }
    }
  }
  profilingEnd(vcmd);
}

// ================================================================================================
void VirtualGPU::PrintChildren(const pal::Kernel& hsaKernel, VirtualGPU* gpuDefQueue) {
  AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
  uint p = 0;
  for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
    if (wraps[i].state != 0) {
      uint j;
      if (p == GPU_PRINT_CHILD_KERNEL) {
        break;
      }
      p++;
      std::stringstream print;
      print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
      print << "Slot#: " << i << "\n";
      print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
      print << "\tcommand_id: " << wraps[i].command_id << "\n";
      print << "\tchild_counter: " << wraps[i].child_counter << "\n";
      print << "\tcompletion: " << wraps[i].completion << "\n";
      print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
      print << "\twait_list: " << wraps[i].wait_list << "\n";
      print << "\twait_num: " << wraps[i].wait_num << "\n";
      uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
      size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
      for (j = 0; j < wraps[i].wait_num; ++j) {
        uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
        AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
        print << "Wait Event#: " << j << "\n";
        print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
      }
      print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
      print << wraps[i].aql.workgroup_size_y << ", ";
      print << wraps[i].aql.workgroup_size_z << "]\n";
      print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
      print << wraps[i].aql.grid_size_y << ", ";
      print << wraps[i].aql.grid_size_z << "]\n";

      pal::Kernel* child = nullptr;
      for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
           ++it) {
        if (wraps[i].aql.kernel_object == static_cast<pal::Kernel*>(it->second)->gpuAqlCode()) {
          child = static_cast<pal::Kernel*>(it->second);
        }
      }
      if (child == nullptr) {
        printf("Error: couldn't find child kernel!\n");
        continue;
      }
      const uint64_t kernarg_address =
          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
      uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
      address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
      print << "Kernel: " << child->name() << "\n";
      const amd::KernelSignature& signature = child->signature();

      // Check if runtime has to setup hidden arguments
      for (const auto it : signature.parameters()) {
        const char* extraArgName = nullptr;
        switch (it.info_.oclObject_) {
          case amd::KernelParameterDescriptor::HiddenNone:
            // void* zero = 0;
            // WriteAqlArgAt(const_cast<address>(parameters), zero, it.size_, it.offset_);
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
            extraArgName = "Offset0: ";
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
            extraArgName = "Offset1: ";
            break;
          case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
            extraArgName = "Offset2: ";
            break;
          case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
            extraArgName = "PrintfBuf: ";
            break;
          case amd::KernelParameterDescriptor::HiddenDefaultQueue:
            extraArgName = "VqueuePtr: ";
            break;
          case amd::KernelParameterDescriptor::HiddenCompletionAction:
            extraArgName = "AqlWrap: ";
            break;
          default:
            break;
        }
        if (extraArgName) {
          print << "\t" << extraArgName << *reinterpret_cast<size_t*>(argum);
          print << "\n";
          argum += sizeof(size_t);
          continue;
        }
        print << "\t" << it.name_ << ": ";
        for (int s = it.size_ - 1; s >= 0; --s) {
          print.width(2);
          print.fill('0');
          print << static_cast<uint32_t>(argum[s]);
        }
        argum += it.offset_;
        print << "\n";
      }
      printf("%s", print.str().c_str());
    }
  }
}

// ================================================================================================
bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel,
                                  VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
  if (nullptr == defQueue) {
    LogError("Default device queue wasn't allocated");
    return false;
  } else {
    if (dev().settings().useDeviceQueue_) {
      *gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
      if ((*gpuDefQueue)->hwRing() == hwRing()) {
        LogError("Can't submit the child kernels to the same HW ring as the host queue!");
        return false;
      }
    } else {
      createVirtualQueue(defQueue->size());
      *gpuDefQueue = this;
    }
  }
  *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();

  (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable());

  // Acquire USWC memory for the scheduler parameters
  (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));

  // Add memory handles before the actual dispatch
  addVmMemory((*gpuDefQueue)->virtualQueue_);
  addVmMemory((*gpuDefQueue)->schedParams_);

  return true;
}

// ================================================================================================
void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel,
                                   VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
                                   uint64_t vmParentWrap, GpuEvent* gpuEvent) {
  uint32_t id = gpuEvent->id_;
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());

  // Make sure exculsive access to the device queue
  amd::ScopedLock(defQueue->lock());
  Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam));

  if (GPU_PRINT_CHILD_KERNEL != 0) {
    waitForEvent(gpuEvent);
    PrintChildren(hsaKernel, gpuDefQueue);
  }

  if (!dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
                                     dev().settings().useDeviceQueue_);
    eventEnd(MainEngine, *gpuEvent);
  }

  // Get the global loop start before the scheduler
  Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
      .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
                    gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, BarrierType::FlushL2);

  // Get the address of PM4 template and add write it to params
  //! @note DMA flush must not occur between patch and the scheduler
  Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
  // Program parameters for the scheduler
  SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
  param->signal = 1;
  // Scale clock to 1024 to avoid 64 bit div in the scheduler
  param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
  param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/;
  param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
  param->releaseHostCP = 0;
  param->parentAQL = vmParentWrap;
  param->dedicatedQueue = dev().settings().useDeviceQueue_;

  // Fill the scratch buffer information
  if (hsaKernel.prog().maxScratchRegs() > 0) {
    pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
    param->scratchSize = scratchBuf->size();
    param->scratch = scratchBuf->vmAddress();
    param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
    param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
    addVmMemory(scratchBuf);
  } else {
    param->numMaxWaves = 0;
    param->scratchSize = 0;
    param->scratch = 0;
    param->scratchOffset = 0;
  }

  // Add all kernels in the program to the mem list.
  //! \note Runtime doesn't know which one will be called
  hsaKernel.prog().fillResListWithKernels(*this);

  Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
  gpuDefQueue->eventBegin(MainEngine);
  gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
      signalAddr, loopStart,
      gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
  // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
  // Thus TS command for profiling has to follow in the next CB.
  constexpr bool ForceSubmitFirst = true;
  gpuDefQueue->eventEnd(MainEngine, *gpuEvent, ForceSubmitFirst);

  if (dev().settings().useDeviceQueue_) {
    // Add the termination handshake to the host queue
    eventBegin(MainEngine);
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
                                     vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
                                     dev().settings().useDeviceQueue_);
    if (id != gpuEvent->id_) {
      LogError("Something is wrong. ID mismatch!\n");
    }
    eventEnd(MainEngine, *gpuEvent);
  }

  xferWrite().Release(*gpuDefQueue->schedParams_);
  gpuDefQueue->schedParams_ = nullptr;
}

// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  if (vcmd.cooperativeGroups()) {
    uint32_t workgroups = 1;
    for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
      if (vcmd.sizes().local()[i] != 0) {
        workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
      }
    }

    bool test = true;
    VirtualGPU* queue = (test) ? this : dev().xferQueue();

    // Wait for the execution on the current queue, since the coop groups will use the device queue
    waitAllEngines();

    amd::ScopedLock lock(queue->blitMgr().lockXfer());

    queue->profilingBegin(vcmd);

    static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
    queue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue);

    // Submit kernel to HW
    if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
                                     vcmd.sharedMemBytes())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

    queue->profilingEnd(vcmd);

    // Wait for the execution on the device queue. Keep the current queue in-order
    queue->waitAllEngines();
  } else {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    profilingBegin(vcmd);

    // Submit kernel to HW
    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
                              vcmd.sharedMemBytes(), vcmd.getAnyOrderLaunchFlag())) {
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

    profilingEnd(vcmd);
  }
}

// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                      const_address parameters, bool nativeMem,
                                      uint32_t sharedMemBytes, bool anyOrder) {
  state_.anyOrder_ = anyOrder;

  // Get the HSA kernel object
  const pal::Kernel& hsaKernel = static_cast<const pal::Kernel&>(*(kernel.getDeviceKernel(dev())));

  // If RGP capturing is enabled, then start SQTT trace
  if (rgpCaptureEna()) {
    size_t newLocalSize[3] = {1, 1, 1};
    size_t newGlobalSize[3] = {0, 0, 0};
    for (uint i = 0; i < sizes.dimensions(); i++) {
      newGlobalSize[i] = sizes.global()[i];
      if (sizes.local()[i] != 0) {
        newLocalSize[i] = sizes.local()[i];
      }
    }
    dev().captureMgr()->PreDispatch(
        this, hsaKernel,
        // Report global size in workgroups, since that's the RGP trace semantics
        newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
        newGlobalSize[2] / newLocalSize[2]);
  }

  bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
  if (printfEnabled && !printfDbgHSA().init(*this, printfEnabled)) {
    LogError("Printf debug buffer initialization failed!");
    return false;
  }

  uint64_t vmDefQueue = 0;
  VirtualGPU* gpuDefQueue = nullptr;
  if (hsaKernel.dynamicParallelism()) {
    // Initialize GPU device queue for execution (gpuDefQueue)
    if (!PreDeviceEnqueue(kernel, hsaKernel, &gpuDefQueue, &vmDefQueue)) {
      return false;
    }
  }
  size_t ldsSize;

  ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\tkernel : %s\n", hsaKernel.name().c_str());

  if (PAL_EMBED_KERNEL_MD) {
    char buf[256];
    sprintf(buf, "kernel: %s\n private mem size: %x\n group mem size: %x\n",
            hsaKernel.name().c_str(), hsaKernel.spillSegSize(), hsaKernel.ldsSize());
    iCmd()->CmdCommentString(buf);
  }

  bool imageBufferWrtBack = false;         // Image buffer write back is required
  std::vector<Image*> wrtBackImageBuffer;  // Array of images for write back

  // Check memory dependency and SVM objects
  if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize, imageBufferWrtBack,
                            wrtBackImageBuffer)) {
    LogError("Wrong memory objects!");
    return false;
  }

  // Add ISA memory object to the resource tracking list
  AddKernel(kernel);

  GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
  uint32_t id = gpuEvent.id_;
  uint64_t vmParentWrap = 0;
  uint32_t aql_index = 0;
  // Program the kernel arguments for the GPU execution
  hsa_kernel_dispatch_packet_t* aqlPkt =
      hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes,
                              vmDefQueue, &vmParentWrap, &aql_index);
  assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");

  // Dynamic call stack size is considered to calculate private segment size and scratch regs
  // in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
  // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
  size_t privateMemSize = hsaKernel.spillSegSize();
  if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
    privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
                                        hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t));
    // Validate privateMemSize is more than max allowed.
    size_t maxStackSize = device().MaxStackSize();
    if (privateMemSize > maxStackSize) {
      ClPrint(amd::LOG_INFO, amd::LOG_KERN,
              "Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s", privateMemSize,
              maxStackSize, hsaKernel.name().c_str());
      LogError("Scratch size exceeds max allowed.");
      return false;
    }
  }

  // Set up the dispatch information
  Pal::DispatchAqlParams dispatchParam = {};
  dispatchParam.pAqlPacket = aqlPkt;
  if (privateMemSize > 0) {
    const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
    dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
    dispatchParam.scratchSize = scratch->size_;
    dispatchParam.scratchOffset = scratch->offset_;
    dispatchParam.workitemPrivateSegmentSize = privateMemSize;
  }
  dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlKd();
  dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
  dispatchParam.wavesPerSh = 0;
  dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
  dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
  dispatchParam.aqlPacketIndex = aql_index;
  // Run AQL dispatch in HW
  eventBegin(MainEngine);
  iCmd()->CmdDispatchAql(dispatchParam);

  if (id != gpuEvent.id_) {
    LogError("Something is wrong. ID mismatch!\n");
  }
  eventEnd(MainEngine, gpuEvent);
  AqlPacketUpdateTs(aql_index, gpuEvent);

  // Execute scheduler for device enqueue
  if (hsaKernel.dynamicParallelism()) {
    PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
  }

  // Update the global GPU event
  constexpr bool kNeedFLush = false;
  setGpuEvent(gpuEvent, kNeedFLush);

  if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
    LogError("Couldn't read printf data from the buffer!\n");
    return false;
  }

  // Check if image buffer write back is required
  if (imageBufferWrtBack) {
    // Make sure the original kernel execution is done
    addBarrier(RgpSqqtBarrierReason::MemDependency);
    for (const auto imageBuffer : wrtBackImageBuffer) {
      Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
      amd::Image* image = imageBuffer->owner()->asImage();
      amd::Coord3D offs(0);
      // Copy memory from the the backing store image into original buffer
      bool result = blitMgr().copyImageToBuffer(*imageBuffer->CopyImageBuffer(), *buffer, offs,
                                                offs, image->getRegion(), true,
                                                image->getRowPitch(), image->getSlicePitch());
    }
  }

  // Perform post dispatch logic for RGP traces
  if (rgpCaptureEna()) {
    dev().captureMgr()->PostDispatch(this);
  }

  return true;
}

// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  Unimplemented();  //!< @todo: Unimplemented
}

// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  //!@note runtime doesn't need to lock this command on execution

  if (vcmd.waitingEvent() != nullptr) {
    bool foundEvent = false;

    // Loop through all outstanding command batches
    while (!cbQueue_.empty()) {
      auto cb = cbQueue_.front();
      // Wait for completion
      foundEvent = awaitCompletion(cb, vcmd.waitingEvent());
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
      // Early exit if we found a command
      if (foundEvent) break;
    }

    // Event should be in the current command batch
    if (!foundEvent) {
      state_.forceWait_ = true;
    }
  } else if (amd::IS_HIP) {
    // Use GPU based timing for HIP events

    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    GpuEvent event;
    profilingBegin(vcmd);
    eventBegin(MainEngine);
    eventEnd(MainEngine, event);
    setGpuEvent(event);
    profilingEnd(vcmd);
  }
}

// ================================================================================================
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {}

// ================================================================================================
void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
  const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());

  if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) {
    flushDMA(MainEngine);
    if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore(
                                    const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
      LogError("Failed to signal external semaphore");
    }
  } else {
    if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore(
                                    const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
      LogError("Failed to wait on external semaphore");
    }
  }
}

void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
  if (!dev().settings().disableSdma_) {
    queues_[SdmaEngine]->removeCmdMemRef(mem);
  }
}

void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();

  PalCounterReference* palRef = PalCounterReference::Create(*this);
  if (palRef == nullptr) {
    LogError("We failed to allocate memory for the GPU perfcounter");
    vcmd.setStatus(CL_INVALID_OPERATION);
    return;
  }

  bool newExperiment = false;

  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    // Make sure we have a valid gpu performance counter
    if (nullptr == counter) {
      amd::PerfCounter::Properties prop = amdCounter->properties();
      PerfCounter* gpuCounter = new PerfCounter(
          gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
          prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
      if (nullptr == gpuCounter) {
        LogError("We failed to allocate memory for the GPU perfcounter");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      } else if (gpuCounter->create()) {
        newExperiment = true;
      } else {
        LogPrintfError(
            "We failed to allocate a perfcounter in PAL.\
                    Block: %d, counter: #d, event: %d",
            gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_,
            gpuCounter->info()->eventIndex_);
      }
      amdCounter->setDeviceCounter(gpuCounter);
    }
  }

  if (newExperiment) {
    palRef->finalize();
  }

  palRef->release();

  Pal::IPerfExperiment* palPerf = nullptr;
  for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
    const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());

    if (palPerf != counter->iPerf()) {
      palPerf = counter->iPerf();
      // Find the state and sends the command to PAL
      if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
        state_.perfCounterEnabled_ = true;
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdBeginPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
      } else if (vcmd.getState() == amd::PerfCounterCommand::End) {
        GpuEvent event;
        eventBegin(MainEngine);
        iCmd()->CmdEndPerfExperiment(palPerf);
        eventEnd(MainEngine, event);
        setGpuEvent(event);
        state_.perfCounterEnabled_ = false;
      } else {
        LogError("Unsupported performance counter state");
        vcmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
  }
}

void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE_MEM: {
      amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      if (threadTrace == nullptr) {
        PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this);
        if (palRef == nullptr) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        size_t numSe = amdThreadTrace->deviceSeNumThreadTrace();

        ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe);
        if (nullptr == gpuThreadTrace) {
          LogError("Failure in memory allocation for the GPU threadtrace");
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        if (gpuThreadTrace->create()) {
          amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
        } else {
          LogError("Failure in memory allocation for the GPU threadtrace");
          delete gpuThreadTrace;
          cmd.setStatus(CL_INVALID_OPERATION);
          return;
        }

        palRef->finalize();
        palRef->release();
      }

      break;
    }
    default:
      LogError("Unsupported command type for ThreadTraceMemObjects!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  switch (cmd.type()) {
    case CL_COMMAND_THREAD_TRACE: {
      amd::ThreadTrace* amdThreadTrace = static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
      ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());

      // gpu thread trace object had to be generated prior to begin/end/pause/resume due
      // to ThreadTraceMemObjectsCommand execution
      if (threadTrace == nullptr) {
        return;
      } else {
        Pal::IPerfExperiment* palPerf = threadTrace->iPerf();
        if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
          amd::ThreadTrace::ThreadTraceConfig* traceCfg =
              static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
          iCmd()->CmdBeginPerfExperiment(palPerf);
        } else if (cmd.getState() == amd::ThreadTraceCommand::End) {
          GpuEvent event;
          eventBegin(MainEngine);
          iCmd()->CmdEndPerfExperiment(palPerf);
          threadTrace->populateUserMemory();
          eventEnd(MainEngine, event);
          setGpuEvent(event);
        } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
          // There's no Pause from the PerfExperiment interface
        } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
          // There's no Resume from the PerfExperiment interface
        }
      }
      break;
    }
    default:
      LogError("Unsupported command type for ThreadTrace!");
      break;
  }

  profilingEnd(cmd);
}

void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data from original resource
    it->getInteropObj()->copyOrigToShared();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (const auto& it : vcmd.getMemList()) {
    // amd::Memory object should never be nullptr
    assert(it && "Memory object for interop is nullptr");
    pal::Memory* memory = dev().getGpuMemory(it);

    // If resource is a shared copy of original resource, then
    // runtime needs to copy data back to original resource
    it->getInteropObj()->copySharedToOrig();
  }

  profilingEnd(vcmd);
}

void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);
  pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory());

  GpuEvent gpuEvent;
  uint32_t value = vcmd.markerValue();

  if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
    eventBegin(MainEngine);
    addVmMemory(pGpuMemory);

    iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF,
                                              Pal::CompareFunc::GreaterEqual);
    eventEnd(MainEngine, gpuEvent);

  } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
    EngineType activeEngineID = engineID_;
    engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);

    // Make sure GPU finished operation and data reached memory before the marker write
    addBarrier(RgpSqqtBarrierReason::SignalSubmit, BarrierType::FlushL2);
    // Workarounds: We had systems where an extra delay was necessary.
    {
      // Flush CB associated with the DGMA buffer
      isDone(pGpuMemory->getGpuEvent(*this));
    }

    eventBegin(engineID_);
    queues_[engineID_]->addCmdMemRef(pGpuMemory->memRef());

    queues_[engineID_]->iCmd()->
#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 396)
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value);
#else
        CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), vcmd.markerOffset(), value);
#endif
    eventEnd(engineID_, gpuEvent);

    // Restore the original engine
    engineID_ = activeEngineID;
  }

  // Update the global GPU event
  setGpuEvent(gpuEvent);

  profilingEnd(vcmd);
}

void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) {
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  std::vector<amd::Memory*> memObjects = vcmd.memObjects();
  uint32_t numObjects = memObjects.size();
  Pal::GpuMemoryRef* pGpuMemRef = new Pal::GpuMemoryRef[numObjects];
  Pal::IGpuMemory** pGpuMems = new Pal::IGpuMemory*[numObjects];

  for (uint i = 0; i < numObjects; i++) {
    pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]);
    pGpuMemory->syncCacheFromHost(*this);

    pGpuMemRef[i].pGpuMemory = pGpuMemory->iMem();
    pGpuMems[i] = pGpuMemory->iMem();
  }

  dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
                                       Pal::GpuMemoryRefCantTrim);
  {
    amd::ScopedLock l(queues_[MainEngine]->lock_);
    dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
  }

  if (numObjects != 0) {
    dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
  }

  for (uint i = 0; i < numObjects; i++) {
    vcmd.busAddress()[i].surface_bus_address = pGpuMems[i]->Desc().surfaceBusAddr;
    vcmd.busAddress()[i].marker_bus_address = pGpuMems[i]->Desc().markerBusAddr;
  }
  profilingEnd(vcmd);
}


bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* head = cb->head_;

  // Make sure that profiling is enabled
  if (state_.profileEnabled_) {
    return profilingCollectResults(cb, waitingEvent);
  }
  // Mark the first command in the batch as running
  if (head != nullptr) {
    head->setStatus(CL_RUNNING);
  } else {
    return found;
  }

  // Wait for the last known GPU event
  waitEventLock(cb);

  while (nullptr != head) {
    current = head->getNext();
    if (head->status() == CL_SUBMITTED) {
      head->setStatus(CL_RUNNING);
      head->setStatus(CL_COMPLETE);
    } else if (head->status() == CL_RUNNING) {
      head->setStatus(CL_COMPLETE);
    } else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", head->status());
    }

    // Check if it's a waiting command
    if (head == waitingEvent) {
      found = true;
    }

    head->release();
    head = current;
  }

  return found;
}

void VirtualGPU::flush(amd::Command* list, bool wait) {
  CommandBatch* cb = nullptr;
  bool gpuCommand = false;

  for (uint i = 0; i < AllEngines; ++i) {
    if (events_[i].isValid()) {
      gpuCommand = true;
    }
  }

  // If the batch doesn't have any GPU command and the list is empty
  if (!gpuCommand && cbQueue_.empty()) {
    state_.forceWait_ = true;
  }

  // Insert the current batch into a list
  if (nullptr != list) {
    if (!freeCbQueue_.empty()) {
      cb = freeCbQueue_.front();
    }

    if (nullptr == cb) {
      cb = new CommandBatch(list, events_, lastTS_);
    } else {
      freeCbQueue_.pop();
      cb->init(list, events_, lastTS_);
    }
  }

  {
    //! @todo: Check if really need a lock
    amd::ScopedLock lock(execution());
    for (uint i = 0; i < AllEngines; ++i) {
      flushDMA(i);
      // Reset event so we won't try to wait again,
      // if runtime didn't submit any commands
      //! @note: it's safe to invalidate events, since
      //! we already saved them with the batch creation step above
      events_[i].invalidate();
    }
  }

  // Mark last TS as nullptr, so runtime won't process empty batches with the old TS
  lastTS_ = nullptr;
  if (nullptr != cb) {
    cbQueue_.push(cb);
  }

  wait |= state_.forceWait_;
  // Loop through all outstanding command batches
  while (!cbQueue_.empty()) {
    cb = cbQueue_.front();
    // Check if command batch finished without a wait
    bool finished = true;
    for (uint i = 0; i < AllEngines; ++i) {
      finished &= isDone(&cb->events_[i]);
    }
    if (finished || wait) {
      // Wait for completion
      awaitCompletion(cb);
      // Release a command batch
      freeCbQueue_.push(cb);
      // Remove command batch from the list
      cbQueue_.pop();
    } else {
      // Early exit if no finished
      break;
    }
  }
  state_.forceWait_ = false;
}

void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }

void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) {
  events_[engineID_] = gpuEvent;

  // Flush current DMA buffer if requested
  if (flush) {
    flushDMA(engineID_);
  }
}

void VirtualGPU::flushDMA(uint engineID) {
  if (engineID == MainEngine) {
    // Clear memory dependency state, since runtime flushes compute
    // memoryDependency().clear();
    //!@todo Keep memory dependency alive even if we flush DMA,
    //! since only L2 cache is flushed in KMD frame,
    //! but L1 still has to be invalidated.
  }

  isDone(&events_[engineID]);
}

bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
  uint i;
  GpuEvent* events;  //!< GPU events for the batch

  // If command batch is nullptr then wait for the current
  if (nullptr == cb) {
    events = events_;
  } else {
    events = cb->events_;
  }

  bool earlyDone = true;
  // The first loop is to flush all engines and/or check if
  // engines are idle already
  for (i = 0; i < AllEngines; ++i) {
    earlyDone &= isDone(&events[i]);
  }

  // Rlease all pinned memory
  releasePinnedMem();

  // The second loop is to wait all engines
  for (i = 0; i < AllEngines; ++i) {
    waitForEvent(&events[i]);
  }

  return earlyDone;
}

void VirtualGPU::waitEventLock(CommandBatch* cb) {
  bool earlyDone = false;
  {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    earlyDone = waitAllEngines(cb);
  }
  // Get timestamp, incase readjustTimeGPU_ needs to be updated
  uint64_t endTimeStampCPU = amd::Os::timeNanos();

  // Free resource cache if we have too many entries
  //! \note we do it here, when all engines are idle,
  // because Vista/Win7 idles GPU on a resource destruction
  static const size_t MinCacheEntries = 4096;
  dev().resourceCache().free(MinCacheEntries);

  // Find the timestamp object of the last command in the batch
  if (cb->lastTS_ != nullptr) {
    // If earlyDone is TRUE, then CPU didn't wait for GPU.
    // Thus the sync point between CPU and GPU is unclear and runtime
    // will use an older adjustment value to maintain the same timeline
    if (!earlyDone ||
        //! \note Workaround for APU(s).
        //! GPU-CPU timelines may go off too much, thus always
        //! force calibration with the last batch in the list
        (cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) {
      uint64_t startTimeStampGPU = 0;
      uint64_t endTimeStampGPU = 0;

      // Get the timestamp value of the last command in the batch
      cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);

      // Adjust the base time by the execution time
      readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
    }
  }
}

bool VirtualGPU::allocConstantBuffers() {
  // Allocate constant buffers.
  // Use double size, reported to the app to account for internal arguments
  const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_;
  uint i;

  // Create/reallocate constant buffer resources
  for (i = 0; i < MaxConstBuffersArguments; ++i) {
    ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize);

    if ((constBuf != nullptr) && constBuf->Create()) {
      addConstBuffer(constBuf);
    } else {
      // We failed to create a constant buffer
      delete constBuf;
      return false;
    }
  }

  return true;
}

void VirtualGPU::profilingBegin(amd::Command& command) {
  // Is profiling enabled?
  if (command.profilingInfo().enabled_) {
    // Allocate a timestamp object from the cache
    TimeStamp* ts = tsCache_->allocTimeStamp();
    if (nullptr == ts) {
      return;
    }
    // Save the TimeStamp object in the current OCL event
    command.data().emplace_back(ts);
    profileTs_ = ts;
    state_.profileEnabled_ = true;
  }
}

void VirtualGPU::profilingEnd(amd::Command& command) {
  // Get the TimeStamp object associated witht the current command
  TimeStamp* ts =
      !command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back()) : nullptr;
  if (ts != nullptr) {
    // Check if the command actually did any GPU submission
    if (ts->isValid()) {
      lastTS_ = ts;
    } else {
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      command.data().clear();
    }
  }
}

bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) {
  bool found = false;
  amd::Command* current;
  amd::Command* first = cb->head_;

  // If the command list is, empty then exit
  if (nullptr == first) {
    return found;
  }

  // Wait for the last known GPU events on all engines
  waitEventLock(cb);

  // Find the CPU base time of the entire command batch execution
  uint64_t endTimeStamp = amd::Os::timeNanos();
  uint64_t startTimeStamp = endTimeStamp;

  // First step, walk the command list to find the first valid command
  //! \note The batch may have empty markers at the beginning.
  //! So the start/end of the empty commands is equal to
  //! the start of the first valid command in the batch.
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts =
        !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back()) : nullptr;

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Assign to endTimeStamp the start of the first valid command
      endTimeStamp = startTimeStamp;
      break;
    }
    first = first->getNext();
  }

  // Second step, walk the command list to construct the time line
  first = cb->head_;
  while (nullptr != first) {
    // Get the TimeStamp object associated witht the current command
    TimeStamp* ts =
        !first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back()) : nullptr;

    current = first->getNext();

    if (ts != nullptr) {
      ts->value(&startTimeStamp, &endTimeStamp);
      endTimeStamp -= readjustTimeGPU_;
      startTimeStamp -= readjustTimeGPU_;
      // Destroy the TimeStamp object
      tsCache_->freeTimeStamp(ts);
      first->data().clear();
    } else {
      // For empty commands start/end is equal to
      // the end of the last valid command
      startTimeStamp = endTimeStamp;
    }

    // Update the command status with the proper timestamps
    if (first->status() == CL_SUBMITTED) {
      first->setStatus(CL_RUNNING, startTimeStamp);
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if (first->status() == CL_RUNNING) {
      first->setStatus(CL_COMPLETE, endTimeStamp);
    } else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
      LogPrintfError("Unexpected command status - %d!", first->status());
    }

    // Do we wait this event?
    if (first == waitingEvent) {
      found = true;
    }

    first->release();
    first = current;
  }

  return found;
}

void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) {
  queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd);
}

void VirtualGPU::profileEvent(EngineType engine, bool type) const {
  if (nullptr == profileTs_) {
    return;
  }
  if (type) {
    profileTs_->begin();
  } else {
    profileTs_->end();
  }
}

bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
                                      bool nativeMem, size_t& ldsAddress, bool& imageBufferWrtBack,
                                      std::vector<Image*>& wrtBackImageBuffer) {
  const amd::KernelParameters& kernelParams = kernel.parameters();

  // Mark the tracker with a new kernel,
  // so we can avoid checks of the aliased objects
  memoryDependency().newKernel();

  size_t count = kernelParams.getNumberOfSvmPtr();
  if (count > 0) {
    bool supportFineGrainedSystem = dev().isFineGrainedSystem(true);
    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
    switch (status) {
      case FGS_YES:
        if (!supportFineGrainedSystem) {
          return false;
        }
        break;
      case FGS_NO:
        supportFineGrainedSystem = false;
        break;
      case FGS_DEFAULT:
      default:
        break;
    }
    // get svm non arugment information
    void* const* svmPtrArray =
        reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
    for (size_t i = 0; i < count; i++) {
      amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
      if (nullptr == memory) {
        if (!supportFineGrainedSystem) {
          return false;
        } else {
          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
          continue;
        }
      } else {
        // Validate Mem Access in case of VMM Memory
        if (!memory->ValidateMemAccess(dev(), true)) {
          return false;
        }

        Memory* gpuMemory = dev().getGpuMemory(memory);
        if (nullptr != gpuMemory) {
          // Synchronize data with other memory instances if necessary
          gpuMemory->syncCacheFromHost(*this);

          const static bool IsReadOnly = false;
          // Validate SVM passed in the non argument list
          memoryDependency().validate(*this, gpuMemory, IsReadOnly);

          // Wait for resource if it was used on an inactive engine
          //! \note syncCache may call DRM transfer
          constexpr bool WaitOnBusyEngine = true;
          gpuMemory->wait(*this, WaitOnBusyEngine);

          // Mark signal write for cache coherency,
          // since this object isn't a part of kernel arg setup
          if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
            memory->signalWrite(&dev());
          }
          addVmMemory(gpuMemory);
        } else {
          return false;
        }
      }
    }
  }

  bool srdResource = false;
  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
  const pal::Kernel& hsaKernel = static_cast<const pal::Kernel&>(*(kernel.getDeviceKernel(dev())));
  const amd::KernelSignature& signature = kernel.signature();
  ldsAddress = hsaKernel.ldsSize();

  if (!nativeMem) {
    // Process cache coherency first, since the extra transfers may affect
    // other mem dependency tracking logic: TS and signalWrite()
    for (uint i = 0; i < signature.numMemories(); ++i) {
      amd::Memory* mem = memories[i];
      if (mem != nullptr) {
        // Synchronize data with other memory instances if necessary
        dev().getGpuMemory(mem)->syncCacheFromHost(*this);
      }
    }
  }

  // Check all parameters for the current kernel
  for (size_t i = 0; i < signature.numParameters(); ++i) {
    const amd::KernelParameterDescriptor& desc = signature.at(i);
    const amd::KernelParameterDescriptor::InfoData& info = desc.info_;

    // Find if current argument is a buffer
    if (desc.type_ == T_POINTER) {
      // If it is a local pointer
      if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
        ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
        if (desc.size_ == 8) {
          // Save the original LDS size
          uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          WriteAqlArgAt(const_cast<address>(params), ldsAddress, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        } else {
          // Save the original LDS size
          uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          uint32_t ldsAddr = ldsAddress;
          WriteAqlArgAt(const_cast<address>(params), ldsAddr, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        }
      } else {
        Memory* gpuMem = nullptr;
        amd::Memory* mem = nullptr;
        uint32_t index = info.arrayIndex_;
        if (nativeMem) {
          gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
          if (nullptr != gpuMem) {
            mem = gpuMem->owner();
          }
        } else {
          mem = memories[index];
          if (mem != nullptr) {
            gpuMem = dev().getGpuMemory(mem);
          }
        }
        //! This condition is for SVM fine-grain
        if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
          addBarrier(RgpSqqtBarrierReason::MemDependency);
          // Clear memory dependency state
          const static bool All = true;
          memoryDependency().clear(!All);
          continue;
        } else if (gpuMem != nullptr) {
          // Validate memory for a dependency in the queue
          memoryDependency().validate(*this, gpuMem, info.readOnly_);
          // Wait for resource if it was used on an inactive engine
          //! \note syncCache may call DRM transfer
          constexpr bool WaitOnBusyEngine = true;
          gpuMem->wait(*this, WaitOnBusyEngine);

          addVmMemory(gpuMem);
          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
          logVmMemory(desc.name_, gpuMem);

          //! Check if compiler expects read/write.
          //! Note: SVM with subbuffers has an issue with tracking.
          //! Conformance can send read only subbuffer, but update the region
          //! in the kernel.
          if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
                                   ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
            mem->signalWrite(&dev());
          }
          if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
            if (gpuMem->memoryType() == Resource::ImageBuffer) {
              Image* imageBuffer = static_cast<Image*>(gpuMem);
              // Check if synchronization has to be performed
              if (imageBuffer->CopyImageBuffer() != nullptr) {
                Memory* buffer = dev().getGpuMemory(mem->parent());
                amd::Image* image = mem->asImage();
                amd::Coord3D offs(0);
                // Copy memory from the original image buffer into the backing store image
                bool result = blitMgr().copyBufferToImage(
                    *buffer, *imageBuffer->CopyImageBuffer(), offs, offs, image->getRegion(), true,
                    image->getRowPitch(), image->getSlicePitch());
                // Make sure the copy operation is done
                addBarrier(RgpSqqtBarrierReason::MemDependency);
                // Use backing store SRD as the replacment
                uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
                WriteAqlArgAt(const_cast<address>(params), srd, sizeof(srd), desc.offset_);
                // Add backing store image to the list of memory handles
                addVmMemory(imageBuffer->CopyImageBuffer());
                // If it's not a read only resource, then runtime has to write back
                if (!info.readOnly_) {
                  wrtBackImageBuffer.push_back(imageBuffer);
                  imageBufferWrtBack = true;
                }
              }
            }

            //! \note Special case for the image views.
            //! Copy SRD to CB1, so blit manager will be able to release
            //! this view without a wait for SRD resource.
            if (gpuMem->memoryType() == Resource::ImageView) {
              // Copy the current image SRD into CB1
              uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
              // Then use a pointer in aqlArgBuffer to CB1
              // Patch the GPU VA address in the original arguments
              WriteAqlArgAt(const_cast<address>(params), srd, sizeof(srd), desc.offset_);
              addVmMemory(cb(1)->ActiveMemory());
            } else {
              srdResource = true;
            }
            if (gpuMem->desc().isDoppTexture_) {
              addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
                         kernel.parameters().getExecPfpaVcop());
            }
          }
        }
      }
    } else if (desc.type_ == T_VOID) {
      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
        // Copy the current structure into CB1
        size_t gpuPtr =
            static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
        // Then use a pointer in aqlArgBuffer to CB1
        const auto it = hsaKernel.patch().find(desc.offset_);
        // Patch the GPU VA address in the original arguments
        WriteAqlArgAt(const_cast<address>(params), gpuPtr, sizeof(size_t), it->second);
        addVmMemory(cb(1)->ActiveMemory());
      }
    } else if (desc.type_ == T_SAMPLER) {
      srdResource = true;
    } else if (desc.type_ == T_QUEUE) {
      uint32_t index = desc.info_.arrayIndex_;
      const amd::DeviceQueue* queue =
          reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
      VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
      uint64_t vmQueue;
      if (dev().settings().useDeviceQueue_) {
        vmQueue = gpuQueue->vQueue()->vmAddress();
      } else {
        if (!createVirtualQueue(queue->size())) {
          LogError("Virtual queue creation failed!");
          return false;
        }
        vmQueue = vQueue()->vmAddress();
      }
      // Patch the GPU VA address in the original arguments
      WriteAqlArgAt(const_cast<address>(params), vmQueue, sizeof(vmQueue), desc.offset_);
      break;
    }
  }

  if (ldsAddress > dev().info().localMemSize_) {
    LogError("No local memory available\n");
    return false;
  }

  if (srdResource || hsaKernel.prog().isStaticSampler()) {
    dev().srds().fillResourceList(*this);
  }

  const static bool IsReadOnly = false;
  for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
    // Validate global store for a dependency in the queue
    memoryDependency().validate(*this, mem, IsReadOnly);
    addVmMemory(mem);
  }
  if (hsaKernel.prog().hasGlobalStores()) {
    // Validate code object for a dependency in the queue
    memoryDependency().validate(*this, &hsaKernel.prog().codeSegGpu(), IsReadOnly);
  }

  addVmMemory(&hsaKernel.prog().codeSegGpu());

  if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
    const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
    // Validate scratch buffer to force sync mode, because
    // the current scratch logic is optimized for size and performance
    // Note: runtime can skip sync if the same kernel is used,
    // since the number of scratch regs remains the same
    if (!IsSameKernel(kernel)) {
      memoryDependency().validate(*this, scratch->memObj_, IsReadOnly);
    }
    addVmMemory(scratch->memObj_);
    logVmMemory("scratch", scratch->memObj_);
  }

  // Synchronize dispatches unconditionally in case memory tracking is disabled
  memoryDependency().sync(*this);

  return true;
}

void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) {
  if (nullptr == kernelTable) {
    vqHeader_->kernel_table = 0;
  } else {
    vqHeader_->kernel_table = kernelTable->vmAddress();
    addVmMemory(kernelTable);
  }

  virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true);
}

bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
  uint64_t srcVmEnd = src.vmAddress() + src.vmSize();
  if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) ||
      ((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) ||
      ((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) {
    sdmaRange_.start_ = dst.vmAddress();
    sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
    return true;
  }

  sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
  sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
  return false;
}

// ================================================================================================
void* VirtualGPU::getOrCreateHostcallBuffer() {
  if (hostcallBuffer_ != nullptr) {
    return hostcallBuffer_;
  }

  // The number of packets required in each buffer is at least equal to the
  // maximum number of waves supported by the device.
  auto wavesPerCu = dev().info().maxThreadsPerCU_ / dev().info().wavefrontWidth_;
  auto numPackets = dev().info().maxComputeUnits_ * wavesPerCu;

  auto size = amd::getHostcallBufferSize(numPackets);
  auto align = amd::getHostcallBufferAlignment();

  hostcallBuffer_ = dev().svmAlloc(dev().context(), size, align,
                                   CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, nullptr);
  if (!hostcallBuffer_) {
    ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to create hostcall buffer");
    return nullptr;
  }

  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
          "Created hostcall buffer %p (numPackets == %d, size == %d, align == %d) for virtual "
          "queue %p\n",
          hostcallBuffer_, numPackets, size, align, this);

  if (!amd::enableHostcalls(dev(), hostcallBuffer_, numPackets)) {
    ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener",
            hostcallBuffer_);
    return nullptr;
  }
  return hostcallBuffer_;
}

}  // namespace amd::pal