rocm-systems/projects/clr/rocclr/device/rocm/rocvirtual.cpp

/* Copyright (c) 2013 - 2025 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include "device/devhostcall.hpp"
#include "device/rocm/rocdevice.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "device/rocm/rockernel.hpp"
#include "device/rocm/rocmemory.hpp"
#include "device/rocm/rocblit.hpp"
#include "device/rocm/roccounters.hpp"
#include "platform/activity.hpp"
#include "platform/kernel.hpp"
#include "platform/context.hpp"
#include "platform/command.hpp"
#include "platform/command_utils.hpp"
#include "platform/memory.hpp"
#include "platform/sampler.hpp"
#include "utils/debug.hpp"
#include "os/os.hpp"

#include <fstream>
#include <limits>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include <atomic>
#include <cinttypes>

#if defined(__AVX__)
#if defined(__MINGW64__)
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#endif

/**
 * HSA image object size in bytes (see HSA spec)
 */
#define HSA_IMAGE_OBJECT_SIZE 48

/**
 * HSA image object alignment in bytes (see HSA spec)
 */
#define HSA_IMAGE_OBJECT_ALIGNMENT 16

/**
 * HSA sampler object size in bytes (see HSA spec)
 */
#define HSA_SAMPLER_OBJECT_SIZE 32

/**
 * HSA sampler object alignment in bytes (see HSA spec)
 */
#define HSA_SAMPLER_OBJECT_ALIGNMENT 16

namespace amd::roc {
// (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1
// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes
// L2

static constexpr uint16_t kInvalidAql = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);

static constexpr uint16_t kBarrierPacketHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr uint16_t kNopPacketHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr uint16_t kBarrierPacketAcquireHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr uint16_t kBarrierPacketReleaseHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr uint16_t kBarrierVendorPacketHeader =
    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr uint16_t kBarrierVendorPacketNopScopeHeader =
    (HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static constexpr hsa_barrier_and_packet_t kBarrierAcquirePacket = {
    kBarrierPacketAcquireHeader, 0, 0, {{0}}, 0, {0}};

static constexpr hsa_barrier_and_packet_t kBarrierReleasePacket = {
    kBarrierPacketReleaseHeader, 0, 0, {{0}}, 0, {0}};

double Timestamp::ticksToTime_ = 0;

static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
  return (v >> pos) & ((1 << width) - 1);
};

// ================================================================================================
void Timestamp::checkGpuTime() {
  amd::ScopedLock s(lock_);
  if (HwProfiling()) {
    uint64_t start = std::numeric_limits<uint64_t>::max();
    uint64_t end = 0;
    uint64_t sdmaStart = std::numeric_limits<uint64_t>::max();
    uint64_t sdmaEnd = 0;


    for (auto it : signals_) {
      amd::ScopedLock lock(it->LockSignalOps());
      // Ignore the wait if runtime processes API callback, because the signal value is bigger
      // than expected and the value reset will occur after API callback is done
      if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
        WaitForSignal(it->signal_);
      }
      // Avoid profiling data for the sync barrier, in tiny performance tests the first call
      // to ROCr is very slow and that also affects the overall performance of the callback thread
      if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ ||
          command().type() == CL_COMMAND_TASK) {
        hsa_amd_profiling_dispatch_time_t time = {};
        hsa_amd_profiling_async_copy_time_t timeSdma = {};
        amd_signal_t* amdSignal = reinterpret_cast<amd_signal_t*>(it->signal_.handle);

        if (it->engine_ == HwQueueEngine::SdmaInter || it->engine_ == HwQueueEngine::SdmaRead ||
            it->engine_ == HwQueueEngine::SdmaWrite || it->engine_ == HwQueueEngine::SdmaIntra) {
          Hsa::profiling_get_async_copy_time(it->signal_, &timeSdma);
          sdmaStart = std::min(timeSdma.start, sdmaStart);
          sdmaEnd = std::max(timeSdma.end, sdmaEnd);
          // set dispatch time to be used in logging.
          time.start = timeSdma.start;
          time.end = timeSdma.end;
        } else {
          Hsa::profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
          start = std::min(time.start, start);
          end = std::max(time.end, end);
        }

        if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) {
          static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
        }

        ClPrint(amd::LOG_INFO, amd::LOG_TS,
                "Signal = (0x%lx), Translated start/end = %ld / %ld, Elapsed = %ld ns, "
                "ticks start/end = %ld / %ld, Ticks elapsed = %ld, Engine = %u",
                it->signal_.handle, time.start, time.end, time.end - time.start,
                amdSignal->start_ts, amdSignal->end_ts, amdSignal->end_ts - amdSignal->start_ts,
                it->engine_);
      }
      it->flags_.done_ = true;
    }
    signals_.clear();
    if (end != 0 || sdmaEnd != 0) {
      // Check if it's the first execution and update start time
      if (!accum_ena_) {
        start_ = ((sdmaEnd != 0) ? sdmaStart : start) * ticksToTime_;
        accum_ena_ = true;
      }
      // Progress the end time always
      end_ = ((sdmaEnd != 0) ? sdmaEnd : end) * ticksToTime_;
    }
  }
}

// ================================================================================================
bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
  Timestamp* ts = reinterpret_cast<Timestamp*>(arg);

  if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
    amd::Command* head = ts->getParsedCommand();
    if (head == nullptr) {
      head = ts->command().GetBatchHead();
    }
    while (head != nullptr) {
      if (!head->data().empty()) {
        for (auto i = 0; i < head->data().size(); i++) {
          Timestamp* headTs = reinterpret_cast<Timestamp*>(head->data()[i]);
          ts->setParsedCommand(head);
          for (auto it : headTs->Signals()) {
            hsa_signal_value_t complete_val = (headTs->GetCallbackSignal().handle != 0) ? 1 : 0;
            if (int64_t val = Hsa::signal_load_relaxed(it->signal_) > complete_val) {
              hsa_status_t result = Hsa::signal_async_handler(
                  headTs->Signals()[0]->signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
                  &HsaAmdSignalHandler, ts);
              if (HSA_STATUS_SUCCESS != result) {
                LogError("hsa_amd_signal_async_handler() failed to requeue the handler!");
              } else {
                ClPrint(amd::LOG_INFO, amd::LOG_SIG,
                        "Requeue handler : value(%d), timestamp(%p),"
                        "handle(0x%lx)",
                        static_cast<uint32_t>(val), headTs,
                        headTs->HwProfiling() ? headTs->Signals()[0]->signal_.handle : 0);
              }
              return false;
            }
          }
        }
      }
      head = head->getNext();
    }
  }
  ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(0x%lx)",
          static_cast<uint32_t>(value), arg,
          ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);

  // Save callback signal
  hsa_signal_t callback_signal = ts->GetCallbackSignal();

  auto gpu = ts->gpu();
  gpu->QueuedAsyncHandlers()--;

  bool isBlocking = ts->GetBlocking();

  // Update the batch, since signal is complete
  gpu->updateCommandsState(ts->command().GetBatchHead());

  // Reset API callback signal. It will release AQL queue and start commands processing
  if (callback_signal.handle != 0 && isBlocking) {
    Hsa::signal_subtract_relaxed(callback_signal, 1);
  }

  // Return false, so the callback will not be called again for this signal
  gpu->release();
  return false;
}

// ================================================================================================
bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
  if (numMemObj > 0) {
    // Allocate the array of memory objects for dependency tracking
    memObjectsInQueue_ = new MemoryState[numMemObj];
    if (nullptr == memObjectsInQueue_) {
      return false;
    }
    memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
    maxMemObjectsInQueue_ = numMemObj;
  }

  return true;
}

// ================================================================================================
void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
  bool flushL1Cache = false;

  if (maxMemObjectsInQueue_ == 0) {
    // Sync AQL packets
    gpu.setAqlHeader(gpu.dispatchPacketHeader_);
    return;
  }

  uint64_t curStart = reinterpret_cast<uint64_t>(memory->getDeviceMemory());
  uint64_t curEnd = curStart + memory->size();

  // Loop through all memory objects in the queue and find dependency
  // @note don't include objects from the current kernel
  for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
    // Check if the queue already contains this mem object and
    // GPU operations aren't readonly
    uint64_t busyStart = memObjectsInQueue_[j].start_;
    uint64_t busyEnd = memObjectsInQueue_[j].end_;

    // Check if the start inside the busy region
    if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
         // Check if the end inside the busy region
         ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
         // Check if the start/end cover the busy region
         ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
        // If the buys region was written or the current one is for write
        (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
      flushL1Cache = true;
      break;
    }
  }

  // Did we reach the limit?
  if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) {
    flushL1Cache = true;
  }

  if (flushL1Cache) {
    // Sync AQL packets
    gpu.setAqlHeader(gpu.dispatchPacketHeader_);

    // Clear memory dependency state
    const static bool All = true;
    clear(!All);
  }

  // Insert current memory object into the queue always,
  // since runtime calls flush before kernel execution and it has to keep
  // current kernel in tracking
  memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart;
  memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
  memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
  numMemObjectsInQueue_++;
}

// ================================================================================================
void VirtualGPU::MemoryDependency::clear(bool all) {
  if (numMemObjectsInQueue_ > 0) {
    if (all) {
      endMemObjectsInQueue_ = numMemObjectsInQueue_;
    }

    // If the current launch didn't start from the beginning, then move the data
    if (0 != endMemObjectsInQueue_) {
      // Preserve all objects from the current kernel
      size_t i, j;
      for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
        memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
        memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
        memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
      }
    } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) {
      // note: The array growth shouldn't occur under the normal conditions,
      // but in a case when SVM path sends the amount of SVM ptrs over
      // the max size of kernel arguments
      MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
      if (nullptr == ptr) {
        numMemObjectsInQueue_ = 0;
        return;
      }
      maxMemObjectsInQueue_ <<= 1;
      memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
      delete[] memObjectsInQueue_;
      memObjectsInQueue_ = ptr;
    }

    numMemObjectsInQueue_ -= endMemObjectsInQueue_;
    endMemObjectsInQueue_ = 0;
  }
}

// ================================================================================================
VirtualGPU::HwQueueTracker::~HwQueueTracker() {
  for (auto& signal : signal_list_) {
    CpuWaitForSignal(signal);
    signal->release();
  }
  // Destroy all extra signals. Note: these signals must be idle already
  while (signal_pool_.size() != 0) {
    signal_pool_.top()->release();
    signal_pool_.pop();
  }
  while (signal_pool_irq_.size() != 0) {
    signal_pool_irq_.top()->release();
    signal_pool_irq_.pop();
  }
}

// ================================================================================================
bool VirtualGPU::HwQueueTracker::CreateSignal(ProfilingSignal* signal, bool interrupt) const {
  const Settings& settings = gpu_.dev().settings();
  // MT path will still have interrupts to avoid extra polling in the queue thread.
  // Also runtime will still use interrupts if active wait was disabled
  interrupt |= !AMD_DIRECT_DISPATCH || !gpu_.dev().ActiveWait();
  // Check if the interrupt was requested for the signal
  if (interrupt && settings.system_scope_signal_) {
    if (HSA_STATUS_SUCCESS != Hsa::signal_create(0, 0, nullptr, &signal->signal_)) {
      return false;
    }
  } else {
    if (HSA_STATUS_SUCCESS !=
        Hsa::signal_create(0, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal->signal_)) {
      return false;
    }
  }
  signal->flags_.interrupt_ = interrupt;
  return true;
}

// ================================================================================================
bool VirtualGPU::HwQueueTracker::Create() {
  const uint kSignalListSize = ROC_SIGNAL_POOL_SIZE;
  signal_list_.resize(kSignalListSize);
  for (uint i = 0; i < kSignalListSize; ++i) {
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
    if ((signal == nullptr) || !CreateSignal(signal.get())) {
      return false;
    }
    signal_list_[i] = signal.release();
  }
  // Add extra signals with the interrupts for the callbacks
  if (AMD_DIRECT_DISPATCH && gpu_.dev().ActiveWait()) {
    for (uint32_t i = 0; i < 5; ++i) {
      std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
      constexpr bool kEnableInterrupt = true;
      if ((signal == nullptr) || !CreateSignal(signal.get(), kEnableInterrupt)) {
        return false;
      }
      signal_pool_irq_.push(signal.release());
    }
  }
  return true;
}

// ================================================================================================
hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_val, Timestamp* ts,
                                                      bool attach_signal) {
  amd::Command* cmd = gpu_.command();
  // If no signal is needed, decrement the refcount and clear the hw_event of current command
  if (!attach_signal) {
    if (nullptr != cmd) {
      if (cmd->HwEvent() != nullptr) {
        reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
      }
      cmd->SetHwEvent(nullptr);
    }
    return hsa_signal_t{0};
  }

  bool new_signal = false;

  // Peep signal +2 ahead to see if its done
  auto temp_id = (current_id_ + 2) % signal_list_.size();

  // If GPU is still busy with processing, then add more signals to avoid more frequent stalls
  if (Hsa::signal_load_relaxed(signal_list_[temp_id]->signal_) > 0) {
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
    if ((signal != nullptr) && CreateSignal(signal.get())) {
      // Find valid new index
      ++current_id_ %= signal_list_.size();
      // Insert the new signal into the current slot and ignore any wait
      signal_list_.insert(signal_list_.begin() + current_id_, signal.release());
      new_signal = true;
    }
  }

  // If it's the new signal, then the wait can be avoided.
  // That will allow to grow the list of signals without stalls
  if (!new_signal) {
    // Find valid index
    ++current_id_ %= signal_list_.size();
    // Make sure the previous operation on the current signal is done
    WaitCurrent();

    // Have to wait the next signal in the queue to avoid a race condition between
    // a GPU waiter(which may be not triggered yet) and CPU signal reset below
    WaitNext();
  }

  if (signal_list_[current_id_]->referenceCount() > 1) {
    // The signal was assigned to the global marker's event, hence runtime can't reuse it
    // and needs a new signal
    std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
    if ((signal != nullptr) && CreateSignal(signal.get())) {
      signal_list_[current_id_]->release();
      signal_list_[current_id_] = signal.release();
    } else {
      assert(!"ProfilingSignal reallocation failed! Marker has a conflict with signal reuse!");
    }
  }

  bool enqueHandler = false;
  if (AMD_DIRECT_DISPATCH) {
    if (ts != nullptr) {
      enqueHandler =
          (ts->command().Callback() != nullptr || ts->command().GetBatchHead() != nullptr) &&
          !ts->command().CpuWaitRequested();
    }
    // Check if the signal doesn't match the requested one.
    // Note: runtime needs the interrupts for the callbacks in DD mode
    if ((signal_list_[current_id_]->flags_.interrupt_ != enqueHandler) && gpu_.dev().ActiveWait()) {
      // Use different stacks if an interrupt is required or not.
      // @note: if runtime needs an interrupt, then the tracking list replaces the original signal
      // with the interrupt signal and saves the signal without interrupt, or vise versa
      auto& pool_get = (enqueHandler) ? signal_pool_irq_ : signal_pool_;
      auto& pool_save = (enqueHandler) ? signal_pool_ : signal_pool_irq_;

      // Check if a free signal in the pop stack isn't available
      if (pool_get.empty()) {
        std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
        if ((signal != nullptr) && CreateSignal(signal.get(), enqueHandler)) {
          pool_get.push(signal.release());
        }
      }
      // Make sure a free signal exists and replace it in the current slot
      if (!pool_get.empty()) {
        pool_save.push(signal_list_[current_id_]);
        signal_list_[current_id_] = pool_get.top();
        pool_get.pop();
      }
    }
  }
  ProfilingSignal* prof_signal = signal_list_[current_id_];
  // Reset the signal and return
  Hsa::signal_silent_store_relaxed(prof_signal->signal_, init_val);
  prof_signal->flags_.done_ = false;
  prof_signal->engine_ = engine_;
  prof_signal->flags_.isPacketDispatch_ = false;


  if (nullptr != cmd) {
    // Release any existing HwEvent before setting new one for the same command
    if (cmd->HwEvent() != nullptr) {
      reinterpret_cast<ProfilingSignal*>(cmd->HwEvent())->release();
    }
    cmd->SetHwEvent(prof_signal);
    prof_signal->retain();
  }

  if (ts != nullptr) {
    // Save HSA signal earlier to make sure the possible callback will have a valid
    // value for processing
    ts->retain();
    prof_signal->ts_ = ts;
    ts->AddProfilingSignal(prof_signal);
    if (AMD_DIRECT_DISPATCH) {
      // If direct dispatch is enabled and the batch head isn't null, then it's a marker and
      // requires the batch update upon HSA signal completion
      if (enqueHandler) {
        uint32_t init_value = kInitSignalValueOne;
        // If API callback is enabled, then use a blocking signal for AQL queue.
        // HSA signal will be acquired in SW and released after HSA signal callback
        if (ts->command().Callback() != nullptr) {
          bool blocking = ts->command().Callback()->blocking_;
          ts->SetCallbackSignal(prof_signal->signal_, blocking);
          // Blocks AQL queue from further processing
          if (blocking) {
            Hsa::signal_add_relaxed(prof_signal->signal_, 1);
            init_value += 1;
          }
        }
        gpu_.QueuedAsyncHandlers()++;
        ts->gpu()->retain();
        hsa_status_t result = Hsa::signal_async_handler(
            prof_signal->signal_, HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
        if (HSA_STATUS_SUCCESS != result) {
          LogError("hsa_amd_signal_async_handler() failed to set the handler!");
        } else {
          ClPrint(amd::LOG_INFO, amd::LOG_SIG,
                  "Set Handler: handle(0x%lx), timestamp(%p), blocking CB=%d",
                  prof_signal->signal_.handle, prof_signal,
                  ts->command().Callback() != nullptr && ts->GetBlocking());
        }
      }
    }
  }
  return prof_signal->signal_;
}

// ================================================================================================
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
  bool explicit_wait = false;
  // Reset all current waiting signals
  waiting_signals_.clear();

  // Does runtime switch the active engine?
  if (engine != engine_) {
    // Yes, return the signal from the previous operation for a wait
    engine_ = engine;
    explicit_wait = true;
  } else {
    // Unknown engine in use, hence return a wait signal always
    if (engine == HwQueueEngine::Unknown) {
      explicit_wait = true;
    } else {
      // Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr
      // and ignore the signal wait, relying on in-order engine execution
      const Settings& settings = gpu_.dev().settings();
      if (engine != HwQueueEngine::Compute) {
        explicit_wait = true;
      }
    }
  }

  // Check if a wait is required
  if (explicit_wait) {
    bool skip_internal_signal = false;

    for (uint32_t i = 0; i < external_signals_.size(); ++i) {
      // If external signal matches internal one, then skip it
      if (external_signals_[i]->signal_.handle == signal_list_[current_id_]->signal_.handle) {
        skip_internal_signal = true;
      }
    }
    // Add the oldest signal into the tracking for a wait
    if (!skip_internal_signal) {
      external_signals_.push_back(signal_list_[current_id_]);
    }
  }

  // Validate all signals for the wait and skip already completed
  for (uint32_t i = 0; i < external_signals_.size(); ++i) {
    // Early signal status check
    if (Hsa::signal_load_relaxed(external_signals_[i]->signal_) > 0) {
      const Settings& settings = gpu_.dev().settings();
      if (settings.cpu_wait_for_signal_) {
        // Wait on CPU for completion if requested
        CpuWaitForSignal(external_signals_[i]);
      } else {
        // Add HSA signal for tracking on GPU
        waiting_signals_.push_back(external_signals_[i]->signal_);
      }
    }
  }
  external_signals_.clear();

  // Return the array of waiting HSA signals
  return waiting_signals_;
}

// ================================================================================================
bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
  // Wait for the current signal
  if (signal->ts_ != nullptr) {
    // Update timestamp values if requested
    auto ts = signal->ts_;
    ts->checkGpuTime();
    ts->release();
    signal->ts_ = nullptr;
  } else if (Hsa::signal_load_relaxed(signal->signal_) > 0) {
    amd::ScopedLock lock(signal->LockSignalOps());
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Host wait on completion_signal=0x%zx",
            signal->signal_.handle);
    if (!WaitForSignal(signal->signal_, gpu_.ActiveWait())) {
      LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
      return false;
    }
    signal->flags_.done_ = true;
  }
  return true;
}

// ================================================================================================
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
  // Reset the signal and return
  Hsa::signal_silent_store_relaxed(signal_list_[current_id_]->signal_, 0);
  // Fallback to the previous signal
  current_id_ = (current_id_ == 0) ? (signal_list_.size() - 1) : (current_id_ - 1);
}

// ================================================================================================
bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address params,
                                   size_t& ldsAddress, bool cooperativeGroups,
                                   bool& imageBufferWrtBack,
                                   std::vector<device::Memory*>& wrtBackImageBuffer) {
  Kernel& hsaKernel =
      const_cast<Kernel&>(static_cast<const Kernel&>(*(kernel.getDeviceKernel(dev()))));
  const amd::KernelSignature& signature = kernel.signature();
  const amd::KernelParameters& kernelParams = kernel.parameters();

  if (!cooperativeGroups && memoryDependency().maxMemObjectsInQueue() != 0) {
    // AQL packets
    setAqlHeader(dispatchPacketHeaderNoSync_);
  }

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());

  // HIP shouldn't use cache coherency layer at any time
  if (!amd::IS_HIP) {
    // Process cache coherency first, since the extra transfers may affect
    // other mem dependency tracking logic: TS and signalWrite()
    for (uint i = 0; i < signature.numMemories(); ++i) {
      amd::Memory* mem = memories[i];
      if (mem != nullptr) {
        roc::Memory* gpuMem = dev().getGpuMemory(mem);
        // Don't sync for internal objects, since they are not shared between devices
        if (gpuMem->owner()->getVirtualDevice() == nullptr) {
          // Synchronize data with other memory instances if necessary
          gpuMem->syncCacheFromHost(*this);
        }
      }
    }
  }

  // Mark the tracker with a new kernel, so it can avoid checks of the aliased objects
  memoryDependency().newKernel();

  bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
  bool supportFineGrainedSystem = deviceSupportFGS;
  FGSStatus status = kernelParams.getSvmSystemPointersSupport();
  switch (status) {
    case FGS_YES:
      if (!deviceSupportFGS) {
        return false;
      }
      supportFineGrainedSystem = true;
      break;
    case FGS_NO:
      supportFineGrainedSystem = false;
      break;
    case FGS_DEFAULT:
    default:
      break;
  }

  size_t count = kernelParams.getNumberOfSvmPtr();
  size_t execInfoOffset = kernelParams.getTotalSize();
  bool sync = true;

  amd::Memory* memory = nullptr;
  // get svm non arugment information
  void* const* svmPtrArray = reinterpret_cast<void* const*>(params + execInfoOffset);
  for (size_t i = 0; i < count; i++) {
    memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
    if (nullptr == memory) {
      if (!supportFineGrainedSystem) {
        return false;
      } else if (sync) {
        // Sync AQL packets
        setAqlHeader(dispatchPacketHeader_);
        // Clear memory dependency state
        const static bool All = true;
        memoryDependency().clear(!All);
        continue;
      }
    } else {
      Memory* rocMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
      if (nullptr != rocMemory) {
        // Synchronize data with other memory instances if necessary
        rocMemory->syncCacheFromHost(*this);

        const static bool IsReadOnly = false;
        // Validate SVM passed in the non argument list
        memoryDependency().validate(*this, rocMemory, IsReadOnly);
      } else {
        return false;
      }
    }
  }

  // Check all parameters for the current kernel
  for (size_t i = 0; i < signature.numParameters(); ++i) {
    const amd::KernelParameterDescriptor& desc = signature.at(i);
    Memory* gpuMem = nullptr;
    amd::Memory* mem = nullptr;

    // Find if current argument is a buffer
    if (desc.type_ == T_POINTER) {
      if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
        // Align the LDS on the alignment requirement of type pointed to
        ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
        if (desc.size_ == 8) {
          // Save the original LDS size
          uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          WriteAqlArgAt(const_cast<address>(params), ldsAddress, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        } else {
          // Save the original LDS size
          uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
          // Patch the LDS address in the original arguments with an LDS address(offset)
          uint32_t ldsAddr = ldsAddress;
          WriteAqlArgAt(const_cast<address>(params), ldsAddr, desc.size_, desc.offset_);
          // Add the original size
          ldsAddress += ldsSize;
        }
      } else {
        uint32_t index = desc.info_.arrayIndex_;
        mem = memories[index];
        const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
        if (mem == nullptr) {
          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p ", i, desc.typeName_.c_str(),
                  desc.name_.c_str(), globalAddress);
          //! This condition is for SVM fine-grain
          if (dev().isFineGrainedSystem(true)) {
            // Sync AQL packets
            setAqlHeader(dispatchPacketHeader_);
            // Clear memory dependency state
            const static bool All = true;
            memoryDependency().clear(!All);
          }
        } else {
          gpuMem = static_cast<Memory*>(mem->getDeviceMemory(dev()));

          const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = ptr:%p obj:[%p-%p]", i,
                  desc.typeName_.c_str(), desc.name_.c_str(), globalAddress,
                  gpuMem->getDeviceMemory(),
                  reinterpret_cast<address>(gpuMem->getDeviceMemory()) + mem->getSize());

          // Validate memory for a dependency in the queue
          memoryDependency().validate(*this, gpuMem, (desc.info_.readOnly_ == 1));

          assert((desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_GLOBAL ||
                  desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_CONSTANT) &&
                 "Unsupported address qualifier");

          const bool readOnly = (desc.typeQualifier_ == CL_KERNEL_ARG_TYPE_CONST) ||
                                ((mem->getMemFlags() & CL_MEM_READ_ONLY) != 0);

          if (!readOnly) {
            mem->signalWrite(&dev());
          }

          if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
            Image* image = static_cast<Image*>(mem->getDeviceMemory(dev()));

            const uint64_t image_srd = image->getHsaImageObject().handle;
            assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
            WriteAqlArgAt(const_cast<address>(params), image_srd, sizeof(image_srd), desc.offset_);

            // Check if synchronization has to be performed
            if (image->CopyImageBuffer() != nullptr) {
              Memory* devBuf = dev().getGpuMemory(mem->parent());
              amd::Coord3D offs(0);
              Image* devCpImg = static_cast<Image*>(dev().getGpuMemory(image->CopyImageBuffer()));
              amd::Image* img = mem->asImage();

              // Copy memory from the original image buffer into the backing store image
              bool result =
                  blitMgr().copyBufferToImage(*devBuf, *devCpImg, offs, offs, img->getRegion(),
                                              true, img->getRowPitch(), img->getSlicePitch());
              // Make sure the copy operation is done
              setAqlHeader(dispatchPacketHeader_);
              // Use backing store SRD as the replacment
              const uint64_t srd = devCpImg->getHsaImageObject().handle;
              WriteAqlArgAt(const_cast<address>(params), srd, sizeof(srd), desc.offset_);

              // If it's not a read only resource, then runtime has to write back
              if (!desc.info_.readOnly_) {
                wrtBackImageBuffer.push_back(mem->getDeviceMemory(dev()));
                imageBufferWrtBack = true;
              }
            }
          }
        }
      }
    } else if (desc.type_ == T_QUEUE) {
      uint32_t index = desc.info_.arrayIndex_;
      const amd::DeviceQueue* queue =
          reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];

      if (!createVirtualQueue(queue->size()) || !createSchedulerParam()) {
        return false;
      }
      uint64_t vqVA = getVQVirtualAddress();
      WriteAqlArgAt(const_cast<address>(params), vqVA, sizeof(vqVA), desc.offset_);
    } else if (desc.type_ == T_VOID) {
      const_address srcArgPtr = params + desc.offset_;
      if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
        void* mem = allocKernArg(desc.size_, 128);
        memcpy(mem, srcArgPtr, desc.size_);
        const auto it = hsaKernel.patch().find(desc.offset_);
        WriteAqlArgAt(const_cast<address>(params), mem, sizeof(void*), it->second);
      }

      if (IsLogEnabled(amd::LOG_INFO, amd::LOG_KERN)) {
        if (desc.size_ > 8) {
          std::string bytes = "0x";
          constexpr size_t kMaxBytes = 64;
          for (size_t j = 0; j < std::min(desc.size_, kMaxBytes); j++) {
            char byteStr[4];
            snprintf(byteStr, sizeof(byteStr), "%02x ",
                     reinterpret_cast<const uint8_t*>(srcArgPtr)[j]);
            bytes += byteStr;
          }
          if (desc.size_ > kMaxBytes) {
            bytes += "...";
          }
          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = %s (size:0x%x)", i,
                  desc.typeName_.c_str(), desc.name_.c_str(), bytes.c_str(), desc.size_);
        } else {
          ClPrint(amd::LOG_DEBUG, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
                  desc.typeName_.c_str(), desc.name_.c_str(),
                  (desc.size_ == 1)   ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
                  : (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
                  : (desc.size_ == 4) ? *reinterpret_cast<const uint32_t*>(srcArgPtr)
                  : (desc.size_ == 8) ? *reinterpret_cast<const uint64_t*>(srcArgPtr)
                                      : 0LL,
                  desc.size_);
        }
      }
    } else if (desc.type_ == T_SAMPLER) {
      uint32_t index = desc.info_.arrayIndex_;
      const amd::Sampler* sampler =
          reinterpret_cast<amd::Sampler* const*>(params + kernelParams.samplerObjOffset())[index];

      device::Sampler* devSampler = sampler->getDeviceSampler(dev());

      uint64_t sampler_srd = devSampler->hwSrd();
      WriteAqlArgAt(const_cast<address>(params), sampler_srd, sizeof(sampler_srd), desc.offset_);
    }
  }

  if (hsaKernel.program()->hasGlobalStores()) {
    // Sync AQL packets
    setAqlHeader(dispatchPacketHeader_);
    // Clear memory dependency state
    const static bool All = true;
    memoryDependency().clear(!All);
  }

  return true;
}

// ================================================================================================
uint64_t VirtualGPU::getQueueID() {
  amd::ScopedLock lock(execution());
  if (gpu_queue_ == nullptr) {
    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
  }
  return gpu_queue_->id;
}

// ================================================================================================
static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
#if IS_WINDOWS
  std::atomic_ref<uint32_t> atomic_header(*packet);
  atomic_header.store(header | (rest << 16), std::memory_order_release);
#else
  __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
#endif
}

// ================================================================================================
void VirtualGPU::AnalyzeAqlQueue() const {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;
  const uint32_t sw_queue_size = queueMask;
  uint64_t index = Hsa::queue_load_write_index_relaxed(gpu_queue_);
  uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);
  if (index > read) {
    int valid_packet_idx = 0;
    constexpr int kAqlSearchWindow = 32;
    while (valid_packet_idx < kAqlSearchWindow) {
      // Read AQL packet header and check if it's invalid, which means it's done
      auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
          gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
      // If the packet is invalid, then continue search
      if (extractAqlBits((*aql_loc).header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) ==
          HSA_PACKET_TYPE_INVALID) {
        valid_packet_idx++;
      } else {
        break;
      }
    }
    if (valid_packet_idx == kAqlSearchWindow) {
      printf("VGPU(%p) Queue(%p). Couldn't find the hang AQL packet!\n", this, gpu_queue_);
      return;
    }
    // Read AQL packet and check if it's a kernel dispatch
    auto aql_loc = &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
        gpu_queue_->base_address))[(read + valid_packet_idx) & queueMask];
    auto packet = *aql_loc;
    auto header = packet.header;
    if (extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE) ==
        HSA_PACKET_TYPE_KERNEL_DISPATCH) {
      auto it = dev().KernelMap().find(packet.kernel_object);
      if (it != dev().KernelMap().end()) {
        // @note: It's possible to demangle the name with comgr
        printf("Kernel Name: %s\n", it->second.name().c_str());
      } else {
        printf("VGPU(%p) Queue(%p). Couldn't find kernel\n", this, gpu_queue_);
      }
      printf("VGPU=%p SWq=%p, HWq=%p, id=%" PRIu64
             "\n\tDispatch Header ="
             "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
             "setup=%d\n\tgrid=[%u, %u, %u], workgroup=[%u, %u, %u]\n\tprivate_seg_size=%u, "
             "group_seg_size=%u\n\tkernel_obj=0x%" PRIx64
             ", "
             "kernarg_address=0x%p\n\tcompletion_signal=0x%" PRIx64
             ", "
             "correlation_id=%" PRIu64 "\n\trptr=%" PRIu64 ", wptr=%" PRIu64 "\n ",
             this, gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
             extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
             extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
             extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                            HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
             extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                            HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
             0, packet.grid_size_x, packet.grid_size_y, packet.grid_size_z, packet.workgroup_size_x,
             packet.workgroup_size_y, packet.workgroup_size_z, packet.private_segment_size,
             packet.group_segment_size, packet.kernel_object, packet.kernarg_address,
             packet.completion_signal.handle, packet.reserved2, read, index);
    } else {
      printf("VGPU(%p) Queue(%p) rptr=%" PRIu64 ", wptr=%" PRIu64
             ". A barrier packet in the queue!\n",
             this, gpu_queue_, read, index);
    }
  } else {
    printf("VGPU(%p) Queue(%p) is idle\n", this, gpu_queue_);
  }
}

// ================================================================================================
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest,
                                          bool blocking, bool attach_signal) {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;
  const uint32_t sw_queue_size = queueMask;

  // Check for queue full and wait if needed.
  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
  setFenceDirty(true);

  if (addSystemScope_) {
    header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
                HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
    header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
               HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
    addSystemScope_ = false;
  }

  auto expected_fence_state = extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                             HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);

  // Reset fence_dirty_ flag if we submit a packet with system scopes
  if (expected_fence_state == amd::Device::kCacheStateSystem) {
    setFenceDirty(false);
  }

  // Dirty optimization to save on consequent dispatch packets which have requested flushes
  if (fence_state_ == amd::Device::kCacheStateSystem &&
      expected_fence_state == amd::Device::kCacheStateSystem) {
    header = dispatchPacketHeader_;
    setFenceDirty(true);
  }

  fence_state_ = static_cast<Device::CacheState>(expected_fence_state);

  bool attachSignal = timestamp_ != nullptr || attach_signal;
  // Get active signal for current dispatch if profiling is necessary
  packet->completion_signal =
      Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal);

  if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value &&
      timestamp_ != nullptr) {
    // If profiling is enabled, store the correlation ID in the dispatch packet. The profiler can
    // retrieve this correlation ID to attribute waves to specific dispatch locations.
    if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
      auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
      dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
    }

    ProfilingSignal* current_signal = Barriers().GetLastSignal();
    current_signal->flags_.isPacketDispatch_ = true;
  }

  // Make sure the slot is free for usage
  while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
    amd::Os::yield();
  }

  // Add blocking command if the original value of read index was behind of the queue size.
  // Note: direct dispatch relies on the slot stall above to keep the forward progress
  // of the app if a dispatched kernel requires some CPU input for completion
  if (blocking) {
    if (packet->completion_signal.handle == 0) {
      packet->completion_signal = Barriers().ActiveSignal();
    }
    blocking = true;
  }

  TrackQueueProgress(*packet, index);

  AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
  *aql_loc = *packet;
  if (header != 0) {
    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
  }
  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
          "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
          "setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], private_seg_size=%u, "
          "group_seg_size=%u, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx, "
          "correlation_id=%zu, rptr=%u, wptr=%u",
          gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
          extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
          extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
          extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
          extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
          rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
          reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
          Hsa::queue_load_read_index_scacquire(gpu_queue_), index);

  Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);

  // Mark the flag indicating if a dispatch is outstanding.
  // We are not waiting after every dispatch.
  hasPendingDispatch_ = true;

  // Wait on signal ?
  if (blocking) {
    LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
    if (!Barriers().WaitCurrent()) {
      LogPrintfError("Failed blocking queue wait with signal [0x%lx]",
                     packet->completion_signal.handle);
      return false;
    }
  }

  return true;
}

// ================================================================================================
void VirtualGPU::dispatchBlockingWait() {
  auto wait_signals = Barriers().WaitingSignal();
  // AQL dispatch doesn't support dependent signals and extra barrier packet must be generated
  for (uint32_t i = 0; i < wait_signals.size(); ++i) {
    uint32_t j = i % 5;
    barrier_packet_.dep_signal[j] = wait_signals[i];
    constexpr bool kSkipSignal = true;
    // If runtime reached the packet limit or the count limit, then flush the barrier
    if ((j == 4) || ((i + 1) == wait_signals.size())) {
      dispatchBarrierPacket(kNopPacketHeader, kSkipSignal);
    }
  }
}
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header,
                                   uint16_t rest, bool blocking, bool capturing,
                                   const uint8_t* aqlPacket, bool attach_signal) {
  if (capturing == true) {
    packet->header = header;
    packet->setup = rest;
    std::memcpy(const_cast<uint8_t*>(aqlPacket), packet, sizeof(hsa_kernel_dispatch_packet_t));
    return true;
  } else {
    dispatchBlockingWait();
    return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
  }
}
// ================================================================================================
bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
                                   bool blocking, bool attach_signal) {
  return dispatchGenericAqlPacket(packet, header, rest, blocking, attach_signal);
}

// ================================================================================================
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacketBatch(const std::vector<AqlPacket*>& packets,
                                               bool blocking, bool attach_signal,
                                               const std::vector<std::string>* kernelNames) {
  if (packets.empty()) {
    return false;
  }

  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;
  const uint32_t sw_queue_size = queueMask;
  const size_t numPackets = packets.size();
  size_t kMaxBatchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
  const size_t kGpuLagPackets = 16;

  // Staggered copy pattern: powers of 2 (1, 2, 4, 8.. to DEBUG_HIP_GRAPH_BATCH_SIZE
  size_t processedPackets = 0;
  size_t batchSize = 1;

  while (processedPackets < numPackets) {
    uint64_t currentReadIndex = Hsa::queue_load_read_index_scacquire(gpu_queue_);
    uint64_t currentWriteIndex = Hsa::queue_load_write_index_relaxed(gpu_queue_);

    if (currentWriteIndex - currentReadIndex >= kGpuLagPackets) {
      //GPU is busy, so we can copy more packets
      batchSize = DEBUG_HIP_GRAPH_BATCH_SIZE;
    }

    // Process all remaining packets in one batch
    if (processedPackets + batchSize > numPackets) {
      batchSize = numPackets - processedPackets;
    }

    // Check if we have enough space in the queue for this batch
    // If queue is full, reset batch size to 1 and wait
    if (currentWriteIndex + batchSize - currentReadIndex >= sw_queue_size) {
      batchSize = 1;
    }

    // Now reserve space for the batch
    uint64_t startIndex = Hsa::queue_add_write_index_screlease(gpu_queue_, batchSize);

    // Make sure the slot is free for usage
    while ((startIndex - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= sw_queue_size) {
      amd::Os::yield();
    }

    setFenceDirty(true);

    // Save header of first packet in this batch
    AqlPacket* firstPacket = packets[processedPackets];
    uint16_t firstPacketHeader = firstPacket->header;
    uint16_t firstPacketRest = firstPacket->setup;
    // Separate header for doorbell ring that can be modified
    uint16_t doorbellHeader = firstPacketHeader;

    // Save header of last packet in this batch (if different from first)
    AqlPacket* lastPacket = packets[processedPackets + batchSize - 1];
    uint16_t lastPacketHeader = lastPacket->header;

    // Process batchSize packets
    for (size_t i = 0; i < batchSize; ++i) {
      size_t packetIndex = processedPackets + i;
      uint64_t index = startIndex + i;

      AqlPacket* packet = packets[packetIndex];

      bool attachSignal = timestamp_ != nullptr || attach_signal;

      packet->completion_signal =
          Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, attachSignal);

      if (std::is_same<decltype(packet), hsa_kernel_dispatch_packet_t*>::value &&
          timestamp_ != nullptr) {
        // If profiling is enabled, store the correlation ID in the dispatch packet
        if (amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
          auto dispatchPacket = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet);
          dispatchPacket->reserved2 = timestamp_->command().profilingInfo().correlation_id_;
        }

        ProfilingSignal* current_signal = Barriers().GetLastSignal();
        current_signal->flags_.isPacketDispatch_ = true;
      }

      //Add blocking command if needed (only for the last packet)
      if (blocking && (packetIndex == numPackets - 1)) {
        if (packet->completion_signal.handle == 0) {
          packet->completion_signal = Barriers().ActiveSignal();
        }
      }

      AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];

      // For first packet in batch, invalidate header before writing
      bool isFirstPacket = (i == 0);
      bool isLastPacket = (i == batchSize - 1);

      if (isFirstPacket) {
        if (addSystemScope_) {
          // Add system scope on the acq on first packet (modify doorbell header)
          doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
          doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE);
        }
        // Invalidate the header of the first packet in the batch
        packet->header = (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
      }

      // For the end packet in batch set flags
      if (isLastPacket) {
        if (addSystemScope_) {
          // If batch has only 1 packet, update doorbell header for release scope
          // (packet->header is already invalid, so don't modify it)
          if (batchSize == 1) {
            doorbellHeader &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
            doorbellHeader |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
          } else {
            // Add system scope on the release on last packet (different from first)
            packet->header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
            packet->header |= (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
          }
          addSystemScope_ = false;
        }
        // Use doorbellHeader for single packet batch (packet->header is invalid),
        // else use packet->header
        uint16_t headerForFenceState = (batchSize == 1) ? doorbellHeader : packet->header;
        auto expected_fence_state =
            extractAqlBits(headerForFenceState, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                           HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
        // Reset fence_dirty_ flag if we submit a packet with system scopes
        if (expected_fence_state == amd::Device::kCacheStateSystem) {
          setFenceDirty(false);
        }
        fence_state_ = static_cast<Device::CacheState>(expected_fence_state);
      }

      // Copy the packet to the queue
      *aql_loc = *packet;

      // Print kernel name for kernel dispatch packets
      if (kernelNames && packetIndex < kernelNames->size()) {
        // Use doorbellHeader for first packet (packet->header is invalid), else use packet->header
        uint16_t headerForPrint = isFirstPacket ? doorbellHeader : packet->header;
        uint8_t packetType =
            extractAqlBits(headerForPrint, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE);
        if (packetType == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN2, "Graph ShaderName : %s, device id : %u",
                  (*kernelNames)[packetIndex].c_str(), dev().index());

          ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
                  "SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
                  "0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
                  "setup=%d, grid=[%u, %u, %u], workgroup=[%u, %u, %u], "
                  "private_seg_size=%u, group_seg_size=%u, kernel_obj=0x%zx, "
                  "kernarg_address=0x%zx, completion_signal=0x%zx, correlation_id=%zu, "
                  "rptr=%u, wptr=%u",
                  gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, headerForPrint, packetType,
                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_BARRIER,
                                 HSA_PACKET_HEADER_WIDTH_BARRIER),
                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                                 HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
                  extractAqlBits(headerForPrint, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                 HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
                  packet->setup,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal,
                  reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->reserved2,
                  Hsa::queue_load_read_index_scacquire(gpu_queue_), index);
        }
      }

      // Restore the header of the first packet
      if (isFirstPacket) {
        packet->header = firstPacketHeader;
      }

      // Restore the header of the last packet (if different from first)
      if (isLastPacket && batchSize > 1) {
        packet->header = lastPacketHeader;
      }
    }

    // Write valid header for the first packet in the batch
    AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[startIndex & queueMask];
    packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), doorbellHeader, firstPacketRest);

    // Ring doorbell for this batch
    Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, startIndex);

    processedPackets += batchSize;

    TrackQueueProgress(*packets[processedPackets - 1], startIndex + batchSize - 1);
    // Double the batch size for next iteration, cap at DEBUG_HIP_GRAPH_BATCH_SIZE
    if (batchSize < kMaxBatchSize) {
      batchSize *= 2;
    }
  }

  // Mark the flag indicating if a dispatch is outstanding
  hasPendingDispatch_ = true;

  // Wait on signal for the last packet if blocking
  if (blocking) {
    LogInfo("Runtime reached the AQL queue limit. SW is much ahead of HW. Blocking AQL queue!");
    if (!Barriers().WaitCurrent()) {
      LogPrintfError("Failed blocking queue wait with signal [0x%lx]",
                     packets.back()->completion_signal.handle);
      return false;
    }
  }

  return true;
}

// ================================================================================================
bool VirtualGPU::dispatchAqlPacketBatch(const std::vector<uint8_t*>& packets,
                                        const std::vector<std::string>& kernelNames,
                                        amd::AccumulateCommand* vcmd) {
  if (vcmd == nullptr || packets.empty() || packets.size() != kernelNames.size()) {
    return false;
  }

  amd::ScopedLock lock(execution());
  profilingBegin(*vcmd);

  dispatchBlockingWait();

  vcmd->setKernelNamesRef(&kernelNames);

  // Dispatch all packets with a single doorbell ring
  // Cast packets vector to AQL packets vector on the fly
  const auto& aqlPackets =
      reinterpret_cast<const std::vector<hsa_kernel_dispatch_packet_t*>&>(packets);
  bool result = dispatchGenericAqlPacketBatch(aqlPackets, false, false, &kernelNames);

  profilingEnd();

  return result;
}

// ================================================================================================
bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
                                          const uint32_t gfxVersion, bool blocking,
                                          const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi) {
  // PM4 IB packet submission is different between GFX8 and GFX9:
  //  In GFX8 the PM4 IB packet blob is writing directly to AQL queue
  //  In GFX9 the PM4 IB is submitting by AQL Vendor Specific packet and
  switch (gfxVersion) {
    case PerfCounter::ROC_GFX9:
    case PerfCounter::ROC_GFX10: {
      packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
      return dispatchGenericAqlPacket(packet, 0, 0, blocking);
    } break;
  }

  return false;
}

// ================================================================================================
void VirtualGPU::WaitCompleteSignal(hsa_signal_t signal) {
  barrier_packet_.dep_signal[0] = signal;
  dispatchBarrierPacket(kBarrierPacketHeader, false);
}

// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
                                       hsa_signal_t signal) {
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;

  if (!skipSignal) {
    // Make sure the wait is issued before queue index reservation
    auto wait_signals = Barriers().WaitingSignal();
    for (uint32_t i = 0; i < wait_signals.size(); ++i) {
      uint32_t j = i % 5;
      barrier_packet_.dep_signal[j] = wait_signals[i];
      constexpr bool kSkipSignal = true;
      // If runtime reached the packet limit and signals left, then flush the barrier
      if ((j == 4) && ((i + 1) < wait_signals.size())) {
        dispatchBarrierPacket(kNopPacketHeader, kSkipSignal);
      }
    }
  }

  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);

  setFenceDirty(true);
  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                    HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
  if (!skipSignal && (signal.handle == 0)) {
    // Get active signal for current dispatch if profiling is necessary
    barrier_packet_.completion_signal = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
  } else {
    // Attach external signal to the packet
    barrier_packet_.completion_signal = signal;
  }

  TrackQueueProgress(barrier_packet_, index);

  // Reset fence_dirty_ flag if we submit a barrier with system scopes
  if (cache_state == amd::Device::kCacheStateSystem) {
    setFenceDirty(false);
  }

  while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
  hsa_barrier_and_packet_t* aql_loc =
      &(reinterpret_cast<hsa_barrier_and_packet_t*>(gpu_queue_->base_address))[index & queueMask];
  *aql_loc = barrier_packet_;
  packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, 0);

  Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);
  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d,"
          " release=%d), "
          "dep_signal=[0x%zx, 0x%zx, 0x%zx, 0x%zx, 0x%zx], completion_signal=0x%zx, "
          "rptr=%u, wptr=%u",
          gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, packetHeader,
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
          barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
          barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
          barrier_packet_.dep_signal[4], barrier_packet_.completion_signal, read, index);

  // Clear dependent signals for the next packet
  barrier_packet_.dep_signal[0] = hsa_signal_t{};
  barrier_packet_.dep_signal[1] = hsa_signal_t{};
  barrier_packet_.dep_signal[2] = hsa_signal_t{};
  barrier_packet_.dep_signal[3] = hsa_signal_t{};
  barrier_packet_.dep_signal[4] = hsa_signal_t{};
}

// ================================================================================================
void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveDepSignal,
                                            hsa_signal_t signal, hsa_signal_value_t value,
                                            hsa_signal_value_t mask, hsa_signal_condition32_t cond,
                                            bool skipTs, hsa_signal_t completionSignal) {
  uint16_t rest = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
  const uint32_t queueSize = gpu_queue_->size;
  const uint32_t queueMask = queueSize - 1;

  barrier_value_packet_.signal = signal;
  barrier_value_packet_.value = value;
  barrier_value_packet_.mask = mask;
  barrier_value_packet_.cond = cond;

  // Dependent signal and external signal cant be true at the same time
  assert(resolveDepSignal & (signal.handle != 0) == 0);
  if (resolveDepSignal) {
    auto wait_signals = Barriers().WaitingSignal();
    if (wait_signals.size() > 0) {
      barrier_value_packet_.signal = wait_signals[0];
      barrier_value_packet_.value = kInitSignalValueOne;
      barrier_value_packet_.mask = std::numeric_limits<int64_t>::max();
      barrier_value_packet_.cond = HSA_SIGNAL_CONDITION_LT;
      for (uint32_t i = 1; i < wait_signals.size(); ++i) {
        uint32_t j = (i - 1) % 5;
        barrier_packet_.dep_signal[j] = wait_signals[i];
        constexpr bool kSkipSignal = true;
        // If runtime reached the packet limit or the count limit, then flush the barrier
        if ((j == 4) || ((i + 1) == wait_signals.size())) {
          dispatchBarrierPacket(kNopPacketHeader, kSkipSignal);
        }
      }
    }
  }

  setFenceDirty(true);
  auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                                    HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);

  if (completionSignal.handle == 0) {
    // Get active signal for current dispatch if profiling is necessary
    barrier_value_packet_.completion_signal =
        Barriers().ActiveSignal(kInitSignalValueOne, skipTs ? nullptr : timestamp_);
  } else {
    // Attach external signal to the packet
    barrier_value_packet_.completion_signal = completionSignal;
  }

  // Reset fence_dirty_ flag if we submit a barrier
  if (cache_state == amd::Device::kCacheStateSystem) {
    setFenceDirty(false);
  }

  uint64_t index = Hsa::queue_add_write_index_screlease(gpu_queue_, 1);
  uint64_t read = Hsa::queue_load_read_index_relaxed(gpu_queue_);

  TrackQueueProgress(barrier_value_packet_, index);

  while ((index - Hsa::queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
  hsa_amd_barrier_value_packet_t* aql_loc = &(reinterpret_cast<hsa_amd_barrier_value_packet_t*>(
      gpu_queue_->base_address))[index & queueMask];
  *aql_loc = barrier_value_packet_;
  packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, rest);

  Hsa::signal_store_screlease(gpu_queue_->doorbell_signal, index);

  ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_AQL,
          "SWq=0x%zx, HWq=0x%zx, id=%d, BarrierValue Header = 0x%x AmdFormat = 0x%x "
          "(type=%d, barrier=%d, acquire=%d, release=%d), "
          "signal=0x%zx, value = 0x%llx mask = 0x%llx cond: %s, completion_signal=0x%zx, "
          "rptr=%u, wptr=%u",
          gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, packetHeader, rest,
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
          extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
                         HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
          cache_state, barrier_value_packet_.signal, barrier_value_packet_.value,
          barrier_value_packet_.mask,
          barrier_value_packet_.cond == 0   ? "EQ"
          : barrier_value_packet_.cond == 1 ? "NE"
          : barrier_value_packet_.cond == 2 ? "LT"
                                            : "GTE",
          barrier_value_packet_.completion_signal, read, index);
  // Clear dependent signals for the next packet
  barrier_value_packet_.signal = hsa_signal_t{};
}

// ================================================================================================
void VirtualGPU::ResetQueueStates() {
  // Release all memory dependencies
  memoryDependency().clear();

  // Release the pool, since runtime just completed a barrier
  // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
  resetKernArgPool();
}

// ================================================================================================
bool VirtualGPU::releaseGpuMemoryFence(bool skip_cpu_wait) {
  if (hasPendingDispatch_ || isFenceDirty() || !Barriers().IsExternalSignalListEmpty()) {
    // Dispatch barrier packet into the queue
    dispatchBarrierPacket(kBarrierPacketHeader);
    hasPendingDispatch_ = false;
    retainExternalSignals_ = false;
  }

  // Check if runtime could skip CPU wait
  if (!skip_cpu_wait) {
    Barriers().WaitCurrent();

    ResetQueueStates();
  }
  return true;
}

// ================================================================================================
VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
                       const std::vector<uint32_t>& cuMask, amd::CommandQueue::Priority priority)
    : device::VirtualDevice(device),
      state_(0),
      gpu_queue_(nullptr),
      roc_device_(device),
      virtualQueue_(nullptr),
      deviceQueueSize_(0),
      maskGroups_(0),
      schedulerThreads_(0),
      schedulerQueue_(nullptr),
      barriers_(*this),
      managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
      managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
      cuMask_(cuMask),
      priority_(priority),
     copy_command_type_(0),
     fence_state_(Device::CacheState::kCacheStateInvalid),
     fence_dirty_(false) {
  index_ = device.numOfVgpus_++;
  gpu_device_ = device.getBackendDevice();
  printfdbg_ = nullptr;

  // Initialize the last signal and dispatch flags
  timestamp_ = nullptr;
  command_ = nullptr;
  hasPendingDispatch_ = false;
  profiling_ = profiling;
  cooperative_ = cooperative;

  // Initialize barrier and barrier value packets
  barrier_packet_.header = kInvalidAql;
  barrier_value_packet_.header.header = kInvalidAql;

  constexpr uint16_t kernelDispatchHBits =
      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE);
  constexpr uint16_t barrierHBits = (1 << HSA_PACKET_HEADER_BARRIER);
  constexpr uint16_t agentScopeHBits =
      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
  constexpr uint16_t systemScopeHBits =
      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
  // acquire=SYSTEM, release=AGENT (needed for GFX12)
  constexpr uint16_t sysAcquireAgentReleaseHBits =
      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
      (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);

  if (device.settings().fenceScopeAgent_) {
    const auto& isa = device.isa();
    const bool isGfx12 = (isa.versionMajor() == 12) && (isa.versionMinor() == 0) &&
                         (isa.versionStepping() == 0 || isa.versionStepping() == 1);

    dispatchPacketHeaderNoSync_ =
        (kernelDispatchHBits | (isGfx12 ? sysAcquireAgentReleaseHBits : agentScopeHBits));
    dispatchPacketHeader_ = (kernelDispatchHBits | barrierHBits |
                             (isGfx12 ? sysAcquireAgentReleaseHBits : agentScopeHBits));
  } else {
    dispatchPacketHeaderNoSync_ = (kernelDispatchHBits | systemScopeHBits);
    dispatchPacketHeader_ = (kernelDispatchHBits | barrierHBits | systemScopeHBits);
  }

  aqlHeader_ = dispatchPacketHeader_;

  // Note: Virtual GPU device creation must be a thread safe operation
  roc_device_.vgpus_.resize(roc_device_.numOfVgpus_);
  roc_device_.vgpus_[index()] = this;
}

// ================================================================================================
VirtualGPU::~VirtualGPU() {
  delete blitMgr_;

  if (tracking_created_) {
    amd::ScopedLock l(execution());
    if (gpu_queue_ == nullptr) {
      gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
    }
    // Release the resources of signal
    releaseGpuMemoryFence();
  }

  releasePinnedMem();

  if (timestamp_ != nullptr) {
    timestamp_->release();
    timestamp_ = nullptr;
    LogError("There was a timestamp that was not used; deleting.");
  }

  delete printfdbg_;

  if (nullptr != schedulerQueue_) {
    Hsa::queue_destroy(schedulerQueue_);
  }

  if (nullptr != virtualQueue_) {
    virtualQueue_->release();
  }

  // Lock the device to make the following thread safe
  amd::ScopedLock lock(roc_device_.vgpusAccess());

  --roc_device_.numOfVgpus_;  // Virtual gpu unique index decrementing
  roc_device_.vgpus_.erase(roc_device_.vgpus_.begin() + index());
  for (uint idx = index(); idx < roc_device_.vgpus().size(); ++idx) {
    roc_device_.vgpus()[idx]->index_--;
  }

  if (gpu_queue_ != nullptr) {
    roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_);
  }
}

// ================================================================================================
bool VirtualGPU::create() {
  // Pick a reasonable queue size
  uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
  if (!gpu_queue_) return false;

  if (!managed_kernarg_buffer_.Create(Device::MemorySegment::kKernArg)) {
    LogError("Couldn't allocate arguments/signals for the queue");
    return false;
  }

  device::BlitManager::Setup blitSetup;
  blitMgr_ = new KernelBlitManager(*this, blitSetup);
  if ((nullptr == blitMgr_) || !blitMgr_->create(roc_device_)) {
    LogError("Could not create BlitManager!");
    return false;
  }

  // Create a object of PrintfDbg
  printfdbg_ = new PrintfDbg(roc_device_);
  if (nullptr == printfdbg_) {
    LogError("\nCould not create printfDbg Object!");
    return false;
  }

  // Initialize timestamp conversion factor
  if (Timestamp::getGpuTicksToTime() == 0) {
    uint64_t frequency;
    Hsa::system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency);
    Timestamp::setGpuTicksToTime(1e9 / double(frequency));
  }

  if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) {
    LogError("Could not create the array of memory objects!");
    return false;
  }

  // Allocate signal tracker for ROCr copy queue
  tracking_created_ = Barriers().Create();
  if (!tracking_created_) {
    LogError("Could not create signal for copy queue!");
    return false;
  }
  // Create managed buffer for staging copies
  if (!managed_buffer_.Create(Device::MemorySegment::kNoAtomics)) {
    LogError("Could not create managed buffer for this queue!");
    return false;
  }
  // Release HW queue until the first usage
  ReleaseHwQueue();
  return true;
}

// ================================================================================================
VirtualGPU::ManagedBuffer::~ManagedBuffer() {
  for (auto& it : pool_signal_) {
    if (it.handle != 0) {
      Hsa::signal_destroy(it);
    }
  }
  if (pool_base_ != nullptr) {
    gpu_.dev().hostFree(pool_base_, pool_size_);
  }
}

// ================================================================================================
bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
  pool_chunk_end_ = pool_size_ / kPoolNumSignals;
  active_chunk_ = 0;
  // Allocate memory for managed buffer
  if (mem_segment == Device::MemorySegment::kKernArg &&
      (gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
      gpu_.dev().info().largeBar_) {
    amd::Device::AllocationFlags flags = {};
    flags.executable_ = true;
    pool_base_ = reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_, flags));
    if (pool_base_ != nullptr) {
      // @note Workaround first access penalty.
      // KFD may update CPU page tables on the first CPU access
      *pool_base_ = 0;
    }
  } else {
    pool_base_ = reinterpret_cast<address>(gpu_.dev().hostAlloc(pool_size_, 0, mem_segment));
  }
  if (pool_base_ == nullptr) {
    return false;
  }
  hsa_agent_t agent = gpu_.dev().getBackendDevice();
  for (auto& it : pool_signal_) {
    if (HSA_STATUS_SUCCESS != Hsa::signal_create(0, 1, &agent, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &it)) {
      return false;
    }
  }
  return true;
}

// ================================================================================================
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
  return Acquire(size, gpu_.dev().info().globalMemCacheLineSize_);
}

// ================================================================================================
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
  assert(alignment != 0);
  address result = nullptr;
  result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
  const size_t pool_new_usage = (result + size) - pool_base_;
  if (pool_new_usage <= pool_chunk_end_) {
    pool_cur_offset_ = pool_new_usage;
    return result;
  } else {
    // Reset the signal for the barrier packet
    Hsa::signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN, "Issue barrier to flush chunk %d",
            active_chunk_);
    // Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
    constexpr bool kSkipSignal = false;
    // Dispatch a barrier packet into the queue
    gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
    // Get the next chunk
    active_chunk_ = ++active_chunk_ % kPoolNumSignals;
    // Make sure the new active chunk is free
    bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
    assert(test && "Runtime can't fail a wait for chunk!");
    // Make sure the current offset matches the new chunk to avoid possible overlaps
    // between chunks and issues during recycle
    pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
    pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals;
    result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
    pool_cur_offset_ = (result + size) - pool_base_;
  }

  return result;
}

// ================================================================================================
void VirtualGPU::ManagedBuffer::ResetPool() {
  pool_cur_offset_ = 0;
  pool_chunk_end_ = pool_size_ / kPoolNumSignals;
  active_chunk_ = 0;
}

// ================================================================================================
void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
  return managed_kernarg_buffer_.Acquire(size, alignment);
}

// ================================================================================================
address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
  if (ROC_SKIP_KERNEL_ARG_COPY) {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    return reinterpret_cast<address>(allocKernArg(size, alignment));
  } else {
    return nullptr;
  }
}

// ================================================================================================
void VirtualGPU::ReleaseAllHwQueues() {
  if (roc_device_.settings().dynamic_queues_ &&
      (roc_device_.NumNormalQueues() > GPU_MAX_HW_QUEUES)) {
    // Lock the device to make the following thread safe
    amd::ScopedLock lock(roc_device_.vgpusAccess());
    for (uint idx = 0; idx < roc_device_.vgpus().size(); ++idx) {
      roc_device_.vgpus()[idx]->ReleaseHwQueue();
    }
  }
}

// ================================================================================================
void VirtualGPU::ReleaseHwQueue() {
  // Try to release normal queue to the pool of active queues
  if (roc_device_.settings().dynamic_queues_ &&
      (priority_ == amd::CommandQueue::Priority::Normal) && !cooperative_ &&
      (cuMask_.size() == 0)) {
    amd::ScopedLock lock(execution());
    if (gpu_queue_ != nullptr) {
      if (IsQueueIdle()) {
        if (roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
          gpu_queue_ = nullptr;
        }
      }
    }
  }
}

// ================================================================================================
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
  if (gpu_queue_ == nullptr) {
    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
  }
  // Track the current command
  command_ = &command;

  // Disable profiling when command is being captured to prevent memory leak from created timestamp_
  // which won't get freed, since the command is not being executed until graph launch
  if (!command.getPktCapturingState() && command.profilingInfo().enabled_) {
    if (timestamp_ != nullptr) {
      LogWarning(
          "Trying to create a second timestamp in VirtualGPU. \
                  This could have unintended consequences.");
      return;
    }

    // Without barrier profiling will wait for each individual signal
    timestamp_ = new Timestamp(this, command);
    command.data().emplace_back(timestamp_);
    timestamp_->start();

    // Enable SDMA profiling on the first access if profiling is set
    // Its not per command basis
    if (sdmaProfiling && !Barriers().GetSDMAProfiling()) {
      Barriers().SetSDMAProfiling(true);
    }
  }

  if (AMD_DIRECT_DISPATCH) {
    if (!retainExternalSignals_) {
      Barriers().ClearExternalSignals();
    }
    for (auto it = command.eventWaitList().begin(); it < command.eventWaitList().end(); ++it) {
      void* hw_event =
          ((*it)->NotifyEvent() != nullptr) ? (*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
      if (hw_event != nullptr) {
        Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
      } else if (static_cast<amd::Command*>(*it)->queue() != command.queue() &&
                 ((*it)->status() != CL_COMPLETE)) {
        LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
      } else {
        // Assume serialization on the same queue...
      }

      // Check if the waiting event's queue has a dirty fence and propagate it
      if (!isFenceDirty()) {
        amd::Command* wait_cmd = static_cast<amd::Command*>(*it);
        if (wait_cmd->queue() != nullptr && wait_cmd->queue() != command.queue()) {
          device::VirtualDevice* wait_vdev = wait_cmd->queue()->vdev();
          if (wait_vdev != nullptr && wait_vdev->isFenceDirty()) {
            setFenceDirty(true);
          }
        }
      }
    }
  }
}

// ================================================================================================
/* profilingEnd, when profiling is enabled, checks to see if a signal was
 * created for whatever command we are running and calls end() to get the
 * current host timestamp if no signal is available.
 */
void VirtualGPU::profilingEnd(bool clearHwEvent) {
  if (!command_->getPktCapturingState() && command_->profilingInfo().enabled_) {
    if (timestamp_->HwProfiling() == false) {
      timestamp_->end();
    }
    timestamp_ = nullptr;
  }
  if (AMD_DIRECT_DISPATCH) {
    assert(retainExternalSignals_ || Barriers().IsExternalSignalListEmpty());
  }

  // Certain commands like map/unmap memory may not need hw_events as its not a
  // queue operation. In such cases clear already set events which may have been for sync
  // before some memory map/unmap operation
  if (clearHwEvent) {
    if (command_->HwEvent() != nullptr) {
      reinterpret_cast<ProfilingSignal*>(command_->HwEvent())->release();
      command_->SetHwEvent(nullptr);
    }
  }

  // Clear the command tracking
  command_ = nullptr;
}

// ================================================================================================
void VirtualGPU::updateCommandsState(amd::Command* list) const {
  Timestamp* ts = nullptr;

  amd::Command* current = list;
  amd::Command* next = nullptr;

  if (current == nullptr) {
    return;
  }

  uint64_t endTimeStamp = 0;
  uint64_t startTimeStamp = endTimeStamp;

  if (current->profilingInfo().enabled_) {
    // TODO: use GPU timestamp when available.
    endTimeStamp = amd::Os::timeNanos();
    startTimeStamp = endTimeStamp;

    // This block gets the first valid timestamp from the first command
    // that has one. This timestamp is used below to mark any command that
    // came before it to start and end with this first valid start time.
    current = list;
    while (current != nullptr) {
      if (!current->data().empty()) {
        ts = reinterpret_cast<Timestamp*>(current->data().back());
        ts->getTime(&startTimeStamp, &endTimeStamp);
        break;
      }
      current = current->getNext();
    }
  }

  // Iterate through the list of commands, and set timestamps as appropriate
  // Note, if a command does not have a timestamp, it does one of two things:
  // - if the command (without a timestamp), A, precedes another command, C,
  // that _does_ contain a valid timestamp, command A will set RUNNING and
  // COMPLETE with the RUNNING (start) timestamp from command C. This would
  // also be true for command B, which is between A and C. These timestamps
  // are actually retrieved in the block above (startTimeStamp, endTimeStamp).
  // - if the command (without a timestamp), C, follows another command, A,
  // that has a valid timestamp, command C will be set RUNNING and COMPLETE
  // with the COMPLETE (end) timestamp of the previous command, A. This is
  // also true for any command B, which falls between A and C.
  current = list;
  while (current != nullptr) {
    if (current->profilingInfo().enabled_) {
      if (!current->data().empty()) {
        for (auto i = 0; i < current->data().size(); i++) {
          // Since this is a valid command to get a timestamp, we use the
          // timestamp provided by the runtime (saved in the data())
          ts = reinterpret_cast<Timestamp*>(current->data()[i]);
          ts->getTime(&startTimeStamp, &endTimeStamp);
          ts->release();
        }
        current->data().clear();
      } else {
        // If we don't have a command that contains a valid timestamp,
        // we simply use the end timestamp of the previous command.
        // Note, if this is a command before the first valid timestamp,
        // this will be equal to the start timestamp of the first valid
        // timestamp at this point.
        startTimeStamp = endTimeStamp;
      }
    }

    if (current->status() == CL_SUBMITTED) {
      current->setStatus(CL_RUNNING, startTimeStamp);
      current->setStatus(CL_COMPLETE, endTimeStamp);
    } else if (current->status() != CL_COMPLETE) {
      LogPrintfError("Unexpected command status - %d.", current->status());
    }

    next = current->getNext();
    current->release();
    current = next;
  }
}

// ================================================================================================

void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);

  Memory* devMem = dev().getRocMemory(&cmd.source());
  // Synchronize data with other memory instances if necessary
  devMem->syncCacheFromHost(*this);

  void* dst = cmd.destination();
  amd::Coord3D size = cmd.size();

  //! @todo: add multi-devices synchronization when supported.

  cl_command_type type = cmd.type();
  bool result = false;
  bool imageBuffer = false;

  // Force buffer read for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_READ_IMAGE) && (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    type = CL_COMMAND_READ_BUFFER;
    imageBuffer = true;
  }

  switch (type) {
    case CL_COMMAND_READ_BUFFER: {
      amd::Coord3D origin(cmd.origin()[0]);
      if (imageBuffer) {
        size_t elemSize = cmd.source().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyBuffer(*devMem, *hostMemory, origin, dstOrigin, size,
                                      cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        result = blitMgr().readBuffer(*devMem, dst, origin, size, cmd.isEntireMemory(),
                                      cmd.copyMetadata());
      }
      break;
    }
    case CL_COMMAND_READ_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(cmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, size.c, cmd.hostRect().rowPitch_,
                            cmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*devMem, *hostMemory, cmd.bufRect(), hostbufferRect, size,
                                          cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        result = blitMgr().readBufferRect(*devMem, dst, cmd.bufRect(), cmd.hostRect(), size,
                                          cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
    case CL_COMMAND_READ_IMAGE: {
      if ((cmd.source().parent() != nullptr) &&
          (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
        Image* imageBuffer = static_cast<Image*>(devMem);
        // Check if synchronization has to be performed
        if (nullptr != imageBuffer->CopyImageBuffer()) {
          amd::Memory* memory = imageBuffer->CopyImageBuffer();
          devMem = dev().getGpuMemory(memory);
          Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
          amd::Image* image = imageBuffer->owner()->asImage();
          amd::Coord3D offs(0);
          // Copy memory from the original image buffer into the backing store image
          result = blitMgr().copyBufferToImage(*buffer, *devMem, offs, offs, image->getRegion(),
                                               true, image->getRowPitch(), image->getSlicePitch());
        }
      }
      if (hostMemory != nullptr) {
        // Accelerated image to buffer transfer without pinning
        amd::Coord3D dstOrigin(offset);
        result = blitMgr().copyImageToBuffer(*devMem, *hostMemory, cmd.origin(), dstOrigin, size,
                                             cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(),
                                             cmd.copyMetadata());
      } else {
        result = blitMgr().readImage(*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
                                     cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitReadMemory failed!");
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  }

  profilingEnd();
}

void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  size_t offset = 0;
  // Find if virtual address is a CL allocation
  device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);

  Memory* devMem = dev().getRocMemory(&cmd.destination());

  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = cmd.isEntireMemory();
  devMem->syncCacheFromHost(*this, syncFlags);

  const char* src = static_cast<const char*>(cmd.source());
  amd::Coord3D size = cmd.size();

  //! @todo add multi-devices synchronization when supported.

  cl_command_type type = cmd.type();
  bool result = false;
  bool imageBuffer = false;

  // Force buffer write for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_WRITE_IMAGE) &&
      (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    type = CL_COMMAND_WRITE_BUFFER;
    imageBuffer = true;
  }

  switch (type) {
    case CL_COMMAND_WRITE_BUFFER: {
      amd::Coord3D origin(cmd.origin()[0]);
      if (imageBuffer) {
        size_t elemSize = cmd.destination().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }
      if (hostMemory != nullptr) {
        // Accelerated transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBuffer(*hostMemory, *devMem, srcOrigin, origin, size,
                                      cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        result = blitMgr().writeBuffer(src, *devMem, origin, size, cmd.isEntireMemory(),
                                       cmd.copyMetadata());
      }
      break;
    }
    case CL_COMMAND_WRITE_BUFFER_RECT: {
      amd::BufferRect hostbufferRect;
      amd::Coord3D region(0);
      amd::Coord3D hostOrigin(cmd.hostRect().start_ + offset);
      hostbufferRect.create(hostOrigin.c, size.c, cmd.hostRect().rowPitch_,
                            cmd.hostRect().slicePitch_);
      if (hostMemory != nullptr) {
        result = blitMgr().copyBufferRect(*hostMemory, *devMem, hostbufferRect, cmd.bufRect(), size,
                                          cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        result = blitMgr().writeBufferRect(src, *devMem, cmd.hostRect(), cmd.bufRect(), size,
                                           cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
    case CL_COMMAND_WRITE_IMAGE: {
      if (hostMemory != nullptr) {
        // Accelerated buffer to image transfer without pinning
        amd::Coord3D srcOrigin(offset);
        result = blitMgr().copyBufferToImage(*hostMemory, *devMem, srcOrigin, cmd.origin(), size,
                                             cmd.isEntireMemory(), cmd.rowPitch(), cmd.slicePitch(),
                                             cmd.copyMetadata());
      } else {
        result = blitMgr().writeImage(src, *devMem, cmd.origin(), size, cmd.rowPitch(),
                                      cmd.slicePitch(), cmd.isEntireMemory(), cmd.copyMetadata());
      }
      break;
    }
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitWriteMemory failed!");
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  } else {
    cmd.destination().signalWrite(&dev());
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  // in-order semantics: previous commands need to be done before we start
  releaseGpuMemoryFence();

  profilingBegin(cmd);
  const std::vector<void*>& svmPointers = cmd.svmPointers();
  if (cmd.pfnFreeFunc() == nullptr) {
    // pointers allocated using clSVMAlloc
    for (uint32_t i = 0; i < svmPointers.size(); i++) {
      amd::SvmBuffer::free(cmd.context(), svmPointers[i]);
    }
  } else {
    cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
                      (void**)(&(svmPointers[0])), cmd.userData());
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(cmd);

  if (dev().info().hmmSupported_) {
    // Initialize signal for the barrier
    auto wait_events = Barriers().WaitingSignal(HwQueueEngine::Unknown);
    hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);

    // Find the requested agent for the transfer
    hsa_agent_t agent =
        (cmd.cpu_access() || (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory))
            ? dev().getCpuAgent(cmd.numa_id())
            : (static_cast<const roc::Device*>(cmd.device()))->getBackendDevice();

    // Initiate a prefetch command
    hsa_status_t status =
        Hsa::svm_prefetch_async(const_cast<void*>(cmd.dev_ptr()), cmd.count(), agent,
                                wait_events.size(), wait_events.data(), active);
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
            "HSA prefetch async dev_ptr=0x%zx, count=%d, wait_event=0x%zx, "
            "completion_signal=0x%zx",
            const_cast<void*>(cmd.dev_ptr()), cmd.count(),
            (wait_events.size() != 0) ? wait_events[0].handle : 0, active.handle);

    if ((status != HSA_STATUS_SUCCESS)) {
      Barriers().ResetCurrentSignal();
      LogError("hsa_amd_svm_prefetch_async failed");
      cmd.setStatus(CL_INVALID_OPERATION);
    }

    // Add system scope, since the prefetch scope is unclear
    addSystemScope();
  } else {
    LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support");
  }
  profilingEnd();
}

// ================================================================================================
bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
                            bool entire, const amd::Coord3D& srcOrigin,
                            const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                            const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                            amd::CopyMetadata copyMetadata) {
  Memory* srcDevMem = dev().getRocMemory(&srcMem);
  Memory* dstDevMem = dev().getRocMemory(&dstMem);
  if (srcDevMem == nullptr || dstDevMem == nullptr) {
    LogError("submitCopyMemory failed!");
    return false;
  }
  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  dstDevMem->syncCacheFromHost(*this, syncFlags);
  srcDevMem->syncCacheFromHost(*this);

  bool result = false;
  bool srcImageBuffer = false;
  bool dstImageBuffer = false;

  // Force buffer copy for IMAGE1D_BUFFER
  if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
    srcImageBuffer = true;
    type = CL_COMMAND_COPY_BUFFER;
  }
  if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
    dstImageBuffer = true;
    type = CL_COMMAND_COPY_BUFFER;
  }

  switch (type) {
    case CL_COMMAND_SVM_MEMCPY:
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D realSrcOrigin(srcOrigin[0]);
      amd::Coord3D realDstOrigin(dstOrigin[0]);
      amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);

      if (srcImageBuffer) {
        const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
        realSrcOrigin.c[0] *= elemSize;
        if (dstImageBuffer) {
          realDstOrigin.c[0] *= elemSize;
        }
        realSize.c[0] *= elemSize;
      } else if (dstImageBuffer) {
        const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
        realDstOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
      }

      result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, realSrcOrigin, realDstOrigin, realSize,
                                    entire, copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_BUFFER_RECT: {
      result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, srcRect, dstRect, size, entire,
                                        copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_IMAGE: {
      result = blitMgr().copyImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire,
                                   copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
      result =
          blitMgr().copyImageToBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire,
                                      dstRect.rowPitch_, dstRect.slicePitch_, copyMetadata);
      break;
    }
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
      result =
          blitMgr().copyBufferToImage(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, entire,
                                      srcRect.rowPitch_, srcRect.slicePitch_, copyMetadata);
      break;
    }
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitCopyMemory failed!");
    return false;
  }

  // Mark this as the most-recently written cache of the destination
  dstMem.signalWrite(&dev());
  return true;
}

// ================================================================================================
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  cl_command_type type = cmd.type();
  bool entire = cmd.isEntireMemory();

  if (!copyMemory(type, cmd.source(), cmd.destination(), entire, cmd.srcOrigin(), cmd.dstOrigin(),
                  cmd.size(), cmd.srcRect(), cmd.dstRect(), cmd.copyMetadata())) {
    cmd.setStatus(CL_INVALID_OPERATION);
  }

  // Runtime may change the command type to report a more accurate info in ROC profiler
  if (copy_command_type_ != 0) {
    cmd.OverrrideCommandType(copy_command_type_);
    copy_command_type_ = 0;
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);
  // no op for FGS supported device
  if (!dev().isFineGrainedSystem(true)) {
    amd::Coord3D srcOrigin(0, 0, 0);
    amd::Coord3D dstOrigin(0, 0, 0);
    amd::Coord3D size(cmd.srcSize(), 1, 1);
    amd::BufferRect srcRect;
    amd::BufferRect dstRect;

    bool result = false;
    amd::Memory* srcMem = amd::MemObjMap::FindMemObj(cmd.src());
    amd::Memory* dstMem = amd::MemObjMap::FindMemObj(cmd.dst());

    device::Memory::SyncFlags syncFlags;
    if (nullptr != srcMem) {
      srcOrigin.c[0] =
          static_cast<const_address>(cmd.src()) - static_cast<address>(srcMem->getSvmPtr());
      if (!(srcMem->validateRegion(srcOrigin, size))) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }
    if (nullptr != dstMem) {
      dstOrigin.c[0] =
          static_cast<const_address>(cmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
      if (!(dstMem->validateRegion(dstOrigin, size))) {
        cmd.setStatus(CL_INVALID_OPERATION);
        return;
      }
    }

    if ((nullptr == srcMem && nullptr == dstMem) ||  // both not in svm space
        (nullptr != srcMem && dev().forceFineGrain(srcMem)) ||
        (nullptr != dstMem && dev().forceFineGrain(dstMem))) {
      // Wait on a kernel if one is outstanding
      releaseGpuMemoryFence();

      // If these are from different contexts, then one of them could be in the device memory
      // This is fine, since spec doesn't allow for copies with pointers from different contexts
      std::memcpy(cmd.dst(), cmd.src(), cmd.srcSize());
      result = true;
    } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
      Memory* memory = dev().getRocMemory(dstMem);
      // Synchronize source and destination memory
      syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
      memory->syncCacheFromHost(*this, syncFlags);

      result = blitMgr().writeBuffer(cmd.src(), *memory, dstOrigin, size,
                                     dstMem->isEntirelyCovered(dstOrigin, size));
      // Mark this as the most-recently written cache of the destination
      dstMem->signalWrite(&dev());
    } else if (nullptr != srcMem && nullptr == dstMem) {  // dst not in svm space
      Memory* memory = dev().getRocMemory(srcMem);
      // Synchronize source and destination memory
      memory->syncCacheFromHost(*this);

      result = blitMgr().readBuffer(*memory, cmd.dst(), srcOrigin, size,
                                    srcMem->isEntirelyCovered(srcOrigin, size));
    } else if (nullptr != srcMem && nullptr != dstMem) {  // both in svm space
      bool entire =
          srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
      result = copyMemory(cmd.type(), *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect,
                          dstRect);
    }

    if (!result) {
      cmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // Stall GPU for CPU access to memory
    releaseGpuMemoryFence();
    // direct memcpy for FGS enabled system
    amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  Memory* srcDevMem = static_cast<roc::Memory*>(
      cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
  Memory* dstDevMem = static_cast<roc::Memory*>(
      cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));

  bool p2pAllowed = false;
  // Loop through all available P2P devices for the destination buffer
  for (auto agent : dstDevMem->dev().p2pAgents()) {
    // Find the device, which is matching the current
    if (agent.handle == dev().getBackendDevice().handle) {
      p2pAllowed = true;
      break;
    }

    for (auto agent : srcDevMem->dev().p2pAgents()) {
      if (agent.handle == dev().getBackendDevice().handle) {
        p2pAllowed = true;
        break;
      }
    }
  }

  // Synchronize source and destination memory
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = cmd.isEntireMemory();
  amd::Coord3D size = cmd.size();

  bool result = false;
  switch (cmd.type()) {
    case CL_COMMAND_COPY_BUFFER: {
      amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
      amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);

      if (p2pAllowed) {
        result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
                                      cmd.isEntireMemory());
      } else {
        // Sync the current queue, since P2P staging uses the device queues for transfer
        releaseGpuMemoryFence();

        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));

        size_t copy_size = Device::kP2PStagingSize;
        size_t left_size = size[0];
        result = true;
        do {
          if (left_size <= copy_size) {
            copy_size = left_size;
          }
          left_size -= copy_size;
          amd::Coord3D stageOffset(0);
          amd::Coord3D cpSize(copy_size);

          // Perform 2 step transfer with staging buffer
          result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
                                                          stageOffset, cpSize);
          srcOrigin.c[0] += copy_size;
          result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
                                                          dstOrigin, cpSize);
          dstOrigin.c[0] += copy_size;
        } while (left_size > 0);
      }
      break;
    }
    case CL_COMMAND_COPY_BUFFER_RECT: {
      if (p2pAllowed) {
        result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(),
                                          size, cmd.isEntireMemory(), cmd.copyMetadata());
      } else {
        // Sync the current queue, since P2P staging uses the device queues for transfer
        releaseGpuMemoryFence();

        amd::ScopedLock lock(dev().P2PStageOps());
        Memory* dstStgMem = static_cast<Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
        Memory* srcStgMem = static_cast<Memory*>(
            dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));

        if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
          result = true;
          // Perform 2 step transfer with staging buffer
          result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
                                                              cmd.srcRect(), size, false,
                                                              cmd.copyMetadata());

          result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
                                                              cmd.dstRect(), size, false,
                                                              cmd.copyMetadata());
        } else {
          size_t srcOffset;
          size_t dstOffset;
          result = true;

          for (size_t z = 0; z < size[2]; ++z) {
            for (size_t y = 0; y < size[1]; ++y) {
              srcOffset = cmd.srcRect().offset(0, y, z);
              dstOffset = cmd.dstRect().offset(0, y, z);

              amd::Coord3D srcOrigin(srcOffset);
              amd::Coord3D dstOrigin(dstOffset);
              size_t copy_size = Device::kP2PStagingSize;
              size_t left_size = size[0];
              amd::Coord3D stageOffset(0);
              do {
                if (left_size <= copy_size) {
                  copy_size = left_size;
                }
                left_size -= copy_size;

                // Perform 2 step transfer with staging buffer
                result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
                                                                stageOffset, copy_size);

                result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
                                                                dstOrigin, copy_size);

                srcOrigin.c[0] += copy_size;
                dstOrigin.c[0] += copy_size;
              } while (left_size > 0);
            }
          }
        }
      }
      break;
    }
    case CL_COMMAND_COPY_IMAGE:
    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
      LogError("Unsupported P2P type!");
      break;
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitCopyMemoryP2P failed!");
    cmd.setStatus(CL_OUT_OF_RESOURCES);
  }

  cmd.destination().signalWrite(&dstDevMem->dev());

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem(true) && !dev().forceFineGrain(cmd.getSvmMem())) {
    // Make sure we have memory for the command execution
    Memory* memory = dev().getRocMemory(cmd.getSvmMem());

    memory->saveMapInfo(cmd.svmPtr(), cmd.origin(), cmd.size(), cmd.mapFlags(),
                        cmd.isEntireMemory());

    if (memory->mapMemory() != nullptr) {
      if (cmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());

        if (!blitMgr().copyBuffer(*memory, *hsaMapMemory, cmd.origin(), cmd.origin(), cmd.size(),
                                  cmd.isEntireMemory())) {
          LogError("submitSVMMapMemory() - copy failed");
          cmd.setStatus(CL_MAP_FAILURE);
        }
        // Wait on a kernel if one is outstanding
        releaseGpuMemoryFence();
        const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
        std::memcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
      }
    } else {
      LogError("Unhandled svm map!");
    }
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  // no op for FGS supported device
  if (!dev().isFineGrainedSystem(true) && !dev().forceFineGrain(cmd.getSvmMem())) {
    Memory* memory = dev().getRocMemory(cmd.getSvmMem());
    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(cmd.svmPtr());

    if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
        // Wait on a kernel if one is outstanding
        releaseGpuMemoryFence();
        amd::Coord3D srcOrigin(0, 0, 0);
        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());

        void* mappedPtr = hsaMapMemory->owner()->getHostMem();
        std::memcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
        // Target is a remote resource, so copy
        if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_,
                                  writeMapInfo->origin_, writeMapInfo->region_,
                                  writeMapInfo->isEntire())) {
          LogError("submitSvmUnmapMemory() - copy failed");
          cmd.setStatus(CL_OUT_OF_RESOURCES);
        }
      }
    } else {
      LogError("Unhandled svm map!");
    }

    memory->clearUnmapInfo(cmd.svmPtr());
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd, true);

  //! @todo add multi-devices synchronization when supported.

  roc::Memory* devMemory =
      reinterpret_cast<roc::Memory*>(cmd.memory().getDeviceMemory(dev(), false));

  cl_command_type type = cmd.type();
  bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
  if (imageBuffer) {
    type = CL_COMMAND_MAP_BUFFER;
  }

  // Save map requirement.
  cl_map_flags mapFlag = cmd.mapFlags();

  // Treat no map flag as read-write.
  if (mapFlag == 0) {
    mapFlag = CL_MAP_READ | CL_MAP_WRITE;
  }

  devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(), mapFlag, cmd.isEntireMemory());

  // Sync to the map target.
  // If we have host memory, use it
  if ((devMemory->owner()->getHostMem() != nullptr) &&
      (devMemory->owner()->getSvmPtr() == nullptr)) {
    if (!AMD_DIRECT_DISPATCH && !devMemory->isHostMemDirectAccess()) {
      // Make sure GPU finished operation before synchronization with the backing store
      releaseGpuMemoryFence();
    }
    // Target is the backing store, so just ensure that owner is up-to-date
    devMemory->owner()->cacheWriteBack(this);

    if (devMemory->isHostMemDirectAccess()) {
      // Add memory to VA cache, so rutnime can detect direct access to VA
      dev().addVACache(devMemory);
    }
  } else if (devMemory->IsPersistentDirectMap()) {
    // Persistent memory - NOP map
  } else if (mapFlag & (CL_MAP_READ | CL_MAP_WRITE)) {
    bool result = false;
    roc::Memory* hsaMemory = static_cast<roc::Memory*>(devMemory);

    amd::Memory* mapMemory = hsaMemory->mapMemory();
    void* hostPtr =
        mapMemory == nullptr ? hsaMemory->owner()->getHostMem() : mapMemory->getHostMem();

    if (type == CL_COMMAND_MAP_BUFFER) {
      amd::Coord3D origin(cmd.origin()[0]);
      amd::Coord3D size(cmd.size()[0]);
      amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0);
      if (imageBuffer) {
        size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize();
        origin.c[0] *= elemSize;
        size.c[0] *= elemSize;
      }

      if (mapMemory != nullptr) {
        roc::Memory* hsaMapMemory =
            static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
        result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
                                      cmd.isEntireMemory());
        void* svmPtr = devMemory->owner()->getSvmPtr();
        if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
          // Wait on a kernel if one is outstanding
          releaseGpuMemoryFence();
          std::memcpy(svmPtr, hostPtr, size[0]);
        }
      } else {
        result = blitMgr().readBuffer(*hsaMemory, static_cast<char*>(hostPtr) + origin[0], origin,
                                      size, cmd.isEntireMemory());
      }
    } else if (type == CL_COMMAND_MAP_IMAGE) {
      amd::Image* image = cmd.memory().asImage();
      if (mapMemory != nullptr) {
        roc::Memory* hsaMapMemory =
            static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
        result =
            blitMgr().copyImageToBuffer(*hsaMemory, *hsaMapMemory, cmd.origin(),
                                        amd::Coord3D(0, 0, 0), cmd.size(), cmd.isEntireMemory());
      } else {
        result = blitMgr().readImage(*hsaMemory, hostPtr, amd::Coord3D(0), image->getRegion(),
                                     image->getRowPitch(), image->getSlicePitch(), true);
      }
    } else {
      ShouldNotReachHere();
    }

    if (!result) {
      LogError("submitMapMemory failed!");
      cmd.setStatus(CL_OUT_OF_RESOURCES);
    }
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  roc::Memory* devMemory = static_cast<roc::Memory*>(cmd.memory().getDeviceMemory(dev(), false));

  const device::Memory::WriteMapInfo* mapInfo = devMemory->writeMapInfo(cmd.mapPtr());
  if (nullptr == mapInfo) {
    LogError("Unmap without map call");
    return;
  }

  profilingBegin(cmd, true);

  // Force buffer write for IMAGE1D_BUFFER
  bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);

  // We used host memory
  if ((devMemory->owner()->getHostMem() != nullptr) &&
      (devMemory->owner()->getSvmPtr() == nullptr)) {
    if (mapInfo->isUnmapWrite()) {
      // Target is the backing store, so sync
      devMemory->owner()->signalWrite(nullptr);
      devMemory->syncCacheFromHost(*this);
    }
    if (devMemory->isHostMemDirectAccess()) {
      // Remove memory from VA cache
      dev().removeVACache(devMemory);
    }
  } else if (devMemory->IsPersistentDirectMap()) {
    // Persistent memory - NOP unmap
  } else if (mapInfo->isUnmapWrite()) {
    // Commit the changes made by the user.
    if (!devMemory->isHostMemDirectAccess()) {
      bool result = false;

      amd::Memory* mapMemory = devMemory->mapMemory();
      if (cmd.memory().asImage() && !imageBuffer) {
        amd::Image* image = cmd.memory().asImage();
        if (mapMemory != nullptr) {
          roc::Memory* hsaMapMemory =
              static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
          result =
              blitMgr().copyBufferToImage(*hsaMapMemory, *devMemory, amd::Coord3D(0, 0, 0),
                                          mapInfo->origin_, mapInfo->region_, mapInfo->isEntire());
        } else {
          void* hostPtr = devMemory->owner()->getHostMem();

          result = blitMgr().writeImage(hostPtr, *devMemory, amd::Coord3D(0), image->getRegion(),
                                        image->getRowPitch(), image->getSlicePitch(), true);
        }
      } else {
        amd::Coord3D origin(mapInfo->origin_[0]);
        amd::Coord3D size(mapInfo->region_[0]);
        amd::Coord3D dstOrigin(mapInfo->origin_[0]);
        if (imageBuffer) {
          size_t elemSize = cmd.memory().asImage()->getImageFormat().getElementSize();
          origin.c[0] *= elemSize;
          size.c[0] *= elemSize;
        }
        if (mapMemory != nullptr) {
          roc::Memory* hsaMapMemory =
              static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));

          const void* svmPtr = devMemory->owner()->getSvmPtr();
          void* hostPtr = mapMemory->getHostMem();
          if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
            // Wait on a kernel if one is outstanding
            releaseGpuMemoryFence();
            std::memcpy(hostPtr, svmPtr, size[0]);
          }
          result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, origin, dstOrigin, size,
                                        mapInfo->isEntire());
        } else {
          result = blitMgr().writeBuffer(cmd.mapPtr(), *devMemory, origin, size);
        }
      }
      if (!result) {
        LogError("submitMapMemory failed!");
        cmd.setStatus(CL_OUT_OF_RESOURCES);
      }
    }

    cmd.memory().signalWrite(&dev());
  }

  devMemory->clearUnmapInfo(cmd.mapPtr());

  profilingEnd();
}

// ================================================================================================
bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
                            size_t patternSize, const amd::Coord3D& surface,
                            const amd::Coord3D& origin, const amd::Coord3D& size, bool forceBlit) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  Memory* memory = dev().getRocMemory(amdMemory);

  bool entire = amdMemory->isEntirelyCovered(origin, size);
  // Synchronize memory from host if necessary
  device::Memory::SyncFlags syncFlags;
  syncFlags.skipEntire_ = entire;
  memory->syncCacheFromHost(*this, syncFlags);

  bool result = false;
  bool imageBuffer = false;
  float fillValue[4];

  // Force fill buffer for IMAGE1D_BUFFER
  if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
    type = CL_COMMAND_FILL_BUFFER;
    imageBuffer = true;
  }

  // Find the the right fill operation
  switch (type) {
    case CL_COMMAND_SVM_MEMFILL:
    case CL_COMMAND_FILL_BUFFER: {
      amd::Coord3D realSurf(surface[0], surface[1], surface[2]);
      amd::Coord3D realOrigin(origin[0], origin[1], origin[2]);
      amd::Coord3D realSize(size[0], size[1], size[2]);
      // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
      if (imageBuffer) {
        size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
        realOrigin.c[0] *= elemSize;
        realSize.c[0] *= elemSize;
        memset(fillValue, 0, sizeof(fillValue));
        amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
        pattern = fillValue;
        patternSize = elemSize;
      }
      result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSurf, realOrigin, realSize,
                                    entire, forceBlit);
      break;
    }
    case CL_COMMAND_FILL_IMAGE: {
      result = blitMgr().fillImage(*memory, pattern, origin, size, entire);
      break;
    }
    default:
      ShouldNotReachHere();
      break;
  }

  if (!result) {
    LogError("submitFillMemory failed!");
  }

  amdMemory->signalWrite(&dev());
  return true;
}

// ================================================================================================
void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  bool force_blit = false;
  if (amd::IS_HIP) {
    // Always use blit for memset for HIP.
    force_blit = true;
  }

  if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.surface(),
                  cmd.origin(), cmd.size(), force_blit)) {
    cmd.setStatus(CL_INVALID_OPERATION);
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(cmd);

  const cl_command_type type = cmd.type();
  const uint64_t value = cmd.value();
  const uint64_t mask = cmd.mask();
  const unsigned int flags = cmd.flags();
  const size_t sizeBytes = cmd.sizeBytes();
  const size_t offset = cmd.offset();

  amd::Memory* amdMemory = &cmd.memory();
  Memory* memory = dev().getRocMemory(amdMemory);

  if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
    if (GPU_STREAMOPS_CP_WAIT) {
      uint16_t header = kBarrierVendorPacketHeader;
      Buffer* buff = static_cast<Buffer*>(memory);
      hsa_signal_t signal = buff->getSignal();

      // mask is always applied on value at signal before performing
      // the comparision defiend by 'condition'
      switch (flags) {
        case ROCCLR_STREAM_WAIT_VALUE_GTE: {
          dispatchBarrierValuePacket(header, false, signal, value, mask, HSA_SIGNAL_CONDITION_GTE,
                                     true);
          break;
        }
        case ROCCLR_STREAM_WAIT_VALUE_EQ: {
          dispatchBarrierValuePacket(header, false, signal, value, mask, HSA_SIGNAL_CONDITION_EQ,
                                     true);
          break;
        }
        case ROCCLR_STREAM_WAIT_VALUE_AND: {
          dispatchBarrierValuePacket(header, false, signal, 0, (value & mask),
                                     HSA_SIGNAL_CONDITION_NE, true);
          break;
        }
        case ROCCLR_STREAM_WAIT_VALUE_NOR: {
          uint64_t norValue = ~value & mask;
          dispatchBarrierValuePacket(header, false, signal, norValue, norValue,
                                     HSA_SIGNAL_CONDITION_NE, true);
          break;
        }
        default:
          ShouldNotReachHere();
          break;
      }
    }
    // Use a blit kernel to perform the wait operation
    else {
      // mask is applied on value before performing
      // the comparision defined by 'condition'
      bool result = blitMgr().streamOpsWait(*memory, value, offset, sizeBytes, flags, mask);
      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
              "Waiting for value: 0x%lx."
              " Flags: 0x%lx mask: 0x%lx",
              value, flags, mask);
      if (!result) {
        LogError("submitStreamOperation: Wait failed!");
      }
    }
  } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
    amd::Coord3D origin(offset);
    amd::Coord3D size(sizeBytes);
    // Ensure memory ordering preceding the write
    dispatchBarrierPacket(kBarrierPacketReleaseHeader);

    bool result = blitMgr().streamOpsWrite(*memory, value, offset, sizeBytes);
    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
    if (!result) {
      LogError("submitStreamOperation: Write failed!");
    }
  } else {
    ShouldNotReachHere();
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitBatchMemoryOperation(amd::BatchMemoryOperationCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(cmd);

  bool result = blitMgr().batchMemOps(cmd.getParamPtr(), cmd.paramSize(), cmd.count());
  if (!result) {
    LogError("submitBatchMemoryOperation failed!");
  }
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
  amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
  if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
    profilingEnd();
    return;
  }

  // Get the amd::Memory object for the physical address
  amd::Memory* phys_mem_obj = vcmd.memory();
  hsa_status_t hsa_status = HSA_STATUS_SUCCESS;

  // If Physical address is not set, then it is map command. If set, it is unmap command.
  if (phys_mem_obj != nullptr) {
    constexpr bool kParent = false;
    amd::Memory* vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer(
        phys_mem_obj->getContext(), const_cast<void*>(vcmd.ptr()), vcmd.size(),
        phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent);
    // Map the physical to virtual address the hsa api
    hsa_amd_vmem_alloc_handle_t opaque_hsa_handle;
    opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle;
    if ((hsa_status = Hsa::vmem_map(vaddr_sub_obj->getSvmPtr(), vcmd.size(),
                                       vaddr_sub_obj->getOffset(), opaque_hsa_handle, 0)) ==
        HSA_STATUS_SUCCESS) {
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_sub_obj);
      vaddr_sub_obj->getUserData().phys_mem_obj = phys_mem_obj;
      phys_mem_obj->getUserData().vaddr_mem_obj = vaddr_sub_obj;
    } else {
      LogError("HSA Command: hsa_amd_vmem_map failed!");
    }
  } else {
    dispatchBarrierPacket(kBarrierPacketHeader, false);
    Barriers().WaitCurrent();

    amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(vcmd.ptr());
    assert(vaddr_sub_obj != nullptr);

    // Unmap the object, since the physical addr is set.
    if ((hsa_status = Hsa::vmem_unmap(vaddr_sub_obj->getSvmPtr(), vcmd.size())) ==
        HSA_STATUS_SUCCESS) {
      // assert the va is mapped and needs to be removed
      vaddr_sub_obj->getContext().devices()[0]->DestroyVirtualBuffer(vaddr_sub_obj);
      amd::MemObjMap::RemoveMemObj(vcmd.ptr());
      if (vaddr_sub_obj->getUserData().phys_mem_obj != nullptr) {
        vaddr_sub_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr;
        vaddr_sub_obj->getUserData().phys_mem_obj = nullptr;
      }
    } else {
      LogError("HSA Command: hsa_amd_vmem_unmap failed");
    }
  }

  // Since this is a memory operation, the HW event set for barrier packet
  // may not encapsulate what the command wants to do. Hence clear the hw_event
  constexpr bool kClearHwEvent = true;
  profilingEnd(kClearHwEvent);
}

// ================================================================================================
void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(cmd);

  amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());

  if (!dev().isFineGrainedSystem(true) ||
      ((dstMemory != nullptr) && !dev().forceFineGrain(dstMemory))) {
    size_t patternSize = cmd.patternSize();
    size_t fillSize = patternSize * cmd.times();

    size_t offset = reinterpret_cast<uintptr_t>(cmd.dst()) -
                    reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());

    Memory* memory = dev().getRocMemory(dstMemory);

    amd::Coord3D origin(offset, 0, 0);
    amd::Coord3D size(fillSize, 1, 1);

    assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");

    if (!fillMemory(cmd.type(), dstMemory, cmd.pattern(), cmd.patternSize(), size, origin, size,
                    true)) {
      cmd.setStatus(CL_INVALID_OPERATION);
    }
  } else {
    // Stall GPU for CPU access to memory
    releaseGpuMemoryFence();
    // for FGS capable device, fill CPU memory directly
    amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);

  for (auto itr : vcmd.memObjects()) {
    // Find device memory
    Memory* memory = dev().getRocMemory(&(*itr));

    if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
      if (!memory->isHostMemDirectAccess()) {
        // Make sure GPU finished operation before synchronization with the backing store
        releaseGpuMemoryFence();
      }
      memory->mgpuCacheWriteBack(*this);
    } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
      // Synchronize memory from host if necessary.
      // The sync function will perform memory migration from
      // another device if necessary
      device::Memory::SyncFlags syncFlags;
      memory->syncCacheFromHost(*this, syncFlags);
    } else {
      LogWarning("Unknown operation for memory migration!");
    }
  }

  profilingEnd();
}

// ================================================================================================
bool VirtualGPU::createSchedulerParam() {
  if (nullptr != schedulerQueue_) {
    return true;
  }

  while (true) {
    // The queue is written by multiple threads of the scheduler kernel
    if (HSA_STATUS_SUCCESS !=
        Hsa::queue_create(gpu_device(), 2048, HSA_QUEUE_TYPE_MULTI, callbackQueue, &roc_device_,
                          std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
                          &schedulerQueue_)) {
      break;
    }

    return true;
  }

  if (nullptr != schedulerQueue_) {
    Hsa::queue_destroy(schedulerQueue_);
    schedulerQueue_ = nullptr;
  }

  return false;
}

// ================================================================================================
uint64_t VirtualGPU::getVQVirtualAddress() {
  Memory* vqMem = dev().getRocMemory(virtualQueue_);
  return reinterpret_cast<uint64_t>(vqMem->getDeviceMemory());
}

// ================================================================================================
bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
  uint MinDeviceQueueSize = 16 * 1024;
  deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);

  maskGroups_ = deviceQueueSize / (512 * Ki);
  maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_;

  // Align the queue size for the multiple dispatch scheduler.
  // Each thread works with 32 entries * maskGroups
  uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_);
  if (extra != 0) {
    deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra;
  }

  if (deviceQueueSize_ == deviceQueueSize) {
    return true;
  } else {
    if (0 != deviceQueueSize_) {
      virtualQueue_->release();
      virtualQueue_ = nullptr;
      deviceQueueSize_ = 0;
      schedulerThreads_ = 0;
    }
  }

  uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
  uint allocSize = deviceQueueSize;

  // Add the virtual queue header
  allocSize += sizeof(AmdVQueueHeader);
  allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));

  uint argOffs = allocSize;

  // Add the kernel arguments and wait events
  uint singleArgSize = amd::alignUp(
      dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t),
      sizeof(AmdAqlWrap));
  allocSize += singleArgSize * numSlots;

  uint eventsOffs = allocSize;
  // Add the device events
  allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);

  uint eventMaskOffs = allocSize;
  // Add mask array for events
  allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;

  uint slotMaskOffs = allocSize;
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

  // Align size to 64 bytes for more efficient fill operation
  allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));

  // CL_MEM_ALLOC_HOST_PTR/CL_MEM_READ_WRITE
  virtualQueue_ = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_READ_WRITE, allocSize);

  if ((nullptr != virtualQueue_) && !virtualQueue_->create(nullptr)) {
    virtualQueue_->release();
    return false;
  }

  Memory* vqMem = dev().getRocMemory(virtualQueue_);

  if (nullptr == vqMem) {
    return false;
  }

  uint64_t vqVA = reinterpret_cast<uint64_t>(vqMem->getDeviceMemory());

  // Use shadow to prepare the data structure in host.
  auto shadow = std::make_unique<uint8_t[]>(allocSize);

  std::memset(&shadow[0], 0, allocSize);

  AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(&shadow[0]);
  // Initialize the virtual queue header
  header->aql_slot_num = numSlots;
  header->event_slot_num = dev().settings().numDeviceEvents_;
  header->event_slot_mask = vqVA + eventMaskOffs;
  header->event_slots = vqVA + eventsOffs;
  header->aql_slot_mask = vqVA + slotMaskOffs;
  header->wait_size = dev().settings().numWaitEvents_;
  header->arg_size = dev().info().maxParameterSize_ + 64;
  header->mask_groups = maskGroups_;

  // Go over all slots and perform initialization
  size_t offset = sizeof(AmdVQueueHeader);
  for (uint i = 0; i < numSlots; ++i) {
    AmdAqlWrap* slot = reinterpret_cast<AmdAqlWrap*>(&shadow[0] + offset);
    uint64_t argStart = vqVA + argOffs + i * singleArgSize;

    slot->aql.kernarg_address = reinterpret_cast<void*>(argStart);
    slot->wait_list = argStart + dev().info().maxParameterSize_ + 64;

    offset += sizeof(AmdAqlWrap);
  }

  amd::Coord3D origin(0, 0, 0);
  amd::Coord3D region(allocSize, 1, 1);

  // copy the data structure from host to GPU
  if (!dev().xferMgr().writeBuffer(&shadow[0], *vqMem, origin, region)) {
    return false;
  }

  deviceQueueSize_ = deviceQueueSize;
  schedulerThreads_ = numSlots / (DeviceQueueMaskSize * maskGroups_);

  return true;
}

// ================================================================================================
#if IS_LINUX
__attribute__((optimize("unroll-all-loops"), always_inline)) static inline void nontemporalMemcpy(
    void* __restrict dst, const void* __restrict src, size_t size) {
#if defined(ATI_ARCH_X86)
#if defined(__AVX512F__)
  for (auto i = 0u; i != size / sizeof(__m512i); ++i) {
    _mm512_stream_si512(reinterpret_cast<__m512i* __restrict&>(dst)++,
                        *reinterpret_cast<const __m512i* __restrict&>(src)++);
  }
  size = size % sizeof(__m512i);
#endif

#if defined(__AVX__)
  for (auto i = 0u; i != size / sizeof(__m256i); ++i) {
    _mm256_stream_si256(reinterpret_cast<__m256i* __restrict&>(dst)++,
                        *reinterpret_cast<const __m256i* __restrict&>(src)++);
  }
  size = size % sizeof(__m256i);
#endif

  for (auto i = 0u; i != size / sizeof(__m128i); ++i) {
    _mm_stream_si128(reinterpret_cast<__m128i* __restrict&>(dst)++,
                     *(reinterpret_cast<const __m128i* __restrict&>(src)++));
  }
  size = size % sizeof(__m128i);

  for (auto i = 0u; i != size / sizeof(long long); ++i) {
    _mm_stream_si64(reinterpret_cast<long long* __restrict&>(dst)++,
                    *reinterpret_cast<const long long* __restrict&>(src)++);
  }
  size = size % sizeof(long long);

  for (auto i = 0u; i != size / sizeof(int); ++i) {
    _mm_stream_si32(reinterpret_cast<int* __restrict&>(dst)++,
                    *reinterpret_cast<const int* __restrict&>(src)++);
  }

  size = size % sizeof(int);
  // Copy remaining bytes for unaligned size
  std::memcpy(dst, src, size);

  // Add memory fence
  _mm_sfence();
#else
  std::memcpy(dst, src, size);
#endif
}
#else
static inline void nontemporalMemcpy(void* __restrict dst, const void* __restrict src,
                                     size_t size) {
  std::memcpy(dst, src, size);
}
#endif

void VirtualGPU::HiddenHeapInit() { const_cast<Device&>(dev()).HiddenHeapInit(*this); }

// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
                                      const_address parameters, void* event_handle,
                                      uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
                                      hsa_kernel_dispatch_packet_t* aql_packet,
                                      bool attach_signal) {
  device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
  Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
  size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
  bool imageBufferWrtBack = false;                  // Image buffer write back is required
  std::vector<device::Memory*> wrtBackImageBuffer;  // Array of images for write back

  // Check memory dependency and SVM objects
  bool coopGroups = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
  if (!processMemObjects(kernel, parameters, ldsUsage, coopGroups, imageBufferWrtBack,
                         wrtBackImageBuffer)) {
    LogError("Wrong memory objects!");
    return false;
  }

  // Init PrintfDbg object if printf is enabled.
  bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false;
  if (!printfDbg()->init(printfEnabled)) {
    LogError("\nPrintfDbg object initialization failed!");
    return false;
  }

  const amd::KernelSignature& signature = kernel.signature();
  const amd::KernelParameters& kernelParams = kernel.parameters();

  amd::Memory* const* memories =
      reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
  bool isGraphCapture = command_ != nullptr && command_->getPktCapturingState();

  ClPrint(amd::LOG_INFO, amd::LOG_KERN2, "ShaderName : %s", gpuKernel.getDemangledName().c_str());

  amd::NDRange local_size(sizes.local());
  address hidden_arguments = const_cast<address>(parameters);
  // Calculate local size if it wasn't provided
  devKernel->FindLocalWorkSize(sizes.dimensions(), sizes.global(), local_size);

  uint16_t local[3] = {1, 1, 1};
  uint32_t global[3] = {1, 1, 1};
  for (uint i = 0; i < sizes.dimensions(); i++) {
    global[i] = static_cast<uint32_t>(sizes.global()[i]);
    local[i] = static_cast<uint16_t>(local_size[i]);
  }
  uint64_t spVA = 0;
  // Check if runtime has to setup hidden arguments
  for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) {
    const auto& it = signature.at(i);
    switch (it.info_.oclObject_) {
      case amd::KernelParameterDescriptor::HiddenNone:
        break;
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: {
        WriteAqlArgAt(hidden_arguments, sizes.offset()[0], it.size_, it.offset_);
        break;
      }
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: {
        if (sizes.dimensions() >= 2) {
          WriteAqlArgAt(hidden_arguments, sizes.offset()[1], it.size_, it.offset_);
        }
        break;
      }
      case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: {
        if (sizes.dimensions() >= 3) {
          WriteAqlArgAt(hidden_arguments, sizes.offset()[2], it.size_, it.offset_);
        }
        break;
      }
      case amd::KernelParameterDescriptor::HiddenPrintfBuffer: {
        uintptr_t bufferPtr = reinterpret_cast<uintptr_t>(printfDbg()->dbgBuffer());
        if (printfEnabled && bufferPtr) {
          WriteAqlArgAt(hidden_arguments, bufferPtr, it.size_, it.offset_);
        }
        break;
      }
      case amd::KernelParameterDescriptor::HiddenHostcallBuffer: {
        if (amd::IS_HIP) {
          if (dev().info().pcie_atomics_) {
            uintptr_t buffer = reinterpret_cast<uintptr_t>(
                roc_device_.getOrCreateHostcallBuffer(gpu_queue_, coopGroups, cuMask_));
            if (!buffer) {
              LogError("Kernel expects a hostcall buffer, but none found");
              return false;
            }
            WriteAqlArgAt(hidden_arguments, buffer, it.size_, it.offset_);
          } else {
            LogError("Pcie atomics not enabled, hostcall not supported");
            return false;
          }
        }
        break;
      }
      case amd::KernelParameterDescriptor::HiddenDefaultQueue: {
        uint64_t vqVA = 0;
        amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
        if (nullptr != defQueue && devKernel->dynamicParallelism()) {
          if (!createVirtualQueue(defQueue->size()) || !createSchedulerParam()) {
            return false;
          }
          vqVA = getVQVirtualAddress();
        }
        WriteAqlArgAt(hidden_arguments, vqVA, it.size_, it.offset_);
        break;
      }
      case amd::KernelParameterDescriptor::HiddenCompletionAction: {
        if (devKernel->dynamicParallelism()) {
          auto params = allocKernArg(sizeof(AmdAqlWrap), 64);
          AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(params);
          memset(wrap, 0, sizeof(AmdAqlWrap));
          wrap->state = AQL_WRAP_DONE;
          spVA = reinterpret_cast<uint64_t>(wrap);
        }
        WriteAqlArgAt(hidden_arguments, spVA, it.size_, it.offset_);
        break;
      }
      case amd::KernelParameterDescriptor::HiddenMultiGridSync: {
        bool multiGridSync = (vcmd != nullptr) ? vcmd->cooperativeMultiDeviceGroups() : false;
        bool singleGridSync = (vcmd != nullptr) ? vcmd->cooperativeGroups() : false;
        Device::MGSyncInfo* syncInfo = nullptr;
        if (multiGridSync) {
          // Find CPU pointer to the right sync info structure. It should be after MGSyncData
          syncInfo = reinterpret_cast<Device::MGSyncInfo*>(
              dev().MGSync() + Device::kMGInfoSizePerDevice * dev().index() +
              Device::kMGSyncDataSize);
          // Update sync data address. Use the offset adjustment to the right location
          syncInfo->mgs = reinterpret_cast<Device::MGSyncData*>(
              dev().MGSync() + Device::kMGInfoSizePerDevice * vcmd->firstDevice());
        } else if (singleGridSync) {
          syncInfo = reinterpret_cast<Device::MGSyncInfo*>(allocKernArg(Device::kSGInfoSize, 64));
          syncInfo->mgs = nullptr;
        }
        if (multiGridSync || singleGridSync) {
          // Update sync data address.
          syncInfo->sgs = {0};
          // Fill rest of sync info fields
          syncInfo->grid_id = vcmd->gridId();
          syncInfo->num_grids = vcmd->numGrids();
          syncInfo->prev_sum = vcmd->prevGridSum();
          syncInfo->all_sum = vcmd->allGridSum();
          syncInfo->num_wg = vcmd->numWorkgroups();
        }
        // Update GPU address for grid sync info. Use the offset adjustment for the right
        // location
        WriteAqlArgAt(hidden_arguments, reinterpret_cast<uint64_t>(syncInfo), it.size_, it.offset_);
        break;
      }
      case amd::KernelParameterDescriptor::HiddenHeap:
        // Allocate hidden heap for HIP applications only
        if ((amd::IS_HIP) && (dev().HeapBuffer() == nullptr)) {
          const_cast<Device&>(dev()).HiddenHeapAlloc(*this);
        }
        if (dev().HeapBuffer() != nullptr) {
          // Initialize hidden heap buffer
          if (!isGraphCapture) {
            const_cast<Device&>(dev()).HiddenHeapInit(*this);
          }
          // Add heap pointer to the code
          size_t heap_ptr = static_cast<size_t>(dev().HeapBuffer()->virtualAddress());
          WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenBlockCountX:
        WriteAqlArgAt(hidden_arguments, global[0] / local[0], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenBlockCountY:
        WriteAqlArgAt(hidden_arguments, global[1] / local[1], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenBlockCountZ:
        WriteAqlArgAt(hidden_arguments, global[2] / local[2], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenGroupSizeX:
        WriteAqlArgAt(hidden_arguments, local[0], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenGroupSizeY:
        WriteAqlArgAt(hidden_arguments, local[1], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenGroupSizeZ:
        WriteAqlArgAt(hidden_arguments, local[2], it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenRemainderX:
        WriteAqlArgAt(hidden_arguments, static_cast<uint16_t>(global[0] % local[0]), it.size_,
                      it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenRemainderY:
        if (sizes.dimensions() >= 2) {
          WriteAqlArgAt(hidden_arguments, static_cast<uint16_t>(global[1] % local[1]), it.size_,
                        it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenRemainderZ:
        if (sizes.dimensions() >= 3) {
          WriteAqlArgAt(hidden_arguments, static_cast<uint16_t>(global[2] % local[2]), it.size_,
                        it.offset_);
        }
        break;
      case amd::KernelParameterDescriptor::HiddenGridDims:
        WriteAqlArgAt(hidden_arguments, static_cast<uint16_t>(sizes.dimensions()), it.size_,
                      it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenPrivateBase:
        WriteAqlArgAt(hidden_arguments,
                      reinterpret_cast<amd_queue_t*>(gpu_queue_)->private_segment_aperture_base_hi,
                      it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenSharedBase:
        WriteAqlArgAt(hidden_arguments,
                      reinterpret_cast<amd_queue_t*>(gpu_queue_)->group_segment_aperture_base_hi,
                      it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenQueuePtr:
        WriteAqlArgAt(hidden_arguments, gpu_queue_, it.size_, it.offset_);
        break;
      case amd::KernelParameterDescriptor::HiddenDynamicLdsSize:
        WriteAqlArgAt(hidden_arguments, sharedMemBytes, it.size_, it.offset_);
        break;
    }
  }
  address argBuffer = hidden_arguments;
  size_t argSize = std::min(gpuKernel.KernargSegmentByteSize(), signature.paramsSize());

  // Find all parameters for the current kernel
  if (!kernel.parameters().deviceKernelArgs() || gpuKernel.isInternalKernel()) {
    // Allocate buffer to hold kernel arguments
    if (isGraphCapture) {
      argBuffer = command_->getGraphKernArg(gpuKernel.KernargSegmentByteSize(),
                                            gpuKernel.KernargSegmentAlignment(), dev().index());
      command_->SetKernelName(gpuKernel.getDemangledName());
    } else {
      ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_KERN,
              "KernargSegmentByteSize = %lu "
              "KernargSegmentAlignment = %lu",
              gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment());
      argBuffer = reinterpret_cast<address>(
          allocKernArg(gpuKernel.KernargSegmentByteSize(), gpuKernel.KernargSegmentAlignment()));
    }

    nontemporalMemcpy(argBuffer, parameters, argSize);

    if (roc_device_.info().largeBar_ && !isGraphCapture) {
      const auto kernArgImpl = dev().settings().kernel_arg_impl_;
      if (kernArgImpl == KernelArgImpl::DeviceKernelArgsHDP) {
        *dev().info().hdpMemFlushCntl = 1u;
        auto kSentinel = *reinterpret_cast<volatile int*>(dev().info().hdpMemFlushCntl);
      } else if (kernArgImpl == KernelArgImpl::DeviceKernelArgsReadback && argSize != 0) {
        _mm_sfence();
        *(argBuffer + argSize - 1) = *(parameters + argSize - 1);
        _mm_mfence();
        auto kSentinel = *reinterpret_cast<volatile unsigned char*>(argBuffer + argSize - 1);
      }
    }
  }

  // Check for group memory overflow
  //! @todo Check should be in HSA - here we should have at most an assert
  assert(dev().info().localMemSizePerCU_ > 0);
  if (ldsUsage > dev().info().localMemSizePerCU_) {
    LogError("No local memory available\n");
    return false;
  }

  // Initialize the dispatch Packet
  hsa_kernel_dispatch_packet_t dispatchPacket{};

  dispatchPacket.header = kInvalidAql;
  dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();

  dispatchPacket.grid_size_x = global[0];
  dispatchPacket.grid_size_y = global[1];
  dispatchPacket.grid_size_z = global[2];

  dispatchPacket.workgroup_size_x = local[0];
  dispatchPacket.workgroup_size_y = local[1];
  dispatchPacket.workgroup_size_z = local[2];

  dispatchPacket.kernarg_address = argBuffer;
  dispatchPacket.group_segment_size = ldsUsage + sharedMemBytes;
  dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
  if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
    dispatchPacket.private_segment_size =
        std::max<uint64_t>(dev().StackSize(), dispatchPacket.private_segment_size);
    // Validate privateMemSize is more than max allowed.
    size_t maxStackSize = dev().MaxStackSize();
    if (dispatchPacket.private_segment_size > maxStackSize) {
      ClPrint(amd::LOG_ERROR, amd::LOG_KERN,
              "Scratch size (%u) exceeds max allowed (%zu) for kernel : %s",
              dispatchPacket.private_segment_size, maxStackSize,
              gpuKernel.getDemangledName().c_str());
      return false;
    }
  }

  // Pass the header accordingly
  auto aqlHeaderWithOrder = aqlHeader_;
  if (vcmd != nullptr) {
    if (vcmd->getAnyOrderLaunchFlag()) {
      constexpr uint32_t kAqlHeaderMask = ~(1 << HSA_PACKET_HEADER_BARRIER);
      aqlHeaderWithOrder &= kAqlHeaderMask;
    }
    if (vcmd->getCommandEntryScope() == amd::Device::kCacheStateSystem) {
      addSystemScope_ = true;
    }
  }

  // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
  if (aql_packet != nullptr) {
    *aql_packet = dispatchPacket;
    aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
                         (1 << HSA_PACKET_HEADER_BARRIER) |
                         (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
                         (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
    aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
  }

  if (isGraphCapture) {
    // Dispatch the packet
    if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                           (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
                           GPU_FLUSH_ON_EXECUTION, command_->getPktCapturingState(),
                           command_->getAqlPacket())) {
      return false;
    }
  } else {
    if (!dispatchAqlPacket(&dispatchPacket, aqlHeaderWithOrder,
                           (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
                           GPU_FLUSH_ON_EXECUTION, false, nullptr, attach_signal)) {
      return false;
    }
  }

  // Output printf buffer
  if (!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())) {
    LogError("\nCould not print data from the printf buffer!");
    return false;
  }

  if (gpuKernel.dynamicParallelism()) {
    dispatchBarrierPacket(kBarrierPacketHeader, true);
    if (virtualQueue_ != nullptr) {
      static_cast<KernelBlitManager&>(blitMgr()).runScheduler(
          getVQVirtualAddress(), schedulerQueue_, schedulerThreads_, spVA);
    }
  }

  // Check if image buffer write back is required
  if (imageBufferWrtBack) {
    // Make sure the original kernel execution is done
    releaseGpuMemoryFence();
    for (const auto imageBuffer : wrtBackImageBuffer) {
      Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
      amd::Image* image = imageBuffer->owner()->asImage();
      Image* devImage = static_cast<Image*>(dev().getGpuMemory(imageBuffer->owner()));
      Memory* cpyImage = dev().getGpuMemory(devImage->CopyImageBuffer());
      amd::Coord3D offs(0);
      // Copy memory from the the backing store image into original buffer
      bool result = blitMgr().copyImageToBuffer(*cpyImage, *buffer, offs, offs, image->getRegion(),
                                                true, image->getRowPitch(), image->getSlicePitch());
    }
  }
  return true;
}

/**
 * @brief Api to dispatch a kernel for execution. The implementation
 * parses the input object, an instance of virtual command to obtain
 * the parameters of global size, work group size, offsets of work
 * items, enable/disable profiling, etc.
 *
 * It also parses the kernel arguments buffer to inject into Hsa Runtime
 * the list of kernel parameters.
 */
// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
  if (vcmd.cooperativeGroups()) {
    // Wait for the execution on the current queue, since the coop groups will use the device queue
    releaseGpuMemoryFence(kSkipCpuWait);

    // Get device queue for exclusive GPU access
    VirtualGPU* queue = dev().xferQueue();
    if (!queue) {
      LogError("Runtime failed to acquire a cooperative queue!");
      vcmd.setStatus(CL_INVALID_OPERATION);
      return;
    }

    // Lock the queue, using the blit manager lock
    amd::ScopedLock lock(queue->blitMgr().lockXfer());

    queue->profilingBegin(vcmd);

    // Add a dependency into the device queue on the current queue
    queue->Barriers().AddExternalSignal(Barriers().GetLastSignal());

    if (dev().settings().gwsInitSupported_ == true) {
      uint32_t workgroups = vcmd.numWorkgroups();
      static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups - 1);
    }

    // Sync AQL packets
    queue->setAqlHeader(dispatchPacketHeader_);

    // Submit kernel to HW
    if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
                                     static_cast<void*>(as_cl(&vcmd.event())),
                                     vcmd.sharedMemBytes(), &vcmd)) {
      LogError("AQL dispatch failed!");
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
    // Wait for the execution on the device queue. Keep the current queue in-order
    queue->releaseGpuMemoryFence(kSkipCpuWait);

    // Add a dependency into the current queue on the coop queue
    Barriers().AddExternalSignal(queue->Barriers().GetLastSignal());
    hasPendingDispatch_ = true;
    retainExternalSignals_ = true;

    queue->profilingEnd();
  } else {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());

    profilingBegin(vcmd);

    // Submit kernel to HW
    if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
                              static_cast<void*>(as_cl(&vcmd.event())), vcmd.sharedMemBytes(),
                              &vcmd)) {
      LogError("AQL dispatch failed!");
      vcmd.setStatus(CL_INVALID_OPERATION);
    }

    profilingEnd();
  }
}

// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {}

// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
    // Make sure VirtualGPU has an exclusive access to the resources
    amd::ScopedLock lock(execution());
    if (vcmd.CpuWaitRequested()) {
      // It should be safe to call flush directly if there are not pending dispatches without
      // HSA signal callback
      if (gpu_queue_ == nullptr) {
        gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
      }
      flush(vcmd.GetBatchHead());
    } else {
      profilingBegin(vcmd);
      if (timestamp_ != nullptr) {
        const Settings& settings = dev().settings();
        int32_t releaseFlags = vcmd.getCommandEntryScope();

        if (releaseFlags == Device::CacheState::kCacheStateIgnore) {
          if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
            dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
          } else {
            dispatchBarrierPacket(kNopPacketHeader, false);
          }
        } else {
          // Submit a barrier with a cache flushes.
          if (settings.barrier_value_packet_ && vcmd.profilingInfo().marker_ts_) {
            dispatchBarrierValuePacket(kBarrierVendorPacketHeader, true);
          } else {
            dispatchBarrierPacket(kBarrierPacketHeader, false);
          }
          hasPendingDispatch_ = false;
        }
      }
      profilingEnd();
    }
  }
}

// ================================================================================================
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);

  const Settings& settings = dev().settings();
  if (settings.barrier_value_packet_) {
    dispatchBarrierValuePacket(kBarrierVendorPacketNopScopeHeader, true);
  } else {
    dispatchBarrierPacket(kNopPacketHeader, false);
  }

  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
  addSystemScope();
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
  profilingBegin(vcmd);
  profilingEnd();
}

// ================================================================================================
void VirtualGPU::flush(amd::Command* list, bool wait) {
  // If barrier is requested, then wait for everything, otherwise
  // a per disaptch wait will occur later in updateCommandsState()
  releaseGpuMemoryFence();
  updateCommandsState(list);

  // Release all pinned memory
  releasePinnedMem();
}

// ================================================================================================
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
  if (!AMD_DIRECT_DISPATCH) {
    //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
    //!        unconditionally, before it can release pinned memory
    releaseGpuMemoryFence();
    if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
      if (pinnedMems_.size() > 7) {
        pinnedMems_.front()->release();
        pinnedMems_.erase(pinnedMems_.begin());
      }

      // Delay destruction
      pinnedMems_.push_back(mem);
    }
  } else {
    if (command_ != nullptr) {
      command_->AddPinnedMemory(mem);
    } else {
      //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
      //!        unconditionally, before it can release pinned memory
      releaseGpuMemoryFence();
      mem->release();
    }
  }
}

// ================================================================================================
void VirtualGPU::releasePinnedMem() {
  for (auto& amdMemory : pinnedMems_) {
    amdMemory->release();
  }
  pinnedMems_.resize(0);
}

// ================================================================================================
amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
  for (auto& amdMemory : pinnedMems_) {
    if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
      return amdMemory;
    }
  }
  return nullptr;
}

// ================================================================================================
void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); }

// ================================================================================================
void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

  const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();

  if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
    // Create a profile for the profiling AQL packet
    PerfCounterProfile* profileRef = new PerfCounterProfile(roc_device_);
    if (profileRef == nullptr || !profileRef->Create()) {
      LogError("Failed to create performance counter profile");
      vcmd.setStatus(CL_INVALID_OPERATION);
      return;
    }

    // Make sure all performance counter objects to use the same profile
    PerfCounter* counter = nullptr;
    for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
      amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
      counter = static_cast<PerfCounter*>(amdCounter->getDeviceCounter());

      if (nullptr == counter) {
        amd::PerfCounter::Properties prop = amdCounter->properties();
        PerfCounter* rocCounter = new PerfCounter(roc_device_, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
                                                  prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX],
                                                  prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);

        if (nullptr == rocCounter || rocCounter->gfxVersion() == PerfCounter::ROC_UNSUPPORTED) {
          LogError("Failed to create the performance counter");
          vcmd.setStatus(CL_INVALID_OPERATION);
          delete rocCounter;
          return;
        }

        amdCounter->setDeviceCounter(rocCounter);
        counter = rocCounter;
      }

      counter->setProfile(profileRef);
    }

    if (!profileRef->initialize()) {
      LogError("Failed to initialize performance counter");
      vcmd.setStatus(CL_INVALID_OPERATION);
    } else if (profileRef->createStartPacket() == nullptr) {
      LogError("Failed to create AQL packet for start profiling");
      vcmd.setStatus(CL_INVALID_OPERATION);
    } else {
      dispatchCounterAqlPacket(profileRef->prePacket(), counter->gfxVersion(), false,
                               profileRef->api());
    }

    profileRef->release();
  } else if (vcmd.getState() == amd::PerfCounterCommand::End) {
    // Since all performance counters should use the same profile, use the 1st
    // one to get the profile object
    amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[0]);
    PerfCounter* counter = static_cast<PerfCounter*>(amdCounter->getDeviceCounter());
    if (counter == nullptr) {
      LogError("Invalid Performance Counter");
      vcmd.setStatus(CL_INVALID_OPERATION);
      return;
    }
    PerfCounterProfile* profileRef = counter->profileRef();

    // create the AQL packet for stop profiling
    if (profileRef->createStopPacket() == nullptr) {
      LogError("Failed to create AQL packet for stop profiling");
      vcmd.setStatus(CL_INVALID_OPERATION);
    }
    dispatchCounterAqlPacket(profileRef->postPacket(), counter->gfxVersion(), true,
                             profileRef->api());
  } else {
    LogError("Unsupported performance counter state");
    vcmd.setStatus(CL_INVALID_OPERATION);
  }
}

}  // namespace amd::roc