rocm-systems/rocclr/runtime/device/rocm/rocvirtual.cpp

//
// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
//

#include "device/rocm/rocdevice.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "device/rocm/rockernel.hpp"
#include "device/rocm/rocmemory.hpp"
#include "device/rocm/rocblit.hpp"
#include "../../platform/kernel.hpp"
#include "../../platform/context.hpp"
#include "../../platform/command.hpp"
#include "../../platform/memory.hpp"
#include "../../platform/sampler.hpp"
#include "../../utils/debug.hpp"
#include "os/os.hpp"
#include "SCHSAInterface.h"
#include "amd_hsa_kernel_code.h"

#include <fstream>
#include <vector>
#include <string>

/**
* HSA image object size in bytes (see HSAIL spec)
*/
#define HSA_IMAGE_OBJECT_SIZE 48

/**
* HSA image object alignment in bytes (see HSAIL spec)
*/
#define HSA_IMAGE_OBJECT_ALIGNMENT  16

/**
* HSA sampler object size in bytes (see HSAIL spec)
*/
#define HSA_SAMPLER_OBJECT_SIZE 32

/**
* HSA sampler object alignment in bytes (see HSAIL spec)
*/
#define HSA_SAMPLER_OBJECT_ALIGNMENT  16

namespace roc {
// (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1
// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes L2

static const uint16_t kDispatchPacketHeaderNoSync =
    (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static const uint16_t kDispatchPacketHeader =
    (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
    (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static const uint16_t kBarrierPacketHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
    (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static const uint16_t kBarrierPacketAcquireHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
    (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static const uint16_t kBarrierPacketReleaseHeader =
    (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
    (1 << HSA_PACKET_HEADER_BARRIER) |
    (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
    (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);

static const hsa_barrier_and_packet_t kBarrierAcquirePacket =
    {kBarrierPacketAcquireHeader,0,0,0,0,0,0,0,0,0};

static const hsa_barrier_and_packet_t kBarrierReleasePacket =
    {kBarrierPacketReleaseHeader,0,0,0,0,0,0,0,0,0};

double Timestamp::ticksToTime_=0;

/**
* Set the ocl correlation handle (essentially the cl_event handle)
* to correlate the cl kernel launch and HSA kernel dispatch
*/
typedef hsa_status_t
  (*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent,
                                        void *correlation_handle);
static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, void *handle) {
  hsa_ext_tools_set_correlation_handle func =
    (hsa_ext_tools_set_correlation_handle)amd::Os::getSymbol(tools_lib, "hsa_ext_tools_set_correlation_handler");
  if (func) {
    func(agent, handle);
  }

  return;
}

bool
VirtualGPU::MemoryDependency::create(size_t numMemObj)
{
    if (numMemObj > 0) {
        // Allocate the array of memory objects for dependency tracking
        memObjectsInQueue_ = new MemoryState[numMemObj];
        if (NULL == memObjectsInQueue_) {
            return false;
        }
        memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
        maxMemObjectsInQueue_ = numMemObj;
    }

    return true;
}

void
VirtualGPU::MemoryDependency::validate(
    VirtualGPU&     gpu,
    const Memory*   memory,
    bool            readOnly)
{
    bool    flushL1Cache = false;

    if (maxMemObjectsInQueue_ == 0) {
        // Sync AQL packets
        gpu.setAqlHeader(kDispatchPacketHeader);
        return;
    }

    uint64_t curStart = reinterpret_cast<uint64_t>(memory->getDeviceMemory());
    uint64_t curEnd = curStart + memory->size();

    // Loop through all memory objects in the queue and find dependency
    // @note don't include objects from the current kernel
    for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
        // Check if the queue already contains this mem object and
        // GPU operations aren't readonly
        uint64_t busyStart = memObjectsInQueue_[j].start_;
        uint64_t busyEnd = memObjectsInQueue_[j].end_;

        // Check if the start inside the busy region
        if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
            // Check if the end inside the busy region
             ((curEnd > busyStart) && (curEnd <= busyEnd)) ||
            // Check if the start/end cover the busy region
             ((curStart <= busyStart) && (curEnd >= busyEnd))) &&
            // If the buys region was written or the current one is for write
            (!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
            flushL1Cache = true;
            break;
        }
    }

    // Did we reach the limit?
    if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
        flushL1Cache = true;
    }

    if (flushL1Cache) {
        // Sync AQL packets
        gpu.setAqlHeader(kDispatchPacketHeader);

        // Clear memory dependency state
        const static bool All = true;
        clear(!All);
    }

    // Insert current memory object into the queue always,
    // since runtime calls flush before kernel execution and it has to keep
    // current kernel in tracking
    memObjectsInQueue_
        [numMemObjectsInQueue_].start_ = curStart;
    memObjectsInQueue_
        [numMemObjectsInQueue_].end_ = curEnd;
    memObjectsInQueue_
        [numMemObjectsInQueue_].readOnly_ = readOnly;
    numMemObjectsInQueue_++;
}

void
VirtualGPU::MemoryDependency::clear(bool all)
{
    if (numMemObjectsInQueue_ > 0) {
        size_t  i, j;
        if (all) {
            endMemObjectsInQueue_ = numMemObjectsInQueue_;
        }

        // Preserve all objects from the current kernel
        for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
            memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
            memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
            memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
        }
        // Clear all objects except current kernel
        memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
        numMemObjectsInQueue_ -= endMemObjectsInQueue_;
        endMemObjectsInQueue_ = 0;
    }
}

bool
VirtualGPU::processMemObjects(
    const amd::Kernel&  kernel,
    const_address       params)
{
    static const bool NoAlias = true;
    const Kernel& hsaKernel = static_cast<const Kernel&>
        (*(kernel.getDeviceKernel(dev(), NoAlias)));
    const amd::KernelSignature& signature = kernel.signature();
    const amd::KernelParameters& kernelParams = kernel.parameters();

    // AQL packets
    setAqlHeader(kDispatchPacketHeaderNoSync);

    // Mark the tracker with a new kernel,
    // so we can avoid checks of the aliased objects
    memoryDependency().newKernel();

    bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
    bool supportFineGrainedSystem = deviceSupportFGS;
    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
    switch (status) {
        case FGS_YES:
            if (!deviceSupportFGS) {
                return false;
            }
            supportFineGrainedSystem = true;
            break;
        case FGS_NO:
            supportFineGrainedSystem = false;
            break;
        case FGS_DEFAULT:
        default:
            break;
    }

    size_t count = kernelParams.getNumberOfSvmPtr();
    size_t execInfoOffset = kernelParams.getExecInfoOffset();
    bool sync = true;

    amd::Memory* memory = NULL;
    //get svm non arugment information
    void* const* svmPtrArray =
        reinterpret_cast<void* const*>(params + execInfoOffset);
    for (size_t i = 0; i < count; i++) {
        memory =  amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
        if (NULL == memory) {
            if (!supportFineGrainedSystem) {
                return false;
            }
            else if (sync) {
                // Sync AQL packets
                setAqlHeader(kDispatchPacketHeader);
                // Clear memory dependency state
                const static bool All = true;
                memoryDependency().clear(!All);
                continue;
            }
        }
        else {
            Memory* gpuMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
            if (NULL != gpuMemory) {
                const static bool IsReadOnly = false;
                // Validate SVM passed in the non argument list
                memoryDependency().validate(*this, gpuMemory, IsReadOnly);
            }
            else {
                return false;
            }
        }
    }

    // Check all parameters for the current kernel
    for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        const HsailKernelArg*  arg = hsaKernel.hsailArgAt(i);
        Memory* memory = NULL;
        bool    readOnly = false;
        amd::Memory* svmMem = NULL;

        // Find if current argument is a buffer
        if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
            if (kernelParams.boundToSvmPointer(dev(), params, i)) {
                svmMem = amd::SvmManager::FindSvmBuffer(
                    *reinterpret_cast<void* const*>(params + desc.offset_));
                if (!svmMem) {
                    // Sync AQL packets
                    setAqlHeader(kDispatchPacketHeader);
                    // Clear memory dependency state
                    const static bool All = true;
                    memoryDependency().clear(!All);
                    continue;
                }
            }

            if (*reinterpret_cast<amd::Memory* const*>
                    (params + desc.offset_) != NULL) {
                if (NULL == svmMem) {
                    memory = static_cast<Memory*>((*reinterpret_cast<amd::Memory* const*>
                            (params + desc.offset_))->getDeviceMemory(dev()));
                }
                else {
                    memory = static_cast<Memory*>(svmMem->getDeviceMemory(dev()));
                }
            }

            if (memory != NULL) {
                // Check image
                readOnly = (desc.accessQualifier_ ==
                    CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
                // Check buffer
                readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
                // Validate memory for a dependency in the queue
                memoryDependency().validate(*this, memory, readOnly);
            }
        }
    }

    return true;
}

template<typename AqlPacket>
bool
VirtualGPU::dispatchGenericAqlPacket(
    AqlPacket* packet,
    bool blocking)
{
    const uint32_t queueSize = gpu_queue_->size;
    const uint32_t queueMask = queueSize - 1;

    //Check for queue full and wait if needed.
    uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
    uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
    hsa_signal_t signal;

    // TODO: placeholder to setup the kernel to populate start and end timestamp.
    if (timestamp_ != nullptr) {
        // Find signal slot
        ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask];
        // Make sure we save the old results in the TS structure
        if (profilingSignal->ts_ != nullptr) {
            profilingSignal->ts_->checkGpuTime();
        }
        // Update the new TS with the signal info
        timestamp_->setProfilingSignal(profilingSignal);
        packet->completion_signal = profilingSignal->signal_;
        profilingSignal->ts_ = timestamp_;
        timestamp_->setAgent(gpu_device_);
    }

    if ((index - read) == queueMask) {
        if (packet->completion_signal.handle == 0) {
            packet->completion_signal = barrier_signal_;
        }
        signal = packet->completion_signal;
        // Initialize signal for a wait
        hsa_signal_store_relaxed(signal, InitSignalValue);
        blocking = true;
    }

    //Insert packet
    ((AqlPacket*)(
        gpu_queue_->base_address))[index & queueMask] = *packet;
    hsa_queue_store_write_index_release(gpu_queue_, index + 1);
    hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);

    //Wait on signal ?
    if (blocking) {
        if (hsa_signal_wait_acquire(
            signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
            HSA_WAIT_STATE_BLOCKED) != 0) {
            LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
            return false;
        }

        // Release the pool, since runtime just drained the entire queue
        resetKernArgPool();
    }

    return true;
}

bool
VirtualGPU::dispatchAqlPacket(
    hsa_kernel_dispatch_packet_t* packet,
    bool blocking)
{
  return dispatchGenericAqlPacket(packet, blocking);
}

bool
VirtualGPU::dispatchAqlPacket(
    hsa_barrier_and_packet_t* packet,
    bool blocking)
{
  return dispatchGenericAqlPacket(packet, blocking);
}

void
VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet)
{
    assert(packet->completion_signal.handle != 0);
    const uint32_t queueSize = gpu_queue_->size;
    const uint32_t queueMask = queueSize - 1;

    uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
    ((hsa_barrier_and_packet_t*)(
        gpu_queue_->base_address))[index&queueMask] = *packet;

    hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1);

    hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
}

/**
 * @brief Waits on an outstanding kernel without regard to how
 * it was dispatched - with or without a signal
 *
 * @return bool true if Wait returned successfully, false
 * otherwise
 */
bool VirtualGPU::releaseGpuMemoryFence() {
    // Return if there is no pending dispatch
    if (!hasPendingDispatch_) {
      return false;
    }

    // Initialize signal for the barrier packet.
    hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);

    // Dispatch barrier packet into the queue and wait till it finishes.
    dispatchBarrierPacket(&barrier_packet_);
    if (hsa_signal_wait_acquire(
        barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
        HSA_WAIT_STATE_BLOCKED) != 0) {
        LogError("Barrier packet submission failed");
        return false;
    }

    hasPendingDispatch_ = false;

    // Release all memory dependencies
    memoryDependency().clear();

    // Release the pool, since runtime just completed a barrier
    resetKernArgPool();

    return true;
}

VirtualGPU::VirtualGPU(Device &device)
    : device::VirtualDevice(device)
    , roc_device_(device)
{
    gpu_device_ = device.getBackendDevice();
    // Initialize the last signal and dispatch flags
    timestamp_ = NULL;
    hasPendingDispatch_ = false;
    tools_lib_ = NULL;

    kernarg_pool_base_ = NULL;
    kernarg_pool_size_ = 0;
    kernarg_pool_cur_offset_ = 0;
    aqlHeader_ = kDispatchPacketHeaderNoSync;
    barrier_signal_.handle = 0;
}

VirtualGPU::~VirtualGPU()
{
    if (timestamp_ != NULL) {
        delete timestamp_;
        timestamp_ = NULL;
        LogError("There was a timestamp that was not used; deleting.");
    }
    if (printfdbg_ != NULL){
        delete printfdbg_;
        printfdbg_ = NULL;
    }

    tools_lib_ = NULL;
}

bool
VirtualGPU::create(bool profilingEna)
{
    // Set the event handle to the tools lib if the env var
    // Load the library using its advertised "soname"
    std::string lib_name = amd::Os::getEnvironment("HSA_TOOLS_LIB");
    if (lib_name != "") {
#if defined(_WIN32) || defined(__CYGWIN__)
        const char *tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll";
#else
        const char *tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1";
#endif
        tools_lib_ = amd::Os::loadLibrary(tools_lib_name);
    }

    uint32_t queue_max_packets = 0;
    if (HSA_STATUS_SUCCESS !=
        hsa_agent_get_info(
        gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) {
        return false;
    }

    //Pick a reasonable queue size
    uint32_t queue_size = 1024;
    queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size;
    while (hsa_queue_create(gpu_device_,
        queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT_MAX, UINT_MAX,
        &gpu_queue_) != HSA_STATUS_SUCCESS) {
        queue_size >>= 1;
        if (queue_size < 64) {
            return false;
        }
    }

    if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
        LogError("Couldn't allocate arguments/signals for the queue");
        return false;
    }

    device::BlitManager::Setup  blitSetup;
    blitMgr_ = new KernelBlitManager(*this, blitSetup);
    if ((NULL == blitMgr_) || !blitMgr_->create(roc_device_)) {
        LogError("Could not create BlitManager!");
        return false;
    }

    // Create signal for the barrier packet.
    hsa_signal_t signal = { 0 };
    if (HSA_STATUS_SUCCESS !=
        hsa_signal_create(InitSignalValue, 0, NULL, &signal)) {
        return false;
    }
    barrier_signal_ = signal;

    // Initialize barrier packet.
    memset(&barrier_packet_, 0, sizeof(barrier_packet_));
    barrier_packet_.header = kBarrierPacketHeader;
    barrier_packet_.completion_signal = barrier_signal_;

    // Create a object of PrintfDbg
    printfdbg_ = new PrintfDbg(roc_device_);
    if (NULL == printfdbg_) {
        LogError("\nCould not create printfDbg Object!");
        return false;
    }

    // Initialize timestamp conversion factor
    if (Timestamp::getGpuTicksToTime() == 0) {
        uint64_t frequency;
        hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency);
        Timestamp::setGpuTicksToTime(1e9/double(frequency));
    }

    if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) {
        LogError("Could not create the array of memory objects!");
        return false;
    }

    return true;
}

bool
VirtualGPU::terminate()
{
    delete blitMgr_;

    // Release the resources of signal
    releaseGpuMemoryFence();
    hsa_status_t err = hsa_queue_destroy(gpu_queue_);
    if (err !=  HSA_STATUS_SUCCESS) {
        return false;
    }

    if (barrier_signal_.handle != 0) {
        hsa_signal_destroy(barrier_signal_);
    }

    if (tools_lib_) {
        amd::Os::unloadLibrary(tools_lib_);
        tools_lib_ = NULL;
    }

    destroyPool();

    return true;
}

bool
VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count)
{
    kernarg_pool_size_ = kernarg_pool_size;
    kernarg_pool_base_ = reinterpret_cast<char *>(
        roc_device_.hostAlloc(kernarg_pool_size_, 1, true));
    if (kernarg_pool_base_ == nullptr) {
        return false;
    }

    if (signal_pool_count != 0) {
        signal_pool_.resize(signal_pool_count);
        for (uint i = 0; i < signal_pool_count; ++i) {
            ProfilingSignal profilingSignal;
            if (HSA_STATUS_SUCCESS != hsa_signal_create(
                0, 0, nullptr, &profilingSignal.signal_)) {
                return false;
            }
            signal_pool_[i] = profilingSignal;
        }
    }

    return true;
}

void
VirtualGPU::destroyPool() {
    if (kernarg_pool_base_ != nullptr) {
        roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
    }

    if (signal_pool_.size() > 0) {
        for (uint i = 0; i < signal_pool_.size(); ++i) {
            hsa_signal_destroy(signal_pool_[i].signal_);
        }
    }
}

void*
VirtualGPU::allocKernArg(size_t size, size_t alignment)
{
    char* result = nullptr;
    do {
        result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment);
        const size_t pool_new_usage = (result + size) - kernarg_pool_base_;
        if (pool_new_usage <= kernarg_pool_size_) {
            kernarg_pool_cur_offset_ = pool_new_usage;
            return result;
        }
        else {
            //! We run out of the arguments space!
            //! That means the app didn't call clFlush/clFinish for very long time.
            //! We can issue a barrier to avoid expensive extra memory allocations.

            // Initialize signal for the barrier packet.
            hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);

            // Dispatch barrier packet into the queue and wait till it finishes.
            dispatchBarrierPacket(&barrier_packet_);
            if (hsa_signal_wait_acquire(
                barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
                HSA_WAIT_STATE_BLOCKED) != 0) {
                LogError("Kernel arguments reset failed");
            }

            resetKernArgPool();
        }
    } while (true);

    return result;
}

/* profilingBegin, when profiling is enabled, creates a timestamp to save in
* virtualgpu's timestamp_, and calls start() to get the current host
* timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling)
{
    if (command.profilingInfo().enabled_) {
        if (timestamp_ != NULL) {
            LogWarning("Trying to create a second timestamp in VirtualGPU. \
                        This could have unintended consequences.");
            return;
        }
        timestamp_ = new Timestamp;
        timestamp_->start();
    }
}

/* profilingEnd, when profiling is enabled, checks to see if a signal was
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available. It then saves the pointer
* timestamp_ to the command's data.
*/
void VirtualGPU::profilingEnd(amd::Command &command)
{
    if (command.profilingInfo().enabled_) {
        if (timestamp_->getProfilingSignal() == nullptr) {
            timestamp_->end();
        }
        command.setData(reinterpret_cast<void*>(timestamp_));
        timestamp_ = NULL;
    }
}

struct DestroySampler : public std::binary_function<hsa_ext_sampler_t,
                                                    hsa_agent_t, bool> {
    bool operator() (hsa_ext_sampler_t &sampler,
                     hsa_agent_t agent) const {
        hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler);
        return status == HSA_STATUS_SUCCESS;
    }
};

void VirtualGPU::updateCommandsState(amd::Command *list)
{
    Timestamp *ts = NULL;

    amd::Command* current = list;
    amd::Command* next = NULL;

    if (current == NULL) {
        return;
    }

    uint64_t endTimeStamp = 0;
    uint64_t startTimeStamp = endTimeStamp;

    if (current->profilingInfo().enabled_) {
        // TODO: use GPU timestamp when available.
        endTimeStamp = amd::Os::timeNanos();
        startTimeStamp = endTimeStamp;

        // This block gets the first valid timestamp from the first command
        // that has one. This timestamp is used below to mark any command that
        // came before it to start and end with this first valid start time.
        current = list;
        while (current != NULL) {
            if (current->data() != NULL) {
                ts = reinterpret_cast<Timestamp*>(current->data());
                    startTimeStamp = ts->getStart();
                    endTimeStamp = ts->getStart();
                break;
            }
            current = current->getNext();
        }
    }

    // Iterate through the list of commands, and set timestamps as appropriate
    // Note, if a command does not have a timestamp, it does one of two things:
    // - if the command (without a timestamp), A, precedes another command, C,
    // that _does_ contain a valid timestamp, command A will set RUNNING and
    // COMPLETE with the RUNNING (start) timestamp from command C. This would
    // also be true for command B, which is between A and C. These timestamps
    // are actually retrieved in the block above (startTimeStamp, endTimeStamp).
    // - if the command (without a timestamp), C, follows another command, A,
    // that has a valid timestamp, command C will be set RUNNING and COMPLETE
    // with the COMPLETE (end) timestamp of the previous command, A. This is
    // also true for any command B, which falls between A and C.
    current = list;
    while (current != NULL) {
        if (current->profilingInfo().enabled_) {
            if (current->data() != NULL) {
                // Since this is a valid command to get a timestamp, we use the
                // timestamp provided by the runtime (saved in the data())
                ts = reinterpret_cast<Timestamp*>(current->data());
                    startTimeStamp = ts->getStart();
                    endTimeStamp = ts->getEnd();
                delete ts;
                current->setData(NULL);
            }
            else {
                // If we don't have a command that contains a valid timestamp,
                // we simply use the end timestamp of the previous command.
                // Note, if this is a command before the first valid timestamp,
                // this will be equal to the start timestamp of the first valid
                // timestamp at this point.
                startTimeStamp = endTimeStamp;
            }
        }

        if (current->status() == CL_SUBMITTED) {
            current->setStatus(CL_RUNNING, startTimeStamp);
            current->setStatus(CL_COMPLETE, endTimeStamp);
        }
        else if (current->status() != CL_COMPLETE) {
            LogPrintfError("Unexpected command status - %d.", current->status());
        }

        next = current->getNext();
        current->release();
        current = next;
    }

    // Release the sampler handles allocated for the various
    // on one or more kernel submissions
    std::for_each(samplerList_.begin(),
      samplerList_.end(),
      std::bind2nd<DestroySampler>(DestroySampler(), gpu_device_));
    samplerList_.clear();

    return;
}

void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(cmd);

    size_t offset = 0;
    // Find if virtual address is a CL allocation
    device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);

    device::Memory *devMem = cmd.source().getDeviceMemory(dev());
    void *dst = cmd.destination();
    amd::Coord3D size = cmd.size();

    //! @todo: add multi-devices synchronization when supported.

    cl_command_type type = cmd.type();
    bool result = false;
    bool imageBuffer = false;

    // Force buffer read for IMAGE1D_BUFFER
    if ((type == CL_COMMAND_READ_IMAGE) &&
        (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        type = CL_COMMAND_READ_BUFFER;
        imageBuffer = true;
    }

    switch (type) {
        case CL_COMMAND_READ_BUFFER: {
            amd::Coord3D    origin(cmd.origin()[0]);
            if (imageBuffer) {
                size_t  elemSize =
                  cmd.source().asImage()->getImageFormat().getElementSize();
                origin.c[0] *= elemSize;
                size.c[0] *= elemSize;
            }
            if (hostMemory != nullptr) {
                // Accelerated transfer without pinning
                amd::Coord3D dstOrigin(offset);
                result = blitMgr().copyBuffer(*devMem, *hostMemory,
                    origin, dstOrigin, size, cmd.isEntireMemory());
            }
            else {
                result = blitMgr().readBuffer(
                        *devMem, dst, origin, size,
                        cmd.isEntireMemory());
            }
            break;
        }
        case CL_COMMAND_READ_BUFFER_RECT: {
            result = blitMgr().readBufferRect(
                        *devMem, dst, cmd.bufRect(), cmd.hostRect(), size,
                        cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_READ_IMAGE: {
            result = blitMgr().readImage(
              *devMem, dst, cmd.origin(), size, cmd.rowPitch(),
              cmd.slicePitch(), cmd.isEntireMemory());
            break;
        }
        default:
            ShouldNotReachHere();
            break;
    }

    if (!result) {
        LogError("submitReadMemory failed!");
        cmd.setStatus(CL_OUT_OF_RESOURCES);
    }

    profilingEnd(cmd);
}

void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(cmd);

    size_t offset = 0;
    // Find if virtual address is a CL allocation
    device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);

    device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
    const char *src = static_cast<const char *>(cmd.source());
    amd::Coord3D size = cmd.size();

    //! @todo add multi-devices synchronization when supported.

    cl_command_type type = cmd.type();
    bool result = false;
    bool imageBuffer = false;

    // Force buffer write for IMAGE1D_BUFFER
    if ((type == CL_COMMAND_WRITE_IMAGE) &&
        (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        type = CL_COMMAND_WRITE_BUFFER;
        imageBuffer = true;
    }

    switch (type) {
        case CL_COMMAND_WRITE_BUFFER: {
            amd::Coord3D    origin(cmd.origin()[0]);
            if (imageBuffer) {
                size_t  elemSize =
                  cmd.destination().asImage()->getImageFormat().getElementSize();
                origin.c[0] *= elemSize;
                size.c[0] *= elemSize;
            }
            if (hostMemory != nullptr) {
                // Accelerated transfer without pinning
                amd::Coord3D srcOrigin(offset);
                result = blitMgr().copyBuffer(*hostMemory, *devMem,
                    srcOrigin, origin, size, cmd.isEntireMemory());
            }
            else {
                result = blitMgr().writeBuffer(
                        src, *devMem , origin, size,
                        cmd.isEntireMemory());
            }
            break;
        }
        case CL_COMMAND_WRITE_BUFFER_RECT: {
            result = blitMgr().writeBufferRect(
                        src, *devMem, cmd.hostRect(), cmd.bufRect(), size,
                        cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_WRITE_IMAGE: {
            result = blitMgr().writeImage(
              src, *devMem, cmd.origin(), size, cmd.rowPitch(),
              cmd.slicePitch(), cmd.isEntireMemory());
            break;
        }
        default:
            ShouldNotReachHere();
            break;
    }

    if (!result) {
        LogError("submitWriteMemory failed!");
        cmd.setStatus(CL_OUT_OF_RESOURCES);
    }
    else {
        cmd.destination().signalWrite(&dev());
    }

    profilingEnd(cmd);
}

void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd)
{
    // in-order semantics: previous commands need to be done before we start
    releaseGpuMemoryFence();

    profilingBegin(cmd);
    const std::vector<void*>& svmPointers = cmd.svmPointers();
    if (cmd.pfnFreeFunc() == NULL) {
        // pointers allocated using clSVMAlloc
        for (cl_uint i = 0; i < svmPointers.size(); i++) {
            amd::SvmBuffer::free(cmd.context(), svmPointers[i]);
        }
    }
    else {
        cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
                (void**) (&(svmPointers[0])), cmd.userData());
    }
    profilingEnd(cmd);
}

void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd)
{
    // in-order semantics: previous commands need to be done before we start
    releaseGpuMemoryFence();
    profilingBegin(cmd);
    amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
    profilingEnd(cmd);
}

void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd)
{
    // in-order semantics: previous commands need to be done before we start
    releaseGpuMemoryFence();
    profilingBegin(cmd);
    amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
    profilingEnd(cmd);
}

void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(cmd);

    device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
    device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
    amd::Coord3D size = cmd.size();

    //! @todo add multi-devices synchronization when supported.

    cl_command_type type = cmd.type();
    bool result = false;
    bool srcImageBuffer = false;
    bool dstImageBuffer = false;

    // Force buffer copy for IMAGE1D_BUFFER
    if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
        srcImageBuffer = true;
        type = CL_COMMAND_COPY_BUFFER;
    }
    if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
        dstImageBuffer = true;
        type = CL_COMMAND_COPY_BUFFER;
    }

    switch (cmd.type()) {
        case CL_COMMAND_COPY_BUFFER: {
            amd::Coord3D    srcOrigin(cmd.srcOrigin()[0]);
            amd::Coord3D    dstOrigin(cmd.dstOrigin()[0]);

            if (srcImageBuffer) {
                const size_t elemSize =
                    cmd.source().asImage()->getImageFormat().getElementSize();
                srcOrigin.c[0] *= elemSize;
                if (dstImageBuffer) {
                    dstOrigin.c[0] *= elemSize;
                }
                size.c[0] *= elemSize;
            }
            else if (dstImageBuffer) {
                const size_t elemSize =
                    cmd.destination().asImage()->getImageFormat().getElementSize();
                dstOrigin.c[0] *= elemSize;
                size.c[0] *= elemSize;
            }

            result = blitMgr().copyBuffer(
                        *srcDevMem, *destDevMem, srcOrigin,
                        dstOrigin, size, cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_COPY_BUFFER_RECT: {
            result = blitMgr().copyBufferRect(
                        *srcDevMem, *destDevMem, cmd.srcRect(),
                        cmd.dstRect(), size, cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_COPY_IMAGE: {
            result = blitMgr().copyImage(
              *srcDevMem, *destDevMem, cmd.srcOrigin(),
              cmd.dstOrigin(), size, cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
            result = blitMgr().copyImageToBuffer(
              *srcDevMem, *destDevMem, cmd.srcOrigin(),
              cmd.dstOrigin(), size, cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
            result = blitMgr().copyBufferToImage(
              *srcDevMem, *destDevMem, cmd.srcOrigin(),
              cmd.dstOrigin(), size, cmd.isEntireMemory());
            break;
        }
        default:
            ShouldNotReachHere();
            break;
    }

    if (!result) {
        LogError("submitCopyMemory failed!");
        cmd.setStatus(CL_OUT_OF_RESOURCES);
    }

    cmd.destination().signalWrite(&dev());

    profilingEnd(cmd);
}

void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd)
{
    // No fence is needed since this is a no-op: the
    // command will be completed only after all the
    // previous commands are complete
    profilingBegin(cmd);
    profilingEnd(cmd);
}

void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd)
{
    // No fence is needed since this is a no-op: the
    // command will be completed only after all the
    // previous commands are complete
    profilingBegin(cmd);
    profilingEnd(cmd);
}

void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(cmd);

    //! @todo add multi-devices synchronization when supported.

    roc::Memory *devMemory = reinterpret_cast<roc::Memory *>(
        cmd.memory().getDeviceMemory(dev(), false));

    cl_command_type type = cmd.type();
    bool imageBuffer = false;

    // Save map requirement.
    cl_map_flags mapFlag = cmd.mapFlags();

    // Treat no map flag as read-write.
    if (mapFlag == 0) {
        mapFlag = CL_MAP_READ | CL_MAP_WRITE;
    }

    devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(),
        mapFlag, cmd.isEntireMemory());

    // Sync to the map target.
    if (devMemory->isHostMemDirectAccess()) {
        // Add memory to VA cache, so rutnime can detect direct access to VA
        dev().addVACache(devMemory);
    }
    if ((!devMemory->isHostMemDirectAccess()) &&
        (mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
        bool result = false;
        roc::Memory *hsaMemory = static_cast<roc::Memory *>(devMemory);

        amd::Memory* mapMemory = hsaMemory->mapMemory();
        void *hostPtr = mapMemory == NULL ?
            hsaMemory->owner()->getHostMem() :
            mapMemory->getHostMem();

        if (type == CL_COMMAND_MAP_BUFFER) {
            amd::Coord3D    origin(cmd.origin()[0]);
            amd::Coord3D    size(cmd.size()[0]);
            amd::Coord3D    dstOrigin(cmd.origin()[0], 0, 0);
            if (imageBuffer) {
                size_t  elemSize =
                  cmd.memory().asImage()->getImageFormat().getElementSize();
                origin.c[0] *= elemSize;
                size.c[0] *= elemSize;
            }

            if (mapMemory != nullptr) {
                roc::Memory *hsaMapMemory = static_cast<roc::Memory *>(
                    mapMemory->getDeviceMemory(dev(), false));
                result = blitMgr().copyBuffer(*hsaMemory,
                    *hsaMapMemory, origin, dstOrigin,
                    size, cmd.isEntireMemory());
            }
            else {
                result = blitMgr().readBuffer(
                    *hsaMemory, static_cast<char *>(hostPtr)+origin[0],
                    origin, size, cmd.isEntireMemory());
            }

        }
        else if (type == CL_COMMAND_MAP_IMAGE) {
            amd::Image* image = cmd.memory().asImage();
            result = blitMgr().readImage(
                *hsaMemory, hostPtr, amd::Coord3D(0),
                image->getRegion(), image->getRowPitch(),
                image->getSlicePitch(), true);
        }
        else {
            ShouldNotReachHere();
        }

        if (!result) {
            LogError("submitMapMemory failed!");
            cmd.setStatus(CL_OUT_OF_RESOURCES);
        }
    }

    profilingEnd(cmd);
}

void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
{
    roc::Memory* devMemory = static_cast<roc::Memory*>(
        cmd.memory().getDeviceMemory(dev(), false));

    const device::Memory::WriteMapInfo* mapInfo =
        devMemory->writeMapInfo(cmd.mapPtr());
    if (nullptr == mapInfo) {
        LogError("Unmap without map call");
        return;
    }
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();
    profilingBegin(cmd);

    // Force buffer write for IMAGE1D_BUFFER
    bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);

    if (devMemory->isHostMemDirectAccess()) {
        // Remove memory from VA cache
        dev().removeVACache(devMemory);
    }
    if (mapInfo->isUnmapWrite()) {
        // Commit the changes made by the user.
        if (devMemory->owner()->getHostMem() == nullptr) {
            bool result = false;

            if (cmd.memory().asImage() && !imageBuffer) {
                amd::Image *image = cmd.memory().asImage();
                result = blitMgr().writeImage(
                    cmd.mapPtr(), *devMemory,
                    mapInfo->origin_,
                    mapInfo->region_,
                    image->getRowPitch(), image->getSlicePitch());
            }
            else {
                amd::Coord3D origin(mapInfo->origin_[0]);
                amd::Coord3D size(mapInfo->region_[0]);
                if (imageBuffer) {
                    size_t  elemSize =
                        cmd.memory().asImage()->getImageFormat().getElementSize();
                    origin.c[0] *= elemSize;
                    size.c[0] *= elemSize;
                }
                if (devMemory->mapMemory() != nullptr) {
                    roc::Memory *mapMemory = static_cast<roc::Memory *>(
                        devMemory->mapMemory()->getDeviceMemory(dev(), false));

                    result = blitMgr().copyBuffer(
                        *mapMemory, *devMemory,
                        mapInfo->origin_,
                        mapInfo->origin_,
                        mapInfo->region_,
                        mapInfo->isEntire());
                }
                else {
                    result = blitMgr().writeBuffer(
                        cmd.mapPtr(), *devMemory, origin, size);
                }
            }
            if (!result) {
                LogError("submitMapMemory failed!");
                cmd.setStatus(CL_OUT_OF_RESOURCES);
            }
        }

        cmd.memory().signalWrite(&dev());
    }

    devMemory->clearUnmapInfo(cmd.mapPtr());

    profilingEnd(cmd);
}

void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(cmd);

    device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);

    //! @todo add multi-devices synchronization when supported.

    cl_command_type type = cmd.type();
    bool result = false;
    bool imageBuffer = false;
    float fillValue[4];

    // Force fill buffer for IMAGE1D_BUFFER
    if ((type == CL_COMMAND_FILL_IMAGE) &&
        (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
        type = CL_COMMAND_FILL_BUFFER;
        imageBuffer = true;
    }

    // Find the the right fill operation
    switch (type) {
        case CL_COMMAND_FILL_BUFFER: {
            const void* pattern = cmd.pattern();
            size_t  patternSize = cmd.patternSize();
            amd::Coord3D    origin(cmd.origin()[0]);
            amd::Coord3D    size(cmd.size()[0]);
            // Reprogram fill parameters if it's an IMAGE1D_BUFFER object
            if (imageBuffer) {
                size_t  elemSize =
                    cmd.memory().asImage()->getImageFormat().getElementSize();
                origin.c[0] *= elemSize;
                size.c[0] *= elemSize;
                memset(fillValue, 0, sizeof(fillValue));
                cmd.memory().asImage()->getImageFormat().formatColor(pattern,
                                                                     fillValue);
                pattern = fillValue;
                patternSize = elemSize;
            }
            result = blitMgr().fillBuffer(
                        *devMemory, pattern, patternSize, origin, size,
                        cmd.isEntireMemory());
            break;
        }
        case CL_COMMAND_FILL_IMAGE: {
            result = blitMgr().fillImage(
              *devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
              cmd.isEntireMemory());
            break;
        }
        default:
            ShouldNotReachHere();
            break;
    }

    if (!result) {
        LogError("submitFillMemory failed!");
        cmd.setStatus(CL_OUT_OF_RESOURCES);
    }

    cmd.memory().signalWrite(&dev());

    profilingEnd(cmd);
}

void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
{
    // Wait on a kernel if one is outstanding
    releaseGpuMemoryFence();

    profilingBegin(vcmd);

    std::vector<amd::Memory *>::const_iterator itr;

    for (itr = vcmd.memObjects().begin();
         itr != vcmd.memObjects().end();
         itr++) {
        // Find device memory
        device::Memory *m = (*itr)->getDeviceMemory(dev());
        roc::Memory *memory = static_cast<roc::Memory *>(m);

        if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
            //! @todo revisit this when multi devices is supported.
        } else if (vcmd.migrationFlags() &
                   CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
            //! @todo revisit this when multi devices is supported.
        } else {
            LogWarning("Unknown operation for memory migration!");
        }
    }

    profilingEnd(vcmd);
}

/*! \brief Writes to the buffer and incrememts the write pointer to the
 *         buffer. Also, ensures that the argument is written to an
 *         aligned memory as specified
 *
 * @param dst The write pointer to the buffer
 * @param src The source pointer
 * @param size The size in bytes to copy
 * @param alignment The alignment to follow while writing to the buffer
 */
static void
addArg(unsigned char** dst, const void* src,
                               size_t size, uint32_t alignment)
{
    *dst = amd::alignUp(*dst, alignment);
    memcpy(*dst, src, size);
    *dst += size;
}

static inline void
addArg(unsigned char** dst, const void* src, size_t size)
{
    assert(size < UINT32_MAX);
    addArg(dst, src, size, size);
}

 //Over rides the workgroup size fields in the packet with runtime/compiler set sizes
 void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
     amd::NDRangeContainer sizes,
     const size_t* compile_size,
     const roc::Device &dev){
     //Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
    // else this error check is incorrect
    if (compile_size[0] || compile_size[1] || compile_size[2]) {
        dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? compile_size[0] : 1;
        dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? compile_size[1] : 1;
        dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? compile_size[2] : 1;
    }
    else {
        //Runtime must set the group size
        dispatchPacket.workgroup_size_x = 1;
        dispatchPacket.workgroup_size_y = 1;
        dispatchPacket.workgroup_size_z = 1;

       if (sizes.dimensions() == 1) {
            dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_;
        }
        else if (sizes.dimensions() == 2) {
            dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_;
            dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_;
        }
        else if (sizes.dimensions() == 3) {
            dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_;
            dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_;
            dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_;
        }
    }
}

 static void
fillSampleDescriptor(
    hsa_ext_sampler_descriptor_t& samplerDescriptor,
   const amd::Sampler& sampler)
 {
     samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST ?
     HSA_EXT_SAMPLER_FILTER_MODE_NEAREST : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
     samplerDescriptor.coordinate_mode = sampler.normalizedCoords() ?
        HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED :
        HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
     switch (sampler.addressingMode()) {
     case CL_ADDRESS_CLAMP_TO_EDGE:
         samplerDescriptor.address_mode =
            HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
         break;
     case CL_ADDRESS_REPEAT:
         samplerDescriptor.address_mode =
            HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
         break;
     case CL_ADDRESS_CLAMP:
         samplerDescriptor.address_mode =
            HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
         break;
     case CL_ADDRESS_MIRRORED_REPEAT:
         samplerDescriptor.address_mode =
            HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
         break;
     case CL_ADDRESS_NONE:
         samplerDescriptor.address_mode =
            HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
         break;
     default:
        return;
     }
   }

bool
VirtualGPU::submitKernelInternal(
    const amd::NDRangeContainer& sizes,
    const amd::Kernel& kernel,
    const_address parameters,
    void *eventHandle)
{
    if (tools_lib_) {
        SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle);
    }

    device::Kernel *devKernel = const_cast<device::Kernel *>
                          (kernel.getDeviceKernel(dev()));
    Kernel &gpuKernel = static_cast<Kernel &>(*devKernel);
    const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
    size_t ldsUsage = compilerLdsUsage;

    // Check memory dependency and SVM objects
    if (!processMemObjects(kernel, parameters)) {
        LogError("Wrong memory objects!");
        return false;
    }

    // Init PrintfDbg object if printf is enabled.
    bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false;
    if (!printfDbg()->init(printfEnabled)){
        LogError("\nPrintfDbg object initialization failed!");
        return false;
    }

    // Allocate buffer to hold kernel arguments
    address argBuffer =
        (address)allocKernArg(gpuKernel.KernargSegmentByteSize(),
                              gpuKernel.KernargSegmentAlignment());

    if (argBuffer == NULL) {
        LogError("Out of memory");
        return false;
    }

    address argPtr = argBuffer;

    // The HLC generates Kernenv arguments, first 3 are global offsets.
    const uint extraAargs = ((roc::Kernel*)devKernel)->extraArgumentsNum();
    for (uint j = 0; j < extraAargs; ++j) {
        // The 4th parameter is the pointer to print buffer
        if (3 == j) {
            address bufferPtr = printfDbg()->dbgBuffer();
            addArg(&argPtr, &bufferPtr, sizeof(void*));
        }else {
            const size_t offset = j < sizes.dimensions() ? sizes.offset()[j] : 0;
            addArg(&argPtr, &offset, sizeof(void*)); //Should be uint32_t for small model and uint64_t for large!
        }
    }

    const amd::KernelSignature& signature = kernel.signature();
    const amd::KernelParameters& kernelParams = kernel.parameters();

    // Find all parameters for the current kernel
    for (uint i = 0; i != signature.numParameters(); ++i) {
        const HsailKernelArg* arg = gpuKernel.hsailArgAt(i);
        const_address srcArgPtr = parameters + signature.at(i).offset_;

        if (arg->type_ == HSAIL_ARGTYPE_POINTER ) {
            const size_t size = sizeof(void*);
            if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
                ldsUsage = amd::alignUp(ldsUsage, arg->alignment_);  //!< do we need this?
                addArg(&argPtr, &ldsUsage, size);
                ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
                continue;
            }
            assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) &&
                   "Unsupported address qualifier");
            if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
                addArg(&argPtr, srcArgPtr, size);
                continue;
            }
            amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
            if (mem == NULL) {
                addArg(&argPtr, srcArgPtr, size);
                continue;
            }

            Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
            //! @todo add multi-devices synchronization when supported.
            void* globalAddress = devMem->getDeviceMemory();
            addArg(&argPtr, &globalAddress, size);

            //! @todo Compiler has to return read/write attributes
            const cl_mem_flags flags = mem->getMemFlags();
            if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
                mem->signalWrite(&dev());
            }
        }
        else if (arg->type_ == HSAIL_ARGTYPE_VALUE) {
            if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
                void *mem = allocKernArg(arg->size_, arg->alignment_);
                if (mem == NULL) {
                    LogError("Out of memory");
                    return false;
                }
                memcpy(mem, srcArgPtr, arg->size_);
                addArg(&argPtr, &mem, sizeof(void*));
                continue;
            }
            for (uint e = 0; e < arg->numElem_; ++e) {
                addArg(&argPtr, srcArgPtr, arg->size_);
                srcArgPtr += arg->size_;
            }
        }
        else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
          amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
          Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
          if (image == NULL) {
            LogError("Kernel image argument is not an image object");
            return false;
          }

          if (dev().settings().enableImageHandle_) {
            const uint64_t image_srd = image->getHsaImageObject().handle;
            assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
            addArg(&argPtr, &image_srd, sizeof(image_srd));
          }
          else {
            // Image arguments are of size 48 bytes and are aligned to 16 bytes
            addArg(&argPtr, (void *)image->getHsaImageObject().handle,
              HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
          }

          //! @todo Compiler has to return read/write attributes
          const cl_mem_flags flags = mem->getMemFlags();
          if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
            mem->signalWrite(&dev());
          }
        }
        else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
          amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
          if (sampler == NULL) {
            LogError("Kernel sampler argument is not an sampler object");
            return false;
          }

          hsa_ext_sampler_descriptor_t samplerDescriptor;
          fillSampleDescriptor(samplerDescriptor, *sampler);

          hsa_ext_sampler_t hsa_sampler;
          hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
            &samplerDescriptor, &hsa_sampler);
          if (status != HSA_STATUS_SUCCESS) {
            LogError("Error creating device sampler object!");
            return false;
          }

          if (dev().settings().enableImageHandle_) {
            uint64_t sampler_srd = hsa_sampler.handle;
            addArg(&argPtr, &sampler_srd, sizeof(sampler_srd));
            samplerList_.push_back(hsa_sampler);
            // TODO: destroy sampler.
          }
          else {
            argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);

            memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
            argPtr += HSA_SAMPLER_OBJECT_SIZE;
            hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
          }
        }
    }

    // Check there is no arguments' buffer overflow
    assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());

    // Check for group memory overflow
    //! @todo Check should be in HSA - here we should have at most an assert
    assert(roc_device_.info().localMemSizePerCU_ > 0);
    if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
        LogError("No local memory available\n");
        return false;
    }

    //Initialize the dispatch Packet
    hsa_kernel_dispatch_packet_t dispatchPacket;
    memset(&dispatchPacket, 0, sizeof(dispatchPacket));

    dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();

    dispatchPacket.header = aqlHeader_;
    dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
    dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
    dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
    dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;

    const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
    if (sizes.local().product() != 0) {
        dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
        dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
        dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
    } else {
        setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
    }
    dispatchPacket.kernarg_address = argBuffer;
    dispatchPacket.group_segment_size = ldsUsage;
    dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;

    //Dispatch the packet
    if (!dispatchAqlPacket(&dispatchPacket, false)){
        return false;
    }

    // Mark the flag indicating if a dispatch is outstanding.
    // We are not waiting after every dispatch.
    hasPendingDispatch_ = true;

    // Output printf buffer
    if(!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())){
        LogError("\nCould not print data from the printf buffer!");
        return false;
    }
    return true;
}
/**
 * @brief Api to dispatch a kernel for execution. The implementation
 * parses the input object, an instance of virtual command to obtain
 * the parameters of global size, work group size, offsets of work
 * items, enable/disable profiling, etc.
 *
 * It also parses the kernel arguments buffer to inject into Hsa Runtime
 * the list of kernel parameters.
 */
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) {
    profilingBegin(vcmd);

    // Submit kernel to HW
    if (!submitKernelInternal(
            vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
            static_cast<void *>(as_cl(&vcmd.event())))) {
        LogError("AQL dispatch failed!");
        vcmd.setStatus(CL_INVALID_OPERATION);
    }

    profilingEnd(vcmd);
}

void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) {
  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}

void VirtualGPU::submitMarker(amd::Marker &cmd) {
  // std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}

void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd)
{
  profilingBegin(vcmd);
  auto fence=kBarrierAcquirePacket;
  dispatchAqlPacket(&fence, false);
  profilingEnd(vcmd);
}

void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
{
  profilingBegin(vcmd);
  auto fence=kBarrierReleasePacket;
  dispatchAqlPacket(&fence, false);
  profilingEnd(vcmd);
}

void VirtualGPU::flush(amd::Command *list, bool wait) {
    releaseGpuMemoryFence();
    updateCommandsState(list);
}
}  // End of roc namespace