rocm-systems/rocclr/runtime/device/gpu/gpukernel.cpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"

#include "acl.h"
#include "SCShadersR678XXCommon.h"

#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>

namespace gpu {

const MetaDataConst ArgState[ArgStateTotal] =
{
// Note: the order is important
    // Name                 Type                    Properties
// Kernel description (special properties)
{ "memory:compilerwrite",   KernelArg::PrivateFixed,    { 0, 0, 0, 0, 0, 0, 0 } },
    { "uniqueid:",          KernelArg::None,            { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:private:",    KernelArg::PrivateSize,     { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:local:",      KernelArg::LocalSize,       { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:hwprivate:",  KernelArg::HwPrivateSize,   { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:uavprivate:", KernelArg::HwPrivateSize,   { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:hwlocal:",    KernelArg::HwLocalSize,     { 0, 0, 0, 0, 0, 0, 0 } },
    { "memory:64bitABI",    KernelArg::ABI64Bit,        { 0, 0, 0, 0, 0, 0, 0 } },
    { "limitgroupsize",     KernelArg::Wavefront,       { 0, 0, 0, 0, 0, 0, 0 } },
    { "function:",          KernelArg::None,            { 1, 1, 0, 0, 0, 0, 0 } },
    { "intrinsic:",         KernelArg::None,            { 1, 0, 0, 0, 0, 0, 0 } },
    { "error:",             KernelArg::ErrorMessage,    { 0, 0, 0, 0, 0, 0, 0 } },
    { "warning:",           KernelArg::WarningMessage,  { 0, 0, 0, 0, 0, 0, 0 } },
    { "printf_fmt:",        KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } },
    { "version:",           KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } },
// Kernel basic types
    { "pointer:",           KernelArg::PointerGlobal,   { 1, 1, 1, 1, 1, 1, 0 } },
    { "value:",             KernelArg::Value,           { 1, 1, 1, 1, 1, 0, 0 } },
    { "image:",             KernelArg::Image,           { 1, 1, 1, 1, 1, 0, 0 } },
    { "sampler:",           KernelArg::Sampler,         { 0, 1, 0, 0, 0, 0, 0 } },
    { "counter:",           KernelArg::Counter,         { 1, 1, 0, 1, 1, 0, 0 } },
    { "cws:",               KernelArg::Grouping,        { 0, 0, 0, 0, 0, 0, 0 } },
    { "lws:",               KernelArg::WrkgrpSize,      { 0, 0, 0, 0, 0, 0, 0 } },
    { "uavid:",             KernelArg::UavId,           { 0, 0, 0, 0, 0, 0, 0 } },
    { "reflection:",        KernelArg::Reflection,      { 0, 0, 0, 0, 0, 0, 0 } },
    { "constarg:",          KernelArg::ConstArg,        { 0, 0, 0, 0, 0, 0, 0 } },
    { "cbid:",              KernelArg::ConstBufId,      { 0, 0, 0, 0, 0, 0, 0 } },
    { "printfid:",          KernelArg::PrintfBufId,     { 0, 0, 0, 0, 0, 0, 0 } },
    { "wsh:",               KernelArg::GroupingHint,    { 0, 0, 0, 0, 0, 0, 0 } },
    { "vth:",               KernelArg::VecTypeHint,     { 0, 0, 0, 0, 0, 0, 0 } },
};

const DataTypeConst DataType[] =
{
    { "i8:",        KernelArg::Char,    },
    { "i16:",       KernelArg::Short,   },
    { "i32:",       KernelArg::Int,     },
    { "i64:",       KernelArg::Long,    },
    { "u8:",        KernelArg::UChar,   },
    { "u16:",       KernelArg::UShort,  },
    { "u32:",       KernelArg::UInt,    },
    { "u64:",       KernelArg::ULong,   },
    { "float:",     KernelArg::Float,   },
    { "double:",    KernelArg::Double,  },
    { "struct:",    KernelArg::Struct,  },
    { "union:",     KernelArg::Union,   },
    { "1D:",        KernelArg::Image1D, },
    { "2D:",        KernelArg::Image2D, },
    { "3D:",        KernelArg::Image3D, },
    { "1DB:",       KernelArg::Image1DB, },
    { "1DA:",       KernelArg::Image1DA, },
    { "2DA:",       KernelArg::Image2DA, },
    { "opaque:",    KernelArg::Opaque,  },
    { "event:",     KernelArg::Event,   },
    { "sampler:",   KernelArg::Sampler, },
    { "half:",      KernelArg::Half, },
};

const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);

struct BufDataConst
{
    const char*     tagName_;       //!< buffer's name
    KernelArg::ArgumentType type_;  //!< type of argument
    struct
    {
        uint    number_     :  1;   //!< buffer's number
        uint    alignment_  :  1;   //!< buffer's alignment
        uint    attribute_  :  1;   //!< buffer's read/write attribute
        uint    reserved    : 29;   //!< reserved
    };
};

static const BufDataConst BufType[] =
{
    { "g",      KernelArg::PointerGlobal,   { 1, 0, 0, 0 } },
    { "p",      KernelArg::PointerPrivate,  { 1, 1, 1, 0 } },
    { "l",      KernelArg::PointerLocal,    { 1, 1, 1, 0 } },
    { "uav",    KernelArg::PointerGlobal,   { 1, 1, 1, 0 } },
    { "c",      KernelArg::PointerConst,    { 1, 1, 1, 0 } },
    { "hl",     KernelArg::PointerHwLocal,  { 1, 1, 1, 0 } },
    { "hp",     KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } },
    { "hc",     KernelArg::PointerHwConst,  { 1, 1, 1, 0 } }
};
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);

//! The mathlib constants for each kernel execution
static const float  MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f };

bool
expect(const std::string& str, size_t* pos, const std::string& sym)
{
    bool    result = true;
    uint    i;

    if (*pos == std::string::npos) {
        return false;
    }

    // Check if we have expected symbols
    for (i = 0; i < sym.size(); ++i) {
        char    deb = str[*pos + i];
        if (deb != sym[i]) {
            result = false;
            break;
        }
    }

    if (result) *pos += i;

    return result;
}

bool
getword(const std::string& str, size_t* pos, char* sym)
{
    if (*pos == std::string::npos) {
        return false;
    }

    *pos = str.find_first_not_of(" \n\r", *pos);
    size_t  posEnd  = str.find_first_of(": \n\r;", *pos);
    size_t  count   = posEnd - *pos;

    if (count != 0) {
        if (!str.copy(sym, count, *pos)) {
            return false;
        }
    }
    sym[count] = 0;
    *pos = posEnd + 1;
    return true;
}

bool
getstring(const std::string& str, size_t* pos, std::string* out)
{
    if (*pos == std::string::npos) {
        return false;
    }

    *pos = str.find_first_not_of(" \n\r", *pos);
    size_t  posEnd  = str.find_first_of(":\n\r;", *pos);
    size_t  count   = posEnd - *pos;

    char* sym = new char[count + 1];
    if (count != 0) {
        if (!str.copy(sym, count, *pos)) {
            return false;
        }
    }
    sym[count] = 0;
    *out = sym;
    delete [] sym;
    *pos = posEnd + 1;
    return true;
}

bool
getuint(const std::string&  str, size_t* pos, uint* val)
{
    if (*pos == std::string::npos) {
        return false;
    }

    char    sym[16];
    *pos = str.find_first_not_of(" \n\r", *pos);
    size_t  posEnd = str.find_first_of(": \n\r;)", *pos);

    if (!str.copy(sym, posEnd - *pos, *pos)) {
        return false;
    }
    *val = 0;
    for (size_t i = 0; i < (posEnd - *pos); ++i) {
        *val = (*val * 10) + (sym[i] - 0x30);
    }
    *pos = posEnd + 1;
    return true;
}

bool
getuintHex(const std::string&  str, size_t* pos, uint* val)
{
    if (*pos == std::string::npos) {
        return false;
    }

    char    sym[16];
    *pos = str.find_first_not_of(" \n\r", *pos);
    size_t  posEnd = str.find_first_of(": \n\r;)", *pos);

    if (!str.copy(sym, posEnd - *pos, *pos)) {
        return false;
    }
    *val = 0;
    for (size_t i = 0; i < (posEnd - *pos); ++i) {
        if (sym[i] >= '0' && sym[i] <= 'F') {
            *val = (*val * 16) + (sym[i] - '0');
        }
        else if (sym[i] >= 'a' && sym[i] <= 'f') {
            *val = (*val * 16) + (sym[i] - 'a' + 10);
        }
        else {
            return false;
        }
    }
    *pos = posEnd + 1;
    return true;
}

bool
getuint64Hex(const std::string&  str, size_t* pos, uint64_t* val)
{
    if (*pos == std::string::npos) {
        return false;
    }

    char    sym[16];
    *pos = str.find_first_not_of(" \n\r", *pos);
    size_t  posEnd = str.find_first_of(": \n\r;)", *pos);

    if (!str.copy(sym, posEnd - *pos, *pos)) {
        return false;
    }
    *val = 0;
    for (size_t i = 0; i < (posEnd - *pos); ++i) {
        if (sym[i] >= '0' && sym[i] <= 'F') {
            *val = (*val * 16) + (sym[i] - '0');
        }
        else if (sym[i] >= 'a' && sym[i] <= 'f') {
            *val = (*val * 16) + (sym[i] - 'a' + 10);
        }
        else {
            return false;
        }
    }
    *pos = posEnd + 1;
    return true;
}

void
intToStr(size_t value, char* str, size_t size)
{
    static const uint MaxDigits32bit = 10;
    char    result[MaxDigits32bit];
    uint    idx = MaxDigits32bit;

    do {
        idx--;
        result[idx] = static_cast<char>((value % 10) + '0');
        value /= 10;
    } while ((value != 0) && (idx > 0));
    size_t len = MaxDigits32bit - idx;
    size_t n = std::min<size_t>(len, size-1);
    memcpy(str, &result[idx], n);
    str[n] = '\0';
}

//! Default destructor
CalImageReference::~CalImageReference()
{
    // Free CAL image
    free(image_);
}

KernelArg::KernelArg()
    : type_(KernelArg::None)
    , size_(0)
    , cbIdx_(0)
    , cbPos_(0)
    , index_(0)
    , alignment_(1)
    , dataType_(KernelArg::None)
{
    name_   = "";
    buf_    = "";
    memory_.value_ = 0;
    typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
}

KernelArg::KernelArg(const KernelArg& data)
{
    // Fill the new object
    *this = data;
}

KernelArg&
KernelArg::operator=(const KernelArg& data)
{
    // Fill the fields of the current object
    name_   = data.name_;
    typeName_ = data.typeName_;
    typeQualifier_ = data.typeQualifier_;
    type_   = data.type_;
    size_   = data.size_;
    cbIdx_  = data.cbIdx_;
    cbPos_  = data.cbPos_;
    buf_    = data.buf_;
    index_  = data.index_;
    alignment_  = data.alignment_;
    dataType_   = data.dataType_;
    memory_.value_  = data.memory_.value_;
    return *this;
}

bool
KernelArg::isCbNeeded() const
{
    //! \note not a safe way
    bool result = ((type_ > None) && (type_ < Sampler)) ? true : false;
    if ((type_ == Sampler) && (location_ == 0)) {
        // Sampler is defined outside the kernel
        result = true;
    }
    return result;
}

size_t
KernelArg::size(bool gpuLayer)const
{
    switch (type_) {
    case None:
        return 0;
    case PointerConst:
    case PointerHwConst:
    case PointerGlobal:
        return (gpuLayer) ? sizeof(uint32_t) * size_ :  sizeof(cl_mem);
    case Image1D:
    case Image2D:
    case Image3D:
    case Image1DB:
    case Image1DA:
    case Image2DA:
        return (gpuLayer) ? sizeof(ImageConstants) :  sizeof(cl_mem);
    case Sampler:
        return (gpuLayer) ? 2 * sizeof(uint32_t) :  sizeof(cl_sampler);
    case Counter:
        return (gpuLayer) ? 0 : sizeof(cl_mem);
    case PointerLocal:
    case PointerHwLocal:
        return (gpuLayer) ? sizeof(uint32_t) * size_ :  0;
    case PointerPrivate:
    case PointerHwPrivate:
        return (gpuLayer) ? sizeof(uint32_t) * size_ :  0;
    case Float:
        return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
    case Double:
        return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
    case Char:
    case UChar:
        return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
    case Short:
    case UShort:
        return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
    case Int:
    case UInt:
        return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
    case Long:
    case ULong:
        return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
    case Struct:
    case Union:
        return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
    default:
        return 0;
    }
}

cl_kernel_arg_address_qualifier
KernelArg::addressQualifier() const
{
    switch (type_) {
    case PointerGlobal:
    case Image1D:
    case Image2D:
    case Image3D:
    case Image1DB:
    case Image1DA:
    case Image2DA:
        return CL_KERNEL_ARG_ADDRESS_GLOBAL;
    case PointerLocal:
    case PointerHwLocal:
        return CL_KERNEL_ARG_ADDRESS_LOCAL;
    case PointerConst:
    case PointerHwConst:
        return CL_KERNEL_ARG_ADDRESS_CONSTANT;
    default:
        return CL_KERNEL_ARG_ADDRESS_PRIVATE;
    }
}

cl_kernel_arg_access_qualifier
KernelArg::accessQualifier() const
{
    switch (type_) {
    case Image1D:
    case Image2D:
    case Image3D:
    case Image1DB:
    case Image1DA:
    case Image2DA:
        if (memory_.readOnly_) {
            return CL_KERNEL_ARG_ACCESS_READ_ONLY;
        }
        else if (memory_.writeOnly_) {
            return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
        }
        else if (memory_.readWrite_) {
            return CL_KERNEL_ARG_ACCESS_READ_WRITE;
        }
        // Fall through ...
    default:
        return CL_KERNEL_ARG_ACCESS_NONE;
    }
}

//! temporary solution for the vectors handling in compiler
size_t
KernelArg::specialVector() const
{
    if (size_ > VectorSizeLimit) {
        switch (type_) {
        case Char:
        case UChar:
            return sizeof(cl_char);
        case Short:
        case UShort:
            return sizeof(cl_short);
        default:
            return 0;
        }
    }
    return 0;
}
clk_value_type_t
KernelArg::type()const
{
    switch (type_) {
    case PointerGlobal:
    case PointerLocal:
    case PointerHwLocal:
    case PointerConst:
    case PointerHwConst:
    case Image1D:
    case Image2D:
    case Image3D:
    case Image1DB:
    case Image1DA:
    case Image2DA:
    case Counter:
        return T_POINTER;
    case Float:
        return T_FLOAT;
    case Double:
        return T_DOUBLE;
    case Char:
    case UChar:
        return T_CHAR;
    case Short:
    case UShort:
        return T_SHORT;
    case Int:
        return T_INT;
    case UInt:
        //! \note No UINT type
        return T_INT;
    case Long:
        return T_LONG;
    case ULong:
        //! \note No ULONG type
        return T_LONG;
    case Struct:
    case Union:
        //! @todo What should we report?
        return T_CHAR;
    case Sampler:
        return T_SAMPLER;
    case PointerPrivate:
    case PointerHwPrivate:
    case None:
    default:
        return T_VOID;
    }
}

NullKernel::NullKernel(
    const std::string&  name,
    const NullDevice&   gpuNullDev,
    const NullProgram&  nullprog)
    : device::Kernel(name)
    , buildError_(CL_BUILD_PROGRAM_FAILURE)
    , gpuDev_(gpuNullDev)
    , prog_(nullprog)
    , calRef_(NULL)
    , internal_(false)
    , flags_(0)
    , cbSizes_(NULL)
    , numCb_(0)
    , rwAttributes_(false)
    , instructionCnt_(4)
{
    // UAV raw index will be detected
    uavRaw_ = UavIdUndefined;
    // Initialize UAV arena index(should be 8)
    uavArena_ = VirtualGPU::UavArena;
    // CB index will be detected
    cbId_ = UavIdUndefined;
    // Printf index will be detected
    printfId_ = UavIdUndefined;
}

NullKernel::~NullKernel()
{
    uint idx;

    if (calRef_ == NULL) {
        return;
    }
    calRef_->release();

    // Destroy all kernel arguments
    for (idx = 0; idx < arguments_.size(); ++idx) {
        delete arguments_[idx];
    }
    arguments_.clear();

    // Destroy all sampler kernel arguments
    for (idx = 0; idx < intSamplers_.size(); ++idx) {
        delete intSamplers_[idx];
    }
    intSamplers_.clear();
}


static int
scComponentToArrayIndex(E_SC_COMPONENT dstComp)
{
    switch (dstComp) {
        case SC_COMPONENT_X:
            return 0;
        case SC_COMPONENT_Y:
            return 1;
        case SC_COMPONENT_Z:
            return 2;
        case SC_COMPONENT_W:
            return 3;
    }

    return 0;
}

static void
addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding)
{
    uint count = shader->dep.NumIntrlIConstants;
    encoding.litConstsCount = shader->dep.NumIntrlIConstants;

    // only suppport loop consts (int consts)
    if (count) {
        AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
        memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
        uint usedConsts = 0;
        for (uint i = 0; i < count; ++i) {
            uint    currentConst;
            for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
                if (allocatedconsts[currentConst].addr ==
                    HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
                    break;
                }
            }
            if (currentConst == usedConsts) {
                usedConsts++;
                assert(usedConsts <= count);
            }
            allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
            allocatedconsts[currentConst].type = AMU_ABI_INT32;
            allocatedconsts[currentConst].value.
                int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
                HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
        }
        encoding.litConstsCount = usedConsts;
    }
}

bool
NullKernel::create(
    const std::string& code,
    const std::string&  metadata,
    const void* binaryCode,
    size_t      binarySize)
{
    std::auto_ptr<uint> uavRefCount (new uint[MaxUavArguments]);
    if (NULL == uavRefCount.get()) {
        return false;
    }

    // Set all ref counts to 0
    memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);

    // parse the metadata fields
    if (!parseArguments(metadata, uavRefCount.get())) {
        return false;
    }

    CALimage    calImage;
    // Save source if DEBUG build
#if DEBUG
    ilSource_ = code;
#endif // DEBUG

    amd::option::Options *options = nullProg().getCompilerOptions();
    internal_ = options->oVariables->clInternalKernel;

    if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
        acl_error err;
        std::string arch = GPU_TARGET_INFO_ARCH;
        if (nullDev().settings().use64BitPtr_) {
            arch += "64";
        }
        aclTargetInfo info = aclGetTargetInfo(
            arch.c_str(), nullDev().hwInfo()->targetName_, &err);
        if (err != ACL_SUCCESS) {
            LogWarning("aclGetTargetInfo failed");
            return false;
        }

        aclBinaryOptions binOpts = {0};
        binOpts.struct_size = sizeof(binOpts);
        binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
        binOpts.bitness = ELFDATA2LSB;
        binOpts.alloc = &::malloc;
        binOpts.dealloc = &::free;

        aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
        if (err != ACL_SUCCESS) {
            LogWarning("aclBinaryInit failed");
            return false;
        }

        if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin,
                code.data(), code.size(), aclSOURCE)) {
            LogWarning("aclInsertSection failed");
            aclBinaryFini(bin);
            return false;
        }

        amd::option::Options* Opts = (amd::option::Options*)bin->options;
        // temporary solution to synchronize buildNo between runtime and complib
        // until we move runtime inside complib
        Opts->setBuildNo(options->getBuildNo());

        // pass kernel name to compiler
        Opts->setCurrKernelName(name().c_str());

        err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(),
            ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL);

        buildLog_ += aclGetCompilerLog(nullDev().compiler());

        if (err != ACL_SUCCESS) {
            LogWarning("aclCompile failed");
            aclBinaryFini(bin);
            return false;
        }
        if (!options->oVariables->BinEXE) {
            // Early exit if binary doesn't contain EXE
            aclBinaryFini(bin);
            return true;
        }
        size_t len;
        const void* isa = aclExtractSection(nullDev().compiler(), bin,
            &len, aclTEXT, &err);
        if (err != ACL_SUCCESS) {
            LogWarning("aclExtractSection failed");
            aclBinaryFini(bin);
            return false;
        }

        uint    calImageSize;
        if (!createMultiBinary(
            &calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
            LogWarning("initSrcEncoding failed");
            aclBinaryFini(bin);
            return false;
        }

        aclBinaryFini(bin);
    }
    else if ((binaryCode != NULL) && (binarySize != 0)) {
        uint size = 0;
        if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode))
                || size > binarySize) {
            buildLog_ += "Invalid binary image";
            LogError("amuABIMultiBinaryGetSize failed!");
            return false;
        }

        calImage = static_cast<CALimage>(malloc(size));
        ::memcpy(calImage, binaryCode, size);
    }
    else {
        LogError("Incorrect initialization parameters!");
        return false;
    }

    calRef_ = new CalImageReference(calImage);
    if (calRef_ == NULL) {
        LogError("Memory allocation failure!");
        // Free CAL image
        free(calImage);
        return false;
    }

    CALfuncInfo     calFuncInfo;

    // Get kernel compiled information
    getFuncInfoFromImage(calImage, &calFuncInfo);
    if (calFuncInfo.maxScratchRegsNeeded > 0) {
        LogPrintfInfo("%s kernel has register spilling."
            "Lower performance is expected.", name().c_str());
    }

    workGroupInfo_.scratchRegs_         = calFuncInfo.maxScratchRegsNeeded;
    workGroupInfo_.wavefrontPerSIMD_    = calFuncInfo.numWavefrontPerSIMD;
    workGroupInfo_.wavefrontSize_       = calFuncInfo.wavefrontSize;
    workGroupInfo_.availableGPRs_       = calFuncInfo.numGPRsAvailable;
    workGroupInfo_.usedGPRs_            = calFuncInfo.numGPRsUsed;
    workGroupInfo_.availableSGPRs_      = calFuncInfo.numSGPRsAvailable;
    workGroupInfo_.usedSGPRs_           = calFuncInfo.numSGPRsUsed;
    workGroupInfo_.availableVGPRs_      = calFuncInfo.numVGPRsAvailable;
    workGroupInfo_.usedVGPRs_           = calFuncInfo.numVGPRsUsed;
    workGroupInfo_.availableLDSSize_    = calFuncInfo.LDSSizeAvailable;
    workGroupInfo_.usedLDSSize_         = calFuncInfo.LDSSizeUsed;
    workGroupInfo_.availableStackSize_  = calFuncInfo.stackSizeAvailable;
    workGroupInfo_.usedStackSize_       = calFuncInfo.stackSizeUsed;

    device::Kernel::parameters_t params;
    if (!createSignature(params)) {
        return false;
    }

    return true;
}

size_t
NullKernel::getCalBinarySize() const
{
    CALuint imageSize;
    if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
        LogError("Failed to get the image size!");
        return 0;
    }
    return static_cast<size_t>(imageSize);
}

bool
NullKernel::getCalBinary(void*  binary, size_t  size) const
{
    uint calImageSize = 0;
    if (!amuABIMultiBinaryGetSize(&calImageSize, calImage())
            || size < calImageSize) {
        LogError("CAL failed to save the kernel binary!");
        return false;
    }
    ::memcpy(binary, calImage(), calImageSize);

    return true;
}

bool
Kernel::create(
    const std::string& code,
    const std::string&  metadata,
    const void* binaryCode,
    size_t      binarySize)
{
    setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);

    if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
        return false;
    }

    // initialize constant buffer sizes
    if (!initConstBuffers()) {
        return false;
    }

    // Initialize the kernel parameters
    bool    result = initParameters();

    if (!dev().heap()->isVirtual()) {
        amd::option::Options *options = nullProg().getCompilerOptions();
        // @todo Remove this. This is a hack for no VM mode
        if (!options->oVariables->EnableDumpKernel) {
            if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
                !name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
                blitKernelHack_ = true;
            }
        }
    }

    if (result) {
        buildError_ = CL_SUCCESS;
    }
    else {
        result = false;
    }

    return result;
}

Kernel::Kernel(
    const std::string&  name,
    const Device&       gpuDev,
    const Program&      prog,
    const InitData*     initData)
    : NullKernel(name, gpuDev, prog)
    , blitKernelHack_(false)
{
    hwPrivateSize_ = 0;
    if (NULL != initData) {
        flags_          = initData->flags_;
        hwPrivateSize_  = initData->hwPrivateSize_;
        hwLocalSize_    = initData->hwLocalSize_;
    }
    // Workgroup info private memory size
    workGroupInfo_.privateMemSize_ = hwPrivateSize_;
    hsa_ = false;
}

Kernel::~Kernel()
{
    if (calRef_ == NULL) {
        return;
    }

    {
        Device::ScopedLockVgpus lock(dev());

        // Release all virtual image objects on all virtual GPUs
        for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
            dev().vgpus()[idx]->releaseKernel(calImage());
        }
    }

    if (0 != numCb_) {
        delete [] cbSizes_;
    }
}

const Device&
Kernel::dev() const
{
    return reinterpret_cast<const Device&>(gpuDev_);
}

const Program&
Kernel::prog() const
{
    return reinterpret_cast<const Program&>(prog_);
}

bool
NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa)
{
    const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);

    bool result = false;
    AMUabiAddEncoding   encoding;
    memset(&encoding, 0, sizeof(AMUabiAddEncoding));

    size_t  allocSize =
        sizeof(uint) * MaxReadImage +
        sizeof(CALUavEntry) * MaxUavArguments +
        sizeof(CALSamplerMapEntry) * MaxSamplers +
        sizeof(CALConstantBufferMask) * MaxConstBuffers +
        sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
    char* tmpMem = new char[allocSize];
    if (tmpMem == NULL) {
        LogError("Error allocating memory");
        return false;
    }

    CalcPtr(encoding.inputs, tmpMem, 0, 0);
    CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
    CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
    CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers);
    if (shader->dep.NumIntrlIConstants != 0) {
        CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers);
    }
    AMUabiMultiBinary amuBinary;
    amuABIMultiBinaryCreate(&amuBinary);

    if (nullDev().settings().siPlus_) {
        result = siCreateHwInfo(shader, encoding);
    }
    else {
        result = r800CreateHwInfo(shader, encoding);
    }
    if (!result) {
        delete [] tmpMem;
        LogWarning("Error Creating program info");
        return false;
    }

    addLoopConst(shader, encoding);

    unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0;
    unsigned int symbolCount=0;
    CALOutputEntry* outputs=0;
    unsigned int* globalBuffers=0;
    unsigned int* persistentBuffers=0;
    AMUabiUserSymbol* symbols=0;

    CALSamplerMapEntry*     inputSamplers = encoding.inputSamplerMaps;
    CALConstantBufferMask*  constBuffers = encoding.constBuffers;
    uint*                   inputResources = encoding.inputs;
    CALUavEntry*            uav = encoding.uav;

    uint inputSamplerCount = samplerSize();
    for (uint i = 0; i < inputSamplerCount; ++i) {
        inputSamplers[i].resource = 0;
        inputSamplers[i].sampler = sampler(i)->index_;
    }

    uint constBufferCount = 2;

    constBuffers[0].index = 0;
    constBuffers[1].index = 1;

    uint inputResourceCount = 0;

    uint uavCount = 0;
    bool globalBound = false;
    bool cbBound = false;
    bool printfBound = false;
    for (uint i = 0; i < arguments_.size(); ++i) {
        const KernelArg*  arg = argument(i);
        switch (arg->type_) {
        case KernelArg::PointerConst:
        case KernelArg::PointerHwConst:
            constBuffers[constBufferCount++].index = arg->index_;
            break;
        case KernelArg::PointerGlobal:
            if (nullDev().settings().useAliases_) {
                if (!globalBound) {
                    uav[uavCount].offset = uavRaw_;
                    uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
                    uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                    uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                    uavCount++;
                    if (uavArena_ != 0) {
                        uav[uavCount].offset = uavArena_;
                        uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
                        uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                        uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                        uavCount++;
                    }
                }
                globalBound = true;
            }
            else {
                uav[uavCount].offset = arg->index_;
                uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
                uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                uavCount++;
            }
            break;
        case KernelArg::ConstBufId:
            if (!cbBound) {
                uav[uavCount].offset = cbId_;
                uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
                uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                uavCount++;
            }
            cbBound = true;
            break;
        case KernelArg::PrintfBufId:
            if (!printfBound) {
                uav[uavCount].offset = printfId_;
                uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
                uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                uavCount++;
            }
            printfBound = true;
            break;
        case KernelArg::UavId:
            if (!nullDev().settings().useAliases_ &&
                (UavIdUndefined != uavRaw_) &&
                !(flags() & PrintfOutput)) {
                uav[uavCount].offset = arg->index_;
                uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
                uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                uavCount++;
            }
            else {
                if (UavIdUndefined != uavRaw_) {
                    uav[uavCount].offset = uavRaw_;
                    uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
                    uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                    uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                    uavCount++;
                }
                if (uavArena_ != 0) {
                    uav[uavCount].offset = uavArena_;
                    uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
                    uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
                    uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                    uavCount++;
                }
            }
            break;
        case KernelArg::Sampler:
            inputSamplers[inputSamplerCount].resource = 0;
            inputSamplers[inputSamplerCount].sampler = arg->index_;
            inputSamplerCount++;
            break;
        case KernelArg::Image1D:
        case KernelArg::Image2D:
        case KernelArg::Image3D:
        case KernelArg::Image1DB:
        case KernelArg::Image1DA:
        case KernelArg::Image2DA:
            if (arg->memory_.readOnly_) {
                inputResources[inputResourceCount++] = arg->index_;
            }
            else {
                uav[uavCount].offset = arg->index_;
                uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
                uav[uavCount].dimension = AMU_ABI_DIM_2D;
                uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
                uavCount++;
            }
            break;
        default:
            break;
        }
    }

    for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
        constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
    }

    encoding.machine                   = nullDev().hwInfo()->machine_;
    encoding.type                      = ED_ATI_CAL_TYPE_COMPUTE;
    encoding.inputCount                = inputResourceCount;
    encoding.outputCount               = outputCount;
    encoding.outputs                   = outputs;
    encoding.condOut                   = condOut;
    encoding.earlyExit                 = earlyExit;
    encoding.globalBuffersCount        = globalCount;
    encoding.globalBuffers             = globalBuffers;
    encoding.persistentBuffersCount    = persistentCount;
    encoding.persistentBuffers         = persistentBuffers;
    encoding.constBuffersCount         = constBufferCount;
    encoding.inputSamplerMapCount      = inputSamplerCount;
    encoding.symbolsCount              = symbolCount;
    encoding.symbols                   = symbols;
    encoding.uavCount                  = uavCount;

    amuABIMultiBinaryAddEncoding(amuBinary, &encoding);

    uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);

    amuABIMultiBinaryDestroy(amuBinary);

    delete [] tmpMem;
    delete [] encoding.progInfos;

    return (success == 0) ? false : true;
}

void
Kernel::findLocalWorkSize(
    size_t              workDim,
    const amd::NDRange& gblWorkSize,
    amd::NDRange& lclWorkSize) const
{
    // Initialize the default workgoup info
    // Check if the kernel has the compiled sizes
    if (workGroupInfo()->compileSize_[0] == 0) {
        // Find the default local workgroup size, if it wasn't specified
        if (lclWorkSize[0] == 0) {
            size_t  thrPerGrp;
            bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
            bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
            bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);

            bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
                               ((workDim == 2) && b2DOverrideSet) ||
                               ((workDim == 3) && b3DOverrideSet);
            if (!overrideSet) {
                // Find threads per group
                thrPerGrp = workGroupInfo()->size_;

                // Check if kernel uses images
                if ((flags() & ImageEnable) &&
                    // and thread group is a multiple value of wavefronts
                    ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
                    // and it's 2 or 3-dimensional workload
                    (workDim > 1) &&
                     ((dev().settings().partialDispatch_) ||
                       (((gblWorkSize[0] % 16) == 0) &&
                        ((gblWorkSize[1] % 16) == 0)))) {
                    // Use 8x8 workgroup size if kernel has image writes
                    if ((flags() & ImageWrite) ||
                        (thrPerGrp != nullDev().info().maxWorkGroupSize_)) {
                        lclWorkSize[0] = 8;
                        lclWorkSize[1] = 8;
                    }
                    else {
                        lclWorkSize[0] = 16;
                        lclWorkSize[1] = 16;
                    }
                    if (workDim == 3) {
                        lclWorkSize[2] = 1;
                    }
                }
                else {
                    size_t  tmp = thrPerGrp;
                    // Split the local workgroup into the most efficient way
                    for (uint d = 0; d < workDim; ++d) {
                        size_t  div = tmp;
                        for (; (gblWorkSize[d] % div) != 0; div--);
                        lclWorkSize[d] = div;
                        tmp /= div;
                    }

                    // Check if partial dispatch is enabled and
                    if (dev().settings().partialDispatch_ &&
                         // we couldn't find optimal workload
                        (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
                        size_t  maxSize = 0;
                        size_t  maxDim = 0;
                        for (uint d = 0; d < workDim; ++d) {
                            if (maxSize < gblWorkSize[d]) {
                                maxSize = gblWorkSize[d];
                                maxDim = d;
                            }
                        }
                        // Check if a local workgroup has the most optimal size
                        if (thrPerGrp > maxSize) {
                            thrPerGrp = maxSize;
                        }
                        lclWorkSize[maxDim] = thrPerGrp;
                        for (uint d = 0; d < workDim; ++d) {
                            if (d != maxDim) {
                                lclWorkSize[d] = 1;
                            }
                        }
                    }
                }
            }
            else {
                // Use overrides when app doesn't provide workgroup dimensions
                if (workDim == 1) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
                }
                else if (workDim == 2) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
                }
                else if (workDim == 3) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
                        lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
                }
                else
                {
                    assert(0 && "Invalid workDim!");
                }
            }
        }
    }
    else {
        for (uint d = 0; d < workDim; ++d) {
            lclWorkSize[d] = workGroupInfo()->compileSize_[d];
        }
    }
}

void
Kernel::setupProgramGrid(
    VirtualGPU&         gpu,
    size_t              workDim,
    const amd::NDRange& glbWorkOffset,
    const amd::NDRange& gblWorkSize,
    amd::NDRange&       lclWorkSize,
    const amd::NDRange& groupOffset,
    const amd::NDRange& glbWorkOffsetOrg,
    const amd::NDRange& glbWorkSizeOrg
    ) const
{
    // ABI is always in CB0
    address cbBuf = gpu.cb(0)->sysMemCopy();
    uint*   pGlobalSize = reinterpret_cast<uint*>
        (cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
    uint*   pLocalSize = reinterpret_cast<uint*>
        (cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
    uint*   pNumGroups = reinterpret_cast<uint*>
        (cbBuf + GroupsOffset * ConstBuffer::VectorSize);
    uint*   pGlobalOffset = reinterpret_cast<uint*>
        (cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
    uint*   pGroupOffset = reinterpret_cast<uint*>
        (cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
    uint32_t*   debugInfo = reinterpret_cast<uint*>
        (cbBuf + DebugOffset * ConstBuffer::VectorSize);
    uint*   pNDRangeGlobalOffset = reinterpret_cast<uint*>
        (cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);

    // Check for 64-bit metadata
    uint glbABIShift = (abi64Bit()) ? 1 : 0;

    ProgramGrid* progGrid = &gpu.cal_.progGrid_;

    // Finds local workgroup size
    findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);

    // Initialize the execution grid block and size/offset
    pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
    pGlobalSize[3] = static_cast<uint>(workDim);

    pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
    pLocalSize[3] = 0;

    pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
    pNumGroups[3] = 0;

    pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
    pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;

    progGrid->gridBlock.width  =
    progGrid->gridBlock.height =
    progGrid->gridBlock.depth  = 1;

    progGrid->gridSize.width   =
    progGrid->gridSize.height  =
    progGrid->gridSize.depth   = 1;

    progGrid->partialGridBlock.width    =
    progGrid->partialGridBlock.height   =
    progGrid->partialGridBlock.depth    = 1;

    bool partialGrid = false;

    // Fill the right values, based on the application request
    switch (workDim) {
    case 3:
        pLocalSize[2] =
        progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);

        pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
        progGrid->gridSize.depth  = static_cast<CALuint>(gblWorkSize[2]);
        progGrid->gridSize.depth  /= progGrid->gridBlock.depth;
        pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;

        pGlobalOffset[2] = glbWorkOffset[2];
        pGroupOffset[2] = groupOffset[2];
        pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];

        if (dev().settings().partialDispatch_) {
            // Check if partial workgroup dispatch is required
            progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
            if (progGrid->partialGridBlock.depth != 0) {
                partialGrid = true;
                // Increment the number of groups
                progGrid->gridSize.depth++;
                pNumGroups[2]++;
            }
            else {
                progGrid->partialGridBlock.depth = lclWorkSize[2];
            }
        }
        // Fall through to fill 2D and 1D dimensions...
    case 2:
        pLocalSize[1] =
        progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);

        pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
        progGrid->gridSize.height  = static_cast<CALuint>(gblWorkSize[1]);
        progGrid->gridSize.height  /= progGrid->gridBlock.height;
        pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;

        pGlobalOffset[1] = glbWorkOffset[1];
        pGroupOffset[1] = groupOffset[1];
        pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];

        if (dev().settings().partialDispatch_) {
            // Check if partial workgroup dispatch is required
            progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
            if (progGrid->partialGridBlock.height != 0) {
                partialGrid = true;
                // Increment the number of groups
                progGrid->gridSize.height++;
                pNumGroups[1]++;
            }
            else {
                progGrid->partialGridBlock.height = lclWorkSize[1];
            }
        }
        // Fall through to fill 1D dimension...
    case 1:
        pLocalSize[0] =
        progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);

        pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
        progGrid->gridSize.width   = static_cast<CALuint>(gblWorkSize[0]);
        progGrid->gridSize.width   /= progGrid->gridBlock.width;
        pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;

        pGlobalOffset[0] = glbWorkOffset[0];
        pGroupOffset[0] = groupOffset[0];
        pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];

        if (dev().settings().partialDispatch_) {
            // Check if partial workgroup dispatch is required
            progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
            if (progGrid->partialGridBlock.width != 0) {
                partialGrid = true;
                // Increment the number of groups
                progGrid->gridSize.width++;
                pNumGroups[0]++;
            }
            else {
                progGrid->partialGridBlock.width = lclWorkSize[0];
            }
        }
        break;
    default:
        LogWarning("Wrong dimensions. Force to 1x1x1!");
        break;
    }

    if (!partialGrid) {
        progGrid->partialGridBlock.width =
        progGrid->partialGridBlock.height =
        progGrid->partialGridBlock.depth = 0;
    }

    // Calculate the total number of workitems and workgroups
    pGlobalOffset[3] = pGroupOffset[3] = 1;
    for (uint i = 0; i < workDim; ++i) {
        pGlobalOffset[3] *= pGlobalOffset[i];
        pGroupOffset[3] *= pGroupOffset[i];
    }

    // Setup debug output buffer (if printf is active)
    if (flags() & PrintfOutput) {
        if (abi64Bit()) {
            // Setup the debug info in constant buffer
            reinterpret_cast<uint64_t*>(debugInfo)[1] =
                gpu.printfDbg().bufOffset();
            // Size in DWORDs
            debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
            debugInfo[4] /= sizeof(uint32_t);
        }
        else {
            // Setup the debug info in constant buffer
            debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
            // Size in DWORDs
            debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
            debugInfo[2] /= sizeof(uint32_t);
        }
    }
}

bool
Kernel::initParameters()
{
    size_t  offset = 0;
    device::Kernel::parameters_t params;
    amd::KernelParameterDescriptor  desc;

    for (uint i = 0; i < arguments_.size(); ++i) {
        const KernelArg*  arg = argument(i);

        // Initialize the arguments for the abstraction layer
        if (arg->isCbNeeded()) {
            desc.name_      = arg->name_.data();
            desc.type_      = arg->type();
            desc.size_      = arg->size(false);
            desc.addressQualifier_ = arg->addressQualifier();
            desc.accessQualifier_ = arg->accessQualifier();
            desc.typeName_ = arg->typeName();
            desc.typeQualifier_ = arg->typeQualifier();

            // Make offset alignment to match CPU metadata, since
            // in multidevice config abstraction layer has a single signature
            // and CPU sends the paramaters as they are allocated in memory
            size_t  size = desc.size_;
            if (size == 0) {
                // Local memory for CPU
                size = sizeof(cl_mem);
            }
            offset  = amd::alignUp(offset, std::min(size, size_t(16)));
            desc.offset_    = offset;
            offset          += amd::alignUp(size, sizeof(uint32_t));
            params.push_back(desc);
        }
    }

    // Report the allocated local memory size (emulated and hw)
    if (hwLocalSize_ != 0) {
        CondLog((dev().info().localMemSize_ < hwLocalSize_),
            "Requested local size is bigger than reported");
        workGroupInfo_.localMemSize_ = hwLocalSize_;
    }

    if (!createSignature(params)) {
        return false;
    }

    return true;
}

bool
Kernel::bindGlobalHwCb(
    VirtualGPU& gpu,
    VirtualGPU::GslKernelDesc* desc) const
{
    bool result  = true;

    // Bind HW constant buffers used for the global data store
    const Program::HwConstBuffers& gds = prog().glbHwCb();
    for (Program::HwConstBuffers::const_iterator it = gds.begin();
        (it != gds.end() && result); ++it) {
        uint    idx = it->first;
        result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
    }

    return result;
}

bool
Kernel::bindConstantBuffers(VirtualGPU& gpu) const
{
    bool result = true;

    assert((numCb_ <= MaxConstBuffersArguments) &&
        "Runtime doesn't support more CBs for arguments!");

    // Upload the parameters to HW and bind all constant buffers
    for (uint i = 0; i < numCb_; i++) {
        ConstBuffer* cb = gpu.constBufs_[i];
        result &= cb->uploadDataToHw(cbSizes_[i]) &&
            bindResource(gpu, *cb, i, ConstantBuffer, i, NULL, cb->wrtOffset());
    }

    return result;
}

bool
Kernel::processMemObjects(
    VirtualGPU&         gpu,
    const amd::Kernel&  kernel,
    const_address       params,
    bool                nativeMem) const
{
    bool    aliases = false;
    VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency();
    bool readCache = !internal_ && rwAttributes_ && !dev().settings().assumeAliases_;

    // Mark the tracker with a new kernel,
    // so we can avoid checks of the aliased objects
    gpu.memoryDependency().newKernel();

    // Check all parameters for the current kernel
    const amd::KernelSignature& signature = kernel.signature();
    for (size_t i = 0; i < signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        const KernelArg*  arg = argument(i);
        Memory* memory = NULL;
        bool    readOnly = false;

        // Find if current argument is a buffer
        if ((desc.type_ == T_POINTER) &&
            (arg->type_ != KernelArg::PointerLocal) &&
            (arg->type_ != KernelArg::PointerHwLocal)) {
            if (nativeMem) {
                memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
            }
            else if (*reinterpret_cast<amd::Memory* const*>
                    (params + desc.offset_) != NULL) {
                memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
                    (params + desc.offset_));
                // Synchronize data with other memory instances if necessary
                memory->syncCacheFromHost(gpu);
            }

            if (memory != NULL) {
                readOnly = arg->memory_.readOnly_;

                // Check if read cache optimization is possible
                if (readCache) {
                    // Find if the same buffer was sent to other arguments (aliases)
                    for (size_t j = i + 1; (j < signature.numParameters()); ++j) {
                        const amd::KernelParameterDescriptor& descJ = signature.at(j);
                        const KernelArg*    argJ = argument(j);
                        if (argJ->type_ == KernelArg::PointerGlobal) {
                            bool readOnlyJ  = argJ->memory_.readOnly_;
                            Memory* memory2 = NULL;
                            if (nativeMem) {
                                memory2 = *reinterpret_cast<Memory* const*>
                                    (params + descJ.offset_);
                            }
                            else if (*reinterpret_cast<amd::Memory* const*>
                                    (params + descJ.offset_) != NULL) {
                                memory2 = dev().getGpuMemory(
                                    *reinterpret_cast<amd::Memory* const*>
                                    (params + descJ.offset_));
                            }
                            if (memory == memory2) {
                                if (!readOnly || !readOnlyJ) {
                                    aliases = true;
                                    break;
                                }
                            }
                        }
                    }
                }

                // Validate memory for a dependency in the queue
                gpu.memoryDependency().validate(gpu, memory, readOnly);
            }
        }
    }

    return aliases & readCache;
}

bool
Kernel::loadParameters(
    VirtualGPU&         gpu,
    const amd::Kernel&  kernel,
    const_address       params,
    bool                nativeMem) const
{
    bool    result = true;
    uint    i;

    // Initialize local private ranges
    if (!initLocalPrivateRanges(gpu)) {
        return false;
    }

    if (!dev().settings().useAliases_ &&
        (UavIdUndefined != uavRaw_) &&
        (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
        Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
        // Bind a buffer for a dummy read
        result = bindResource(gpu, *gpuMemory, 0,
            ArgumentUavID, uavRaw_);
    }

    // Find all parameters for the current kernel
    const amd::KernelSignature& signature = kernel.signature();
    for (i = 0; i != signature.numParameters(); ++i) {
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        // Set current argument
        if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) {
            result = false;
            break;
        }
    }

    if (result) {
        // Update the ring ranges and math constant
        setLocalPrivateRanges(gpu);

        result = bindConstantBuffers(gpu);

        if (dev().settings().useAliases_) {
            result &= bindResource(gpu, dev().globalMem(), 0, GlobalBufferArena, uavArena_);
        }
        else if (flags() & PrivateFixed) {
            result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
        }

        // Setup debug output buffer (if printf is active)
        if (flags() & PrintfOutput) {
            gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
        }
    }

    return result;
}

bool
Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const
{
    // 8xx workaround for the number of groups limit in HW
    if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
        const ProgramGrid* programGrid = &gpu.cal()->progGrid_;
        ConstBuffer* cb = gpu.numGrpCb();
        assert((cb != NULL) && "Runtime must have the constant buffer");

        uint32_t*   memPtr = reinterpret_cast<uint32_t*>(cb->sysMemCopy());
        memPtr[0] = programGrid->gridSize.width;
        memPtr[1] = programGrid->gridSize.height;
        memPtr[2] = programGrid->gridSize.depth;
        memPtr[3] = 0;

        memPtr[4] = programGrid->gridBlock.width;
        memPtr[5] = programGrid->gridBlock.height;
        memPtr[6] = programGrid->gridBlock.depth;
        memPtr[7] = 0;

        bool    result = cb->uploadDataToHw(8 * sizeof(uint32_t));
        if (result) {
            gpu.setConstantBuffer(SC_INFO_CONSTANTBUFFER,
                cb->gslResource(), static_cast<CALuint>(cb->wrtOffset()), cb->hbSize());
        }
        else {
            assert(!"Runtime didn't upload data for NumGroup workaround");
            return false;
        }
    }

    if (!gpu.runProgramGrid(*calEvent,
        const_cast<ProgramGrid*>(&gpu.cal()->progGrid_), gpu.vmMems(), gpu.cal_.memCount_)) {
        LogError("Failed to execute the program!");
        return false;
    }

    // Unbind all resources
    unbindResources(gpu, *calEvent, lastRun);

    return true;
}

static size_t counter = 0;
void
Kernel::debug(VirtualGPU& gpu) const
{
    std::fstream    stubWrite;
    address         src = NULL;
    if (!dev().heap()->isVirtual()) {
        src  = reinterpret_cast<address>
            (const_cast<Resource&>(dev().globalMem()).map(&gpu));
    }

    std::cerr << "--- " << name_ << " ---" << std::endl;
    for (uint i = 0; i < arguments_.size(); ++i) {
        const KernelArg*  arg = argument(i);
        Memory*  gpuMem = gpu.slots_[i].memory_;
        std::stringstream fileName;
        bool bufferObj =
            ((arg->type_ == KernelArg::PointerGlobal) ||
             (arg->type_ == KernelArg::PointerConst) ||
             (arg->type_ == KernelArg::PointerHwConst));

        if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
            address memory  = gpu.cb(arg->cbIdx_)->sysMemCopy();
            std::cerr.setf(std::ios::hex);
            uint* location  = reinterpret_cast<uint*>
                (src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
            std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;

            // Dump the data
            fileName << counter << "_kernel_" << name() <<
                    "_" << arg->name_ << "_" << location << ".bin";
            stubWrite.open(fileName.str().c_str(),
                (std::fstream::out | std::fstream::binary));

            // Write data to a file
            if (stubWrite.is_open()) {
                stubWrite.write(
                    reinterpret_cast<char*>(location), gpuMem->size());
                stubWrite.close();
            }
        }
        if (((arg->type_ >= KernelArg::Image1D) &&
             (arg->type_ <= KernelArg::Image3D)) ||
            ((src == NULL) && bufferObj)) {
            Memory*  resource = gpu.slots_[i].memory_;
            void* memory = resource->map(&gpu);
            uint* location  = reinterpret_cast<uint*>(memory);
            std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
            // Dump the data
            fileName << counter << "_kernel_" << name() <<
                    "_" << arg->name_ << "_" << location << ".bin";
            stubWrite.open(fileName.str().c_str(),
                (std::fstream::out | std::fstream::binary));

            // Write data to a file
            if (stubWrite.is_open()) {
                stubWrite.write(
                    reinterpret_cast<char*>(location), gpuMem->size());
                stubWrite.close();
            }
            resource->unmap(&gpu);
        }
    }

    for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
        std::stringstream fileName;
        fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
        stubWrite.open(fileName.str().c_str(),
            (std::fstream::out | std::fstream::binary));
        if (stubWrite.is_open()) {
            address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
            // Check if we have OpenCL program
            stubWrite.write(reinterpret_cast<char*>(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize());
            gpu.constBufs_[i]->unmap(&gpu);
            stubWrite.close();
        }
    }
    const Program::HwConstBuffers& gds = prog().glbHwCb();
    for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
        uint    idx = it->first;
        std::stringstream fileName;
        fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
        stubWrite.open(fileName.str().c_str(),
            (std::fstream::out | std::fstream::binary));
        if (stubWrite.is_open()) {
            address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
            // Check if we have OpenCL program
            stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
            (it->second)->unmap(&gpu);
            stubWrite.close();
        }
    }
    if (!dev().heap()->isVirtual()) {
        const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
    }
}

bool
Kernel::initConstBuffers()
{
    bool    result = true;
    size_t  i;

    assert((numCb_ != 0) && "We have 0 constant buffers!");

    // Allocate an array for CB sizes
    cbSizes_ = new size_t[numCb_];
    if (cbSizes_ == NULL) {
        return false;
    }
    memset(cbSizes_, 0, sizeof(size_t) * numCb_);

    // CB0 is reserved for ABI data
    cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;

    // Find sizes of all constant buffers
    for (i = 0; i < arguments_.size(); ++i) {
        const KernelArg*    arg = argument(i);
        size_t              size = arg->cbPos_ + arg->size(true);
        size_t              specVec = arg->specialVector();
        if (specVec != 0) {
            size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) *
                ConstBuffer::VectorSize;
        }
        // Do we need a CB?
        if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
            cbSizes_[arg->cbIdx_] = size;
        }
    }

    return result;
}

bool
Kernel::setInternalSamplers(VirtualGPU& gpu) const
{
    for (uint i = 0; i < samplerSize(); ++i) {
        const KernelArg*  arg = sampler(i);
        uint    state = arg->cbPos_;
        uint    idx = arg->index_;

        if (gpu.cal()->samplersState_[idx] != state) {
            setSampler(gpu, state, idx);
            gpu.cal_.samplersState_[idx] = state;
        }
    }

    return true;
}

bool
Kernel::setArgument(
    VirtualGPU&     gpu,
    uint            idx,
    const void*     param,
    size_t          size,
    bool            nativeMem) const
{
    bool        result = true;
    const KernelArg*  arg;
    address     memory;
    size_t      argSize;
    static const bool waitOnBusyEngine = true;

    assert((idx < arguments_.size()) && "Param index is out of range!");

    arg         = argument(idx);
    assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
    memory      = gpu.cb(1)->sysMemCopy();
    argSize     = arg->size(true);

    // Bind the global heap for emulation mode
    switch (arg->type_) {
    case KernelArg::PointerLocal:
    case KernelArg::PointerPrivate:
        if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
            return false;
        }
        // Fall through ...
    default:
        break;
    }

    switch (arg->type_) {
    case KernelArg::PointerConst:
    case KernelArg::PointerHwConst:
    case KernelArg::PointerGlobal:
        {
            gpu::Memory*    gpuMem = NULL;
            if (nativeMem) {
                gpuMem = *reinterpret_cast<Memory* const*>(param);
            }
            else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
                gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
            }
            bool    forceZeroOffset = false;

            if (gpuMem == NULL) {
                forceZeroOffset = true;
                gpuMem = dev().getGpuMemory(dev().dummyPage());
            }
            uint64_t    offset = gpuMem->pinOffset();

            // Make sure the passed argument is a buffer object
            if (!gpuMem->cal()->buffer_) {
                LogError("The kernel buffer argument isn't a buffer object!");
                return false;
            }

            if (arg->type_ == KernelArg::PointerHwConst) {
                 // Bind current memory object with the kernel
                if (!bindResource(gpu, *gpuMem, idx,
                        ArgumentConstBuffer, arg->index_, gpuMem)) {
                    return false;
                }
                assert((offset == 0) && "No offset for HW CB");
                // Add a fake offset to make sure (ptr != NULL) is TRUE
                offset = 1;
            }
            else {
                ResourceType    type = ArgumentHeapBuffer;

                // Check if kernel expects UAV binding
                if (arg->memory_.uavBuf_) {
                    type = ArgumentBuffer;
                }
                else {
                    if (blitKernelHack_) {
                        // Bind global buffer to UAV this buffer is bound to
                        if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
                            return false;
                        }
                    }
                    else {
                        // Bind global buffer to UAV this buffer is bound to
                        if (!bindResource(gpu, dev().globalMem(), 0,
                            GlobalBuffer, uavRaw_)) {
                            return false;
                        }
                    }
                }

                // Bind current memory object with the kernel
                // Note: it's a fake binding, if the buffer is part of
                // the global heap
                if (!bindResource(gpu, *gpuMem, idx, type, arg->index_, gpuMem)) {
                    return false;
                }

                // Update offset only if we bind HeapBuffer or
                // it's global address space in UAV setup on SI+
                if ((type == ArgumentHeapBuffer) || dev().settings().siPlus_) {
                    if (!blitKernelHack_) {
                        offset += gpuMem->hbOffset();
                        if (!forceZeroOffset) {
                            assert((offset != 0) && "Offset 0 with a real allocation!");
                        }
                    }
                    gpu.addVmMemory(gpuMem);
                }
            }

            // Wait for resource if it was used on an inactive engine
            //! \note syncCache may call DRM transfer
            gpuMem->wait(gpu, waitOnBusyEngine);

            if (forceZeroOffset) {
                offset = 0;
            }

            // Copy memory offset into the constant buffer
            if (abi64Bit()) {
                *(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
            }
            else {
                *(reinterpret_cast<uint*>(memory + arg->cbPos_)) =
                    static_cast<uint>(offset);
            }
        }
        break;
    case KernelArg::Image1D:
    case KernelArg::Image2D:
    case KernelArg::Image3D:
    case KernelArg::Image1DB:
    case KernelArg::Image1DA:
    case KernelArg::Image2DA:
        {
            gpu::Memory*    gpuMem = NULL;
            if (nativeMem) {
                gpuMem = *reinterpret_cast<Memory* const*>(param);
            }
            else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
                gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
            }

            if (gpuMem == NULL) {
                return false;
            }
            // Make sure the passed argument is an image object
            if (gpuMem->cal()->buffer_) {
                LogError("The kernel image argument isn't an image object!");
                return false;
            }

            ResourceType    resType = arg->memory_.readOnly_ ?
                ArgumentImageRead : ArgumentImageWrite;

            // Bind current memory object with the shader.
            if (!bindResource(gpu, *gpuMem, idx,
                    resType, arg->index_, gpuMem)) {
                return false;
            }

            // Wait for resource if it was used on an inactive engine
            //! \note syncCache may call DRM transfer
            gpuMem->wait(gpu, waitOnBusyEngine);

            // Copy image constants into the constant buffer
            if (gpuMem->owner() != NULL) {
                copyImageConstants(gpuMem->owner()->asImage(),
                    reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
            }
        }
        break;
    case KernelArg::Sampler:
        {
            amd::Sampler* amdSampler =
                *reinterpret_cast<amd::Sampler* const*>(param);
            uint    idx = arg->index_;
            uint32_t    state = amdSampler->state();

            if (state != gpu.cal()->samplersState_[idx]) {
                setSampler(gpu, state, idx);
                gpu.cal_.samplersState_[idx] = state;
            }

            // Copy sampler state into the constant buffer
            *(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
        }
        break;
    case KernelArg::Counter:
        {
            gpu::Memory*    gpuMem = NULL;
            if (nativeMem) {
                gpuMem = *reinterpret_cast<Memory* const*>(param);
            }
            else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
                gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
            }

            // Wait for resource if it was used on an inactive engine
            //! \note syncCache may call DRM transfer
            gpuMem->wait(gpu, waitOnBusyEngine);

            // Bind current memory object with the shader.
            if (!bindResource(gpu, *gpuMem, idx,
                ArgumentCounter, idx, gpuMem)) {
                return false;
            }
        }
        break;
    case KernelArg::PointerHwLocal:
        {
            // Calculate current offset in the local ring
            uint    offset = gpu.cal_.progGrid_.localSize;
            uint    extra  = amd::alignUp(offset, arg->alignment_) - offset;

            offset = amd::alignUp(offset, arg->alignment_);
            size_t  memSize = *static_cast<const uintptr_t*>(param);

            // Allocate new memory from the local ring
            gpu.cal_.progGrid_.localSize += static_cast<uint>(memSize) + extra;
            // Copy current local argument's offset into the CB
            *(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;

            CondLog((gpu.cal_.progGrid_.localSize > dev().info().localMemSize_),
                "Requested local size is bigger than reported!");
        }
        break;
    case KernelArg::Float:
    case KernelArg::Double:
    case KernelArg::Char:
    case KernelArg::UChar:
    case KernelArg::Short:
    case KernelArg::UShort:
    case KernelArg::Int:
    case KernelArg::UInt:
    case KernelArg::Long:
    case KernelArg::ULong:
        if (size != argSize) {
            LogWarning("Parameter's sizes are unmatched!");
        }
        // Fall through ...
    case KernelArg::Struct:
    case KernelArg::Union: {
        size_t  specVec = arg->specialVector();
        if (specVec != 0) {
            uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
            for (uint i = 0; i < iter; ++i) {
                amd::Os::fastMemcpy((memory + arg->cbPos_ +
                    i * ConstBuffer::VectorSize),
                    reinterpret_cast<const char*>(param) +
                    i * KernelArg::VectorSizeLimit * specVec,
                    specVec * KernelArg::VectorSizeLimit);
            }
        }
        else {
            // Copy data into the CB
            amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
        }
    }
        break;
    default:
        LogError("Unhandled argument's type!");
        break;
    }

    return result;
}

bool
Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const
{
    // Initialize HW local
    gpu.cal_.progGrid_.localSize = hwLocalSize_;

    // Bind the global buffer if emulated local or private memory
    // was allocated by the kernel
    if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) &&
        (uavRaw_ != UavIdUndefined)) {
        if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
            return false;
        }
    }

    // Bind the global buffer if emulated constant buffers are enabled
    if (cbId_ != UavIdUndefined) {
        if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
            return false;
        }
    }

    // Bind the printf buffer
    if (printfId_ != UavIdUndefined) {
        if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
            return false;
        }
    }
    // Initialize the iterations count
    gpu.cal_.iterations_ = 1;

    return true;
}

void
Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const
{
    address cbBuf = gpu.cb(0)->sysMemCopy();
    uint*   data;
    uint    gridSize =
        gpu.cal()->progGrid_.gridSize.width *
        gpu.cal()->progGrid_.gridSize.height *
        gpu.cal()->progGrid_.gridSize.depth;
    uint    blockSize =
        gpu.cal()->progGrid_.gridBlock.width *
        gpu.cal()->progGrid_.gridBlock.height *
        gpu.cal()->progGrid_.gridBlock.depth;

    //! \todo validate if the compiler still generates PrivateFixed
    if (flags() & PrivateFixed) {
        // Update private ring
        data = reinterpret_cast<uint*>
            (cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
        Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());

        if (abi64Bit()) {
            reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
            data[2] = 0;
            data[3] = 0;
        }
        else {
            data[0] = static_cast<uint>(gpuMemory->hbOffset());
            data[1] = 0;
            data[2] = data[3] = 0;
        }
        gpu.addVmMemory(gpuMemory);
    }

    // Copy the math lib constants
    amd::Os::fastMemcpy(
        (cbBuf + MathLibOffset * ConstBuffer::VectorSize),
        MathLibConst, sizeof(MathLibConst));

    // Update the offset to the global data
    if (prog().glbData() != NULL) {
        gpu.addVmMemory(prog().glbData());
        uint64_t    glbDataOffset = prog().glbData()->hbOffset();
        if (abi64Bit()) {
            *reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset *
                ConstBuffer::VectorSize) = glbDataOffset;
        }
        else {
            *reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset *
                ConstBuffer::VectorSize) = static_cast<uint>(glbDataOffset);
        }
    }

    // Split workload if it was requested
    if ((gpu.cal_.iterations_ < 2) &&
        gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
        uint    totalSize = gridSize * blockSize;
        if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
            gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_,
                (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
        }
    }

    // Initialize the number of iterations to the grid size
    if (flags() & PrintfOutput) {
        gpu.cal_.iterations_ = gridSize;
    }
}

void
Kernel::setSampler(
    VirtualGPU& gpu,
    uint32_t    state,
    uint        physUnit
) const
{
    // All CAL sampler's parameters are in floats
    float   gslAddress = GSL_CLAMP_TO_BORDER;
    float   gslMinFilter = GSL_MIN_NEAREST;
    float   gslMagFilter = GSL_MAG_NEAREST;

    state &= ~amd::Sampler::StateNormalizedCoordsMask;

    // Program the sampler address mode
    switch (state & amd::Sampler::StateAddressMask) {
        case amd::Sampler::StateAddressRepeat:
            gslAddress = GSL_REPEAT;
            break;
        case amd::Sampler::StateAddressClampToEdge:
            gslAddress = GSL_CLAMP_TO_EDGE;
            break;
        case amd::Sampler::StateAddressMirroredRepeat:
            gslAddress = GSL_MIRRORED_REPEAT;
            break;
        case amd::Sampler::StateAddressClamp:
        case amd::Sampler::StateAddressNone:
        default:
            break;
    }
    state &= ~amd::Sampler::StateAddressMask;

    gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
    gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
    gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);

    // Program texture filter mode
    if (state == amd::Sampler::StateFilterLinear) {
        gslMinFilter = GSL_MIN_LINEAR;
        gslMagFilter = GSL_MAG_LINEAR;
    }

    gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
    gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
}

bool
Kernel::bindResource(
    VirtualGPU&     gpu,
    const Resource& resource,
    uint            paramIdx,
    ResourceType    type,
    uint            physUnit,
    Memory*         memory,
    size_t          offset) const
{
    gslUAVType  uavType = GSL_UAV_TYPE_UNKNOWN;

    // Find the original resource name from the IL program
    switch (type) {
    case GlobalBuffer:
        if (gpu.state_.boundGlobal_) {
            return true;
        }
        gpu.state_.boundGlobal_ = true;
        physUnit = uavRaw_;
        uavType = GSL_UAV_TYPE_TYPELESS;
        break;
    case GlobalBufferArena:
        if (gpu.state_.boundGlobal_) {
            return true;
        }
        gpu.state_.boundGlobal_ = true;
        physUnit = uavArena_;
        uavType = GSL_UAV_TYPE_TYPELESS;
        break;
    case ArgumentCbID:
        if (gpu.state_.boundCb_) {
            return true;
        }
        gpu.state_.boundCb_ = true;
        physUnit = cbId_;
        uavType = GSL_UAV_TYPE_TYPELESS;
        break;
    case ArgumentPrintfID:
        if (gpu.state_.boundPrintf_) {
            return true;
        }
        gpu.state_.boundPrintf_ = true;
        physUnit = printfId_;
        uavType = GSL_UAV_TYPE_TYPELESS;
        break;
    case ArgumentHeapBuffer:
    case ArgumentBuffer:
    case ArgumentImageRead:
    case ArgumentImageWrite:
    case ArgumentConstBuffer:
    case ArgumentCounter:
        // Early exit if resource is bound already
        if (gpu.slots_[paramIdx].state_.bound_) {
            return true;
        }

        // Associate resource with the slot
        gpu.slots_[paramIdx].memory_    = memory;

        // Mark resource as bound
        gpu.slots_[paramIdx].state_.bound_ = true;

        if (type == ArgumentCounter) {
            GpuEvent  calEvent;

            // Bind memory with atomic counter
            gpu.bindAtomicCounter(argument(paramIdx)->index_,
                memory->gslResource());

            // Copy the counter value into GDS
            gpu.syncAtomicCounter(calEvent, argument(paramIdx)->index_, false);

            // Mark resource as busy
            memory->setBusy(gpu, calEvent);
            return true;
        }
        else if (type == ArgumentHeapBuffer) {
            // We return here, since we just have to bind the global heap
            return true;
        }
        else if (type == ArgumentConstBuffer) {
            gpu.slots_[paramIdx].state_.constant_ = true;
        }
        break;
    case ArgumentUavID:
    case ConstantBuffer:
        break;
    default:
        LogPrintfError("Unspecified argument type ()!", type);
        return false;
    }

    gslMemObject gslMem = NULL;
    // Use global address space on SI+ for UAV setup
    if (dev().settings().siPlus_ &&
        ((type == ArgumentBuffer) || (type == ArgumentCbID) ||
         (type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
        !blitKernelHack_) {
        gslMem = dev().heap()->resource().gslResource();
    }
    else {
        gslMem = resource.gslResource();
    }

    // Associate memory with the physical unit, the actual binding
    bool result = true;
    switch (type) {
    case GlobalBuffer:
    case GlobalBufferArena:
    case ArgumentBuffer:
    case ArgumentImageWrite:
    case ArgumentUavID:
    case ArgumentCbID:
    case ArgumentPrintfID:
        if (type == ArgumentImageWrite) {
            uavType = GSL_UAV_TYPE_TYPED;
        }
        else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
            uavType = GSL_UAV_TYPE_TYPELESS;
        }
        if (gpu.cal_.uavs_[physUnit] != gslMem) {
            result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
            gpu.setUAVChannelOrder(physUnit, gslMem);
            gpu.cal_.uavs_[physUnit] = gslMem;
        }
        else if (!dev().settings().siPlus_)
        {
            gpu.setUAVChannelOrder(physUnit, gslMem);
        }
        break;
    case ConstantBuffer:
    case ArgumentConstBuffer:
        if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
            result = gpu.setConstantBuffer(physUnit,
                gslMem, offset, resource.hbSize());
            gpu.cal_.constBuffers_[physUnit] = gslMem;
        }
        break;
    case ArgumentImageRead:
        if (gpu.cal_.readImages_[physUnit] != gslMem) {
            result = gpu.setInput(physUnit, gslMem);
            gpu.cal_.readImages_[physUnit] = gslMem;
        }
        break;
    default:
        result = false;
        assert(false);
        break;
    }
    if (!result) {
        LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
        return false;
    }

    if ((type == GlobalBuffer) && dev().settings().useAliases_) {
        if (uavArena_ != 0) {
            if (!setupArenaAliases(gpu, resource)) {
                return false;
            }
            if ((uavArena_ != physUnit) &&
                (gpu.cal_.uavs_[uavArena_] != gslMem)) {
                gpu.cal_.uavs_[uavArena_] = gslMem;
                // Associate memory with the name
                if (!gpu.setUAVBuffer(uavArena_, gslMem,
                    GSL_UAV_TYPE_TYPELESS)) {
                    LogError("calCtxSetMem failed!");
                    return false;
                }
                gpu.setUAVChannelOrder(uavArena_, gslMem);
            }
        }
    }

    return true;
}

void
Kernel::unbindResources(
    VirtualGPU& gpu,
    GpuEvent  calEvent,
    bool        lastRun) const
{
    // Make sure unbind occurs on the last run, in case the execution had a split
    if (lastRun) {
        for (uint i = 0; i < arguments_.size(); ++i) {
            if (gpu.slots_[i].state_.bound_) {
                GpuEvent  calEventTmp = calEvent;

                if (KernelArg::Counter == argument(i)->type_) {
                    // Copy the counter value from GDS
                    gpu.syncAtomicCounter(calEventTmp, argument(i)->index_, true);
                }
                else if (!(gpu.slots_[i].state_.constant_ ||
                           argument(i)->memory_.readOnly_)) {
                    // Signal the abstraction layer that GPU memory is dirty
                    if (gpu.slots_[i].memory_->owner() != NULL) {
                        gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
                    }
                }
                // Mark resource as busy
                gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);

                gpu.slots_[i].state_.value_ = 0;
            }
        }

        // Unbind the global buffer
        gpu.state_.boundGlobal_ = false;

        // Unbind the constant buffer
        gpu.state_.boundCb_ = false;

        // Unbind the pritnf buffer
        gpu.state_.boundPrintf_ = false;
    }

    // Mark CB busy
    for (uint i = 0; i < numCb_; ++i) {
        gpu.constBufs_[i]->setBusy(gpu, calEvent);
    }

    // 8xx workaround for the number of groups limit in HW
    if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
        ConstBuffer* cb = gpu.numGrpCb();
        assert((cb != NULL) && "Runtime must have the constant buffer");
        cb->setBusy(gpu, calEvent);
    }

    // Set the event object for the scratch buffer
    if (workGroupInfo()->scratchRegs_ > 0) {
        for (uint i = 0; i < dev().scratch(gpu.hwRing())->memObjs_.size(); ++i) {
            dev().scratch(gpu.hwRing())->memObjs_[i]->setBusy(gpu, calEvent);
        }
    }
}

bool
Kernel::setupArenaAliases(VirtualGPU& gpu, const Resource& resource) const
{
    const static uint ScArenaUavShortId = 9;
    const static uint ScArenaUavByteId  = 10;

    Resource* buf = &(const_cast<Resource&>(resource));
    Resource* alias;
    gslMemObject gslMem = NULL;
    //
    // byte view
    //
    alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R8I);
    if (NULL == alias) {
        return false;
    }
    gslMem = alias->gslResource();
    if (gpu.cal_.uavs_[ScArenaUavByteId] != gslMem) {
        if (!gpu.setUAVBuffer(ScArenaUavByteId,
            gslMem, GSL_UAV_TYPE_TYPELESS)) {
            return false;
        }
        gpu.cal_.uavs_[ScArenaUavByteId] = gslMem;
    }

    //
    // short view
    //
    alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R16I);
    if (NULL == alias) {
        return false;
    }
    bool result = true;
    gslMem = alias->gslResource();
    if (gpu.cal_.uavs_[ScArenaUavShortId] != gslMem) {
        result = gpu.setUAVBuffer(ScArenaUavShortId,
            gslMem, GSL_UAV_TYPE_TYPELESS);
        gpu.cal_.uavs_[ScArenaUavShortId] = gslMem;
    }

    return result;
}

void
Kernel::copyImageConstants(
    const amd::Image*   amdImage,
    ImageConstants*     imageData
    ) const
{
    imageData->width_   = static_cast<uint32_t>(amdImage->getWidth());
    imageData->height_  = static_cast<uint32_t>(amdImage->getHeight());
    imageData->depth_   = static_cast<uint32_t>(amdImage->getDepth());
    imageData->dataType_    =
        static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);

    imageData->widthFloat_  = 1.f / static_cast<float>(amdImage->getWidth());
    imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
    imageData->depthFloat_  = 1.f / static_cast<float>(amdImage->getDepth());
    imageData->channelOrder_ =
        static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
}

union MetadataVersion {
    struct {
        uint64_t    revision_: 16;      //!< LLVM metadata revision
        uint64_t    minorVersion_: 16;  //!< LLVM metadata minor verison
        uint64_t    majorVersion_: 16;  //!< LLVM metadata major version
    };
    uint64_t    value_;
    MetadataVersion(uint mj, uint mi, uint rev): value_(0)
    {
        revision_       = rev;
        minorVersion_   = mi;
        majorVersion_   = mj;
    }
    MetadataVersion(): value_(0) {}
};

//! Version of metadata with buffer attributes
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);

//! Version of metadata with type qualifiers
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);

bool
NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
{
    // Initialize workgroup info
    workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_;
    MetadataVersion     mdVersion;

    // Find first tag
    size_t  pos = metaData.find(";");

    // Loop through all provided program arguments
    while (pos != std::string::npos) {
        KernelArg  arg;

        if (!expect(metaData, &pos, ";")) {
            break;
        }

        arg.type_ = KernelArg::None;

        // Loop through all available metadata types
        for (uint i = 0; i < ArgStateTotal; ++i) {
            uint tmpValue;
            // Find the name tag
            if (expect(metaData, &pos, ArgState[i].typeName_)) {
                switch (ArgState[i].type_) {
                case KernelArg::None:
                    // Process next ...
                    continue;
                case KernelArg::Reflection: {
                    uint    argIdx;
                    // Read the argument's index
                    if (!getuint(metaData, &pos, &argIdx)) {
                        LogWarning("Couldn't get the argument index!");
                        return false;
                    }
                    KernelArg*  tmpArg = arguments_[argIdx];
                    if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
                        LogWarning("Couldn't get the argument type!");
                        return false;
                    }
                }
                    continue;
                case KernelArg::ConstArg: {
                    uint    argIdx;
                    // Read the argument's index
                    if (!getuint(metaData, &pos, &argIdx)) {
                        LogWarning("Couldn't get the argument index!");
                        return false;
                    }
                    KernelArg*  tmpArg = arguments_[argIdx];
                    tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
                }
                    continue;
                case KernelArg::Grouping:
                    for (uint j = 0; j < 3; ++j) {
                        uint    temp;
                        // Read the compile workgroup size
                        if (!getuint(metaData, &pos, &temp)) {
                            LogWarning("Couldn't get the compile workgroup size!");
                            return false;
                        }
                        workGroupInfo_.compileSize_[j] = temp;
                    }
                    // Process next ...
                    continue;
                case KernelArg::WrkgrpSize: {
                        uint    temp;
                        // Read the workgroup size
                        if (!getuint(metaData, &pos, &temp)) {
                            LogWarning("Couldn't get the workgroup size!");
                            return false;
                        }
                        workGroupInfo_.size_ = temp;
                    }
                    // Process next ...
                    continue;
                case KernelArg::Wavefront:
                    // Process next ...
                    continue;
                case KernelArg::UavId:
                    // Read index
                    if (!getuint(metaData, &pos, &arg.index_)) {
                        return false;
                    }
                    break;
                case KernelArg::ConstBufId:
                    // Read index
                    if (!getuint(metaData, &pos, &cbId_)) {
                        return false;
                    }
                    continue;
                case KernelArg::PrintfBufId:
                    // Read index
                    if (!getuint(metaData, &pos, &printfId_)) {
                        return false;
                    }
                    continue;
                case KernelArg::MetadataVersion:
                    // Read metadata version
                    if (!getuint(metaData, &pos, &tmpValue)) {
                        return false;
                    }
                    mdVersion.majorVersion_ = tmpValue;
                    if (!getuint(metaData, &pos, &tmpValue)) {
                        return false;
                    }
                    mdVersion.minorVersion_ = tmpValue;
                    if (!getuint(metaData, &pos, &tmpValue)) {
                        return false;
                    }
                    mdVersion.revision_ = tmpValue;
                    // Process next ...
                    continue;
                case KernelArg::GroupingHint:
                    for (uint j = 0; j < 3; ++j) {
                        uint    temp;
                        // Read the compile workgroup size hint
                        if (!getuint(metaData, &pos, &temp)) {
                            LogWarning("Couldn't get the compile workgroup size hint!");
                            return false;
                        }
                        workGroupInfo_.compileSizeHint_[j] = temp;
                    }
                    // Process next ...
                    continue;
                case KernelArg::VecTypeHint:
                    {
                        std::string temp;
                        // Read the compile vector type hint
                        if (!getstring(metaData, &pos, &temp)) {
                          LogWarning("Couldn't get the compile vector type hint!");
                          return false;
                        }
                        workGroupInfo_.compileVecTypeHint_ = temp;
                    }
                    // Process next ...
                    continue;
                default:
                    break;
                }

                char    argName[256];
                // Save the argument type
                arg.type_ = ArgState[i].type_;

                // Check if we should expect the name
                if (ArgState[i].name_) {
                    // Read the parameter's name
                    if (!getword(metaData, &pos, argName)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                    arg.name_ = argName;
                }

                if (arg.type_ == KernelArg::Sampler) {
                    if (!getuint(metaData, &pos, &arg.index_)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                    if (!getuint(metaData, &pos, &arg.location_)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                    if (!getuint(metaData, &pos, &arg.cbPos_)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                }

                // Check if we should expect the resource data type
                if (ArgState[i].resType_) {
                    uint    k;
                    // Search for the data type
                    for (k = 0; k < DataTypeTotal; k++) {
                        if (expect(metaData, &pos, DataType[k].tagName_)) {
                            arg.dataType_ = DataType[k].type_;
                            if (arg.type_ == KernelArg::Image) {
                                flags_ |= ImageEnable;
                                if (expect(metaData, &pos, "RO:")) {
                                    arg.memory_.readOnly_ = 1;
                                }
                                else if (expect(metaData, &pos, "RW:")) {
                                    arg.memory_.readWrite_  = 1;
                                    flags_ |= ImageWrite;
                                }
                                else if (expect(metaData, &pos, "WO:")) {
                                    arg.memory_.writeOnly_ = 1;
                                    flags_ |= ImageWrite;
                                }
                            }
                            else if (arg.type_ == KernelArg::Value) {
                                arg.type_ = DataType[k].type_;
                            }
                            break;
                        }
                    }
                    if (k == DataTypeTotal) {
                        LogWarning("We couldn't find the argument's type.");
                        if ((arg.type_ == KernelArg::Value) ||
                            !getword(metaData, &pos, argName)) {
                            LogWarning("Couldn't get a kernel argument!");
                            return false;
                        }
                    }
                    //! @todo temporary condition
                    if ((arg.type_ == KernelArg::Opaque) ||
                        (arg.type_ == KernelArg::Sampler)) {
                        assert(false);
                        continue;
                    }
                }

                // Check if we should expect the data size
                if (ArgState[i].size_) {
                    uint    tmpData;
                    // Read the data size
                    if (!getuint(metaData, &pos, &tmpData)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                    if (arg.type_ == KernelArg::Image) {
                        arg.type_ = arg.dataType_;
                        arg.index_ = tmpData;
                    }
                    else {
                        arg.size_ = tmpData;
                    }
                }

                if (arg.type_ == KernelArg::Counter) {
                    // Read a counter index
                    if (!getuint(metaData, &pos, &arg.index_)) {
                        LogWarning("Couldn't get a counter index!");
                        return false;
                    }
                }

                // Check if we should expect a resource index
                if (ArgState[i].cbIdx_) {
                    // Read resource index
                    if (!getuint(metaData, &pos, &arg.cbIdx_)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }

                    if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
                        numCb_ = arg.cbIdx_;
                    }
                }
                // Check if we should expect the CB offset
                if (ArgState[i].cbPos_) {
                    // Read position in the constant buffer
                    if (!getuint(metaData, &pos, &arg.cbPos_)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                }
                // Check if we should expect the buffer type
                if (ArgState[i].buf_) {
                    // Read the buffer type
                    if (!getword(metaData, &pos, argName)) {
                        LogWarning("Couldn't get a kernel argument!");
                        return false;
                    }
                    arg.buf_ = argName;

                    for (uint k = 0; k < BufTypeTotal; ++k) {
                        if (0 == arg.buf_.compare(BufType[k].tagName_)) {
                            // Update the parameter type
                            arg.type_ = BufType[k].type_;
                            // Check if we should expect a buffer index
                            if (BufType[k].number_) {
                                // Read a buffer index
                                if (!getuint(metaData, &pos, &arg.index_)) {
                                    LogWarning("Couldn't get a kernel argument!");
                                    return false;
                                }
                            }
                            // Check for the required alignment
                            if (BufType[k].alignment_) {
                                // Read data alignment
                                if (!getuint(metaData, &pos, &arg.alignment_)) {
                                    LogWarning("Couldn't get a kernel argument!");
                                    return false;
                                }
                            }
                            // Check for the buffer's attribute
                            if ((mdVersion.value_ >= MetadataBufferAttributes.value_) &&
                                BufType[k].attribute_) {
                                if (expect(metaData, &pos, "RO")) {
                                    arg.memory_.readOnly_ = 1;
                                }
                                else if (expect(metaData, &pos, "RW")) {
                                    arg.memory_.readWrite_ = 1;
                                }
                                else if (expect(metaData, &pos, "WO")) {
                                    arg.memory_.writeOnly_ = 1;
                                }
                            }
                            // Check for the type qualifier
                            if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) &&
                                BufType[k].attribute_) {
                                uint tmp;
                                pos += 1;
                                if (!getuint(metaData, &pos, &tmp)) {
                                    LogWarning("Couldn't get volatile type!");
                                    return false;
                                }
                                if (tmp == 1) {
                                    arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
                                }
                                if (!getuint(metaData, &pos, &tmp)) {
                                    LogWarning("Couldn't get restrict type!");
                                    return false;
                                }
                                if (tmp == 1) {
                                    arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
                                }
                            }
                        }
                    }
                }
                // Find multiple UAV references
                switch (arg.type_) {
                case KernelArg::PointerGlobal:
                case KernelArg::PointerConst:
                case KernelArg::PointerLocal:
                case KernelArg::PointerPrivate:
                case KernelArg::UavId:
                    uavRefCount[arg.index_]++;
                    break;
                default:
                    break;
                }
                // Check if this argument will be passed in constant buffer
                if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
                    if (arg.type_ == KernelArg::Sampler) {
                        // Serach for the passed by value sampler
                        for (uint i = 0; i < argSize(); ++i) {
                            KernelArg*  value = arguments_[i];
                            if (0 == value->name_.compare(arg.name_)) {
                                value->type_ = arg.type_;
                                value->index_ = arg.index_;
                                value->location_ = 0;
                                break;
                            }
                        }
                    }
                    else {
                        KernelArg*  argument = new KernelArg(arg);
                        if (argument != NULL) {
                            addArgument(argument);
                        }
                        else {
                            LogError("Couldn't allocate memory!");
                            return false;
                        }
                    }
                }
                // Check if we have a pre-defined sampler
                else if (arg.type_ == KernelArg::Sampler) {
                    KernelArg*  sampler = new KernelArg(arg);
                    if (sampler != NULL) {
                        addSampler(sampler);
                    }
                    else {
                        LogError("Couldn't allocate memory!");
                        return false;
                    }
                }
                break;
            }
        }

        // Next argument
        pos = metaData.find(";", pos);
    }

    // Find arguments that will require a reallocation
    for (uint i = 0; i < arguments_.size(); ++i) {
        KernelArg* arg = arguments_[i];
        switch (arg->type_) {
        case KernelArg::PointerGlobal:
        case KernelArg::PointerConst:
        case KernelArg::PointerLocal:
        case KernelArg::PointerPrivate:
            // Check if can't use a dedicated UAV,
            // so realloc memory in the heap
            arg->memory_.realloc_ = isRealloc();

            if (nullDev().settings().useAliases_) {
                if (uavRefCount[arg->index_] > 1) {
                    // Multiple accesses, assume this is the heap
                    uavRaw_ = arg->index_;
                }
                // Mark argument as an UAV buffer if no aliases or it's not arena
                else if (arg->index_ != VirtualGPU::UavArena) {
                    arg->memory_.uavBuf_ = true;
                }
            }
            else {
                arg->memory_.uavBuf_ = true;
            }
            break;
        case KernelArg::PointerHwConst:
            arg->memory_.realloc_ = true;
            break;
        case KernelArg::UavId:
            if (!nullDev().settings().useAliases_ ||
                (arg->index_ != VirtualGPU::UavArena)) {
                uavRaw_ = arg->index_;
            }
            break;
        default:
            break;
        }
        // If argument marked with the const qualifier, then overwrite
        // Read-Write attributes, since compiler doesn't mark it properly
        if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
            arg->memory_.readOnly_ = 1;
            arg->memory_.readWrite_ = 0;
            arg->memory_.writeOnly_ = 0;
        }
    }

    if (!nullDev().settings().useAliases_ &&
        (uavRaw_ != UavIdUndefined) &&
        !(flags() & PrintfOutput)) {
        // Find if default UAV is already assigned to an argument
        for (uint i = 0; i < arguments_.size(); ++i) {
            KernelArg* arg = arguments_[i];
            switch (arg->type_) {
            case KernelArg::PointerGlobal:
            case KernelArg::PointerConst:
            case KernelArg::PointerLocal:
            case KernelArg::PointerPrivate:
                if (uavRaw_ == arg->index_) {
                    uavRaw_ = UavIdUndefined;
                }
                break;
            default:
                break;

            }
        }
    }

    // There is always 1 constant buffer, associated with the kernel
    numCb_++;
    assert((numCb_ <= MaxConstBuffersArguments) &&
        "Runtime doesn't support more than max CBs for arguments!");

    // Limit workgroup size if requested
    if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
        size_t  temp = 1;
        workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
        for (uint j = 0; j < 3; ++j) {
            if (workGroupInfo()->compileSize_[j] != 0) {
                temp *= workGroupInfo_.compileSize_[j];
            }
        }
        // Report a compilation error if requested compile size doesn't
        // match the required workgroup size
        if (workGroupInfo()->size_ < temp) {
            char    str[8];
            intToStr(workGroupInfo_.size_, str, 8);
            buildError_ = CL_OUT_OF_RESOURCES;
            buildLog_ +=
                "Error: Requested compile size is bigger than the required workgroup size of ";
            buildLog_ += str;
            buildLog_ += " elements\n";
            LogError(buildLog().c_str());
            return false;
        }
    }

    // Read/Write attributes are provided in metadata
    if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
        rwAttributes_ = true;
    }

    return true;
}

bool
Kernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
    // Check if memory doesn't require reallocation
    bool    noRealloc = (!argument(idx)->memory_.realloc_ ||
        amdMem->reallocedDeviceMemory(&dev()));

    return noRealloc;
}

inline static HSAIL_ARG_TYPE
GetHSAILArgType(const aclArgData* argInfo)
{
    switch (argInfo->type) {
        case ARG_TYPE_POINTER:
            if (argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU) {
                return HSAIL_ARGTYPE_QUEUE;
            }
            return HSAIL_ARGTYPE_POINTER;
        case ARG_TYPE_VALUE:
            return HSAIL_ARGTYPE_VALUE;
        case ARG_TYPE_IMAGE:
            return HSAIL_ARGTYPE_IMAGE;
        case ARG_TYPE_SAMPLER:
            return HSAIL_ARGTYPE_SAMPLER;
        case ARG_TYPE_ERROR:
        default:
            return HSAIL_ARGTYPE_ERROR;
    }
}

inline static size_t
GetHSAILArgAlignment(const aclArgData* argInfo)
{
    switch (argInfo->type) {
        case ARG_TYPE_POINTER:
            return argInfo->arg.pointer.align;
        default:
            return 1;
    }
}

inline static HSAIL_ADDRESS_QUALIFIER
GetHSAILAddrQual(const aclArgData* argInfo)
{
    if (argInfo->type == ARG_TYPE_POINTER) {
        switch (argInfo->arg.pointer.memory) {
            case PTR_MT_CONSTANT_EMU:
            case PTR_MT_CONSTANT:
            case PTR_MT_UAV:
            case PTR_MT_GLOBAL:
                return HSAIL_ADDRESS_GLOBAL;
            case PTR_MT_LDS_EMU:
            case PTR_MT_LDS:
                return HSAIL_ADDRESS_LOCAL;
            case PTR_MT_SCRATCH_EMU:
                return HSAIL_ADDRESS_GLOBAL;
            case PTR_MT_ERROR:
            default:
                LogError("Unsupported address type");
                return HSAIL_ADDRESS_ERROR;
        }
    }
    else if ((argInfo->type == ARG_TYPE_IMAGE) ||
             (argInfo->type == ARG_TYPE_SAMPLER)) {
        return HSAIL_ADDRESS_GLOBAL;
    }
    return HSAIL_ADDRESS_ERROR;
}

/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE
GetHSAILDataType(const aclArgData* argInfo)
{
    aclArgDataType dataType;

    if (argInfo->type == ARG_TYPE_POINTER) {
        dataType = argInfo->arg.pointer.data;
    }
    else if (argInfo->type == ARG_TYPE_VALUE) {
        dataType = argInfo->arg.value.data;
    }
    else {
        return HSAIL_DATATYPE_ERROR;
    }
    switch (dataType) {
        case DATATYPE_i1:
            return HSAIL_DATATYPE_B1;
        case DATATYPE_i8:
            return HSAIL_DATATYPE_S8;
        case DATATYPE_i16:
            return HSAIL_DATATYPE_S16;
        case DATATYPE_i32:
            return HSAIL_DATATYPE_S32;
        case DATATYPE_i64:
            return HSAIL_DATATYPE_S64;
        case DATATYPE_u8:
            return HSAIL_DATATYPE_U8;
        case DATATYPE_u16:
            return HSAIL_DATATYPE_U16;
        case DATATYPE_u32:
            return HSAIL_DATATYPE_U32;
        case DATATYPE_u64:
            return HSAIL_DATATYPE_U64;
        case DATATYPE_f16:
            return HSAIL_DATATYPE_F32;
        case DATATYPE_f32:
            return HSAIL_DATATYPE_F32;
        case DATATYPE_f64:
            return HSAIL_DATATYPE_F64;
        case DATATYPE_struct:
            return HSAIL_DATATYPE_STRUCT;
        case DATATYPE_opaque:
            return HSAIL_DATATYPE_OPAQUE;
        case DATATYPE_ERROR:
        default:
            return HSAIL_DATATYPE_ERROR;
    }
}

inline static int
GetHSAILArgSize(const aclArgData *argInfo)
{
    switch (argInfo->type) {
        case ARG_TYPE_VALUE:
            switch (GetHSAILDataType(argInfo)) {
                case HSAIL_DATATYPE_B1:
                    return 1;
                case HSAIL_DATATYPE_B8:
                case HSAIL_DATATYPE_S8:
                case HSAIL_DATATYPE_U8:
                    return 1;
                case HSAIL_DATATYPE_B16:
                case HSAIL_DATATYPE_U16:
                case HSAIL_DATATYPE_S16:
                case HSAIL_DATATYPE_F16:
                    return 2;
                case HSAIL_DATATYPE_B32:
                case HSAIL_DATATYPE_U32:
                case HSAIL_DATATYPE_S32:
                case HSAIL_DATATYPE_F32:
                    return 4;
                case HSAIL_DATATYPE_B64:
                case HSAIL_DATATYPE_U64:
                case HSAIL_DATATYPE_S64:
                case HSAIL_DATATYPE_F64:
                    return 8;
                case HSAIL_DATATYPE_STRUCT:
                    return argInfo->arg.value.numElements;
                default:
                    return -1;
            }
        case ARG_TYPE_POINTER:
        case ARG_TYPE_IMAGE:
        case ARG_TYPE_SAMPLER:
            return sizeof(void*);
        default:
            return -1;
    }
}

inline static clk_value_type_t
GetOclType(const aclArgData* argInfo)
{
    static const clk_value_type_t   ClkValueMapType[6][6] = {
        { T_CHAR,   T_CHAR2,    T_CHAR3,    T_CHAR4,    T_CHAR8,    T_CHAR16   },
        { T_SHORT,  T_SHORT2,   T_SHORT3,   T_SHORT4,   T_SHORT8,   T_SHORT16  },
        { T_INT,    T_INT2,     T_INT3,     T_INT4,     T_INT8,     T_INT16    },
        { T_LONG,   T_LONG2,    T_LONG3,    T_LONG4,    T_LONG8,    T_LONG16   },
        { T_FLOAT,  T_FLOAT2,   T_FLOAT3,   T_FLOAT4,   T_FLOAT8,   T_FLOAT16  },
        { T_DOUBLE, T_DOUBLE2,  T_DOUBLE3,  T_DOUBLE4,  T_DOUBLE8,  T_DOUBLE16 },
    };

    uint sizeType;
    if ((argInfo->type == ARG_TYPE_POINTER) &&
        (argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU)) {
        return T_QUEUE;
    }
    if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
        return T_POINTER;
    }
    else if (argInfo->type == ARG_TYPE_VALUE) {
        switch (argInfo->arg.value.data) {
            case DATATYPE_i8:
            case DATATYPE_u8:
                sizeType = 0;
                break;
            case DATATYPE_i16:
            case DATATYPE_u16:
                sizeType = 1;
                break;
            case DATATYPE_i32:
            case DATATYPE_u32:
                sizeType = 2;
                break;
            case DATATYPE_i64:
            case DATATYPE_u64:
                sizeType = 3;
                break;
            case DATATYPE_f16:
            case DATATYPE_f32:
                sizeType = 4;
                break;
            case DATATYPE_f64:
                sizeType = 5;
                break;
            default:
                return T_VOID;
        }
        switch (argInfo->arg.value.numElements) {
            case 1: return ClkValueMapType[sizeType][0];
            case 2: return ClkValueMapType[sizeType][1];
            case 3: return ClkValueMapType[sizeType][2];
            case 4: return ClkValueMapType[sizeType][3];
            case 8: return ClkValueMapType[sizeType][4];
            case 16: return ClkValueMapType[sizeType][5];
            default: return T_VOID;
        }
    }
    else if (argInfo->type == ARG_TYPE_SAMPLER) {
        return T_SAMPLER;
    }
    else {
        return T_VOID;
    }
}

inline static cl_kernel_arg_address_qualifier
GetOclAddrQual(const aclArgData* argInfo)
{
    if (argInfo->type == ARG_TYPE_POINTER) {
        switch (argInfo->arg.pointer.memory) {
        case PTR_MT_UAV:
        case PTR_MT_GLOBAL:
            return CL_KERNEL_ARG_ADDRESS_GLOBAL;
        case PTR_MT_CONSTANT:
        case PTR_MT_UAV_CONSTANT:
        case PTR_MT_CONSTANT_EMU:
            return CL_KERNEL_ARG_ADDRESS_CONSTANT;
        case PTR_MT_LDS_EMU:
        case PTR_MT_LDS:
            return CL_KERNEL_ARG_ADDRESS_LOCAL;
        default:
            return CL_KERNEL_ARG_ADDRESS_PRIVATE;
        }
    }
    else if (argInfo->type == ARG_TYPE_IMAGE) {
        return CL_KERNEL_ARG_ADDRESS_GLOBAL;
    }
    //default for all other cases
    return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}

inline static cl_kernel_arg_access_qualifier
GetOclAccessQual(const aclArgData* argInfo)
{
    if (argInfo->type == ARG_TYPE_IMAGE) {
        switch (argInfo->arg.image.type) {
        case ACCESS_TYPE_RO:
            return CL_KERNEL_ARG_ACCESS_READ_ONLY;
        case ACCESS_TYPE_WO:
             return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
        case ACCESS_TYPE_RW:
            return CL_KERNEL_ARG_ACCESS_READ_WRITE;
        default:
            return CL_KERNEL_ARG_ACCESS_NONE;
        }
    }
    return CL_KERNEL_ARG_ACCESS_NONE;
}

inline static cl_kernel_arg_type_qualifier
GetOclTypeQual(const aclArgData* argInfo)
{
    cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
    if (argInfo->type == ARG_TYPE_POINTER) {
        if (argInfo->arg.pointer.isVolatile) {
            rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
        }
        if (argInfo->arg.pointer.isRestrict) {
            rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
        }
        if (argInfo->arg.pointer.isPipe) {
            rv |= CL_KERNEL_ARG_TYPE_PIPE;
        }
        if (argInfo->isConst) {
            rv |= CL_KERNEL_ARG_TYPE_CONST;
        }
        switch (argInfo->arg.pointer.memory) {
        case PTR_MT_CONSTANT:
        case PTR_MT_UAV_CONSTANT:
        case PTR_MT_CONSTANT_EMU:
            rv |= CL_KERNEL_ARG_TYPE_CONST;
            break;
        default:
            break;
        }
    }
    return rv;
}

static int
GetOclSize(const aclArgData* argInfo)
{
    switch (argInfo->type) {
        case ARG_TYPE_POINTER: return sizeof(void *);
        case ARG_TYPE_VALUE:
            //! \note OCL 6.1.5. For 3-component vector data types,
            //! the size of the data type is 4 * sizeof(component).
            switch (argInfo->arg.value.data) {
                case DATATYPE_struct:
                    return 1 * argInfo->arg.value.numElements;
                case DATATYPE_i8:
                case DATATYPE_u8:
                    return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
                case DATATYPE_u16:
                case DATATYPE_i16:
                case DATATYPE_f16:
                    return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
                case DATATYPE_u32:
                case DATATYPE_i32:
                case DATATYPE_f32:
                    return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
                case DATATYPE_i64:
                case DATATYPE_u64:
                case DATATYPE_f64:
                    return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
                case DATATYPE_ERROR:
                default: return -1;
            }
        case ARG_TYPE_IMAGE: return sizeof(cl_mem);
        case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
        default: return -1;
    }
}

void
HSAILKernel::initArgList(const aclArgData* aclArg)
{
    // Initialize the hsail argument list too
    initHsailArgs(aclArg);

    // Iterate through the arguments and insert into parameterList
    device::Kernel::parameters_t params;
    amd::KernelParameterDescriptor desc;
    size_t offset = 0;

    // Reserved arguments for HSAIL launch
    aclArg += ExtraArguments;
    for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
        desc.name_ = arguments_[i]->name_.c_str();
        desc.type_ = GetOclType(aclArg);
        desc.addressQualifier_ = GetOclAddrQual(aclArg);
        desc.accessQualifier_ = GetOclAccessQual(aclArg);
        desc.typeQualifier_ = GetOclTypeQual(aclArg);
        desc.typeName_ = arguments_[i]->typeName_.c_str();

        // Make a check if it is local or global
        if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
            desc.size_ = 0;
        }
        else {
            desc.size_ = GetOclSize(aclArg);
        }

        // Make offset alignment to match CPU metadata, since
        // in multidevice config abstraction layer has a single signature
        // and CPU sends the paramaters as they are allocated in memory
        size_t  size = desc.size_;
        if (size == 0) {
            // Local memory for CPU
            size = sizeof(cl_mem);
        }
        offset  = amd::alignUp(offset, std::min(size, size_t(16)));
        desc.offset_    = offset;
        offset          += amd::alignUp(size, sizeof(uint32_t));
        params.push_back(desc);

        if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
            flags_.imageEna_ = true;
            if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
                flags_.imageWriteEna_ = true;
            }
        }
    }

    createSignature(params);
}

void
HSAILKernel::initHsailArgs(const aclArgData* aclArg)
{
    int offset = 0;

    // Reserved arguments for HSAIL launch
    aclArg += ExtraArguments;

    // Iterate through the each kernel argument
    for (; aclArg->struct_size != 0; aclArg++) {
        Argument* arg = new Argument;
        // Initialize HSAIL kernel argument
        arg->name_      = aclArg->argStr;
        arg->typeName_  = aclArg->typeStr;
        arg->size_      = GetHSAILArgSize(aclArg);
        arg->offset_    = offset;
        arg->type_      = GetHSAILArgType(aclArg);
        arg->addrQual_  = GetHSAILAddrQual(aclArg);
        arg->dataType_  = GetHSAILDataType(aclArg);
        // If vector of args we add additional arguments to flatten it out
        arg->numElem_   = ((aclArg->type == ARG_TYPE_VALUE) &&
             (aclArg->arg.value.data != DATATYPE_struct)) ?
             aclArg->arg.value.numElements : 1;
        arg->alignment_ = GetHSAILArgAlignment(aclArg);
        offset += GetHSAILArgSize(aclArg);
        arguments_.push_back(arg);
    }
}

void
HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
{
    PrintfInfo  info;
    uint index = 0;
    for (; aclPrintf->struct_size != 0; aclPrintf++) {
        index = aclPrintf->ID;
        if (printf_.size() <= index) {
            printf_.resize(index + 1);
        }
        info.fmtString_ = aclPrintf->fmtStr;
        info.fmtString_ += "\n";
        uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
        for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
            info.arguments_.push_back(*tmp_ptr);
        }
        printf_[index] = info;
        info.arguments_.clear();
    }
}

HSAILKernel::HSAILKernel(std::string name,
    HSAILProgram* prog,
    std::string compileOptions)
    : device::Kernel(name)
    , compileOptions_(compileOptions)
    , dev_(prog->dev())
    , prog_(*prog)
    , index_(0)
    , code_(NULL)
    , hwMetaData_(NULL)
{
    hsa_ = true;
}

HSAILKernel::~HSAILKernel()
{
    while (!arguments_.empty()) {
        Argument* arg = arguments_.back();
        delete arg;
        arguments_.pop_back();
    }

    delete [] hwMetaData_;

    delete code_;
}

bool
HSAILKernel::init()
{
    acl_error error;
    //compile kernel down to ISA
    std::string openClKernelName("&__OpenCL_" + name() + "_kernel");

    std::string options(compileOptions_.c_str());
    options.append(" -just-kernel=");
    options.append(openClKernelName.c_str());
    // Get the ISA out
    size_t  size_isa;
    void*   shader_isa = NULL;
    error = aclCompile(dev().hsaCompiler(), prog().binaryElf(),
        options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
    if (error != ACL_SUCCESS) {
        LogError("Failed to finalize");
        return false;
    }

    shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
        prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
    if (shader_isa == NULL) {
        LogError("Failed find the ISA");
        return false;
    }

    aqlCreateHWInfo(shader_isa, size_isa);

    // Pull out metadata from the ELF
    size_t sizeOfArgList;
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList);
    if (error != ACL_SUCCESS) {
        return false;
    }

    char* aclArgList = new char[sizeOfArgList];
    if (NULL == aclArgList) {
        return false;
    }
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
    if (error != ACL_SUCCESS) {
        return false;
    }
    // Set the argList
    initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
    delete [] aclArgList;

    size_t sizeOfWorkGroupSize;
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
    if (error != ACL_SUCCESS) {
        return false;
    }
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
        workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
    if (error != ACL_SUCCESS) {
        return false;
    }

    // Copy wavefront size
    workGroupInfo_.wavefrontSize_ = dev().getAttribs().wavefrontSize;
    // Find total workgroup size
    if (workGroupInfo_.compileSize_[0] != 0) {
        workGroupInfo_.size_ =
            workGroupInfo_.compileSize_[0] *
            workGroupInfo_.compileSize_[1] *
            workGroupInfo_.compileSize_[2];
    }
    else {
        workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
    }

    // Pull out printf metadata from the ELF
    size_t sizeOfPrintfList;
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList);
    if (error != ACL_SUCCESS) {
        return false;
    }

    // Make sure kernel has any printf info
    if (0 != sizeOfPrintfList) {
        char* aclPrintfList = new char[sizeOfPrintfList];
        if (NULL == aclPrintfList) {
            return false;
        }
        error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
            RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
             &sizeOfPrintfList);
        if (error != ACL_SUCCESS) {
            return false;
        }

        // Set the PrintfList
        initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
        delete [] aclPrintfList;
    }

    size_t sizeOfDevice;
    bool hasKernelEnqueue = false;
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
        &hasKernelEnqueue, &sizeOfDevice);
    if (error != ACL_SUCCESS) {
        return false;
    }
    flags_.dynamicParallelism_ = hasKernelEnqueue;

    int index = -1;
    error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
        RT_KERNEL_INDEX, openClKernelName.c_str(),
        &index, &sizeOfDevice);
    if (error != ACL_SUCCESS) {
        return false;
    }
    index_ = static_cast<uint>(index);

    return true;
}

bool
HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
    // Check if memory doesn't require reallocation
    bool    noRealloc = true;
        //amdMem->reallocedDeviceMemory(&dev()));

    return noRealloc;
}

const Device&
HSAILKernel::dev() const
{
    return reinterpret_cast<const Device&>(dev_);
}

const HSAILProgram&
HSAILKernel::prog() const
{
    return reinterpret_cast<const HSAILProgram&>(prog_);
}

void
HSAILKernel::findLocalWorkSize(
    size_t              workDim,
    const amd::NDRange& gblWorkSize,
    amd::NDRange& lclWorkSize) const
{
    // Initialize the default workgoup info
    // Check if the kernel has the compiled sizes
    if (workGroupInfo()->compileSize_[0] == 0) {
        // Find the default local workgroup size, if it wasn't specified
        if (lclWorkSize[0] == 0) {
            size_t  thrPerGrp;
            bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
            bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
            bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
                                  !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);

            bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
                               ((workDim == 2) && b2DOverrideSet) ||
                               ((workDim == 3) && b3DOverrideSet);
            if (!overrideSet) {
                // Find threads per group
                thrPerGrp = workGroupInfo()->size_;

                // Check if kernel uses images
                if (flags_.imageEna_ &&
                    // and thread group is a multiple value of wavefronts
                    ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
                    // and it's 2 or 3-dimensional workload
                    (workDim > 1) &&
                     ((dev().settings().partialDispatch_) ||
                       (((gblWorkSize[0] % 16) == 0) &&
                        ((gblWorkSize[1] % 16) == 0)))) {
                    // Use 8x8 workgroup size if kernel has image writes
                    if (flags_.imageWriteEna_ ||
                        (thrPerGrp != dev().info().maxWorkGroupSize_)) {
                        lclWorkSize[0] = 8;
                        lclWorkSize[1] = 8;
                    }
                    else {
                        lclWorkSize[0] = 16;
                        lclWorkSize[1] = 16;
                    }
                    if (workDim == 3) {
                        lclWorkSize[2] = 1;
                    }
                }
                else {
                    size_t  tmp = thrPerGrp;
                    // Split the local workgroup into the most efficient way
                    for (uint d = 0; d < workDim; ++d) {
                        size_t  div = tmp;
                        for (; (gblWorkSize[d] % div) != 0; div--);
                        lclWorkSize[d] = div;
                        tmp /= div;
                    }

                    // Check if partial dispatch is enabled and
                    if (dev().settings().partialDispatch_ &&
                         // we couldn't find optimal workload
                        (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
                        size_t  maxSize = 0;
                        size_t  maxDim = 0;
                        for (uint d = 0; d < workDim; ++d) {
                            if (maxSize < gblWorkSize[d]) {
                                maxSize = gblWorkSize[d];
                                maxDim = d;
                            }
                        }
                        // Check if a local workgroup has the most optimal size
                        if (thrPerGrp > maxSize) {
                            thrPerGrp = maxSize;
                        }
                        lclWorkSize[maxDim] = thrPerGrp;
                        for (uint d = 0; d < workDim; ++d) {
                            if (d != maxDim) {
                                lclWorkSize[d] = 1;
                            }
                        }
                    }
                }
            }
            else {
                // Use overrides when app doesn't provide workgroup dimensions
                if (workDim == 1) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
                }
                else if (workDim == 2) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
                }
                else if (workDim == 3) {
                        lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
                        lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
                        lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
                }
                else
                {
                    assert(0 && "Invalid workDim!");
                }
            }
        }
    }
    else {
        for (uint d = 0; d < workDim; ++d) {
            lclWorkSize[d] = workGroupInfo()->compileSize_[d];
        }
    }
}

inline static void
WriteAqlArg(
    unsigned char** dst,//!< The write pointer to the buffer
    const void* src,    //!< The source pointer
    uint size,          //!< The size in bytes to copy
    uint alignment = 0  //!< The alignment to follow while writing to the buffer
    )
{
    if (alignment == 0) {
        *dst = amd::alignUp(*dst, size);
    }
    else {
        *dst = amd::alignUp(*dst, alignment);
    }
    memcpy(*dst, src, size);
    *dst += size;
}

HsaAqlDispatchPacket*
HSAILKernel::loadArguments(
    VirtualGPU&                     gpu,
    const amd::Kernel&              kernel,
    const amd::NDRangeContainer&    sizes,
    const_address                   parameters,
    bool                            nativeMem,
    uint64_t                        vmDefQueue,
    uint64_t*                       vmParentWrap,
    std::vector<const Resource*>&   memList) const
{
    static const bool WaitOnBusyEngine = true;
    uint64_t    ldsAddress = ldsSize();
    address     aqlArgBuf = gpu.cb(0)->sysMemCopy();
    address     aqlStruct = gpu.cb(1)->sysMemCopy();
    bool        srdResource = false;

    // The HLC generates 3 additional arguments for the global offsets
    //and fourth argument is the printf_buffer pointer
    size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
    for (uint i = 0; i < sizes.dimensions(); ++i) {
        offsetSize[i] = sizes.offset()[i];
    }

    if (dynamicParallelism()) {
        // Provide the host parent AQL wrap object to the kernel
        AmdAqlWrap*    wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
        memset(wrap, 0, sizeof(AmdAqlWrap));
        wrap->state = AQL_WRAP_BUSY;
        ConstBuffer* cb = gpu.constBufs_[1];
        cb->uploadDataToHw(sizeof(AmdAqlWrap));
        *vmParentWrap = cb->vmAddress() + cb->wrtOffset();
        offsetSize[4] = vmDefQueue;
        offsetSize[5] = *vmParentWrap;
        memList.push_back(cb);
    }

    // Check if the kernel may have printf output
    if ((printfInfo().size() > 0) &&
        // and printf buffer was allocated
        (gpu.printfDbgHSA().dbgBuffer() != NULL)) {
        offsetSize[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
        memList.push_back(gpu.printfDbgHSA().dbgBuffer());
    }
    WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));

    const amd::KernelSignature& signature = kernel.signature();
    const amd::KernelParameters& kernelParams = kernel.parameters();

    // Find all parameters for the current kernel
    for (uint i = 0; i != signature.numParameters(); ++i) {
        const HSAILKernel::Argument* arg = argument(i);
        const amd::KernelParameterDescriptor& desc = signature.at(i);
        const_address paramaddr = parameters + desc.offset_;

        switch (arg->type_) {
        case HSAIL_ARGTYPE_POINTER:
            // If it is a global pointer
            if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {

                Memory* gpuMem = NULL;
                amd::Memory* mem = NULL;

                if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
                    WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
                    mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
                    if (mem != NULL) {
                        gpuMem = dev().getGpuMemory(mem);
                        gpuMem->wait(gpu, WaitOnBusyEngine);
                        memList.push_back(gpuMem);
                    }
                    else {
                        return NULL;
                    }
                    break;
                }
                if (nativeMem) {
                    gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
                }
                else {
                        mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
                        if (mem != NULL) {
                             gpuMem = dev().getGpuMemory(mem);
                        }
                }
                if (gpuMem == NULL) {
                    WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
                    break;
                }

                //! @todo 64 bit isn't supported with 32 bit binary
                uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
                WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));

                // Wait for resource if it was used on an inactive engine
                //! \note syncCache may call DRM transfer
                gpuMem->wait(gpu, WaitOnBusyEngine);

                //! @todo Compiler has to return read/write attributes
                if ((NULL != mem) &&
                    ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
                    mem->signalWrite(&dev());
                }
                memList.push_back(gpuMem);
            }
            // If it is a local pointer
            else {
                assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
                    "Unsupported address type");
                ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
                WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
                ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
            }
            break;
        case HSAIL_ARGTYPE_VALUE:
            // Special case for structrues
            if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
                // Copy the current structre into CB1
                memcpy(aqlStruct, paramaddr, arg->size_);
                ConstBuffer* cb = gpu.constBufs_[1];
                cb->uploadDataToHw(arg->size_);
                // Then use a pointer in aqlArgBuffer to CB1
                uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
                WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
                memList.push_back(cb);
            }
            else {
                WriteAqlArg(&aqlArgBuf, paramaddr,
                    arg->numElem_ * arg->size_, arg->size_);
            }
            break;
        case HSAIL_ARGTYPE_IMAGE: {
            Image* image = NULL;
            amd::Memory* mem = NULL;
            if (nativeMem) {
                image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
            }
            else {
                mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
                if (mem == NULL) {
                    LogError( "The kernel image argument isn't an image object!");
                    return false;
                }
                image = static_cast<Image*>(dev().getGpuMemory(mem));
            }

            // Wait for resource if it was used on an inactive engine
            //! \note syncCache may call DRM transfer
            image->wait(gpu, WaitOnBusyEngine);

            if (dev().settings().hsailDirectSRD_) {
                // Image arguments are of size 48 bytes and aligned to 16 bytes
                WriteAqlArg(&aqlArgBuf, image->hwState(),
                    HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
            }
            else {
                uint64_t srd = image->hwSrd();
                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
                srdResource = true;
            }

            //! @todo Compiler has to return read/write attributes
            if ((NULL != mem) &&
                ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
                mem->signalWrite(&dev());
            }

            memList.push_back(image);
            break;
        }
        case HSAIL_ARGTYPE_SAMPLER: {
            const amd::Sampler* sampler =
                *reinterpret_cast<amd::Sampler* const*>(paramaddr);
            const Sampler* gpuSampler = static_cast<Sampler*>
                    (sampler->getDeviceSampler(dev()));
            if (dev().settings().hsailDirectSRD_) {
                WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
                    HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
            }
            else {
                uint64_t srd = gpuSampler->hwSrd();
                WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
                srdResource = true;
            }
            break;
        }
        case HSAIL_ARGTYPE_QUEUE: {
            const amd::DeviceQueue* queue =
                *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
            VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
            uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
            WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
            break;
        }
        default:
            LogError(" Unsupported address type ");
            return NULL;
        }
    }

    if (ldsAddress > dev().info().localMemSize_) {
        LogError("No local memory available\n");
        return NULL;
    }

    assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
        "Size and the number of arguments don't match!");
    uint  argBufSize = amd::alignUp(
        static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
    HsaAqlDispatchPacket*   aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
        gpu.cb(0)->sysMemCopy() + argBufSize);

    amd::NDRange    local(sizes.local());
    amd::NDRange    global(sizes.global());

    // Initialize the Global, Local and Offset values
    aqlPkt->dimensions = sizes.dimensions();
    // Initialize the work grid parameter
    for (uint idx = 0; idx < 3; idx++) {
        aqlPkt->grid_size[idx] = 1;
        aqlPkt->workgroup_size[idx] = 1;
    }

    // Check if runtime has to find local workgroup size
    findLocalWorkSize(aqlPkt->dimensions, global, local);
    for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
        aqlPkt->grid_size[idx] = global[idx];
        aqlPkt->workgroup_size[idx] = local[idx];
    }

    // Initialize if dispatch should enable profiling
    aqlPkt->reserved2 = 0;  //config->profile ? 1:0;

    // Initialize kernel ISA and execution buffer requirements
    aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
    aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
    aqlPkt->private_segment_size_bytes = spillSegSize();

    // Initialize cache flush configuration for the dispatch
    //! @todo Currently not used in emulation
    aqlPkt->barrier = 1;
    aqlPkt->release_fence_scope = 1;
    aqlPkt->acquire_fence_scope = 2;
    aqlPkt->invalidate_instruction_cache = 1;

    ConstBuffer* cb = gpu.constBufs_[0];
    cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
    uint64_t argList = cb->vmAddress() + cb->wrtOffset();
    aqlPkt->kernel_arg_address = argList;
    memList.push_back(cb);
    memList.push_back(gpuAqlCode());
    if (NULL != prog().globalStore()) {
        memList.push_back(prog().globalStore());
    }
    if (cpuAqlCode_->enable_sgpr_queue_ptr) {
        memList.push_back(gpu.hsaQueueMem());
    }

    if (srdResource) {
        dev().srds().fillResourceList(memList);
    }

    return aqlPkt;
}

} // namespace gpu