rocm-systems/rocclr/runtime/device/gpu/gpukernel.hpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#ifndef GPUKERNEL_HPP_
#define GPUKERNEL_HPP_

#include "device/device.hpp"
#include "utils/macros.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/kernel.hpp"
#include "platform/sampler.hpp"
#include "device/gpu/gpudevice.hpp"
#include "device/gpu/gpuvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/gpu/gpuprintf.hpp"
#include "device/gpu/gpuwavelimiter.hpp"
#include "hsa.h"

namespace amd {
namespace hsa {
namespace loader {
class Symbol;
} // loader
} // hsa
} // amd

//! \namespace gpu GPU Device Implementation
namespace gpu {

class VirtualGPU;
class Device;
class NullDevice;
class HSAILProgram;

struct HWSHADER_Helper
{
    template <typename S, typename T>
    static T Get(S base, T offset) {
        return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base)
            + reinterpret_cast<size_t>(offset));
    }
};

#define HWSHADER_Get(shader, field) \
    HWSHADER_Helper::Get((shader), (shader)->field)

template <typename D, typename S>
static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
    dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src)
            + structSize * size);
}

/*! \addtogroup GPU GPU Device Implementation
 *  @{
 */

/*! \brief Helper function for the std::string processing.
 *  Finds the name in the std::string
 *
 *  \return True if we found the entry of the symbols
 */
bool expect(
    const std::string&  str,    //!< The original std::string
    size_t*             pos,    //!< Position to start
    const std::string&  sym     //!< The sympols to expect
    );

/*! \brief Helper function for the std::string processing.
 *  Gets a word from the std::string
 *
 *  \return True if we successfully received a word
 */
bool getword(
    const std::string&  str,    //!< The original std::string
    size_t*             pos,    //!< Position to start
    char*               sym     //!< Returned word
    );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata
 *
 *  \return True if we loaded a number
 */
bool getuint(
    const std::string&  str,    //!< The original std::string
    size_t*             pos,    //!< Position to start
    uint*               val     //!< Returned number
    );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata in HEX format
 *
 *  \return True if we loaded a number
 */
bool getuintHex(
    const std::string&  str,    //!< The original std::string
    size_t*             pos,    //!< Position to start
    uint*               val     //!< Returned number
    );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata in HEX format
 *
 *  \return True if we loaded a number
 */
bool getuint64Hex(
    const std::string&  str,    //!< The original std::string
    size_t*             pos,    //!< Position to start
    uint64_t*           val     //!< Returned number
    );

/*! \brief Helper function for the std::string processing.
 *  Converts unsigned integer to string
 *
 *  \return None
 */
void intToStr(
    size_t  value,          //!< Value for conversion
    char*   str,            //!< Pointer to the converted string
    size_t  size            //!< String size
    );

//! Image constant data from ABI specification
struct ImageConstants : public amd::EmbeddedObject
{
    uint32_t    width_;         //!< Image surface width
    uint32_t    height_;        //!< Image surface height
    uint32_t    depth_;         //!< Image surface depth (1 for 2D images)
    uint32_t    dataType_;      //!< Image surface data type
    float       widthFloat_;    //!< Image surface width
    float       heightFloat_;   //!< Image surface height
    float       depthFloat_;    //!< Image surface depth (1 for 2D images)
    uint32_t    channelOrder_;  //!< Image surface texels channel order
};

//! Kernel arguments
struct KernelArg : public amd::HeapObject
{
public:
    //! \enum Kernel argument type
    enum ArgumentType
    {
        None    = 0,
        PointerGlobal,
        Value,
        Image,
        PointerLocal,
        PointerHwLocal,
        PointerPrivate,
        PointerHwPrivate,
        PointerConst,
        PointerHwConst,
        Float,
        Double,
        Half,
        Char,
        UChar,
        Short,
        UShort,
        Int,
        UInt,
        Long,
        ULong,
        Struct,
        Union,
        Opaque,
        Event,
        Image1D,    //!< first image
        Image2D,
        Image1DB,
        Image1DA,
        Image2DA,
        Image3D,    //!< last image
        Counter,
        Sampler,
        PrivateSize,
        LocalSize,
        HwPrivateSize,
        HwLocalSize,
        Grouping,
        WrkgrpSize,
        Wavefront,
        PrivateFixed,
        ErrorMessage,
        WarningMessage,
        PrintfFormatStr,
        MetadataVersion,
        UavId,
        ABI64Bit,
        GWS,
        SWGWS,
        Reflection,
        ConstArg,
        ConstBufId,
        PrintfBufId,
        GroupingHint,
        VecTypeHint,
        LimitWave,
        TotalTypes
    };

    // The compiler metadata fields
    std::string     name_;      //!< parameters name
    ArgumentType    type_;      //!< type of argument
    union {
        uint        size_;      //!< number of arguments (for values and pointers only)
        uint        location_;  //!< sampler's location (for samplers only)
    };
    uint            cbIdx_;     //!< constant buffer index
    uint            cbPos_;     //!< dword address in CB for the argument
    std::string     buf_;       //!< buffer tag
    uint            index_;     //!< buffer/image/sampler index
    uint            alignment_; //!< the required argument's alignment
    ArgumentType    dataType_;  //!< data type of the argument
    union {
        struct {
            uint    uavBuf_    : 1; //!< UAV memory, no global heap
            uint    realloc_   : 1; //!< argument has to be reallocatedin the global heap
            uint    readOnly_  : 1; //!< Read only memory object
            uint    writeOnly_ : 1; //!< Write only memory object
            uint    readWrite_ : 1; //!< Read/Write memory object
        };
        uint    value_;
    } memory_;

    std::string     typeName_;      //!< argument's type name
    uint            typeQualifier_; //!< argument's type qualifier

    //! Default constructor for the kernel argument
    KernelArg();

    //! Copy constructor for the kernel argument
    KernelArg(const KernelArg& data);

    //! Overloads operator=
    KernelArg& operator=(const KernelArg& data);

    //! Destructor of the kernel argument
    ~KernelArg() { name_.clear(); }

    /*! \brief Checks if this arguments requires a place in constant buffer
     *
     *  \return True if we need CB
     */
    bool isCbNeeded() const;

    /*! \brief Retrieves the argument's size
     *
     *  \return Size of the current argument
     */
    size_t size(
        bool gpuLayer   //!< True if we want the argument's size for the GPU layer
        ) const;

    /*! \brief Retrieves the argument's type for the abstraction layer
     *
     *  \return The argument's type in the abstraction layer format
     */
    clk_value_type_t type() const;

    /*! \brief Retrieves the argument's address qualifier for the abstraction layer
     *
     *  \return The argument's address qualifier in the abstraction layer format
     */
    cl_kernel_arg_address_qualifier addressQualifier() const;

    /*! \brief Retrieves the argument's access qualifier for the abstraction layer
     *
     *  \return The argument's access qualifier in the abstraction layer format
     */
    cl_kernel_arg_access_qualifier accessQualifier() const;

    /*! \brief Retrieves the argument's type name for the abstraction layer
     *
     *  \return The argument's type name
     */
    const char* typeName() const { return typeName_.c_str(); }

    /*! \brief Retrieves the argument's type qualifier for the abstraction layer
     *
     *  \return The argument's type qualifier
     */
    cl_kernel_arg_type_qualifier typeQualifier() const
    {
        switch (type_) {
        case PointerConst:
        case PointerHwConst:
            return static_cast<cl_kernel_arg_type_qualifier>(typeQualifier_ |
                CL_KERNEL_ARG_TYPE_CONST);
        default:
            return static_cast<cl_kernel_arg_type_qualifier>(typeQualifier_);
        }
    }

    //! Special case for vectors with component size <= 16bit
    const static uint VectorSizeLimit = 4;
    size_t specialVector() const;
};

struct DataTypeConst
{
    const char*     tagName_;       //!< data type's name
    KernelArg::ArgumentType type_;  //!< data type
};

//!  Metadata description for parsing
struct MetaDataConst
{
    const char*     typeName_;      //!< parameters name
    KernelArg::ArgumentType type_;  //!< type of argument
    struct
    {
        uint    size_   :  1;   //!< number of arguments
        uint    name_   :  1;   //!< argument's name
        uint    resType_:  1;   //!< argument's type
        uint    cbIdx_  :  1;   //!< resource index CB, sampler or image
        uint    cbPos_  :  1;   //!< dword address in CB for the argument
        uint    buf_    :  1;   //!< buffer tag
        uint    reserved: 26;   //!< reserved
    };
};

const uint DescTotal        = 15;
const uint BasicTypeTotal   = 15;
const uint ArgStateTotal    = DescTotal + BasicTypeTotal;

//! The constant array that describes different metadata properties
extern const MetaDataConst ArgState[ArgStateTotal];

extern const DataTypeConst DataType[];

extern const uint DataTypeTotal;

// Forward declaration
class Program;
class NullProgram;

class CalImageReference : public amd::ReferenceCountedObject
{
public:
    //! Default constructor
    CalImageReference(CALimage calImage): image_(calImage) {}

    //! Get CAL image
    CALimage calImage() const { return image_; }

protected:
    //! Default destructor
    ~CalImageReference();

private:
    //! Disable copy constructor
    CalImageReference(const CalImageReference&);

    //! Disable operator=
    CalImageReference& operator=(const CalImageReference&);

    CALimage    image_; //!< CAL kernel image
};

//! \class GPU NullKernel - Kernel for offline device
class NullKernel : public device::Kernel
{
public:
    typedef std::vector<KernelArg*> arguments_t;

    const static uint UavIdUndefined = 0xffff;

    enum Flags {
        LimitWorkgroup  = 1 << 0,   //!< Limits the workgroup size
        PrintfOutput    = 1 << 1,   //!< Kernel has printf output
        PrivateFixed    = 1 << 2,   //!< Kernel has printf output
        ABI64bit        = 1 << 3,   //!< Kernel has 64 bit ABI
        Unused0         = 1 << 4,   //!< Unused
        Unused1         = 1 << 5,   //!< Unused
        ImageEnable     = 1 << 6,   //!< Kernel uses images
        ImageWrite      = 1 << 7,   //!< Kernel writes images
    };

    //! \enum Resource type for binding
    enum ResourceType
    {
        Undefined           = 0x00000000,   //!< resource type will be detected
        ConstantBuffer      = 0x00000001,   //!< resource is a constant buffer
        GlobalBuffer        = 0x00000002,   //!< resource is a global buffer
        GlobalBufferArena   = 0x00000003,   //!< resource is a global buffer
        ArgumentHeapBuffer  = 0x00000004,   //!< resource is an argument buffer
        ArgumentBuffer      = 0x00000005,   //!< resource is an argument buffer
        ArgumentImageRead   = 0x00000006,   //!< resource is an argument image read
        ArgumentImageWrite  = 0x00000007,   //!< resource is an argument image write
        ArgumentConstBuffer = 0x00000008,   //!< resource is an argument const buffer
        ArgumentCounter     = 0x00000009,   //!< resource is a global counter
        ArgumentUavID       = 0x0000000a,   //!< resource is a dummy ID read
        ArgumentCbID        = 0x0000000b,   //!< resource is a constant buffer
        ArgumentPrintfID    = 0x0000000c,   //!< resource is a printf buffer
    };

    //! GPU kernel constructor
    NullKernel(
        const std::string&  name,           //!< The kernel's name
        const NullDevice&   gpuNullDev,     //!< GPU device object
        const NullProgram&  nullProg        //!< Reference to the program
        );

    virtual ~NullKernel();

    /*! \brief Creates a GPU kernel in CAL
     *
     *  \return True if we successfully created a kernel in CAL
     */
    bool create(
        const std::string&  code,       //!< IL source code
        const std::string&  metadata,   //!< the kernel metadata structure
        const void* binaryCode = NULL,  //!< binary machine code for CAL
        size_t      binarySize = 0      //!< the machine code size
        );

    //! Returns CAL function descriptor
    CALimage calImage() const { return calRef_->calImage(); }

    //! Returns TRUE if we successfully retrieved the binary from CAL
    bool getCalBinary(
        void*   binary,     //!< ISA binary code
        size_t  size        //!< ISA binary size
        ) const;

    //! Returns CAL image size
    size_t getCalBinarySize() const;

    //! Returns GPU device object, associated with this kernel
    const NullDevice& nullDev() const { return gpuDev_; }

    //! Returns GPU device object, associated with this kernel
    const NullProgram& nullProg() const { return prog_; }

    //! Returns the kernel's build error
    const cl_int buildError() const { return buildError_; }

    //! Returns the kernel's flags
    uint flags() const { return flags_; }

    //! Returns TRUE if ABI is for 64 bits
    bool abi64Bit() const { return (flags_ & ABI64bit) ? true : false; }

    //! Returns the total number of all arguments
    size_t argSize() const { return arguments_.size(); }

    //! Returns instruction count of the current kernel
    uint  instructionCnt() const { return instructionCnt_; }

protected:
    //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
    virtual bool isRealloc() const { return false; }

    /*! \brief Parses the metadata structure for the kernel,
     *   provided by the OpenCL compiler
     *
     *  \return True if we succefully parsed all arguments
     */
    bool parseArguments(
        const std::string&  metaData,   //!< the program for parsing
        uint*               uavRefCount //!< an array of reference counters for used UAVs
        );

    //! Returns the argument for the specified index
    const KernelArg* argument(uint idx) const { return arguments_[idx]; }

    //! Adds the kernel argument into the list
    void addArgument(KernelArg* arg) { arguments_.push_back(arg); }

    //! Returns the argument for the specified sampler's index
    const KernelArg* sampler(uint idx) const { return intSamplers_[idx]; }

    //! Returns the total number of all internal samplers
    size_t samplerSize() const { return intSamplers_.size(); }

    //! Adds the kernel sampler into the sampler's list
    void addSampler(KernelArg* arg) { intSamplers_.push_back(arg); }

    //! Returns UAV raw index for this kernel
    uint uavRaw() const { return uavRaw_; }

    //! Returns UAV arena index for this kernel
    uint uavArena() const { return uavArena_; }

    cl_int          buildError_;    //!< Kernel's build error
    std::string     ilSource_;      //!< IL source code of this kernel

    const NullDevice&   gpuDev_;    //!< GPU device object
    const NullProgram&  prog_;      //!< Reference to the parent program

    CalImageReference*  calRef_;    //!< CAL image reference for this kernel
    bool            internal_;      //!< Runtime internal ker

    uint            flags_;         //!< kernel object flags
    arguments_t     arguments_;     //!< kernel arguments for the execution
    arguments_t     intSamplers_;   //!< predefined intenal kernel samplers

    size_t*         cbSizes_;       //!< real constant buffer sizes for this kernel
    uint            numCb_;         //!< total number of constant buffers

    uint            uavRaw_;        //!< UAV used for RAW access
    uint            uavArena_;      //!< UAV used for arena access

    bool            rwAttributes_;  //!< backend provides RW attributes for arguments

    uint            instructionCnt_;//!< Instruction count

    uint            cbId_;          //!< UAV used for constant buffer access
    uint            printfId_;      //!< UAV used for printf buffer access

private:
    //! Disable copy constructor
    NullKernel(const NullKernel&);

    //! Disable operator=
    NullKernel& operator=(const NullKernel&);

    //! Creates a filename for ISA/IL dumps
    std::string mkDumpName(
        const char* extension       //!< File extension to append
        ) const;

    bool createMultiBinary(
        uint*       imageSize,  //!< Multibinary image size
        void**      image,      //!< Multibinary image
        const void* isa         //!< Kernel HW info
        );

    //! SI HW specific setup for kernels
    bool siCreateHwInfo(
        const void*         shader,     //!< HW info shader
        AMUabiAddEncoding&  encoding    //!< ABI encoding structure
        );

    //! r800 HW specific setup for kernels
    bool r800CreateHwInfo(
        const void*         shader,     //!< HW info shader
        AMUabiAddEncoding&  encoding    //!< ABI encoding structure
        );
};

//! \class GPU kernel
class Kernel : public NullKernel
{
public:
    struct InitData {
        uint    privateSize_;       //!< Private ring initial size
        uint    localSize_;         //!< Local ring initial size
        uint    hwPrivateSize_;     //!< HW private ring initial size
        uint    hwLocalSize_;       //!< HW local ring initial size
        uint    flags_;             //!< Kernel initialization flags
    };

    //! GPU kernel constructor
    Kernel(
        const std::string&  name,       //!< The kernel's name
        const Device&       gpuDev,     //!< GPU device object
        const Program&      prog,       //!< Reference to the program
        const InitData*     initData_   //!< Initialization data
        );

    //! GPU kernel destructor
    virtual ~Kernel();

    /*! \brief Creates a GPU kernel in CAL
     *
     *  \return True if we successfully created a kernel in CAL
     */
    bool create(
        const std::string&  code,       //!< IL source code
        const std::string&  metadata,   //!< the kernel metadata structure
        const void* binaryCode = NULL,  //!< binary machine code for CAL
        size_t      binarySize = 0      //!< the machine code size
        );

    //! Validates memory argument
    virtual bool validateMemory(
        uint    idx,            //!< Argument's index
        amd::Memory* amdMem     //!< AMD memory object for validation
        ) const ;

    //! Initializes the CAL program grid for the kernel execution
    void setupProgramGrid(
        VirtualGPU&         gpu,            //!< virtual GPU device object
        size_t              workDim,        //!< work dimension
        const amd::NDRange& glbWorkOffset,  //!< global work offset
        const amd::NDRange& gblWorkSize,    //!< global work size
        amd::NDRange&       lclWorkSize,    //!< local work size
        const amd::NDRange& groupOffset,    //!< group offsets
        const amd::NDRange& glbWorkOffsetOrg,
        const amd::NDRange& glbWorkSizeOrg  //!< original global work size
        ) const;

    /*! \brief Detects if runtime has to disable cache optimization and
     *   recompiles the kernel
     *
     *  \return True if aliases were detected in the kernel arguments
     */
    bool processMemObjects(
        VirtualGPU&         gpu,        //!< Virtual GPU objects - queue
        const amd::Kernel&  kernel,     //!< AMD kernel object for execution
        const_address       params,     //!< pointer to the param's store
        bool                nativeMem   //!< Native memory objects
        ) const;

    /*! \brief Loads all kernel arguments, so we could run the kernel in HW.
     *  This includes CB update and resource binding
     *
     *  \return True if we succefully loaded the arguments
     */
    bool loadParameters(
        VirtualGPU&         gpu,    //!< virtual GPU device object
        const amd::Kernel&  kernel, //!< AMD kernel object for execution
        const_address       params, //!< pointer to the param's store
        bool            nativeMem   //!< Native memory objects
        ) const;

    //! Binds the constant buffers associated with the kernel
    bool bindConstantBuffers(VirtualGPU& gpu) const;

    /*! \brief Runs the kernel on HW
     *
     *  \return True if we succefully executed the kernel
     */
    bool run(
        VirtualGPU& gpu,        //!< virtual GPU device object
        GpuEvent*   gpuEvent,   //!< Pointer to the GPU event
        bool        lastRun     //!< Last run in the split execution
        ) const;

    //! Help function to debug the kernel output
    void debug(
        VirtualGPU& gpu     //!< virtual GPU device object
        ) const;

    //! Programs internal samplers defined inside the kernel
    bool setInternalSamplers(
        VirtualGPU&   gpu   //!< Virtual GPU device object
        ) const;

    //! Returns TRUE if we successfully retrieved the binary from CAL
    bool getCalBinary(
        void*   binary,     //!< ISA binary code
        size_t  size        //!< ISA binary size
        ) const;

    //! Returns CAL image size
    size_t getCalBinarySize() const;

    //! Returns GPU device object, associated with this kernel
    const Device& dev() const;

    //! Returns GPU device object, associated with this kernel
    const Program& prog() const;

    //! Binds global HW constant buffers
    bool bindGlobalHwCb(
        VirtualGPU& gpu,                    //!< Virtual GPU device object
        VirtualGPU::GslKernelDesc*  desc    //!< Kernel descriptor
        ) const;

    //! Get profiling callback object
    virtual amd::ProfilingCallback* getProfilingCallback(
        const device::VirtualDevice *vdev){
        return waveLimiter_.getProfilingCallback(vdev);
    }

protected:
    //! Initializes the kernel parameters for the abstraction layer
    bool initParameters();

    /*! \brief Creates constant buffer resources, associated with the kernel
     *
     *  \return TRUE if we succefully created constant buffers
     */
    bool initConstBuffers();

    //! Returns TRUE if memory should be reallocated, returns FALSE always for NullDevice
    virtual bool isRealloc() const { return !dev().heap()->isVirtual(); }

private:
    //! Disable copy constructor
    Kernel(const Kernel&);

    //! Disable operator=
    Kernel& operator=(const Kernel&);

    //! \enum Fixed Metadata offsets
    enum MetadataOffsets
    {
        GlobalWorkitemOffset            = 0,
        LocalWorkitemOffset             = 1,
        GroupsOffset                    = 2,
        PrivateRingOffset               = 3,
        LocalRingOffset                 = 4,
        MathLibOffset                   = 5,
        GlobalWorkOffsetOffset          = 6,
        GroupWorkOffsetOffset           = 7,
        GlobalDataStoreOffset           = 8,
        DebugOffset                     = 8,
        NDRangeGlobalWorkOffsetOffset   = 9,

        // The total number of constants reserved for ABI
        TotalABIVectors
    };

    /*! \brief Sets the kernel argument
     *
     *  \return True if we succefully updated the arguments
     */
    bool setArgument(
        VirtualGPU& gpu,    //!< Virtual GPU device object
        uint        idx,    //!< the argument index
        const void* param,  //!< the arguments data
        size_t      size,   //!< size of the provided data
        bool    nativeMem   //!< Native memory objects
        ) const;

    /*! \brief Initializes local and private buffer ranges
     *
     *  \return True if we succefully initialized the ranges
     */
    bool initLocalPrivateRanges(
        VirtualGPU& gpu     //!< Virtual GPU device object
        ) const;

    //! Sets local and private buffer ranges
    void setLocalPrivateRanges(
        VirtualGPU& gpu     //!< Virtual GPU device object
        ) const;

    //! Sets the sampler's parameters for the image look-up
    void setSampler(
        VirtualGPU& gpu,        //!< virtual GPU device object
        uint32_t    state,      //!< sampler state
        uint        physUnit    //!< sampler's number
        ) const;

    /*! \brief Binds resource
     *
     *  \return True if we succefully created constant buffers
     */
    bool bindResource(
        VirtualGPU&     gpu,        //!< virtual GPU device object
        const Resource& resource,   //!< resource for binding
        uint            paramIdx,   //!< index of the parameter
        ResourceType    type,       //!< resource type
        uint            physUnit,   //!< PhysUnit
        Memory*         memory = NULL,  //!< GPU layer memory object
        size_t          offset = 0
        ) const;

    //! Unbinds all resources for the kernel
    void unbindResources(
        VirtualGPU& gpu,        //!< virtual GPU device object
        GpuEvent    gpuEvent,   //!< GPU event that will be associated with the resources
        bool        lastRun     //!< last run in the split execution
        ) const;

    //! Returns true if arena setup was successful
    bool setupArenaAliases(
        VirtualGPU& gpu,            //!< Virtual GPU device object
        const Resource& resource    //!< Resource for aliases setup
        ) const;

    //! Copies image constants to the constant buffer
    void copyImageConstants(
        const amd::Image*   amdImage,   //!< Abstraction layer image object
        ImageConstants*     imageData   //!< Pointer in CB to the image constants
        ) const;

    //! Finds local workgroup size
    void findLocalWorkSize(
        size_t      workDim,            //!< Work dimension
        const amd::NDRange& gblWorkSize,//!< Global work size
        amd::NDRange& lclWorkSize       //!< Local work size
        ) const;

    uint    hwPrivateSize_;     //!< initial HW private size
    uint    hwLocalSize_;       //!< initial HW local size

    //! @todo remove the blit kernel hack
    bool    blitKernelHack_;    //!< No VM hack for kernel blit

    WaveLimiterManager waveLimiter_; //!< adaptively control number of waves
};

enum HSAIL_ADDRESS_QUALIFIER{
    HSAIL_ADDRESS_ERROR = 0,
    HSAIL_ADDRESS_GLOBAL,
    HSAIL_ADDRESS_LOCAL,
    HSAIL_MAX_ADDRESS_QUALIFIERS
} ;

enum HSAIL_ARG_TYPE{
    HSAIL_ARGTYPE_ERROR = 0,
    HSAIL_ARGTYPE_POINTER,
    HSAIL_ARGTYPE_VALUE,
    HSAIL_ARGTYPE_IMAGE,
    HSAIL_ARGTYPE_SAMPLER,
    HSAIL_ARGTYPE_QUEUE,
    HSAIL_ARGMAX_ARG_TYPES
};

enum HSAIL_DATA_TYPE{
    HSAIL_DATATYPE_ERROR = 0,
    HSAIL_DATATYPE_B1,
    HSAIL_DATATYPE_B8,
    HSAIL_DATATYPE_B16,
    HSAIL_DATATYPE_B32,
    HSAIL_DATATYPE_B64,
    HSAIL_DATATYPE_S8,
    HSAIL_DATATYPE_S16,
    HSAIL_DATATYPE_S32,
    HSAIL_DATATYPE_S64,
    HSAIL_DATATYPE_U8,
    HSAIL_DATATYPE_U16,
    HSAIL_DATATYPE_U32,
    HSAIL_DATATYPE_U64,
    HSAIL_DATATYPE_F16,
    HSAIL_DATATYPE_F32,
    HSAIL_DATATYPE_F64,
    HSAIL_DATATYPE_STRUCT,
    HSAIL_DATATYPE_OPAQUE,
    HSAIL_DATATYPE_MAX_TYPES
};


class HSAILKernel : public device::Kernel
{
public:
    struct Argument
    {
        std::string name_;          //!< Argument's name
        std::string typeName_;      //!< Argument's type name
        uint        size_;          //!< Size in bytes
        uint        offset_;        //!< Argument's offset
        uint        alignment_;     //!< Argument's alignment
        HSAIL_ARG_TYPE type_;       //!< Type of the argument
        HSAIL_ADDRESS_QUALIFIER addrQual_;  //!< Address qualifier of the argument
        HSAIL_DATA_TYPE dataType_;  //!< The type of data
        uint        numElem_;       //!< Number of elements
    };

    // Global offsets located in the first 3 elements
    static const uint ExtraArguments = 6;

    HSAILKernel(std::string name,
        HSAILProgram* prog,
        std::string compileOptions);

    virtual ~HSAILKernel();

    //! Initializes the metadata required for this kernel,
    //! finalizes the kernel if needed
    bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);

    //! Returns true if memory is valid for execution
    virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;

    //! Returns a pointer to the hsail argument
    const Argument* argument(size_t i) const { return arguments_[i]; }

    //! Returns the number of hsail arguments
    size_t numArguments() const { return arguments_.size(); }

    //! Returns GPU device object, associated with this kernel
    const Device& dev() const;

    //! Returns HSA program associated with this kernel
    const HSAILProgram& prog() const;

    //! Returns LDS size used in this kernel
    uint32_t ldsSize() const
        { return cpuAqlCode_->workgroup_group_segment_byte_size; }

    //! Returns pointer on CPU to AQL code info
    const void* cpuAqlCode() const { return cpuAqlCode_; }

    //! Returns memory object with AQL code
    gpu::Memory* gpuAqlCode() const { return code_; }

    //! Returns size of AQL code
    size_t aqlCodeSize() const { return codeSize_; }

    //! Returns the size of argument buffer
    size_t argsBufferSize() const
        { return cpuAqlCode_->kernarg_segment_byte_size; }

    //! Returns spill reg size per workitem
    int spillSegSize() const
        { return cpuAqlCode_->workitem_private_segment_byte_size; }

    //! Returns TRUE if kernel uses dynamic parallelism
    bool dynamicParallelism() const
        { return (flags_.dynamicParallelism_) ? true : false; }

    //! Returns TRUE if kernel is internal kernel
    bool isInternalKernel() const
        { return (flags_.internalKernel_) ? true : false; }

    //! Finds local workgroup size
    void findLocalWorkSize(
        size_t      workDim,            //!< Work dimension
        const amd::NDRange& gblWorkSize,//!< Global work size
        amd::NDRange& lclWorkSize       //!< Local work size
        ) const;

    //! Returns AQL packet in CPU memory
    //! if the kerenl arguments were successfully loaded, otherwise NULL
    hsa_kernel_dispatch_packet_t* loadArguments(
        VirtualGPU&                     gpu,        //!< Running GPU context
        const amd::Kernel&              kernel,     //!< AMD kernel object
        const amd::NDRangeContainer&    sizes,      //!< NDrange container
        const_address               parameters,     //!< Application arguments for the kernel
        bool                        nativeMem,      //!< Native memory objectes are passed
        uint64_t                    vmDefQueue,     //!< GPU VM default queue pointer
        uint64_t*                   vmParentWrap,   //!< GPU VM parent aql wrap object
        std::vector<const Resource*>&   memList     //!< Memory list for GSL/VidMM handles
        ) const;

    //! Returns pritnf info array
    const std::vector<PrintfInfo>& printfInfo() const { return printf_; }

    //! Returns the kernel index in the program
    uint index() const { return index_; }

private:
    //! Disable copy constructor
    HSAILKernel(const HSAILKernel&);

    //! Disable operator=
    HSAILKernel& operator=(const HSAILKernel&);

    //! Creates AQL kernel HW info
    bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);

    //! Initializes arguments_ and the abstraction layer kernel parameters
    void initArgList(
        const aclArgData* aclArg    //!< List of ACL arguments
        );

    //! Initializes Hsail Argument metadata and info
    void initHsailArgs(
        const aclArgData* aclArg    //!< List of ACL arguments
        );

    //! Initializes Hsail Printf metadata and info
    void initPrintf(
        const aclPrintfFmt* aclPrintf   //!< List of ACL printfs
        );

    std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
    std::string compileOptions_;        //!< compile used for finalizing this kernel
    amd_kernel_code_t*  cpuAqlCode_;    //!< AQL kernel code on CPU
    const NullDevice&   dev_;           //!< GPU device object
    const HSAILProgram& prog_;          //!< Reference to the parent program
    std::vector<PrintfInfo> printf_;    //!< Format strings for GPU printf support
    uint    index_;                     //!< Kernel index in the program

    gpu::Memory*    code_;      //!< Memory object with ISA code
    size_t          codeSize_;  //!< Size of ISA code

    char*       hwMetaData_;    //!< SI metadata

    union Flags {
        struct {
            uint    imageEna_: 1;           //!< Kernel uses images
            uint    imageWriteEna_: 1;      //!< Kernel uses image writes
            uint    dynamicParallelism_: 1; //!< Dynamic parallelism enabled
            uint    internalKernel_: 1;     //!< True: internal kernel
        };
        uint    value_;
        Flags(): value_(0) {}
    } flags_;
};

/*@}*/} // namespace gpu

#endif /*GPUKERNEL_HPP_*/