rocm-systems/rocclr/device/gpu/gpukernel.hpp

/* Copyright (c) 2008-present Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#ifndef GPUKERNEL_HPP_
#define GPUKERNEL_HPP_

#include "device/device.hpp"
#include "utils/macros.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/kernel.hpp"
#include "platform/sampler.hpp"
#include "device/gpu/gpudevice.hpp"
#include "device/gpu/gpuvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/gpu/gpuprintf.hpp"
#include "device/devwavelimiter.hpp"
#include "hsa.h"

namespace amd {
namespace hsa {
namespace loader {
class Symbol;
}  // loader
}  // hsa
}  // amd

//! \namespace gpu GPU Device Implementation
namespace gpu {

class VirtualGPU;
class Device;
class NullDevice;
class HSAILProgram;

struct HWSHADER_Helper {
  template <typename S, typename T> static T Get(S base, T offset) {
    return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base) + reinterpret_cast<size_t>(offset));
  }
};

#define HWSHADER_Get(shader, field) HWSHADER_Helper::Get((shader), (shader)->field)

template <typename D, typename S>
static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
  dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src) + structSize * size);
}

/*! \addtogroup GPU GPU Device Implementation
 *  @{
 */

/*! \brief Helper function for the std::string processing.
 *  Finds the name in the std::string
 *
 *  \return True if we found the entry of the symbols
 */
bool expect(const std::string& str,  //!< The original std::string
            size_t* pos,             //!< Position to start
            const std::string& sym   //!< The sympols to expect
            );

/*! \brief Helper function for the std::string processing.
 *  Gets a word from the std::string
 *
 *  \return True if we successfully received a word
 */
bool getword(const std::string& str,  //!< The original std::string
             size_t* pos,             //!< Position to start
             std::string& sym         //!< Returned word
             );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata
 *
 *  \return True if we loaded a number
 */
bool getuint(const std::string& str,  //!< The original std::string
             size_t* pos,             //!< Position to start
             uint* val                //!< Returned number
             );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata in HEX format
 *
 *  \return True if we loaded a number
 */
bool getuintHex(const std::string& str,  //!< The original std::string
                size_t* pos,             //!< Position to start
                uint* val                //!< Returned number
                );

/*! \brief Helper function for the std::string processing.
 *  Loads numbers from the metadata in HEX format
 *
 *  \return True if we loaded a number
 */
bool getuint64Hex(const std::string& str,  //!< The original std::string
                  size_t* pos,             //!< Position to start
                  uint64_t* val            //!< Returned number
                  );

/*! \brief Helper function for the std::string processing.
 *  Converts unsigned integer to string
 *
 *  \return None
 */
void intToStr(size_t value,  //!< Value for conversion
              char* str,     //!< Pointer to the converted string
              size_t size    //!< String size
              );

//! Image constant data from ABI specification
struct ImageConstants : public amd::EmbeddedObject {
  uint32_t width_;         //!< Image surface width
  uint32_t height_;        //!< Image surface height
  uint32_t depth_;         //!< Image surface depth (1 for 2D images)
  uint32_t dataType_;      //!< Image surface data type
  float widthFloat_;       //!< Image surface width
  float heightFloat_;      //!< Image surface height
  float depthFloat_;       //!< Image surface depth (1 for 2D images)
  uint32_t channelOrder_;  //!< Image surface texels channel order
};

//! Kernel arguments
struct KernelArg : public amd::HeapObject {
 public:
  //! \enum Kernel argument type
  enum ArgumentType {
    NoType = 0,
    PointerGlobal,
    Value,
    Image,
    PointerLocal,
    PointerHwLocal,
    PointerPrivate,
    PointerHwPrivate,
    PointerConst,
    PointerHwConst,
    Float,
    Double,
    Half,
    Char,
    UChar,
    Short,
    UShort,
    Int,
    UInt,
    Long,
    ULong,
    Struct,
    Union,
    Opaque,
    Event,
    Image1D,  //!< first image
    Image2D,
    Image1DB,
    Image1DA,
    Image2DA,
    Image3D,  //!< last image
    Counter,
    Sampler,
    PrivateSize,
    LocalSize,
    HwPrivateSize,
    HwLocalSize,
    Grouping,
    WrkgrpSize,
    Wavefront,
    PrivateFixed,
    ErrorMessage,
    WarningMessage,
    PrintfFormatStr,
    MetadataVersion,
    UavId,
    ABI64Bit,
    GWS,
    SWGWS,
    Reflection,
    ConstArg,
    ConstBufId,
    PrintfBufId,
    GroupingHint,
    VecTypeHint,
    WavesPerSimdHint,
    TotalTypes
  };

  // The compiler metadata fields
  std::string name_;   //!< parameters name
  ArgumentType type_;  //!< type of argument
  union {
    uint size_;      //!< number of arguments (for values and pointers only)
    uint location_;  //!< sampler's location (for samplers only)
  };
  uint cbIdx_;             //!< constant buffer index
  uint cbPos_;             //!< dword address in CB for the argument
  std::string buf_;        //!< buffer tag
  uint index_;             //!< buffer/image/sampler index
  uint alignment_;         //!< the required argument's alignment
  ArgumentType dataType_;  //!< data type of the argument
  union {
    struct {
      uint uavBuf_ : 1;     //!< UAV memory, no global heap
      uint realloc_ : 1;    //!< argument has to be reallocatedin the global heap
      uint readOnly_ : 1;   //!< Read only memory object
      uint writeOnly_ : 1;  //!< Write only memory object
      uint readWrite_ : 1;  //!< Read/Write memory object
    };
    uint value_;
  } memory_;

  std::string typeName_;  //!< argument's type name
  uint typeQualifier_;    //!< argument's type qualifier

  //! Default constructor for the kernel argument
  KernelArg();

  //! Copy constructor for the kernel argument
  KernelArg(const KernelArg& data);

  //! Overloads operator=
  KernelArg& operator=(const KernelArg& data);

  //! Destructor of the kernel argument
  ~KernelArg() { name_.clear(); }

  /*! \brief Checks if this arguments requires a place in constant buffer
   *
   *  \return True if we need CB
   */
  bool isCbNeeded() const;

  /*! \brief Retrieves the argument's size
   *
   *  \return Size of the current argument
   */
  size_t size(bool gpuLayer  //!< True if we want the argument's size for the GPU layer
              ) const;

  /*! \brief Retrieves the argument's type for the abstraction layer
   *
   *  \return The argument's type in the abstraction layer format
   */
  clk_value_type_t type() const;

  /*! \brief Retrieves the argument's address qualifier for the abstraction layer
   *
   *  \return The argument's address qualifier in the abstraction layer format
   */
  cl_kernel_arg_address_qualifier addressQualifier() const;

  /*! \brief Retrieves the argument's access qualifier for the abstraction layer
   *
   *  \return The argument's access qualifier in the abstraction layer format
   */
  cl_kernel_arg_access_qualifier accessQualifier() const;

  /*! \brief Retrieves the argument's type name for the abstraction layer
   *
   *  \return The argument's type name
   */
  const char* typeName() const { return typeName_.c_str(); }

  /*! \brief Retrieves the argument's type qualifier for the abstraction layer
   *
   *  \return The argument's type qualifier
   */
  cl_kernel_arg_type_qualifier typeQualifier() const {
    switch (type_) {
      case PointerConst:
      case PointerHwConst:
        return static_cast<cl_kernel_arg_type_qualifier>(typeQualifier_ | CL_KERNEL_ARG_TYPE_CONST);
      default:
        return static_cast<cl_kernel_arg_type_qualifier>(typeQualifier_);
    }
  }

  //! Special case for vectors with component size <= 16bit
  static constexpr uint VectorSizeLimit = 4;
  size_t specialVector() const;
};

struct DataTypeConst {
  const char* tagName_;           //!< data type's name
  KernelArg::ArgumentType type_;  //!< data type
};

//!  Metadata description for parsing
struct MetaDataConst {
  const char* typeName_;          //!< parameters name
  KernelArg::ArgumentType type_;  //!< type of argument
  struct {
    uint size_ : 1;      //!< number of arguments
    uint name_ : 1;      //!< argument's name
    uint resType_ : 1;   //!< argument's type
    uint cbIdx_ : 1;     //!< resource index CB, sampler or image
    uint cbPos_ : 1;     //!< dword address in CB for the argument
    uint buf_ : 1;       //!< buffer tag
    uint reserved : 26;  //!< reserved
  };
};

const uint DescTotal = 15;
const uint BasicTypeTotal = 15;
const uint ArgStateTotal = DescTotal + BasicTypeTotal;

//! The constant array that describes different metadata properties
extern const MetaDataConst ArgState[ArgStateTotal];

extern const DataTypeConst DataType[];

extern const uint DataTypeTotal;

// Forward declaration
class Program;
class NullProgram;

class CalImageReference : public amd::ReferenceCountedObject {
 public:
  //! Default constructor
  CalImageReference(CALimage calImage) : image_(calImage) {}

  //! Get CAL image
  CALimage calImage() const { return image_; }

 protected:
  //! Default destructor
  ~CalImageReference();

 private:
  //! Disable copy constructor
  CalImageReference(const CalImageReference&);

  //! Disable operator=
  CalImageReference& operator=(const CalImageReference&);

  CALimage image_;  //!< CAL kernel image
};

//! \class GPU NullKernel - Kernel for offline device
class NullKernel : public device::Kernel {
 public:
  typedef std::vector<KernelArg*> arguments_t;

  static constexpr uint UavIdUndefined = 0xffff;

  enum Flags {
    LimitWorkgroup = 1 << 0,  //!< Limits the workgroup size
    PrintfOutput = 1 << 1,    //!< Kernel has printf output
    PrivateFixed = 1 << 2,    //!< Kernel has printf output
    ABI64bit = 1 << 3,        //!< Kernel has 64 bit ABI
    Unused0 = 1 << 4,         //!< Unused
    Unused1 = 1 << 5,         //!< Unused
    ImageEnable = 1 << 6,     //!< Kernel uses images
    ImageWrite = 1 << 7,      //!< Kernel writes images
  };

  //! \enum Resource type for binding
  enum ResourceType {
    Undefined = 0x00000000,            //!< resource type will be detected
    ConstantBuffer = 0x00000001,       //!< resource is a constant buffer
    GlobalBuffer = 0x00000002,         //!< resource is a global buffer
    ArgumentHeapBuffer = 0x00000004,   //!< resource is an argument buffer
    ArgumentBuffer = 0x00000005,       //!< resource is an argument buffer
    ArgumentImageRead = 0x00000006,    //!< resource is an argument image read
    ArgumentImageWrite = 0x00000007,   //!< resource is an argument image write
    ArgumentConstBuffer = 0x00000008,  //!< resource is an argument const buffer
    ArgumentCounter = 0x00000009,      //!< resource is a global counter
    ArgumentUavID = 0x0000000a,        //!< resource is a dummy ID read
    ArgumentCbID = 0x0000000b,         //!< resource is a constant buffer
    ArgumentPrintfID = 0x0000000c,     //!< resource is a printf buffer
  };

  //! GPU kernel constructor
  NullKernel(const std::string& name,       //!< The kernel's name
             const NullDevice& gpuNullDev,  //!< GPU device object
             const NullProgram& nullProg    //!< Reference to the program
             );

  virtual ~NullKernel();

  /*! \brief Creates a GPU kernel in CAL
   *
   *  \return True if we successfully created a kernel in CAL
   */
  bool create(const std::string& code,        //!< IL source code
              const std::string& metadata,    //!< the kernel metadata structure
              const void* binaryCode = NULL,  //!< binary machine code for CAL
              size_t binarySize = 0           //!< the machine code size
              );

  //! Returns CAL function descriptor
  CALimage calImage() const { return calRef_->calImage(); }

  //! Returns TRUE if we successfully retrieved the binary from CAL
  bool getCalBinary(void* binary,  //!< ISA binary code
                    size_t size    //!< ISA binary size
                    ) const;

  //! Returns CAL image size
  size_t getCalBinarySize() const;

  //! Returns GPU device object, associated with this kernel
  const NullDevice& nullDev() const { return gpuDev_; }

  //! Returns GPU device object, associated with this kernel
  const NullProgram& nullProg() const { return reinterpret_cast<const NullProgram&>(prog_); }

  //! Returns the kernel's build error
  const int32_t buildError() const { return buildError_; }

  //! Returns the kernel's flags
  uint flags() const { return flags_; }

  //! Returns TRUE if ABI is for 64 bits
  bool abi64Bit() const { return (flags_ & ABI64bit) ? true : false; }

  //! Returns the total number of all arguments
  size_t argSize() const { return arguments_.size(); }

  //! Returns instruction count of the current kernel
  uint instructionCnt() const { return instructionCnt_; }

 protected:
  /*! \brief Parses the metadata structure for the kernel,
   *   provided by the OpenCL compiler
   *
   *  \return True if we succefully parsed all arguments
   */
  bool parseArguments(const std::string& metaData,  //!< the program for parsing
                      uint* uavRefCount  //!< an array of reference counters for used UAVs
                      );

  //! Returns the argument for the specified index
  const KernelArg* argument(uint idx) const { return arguments_[idx]; }

  //! Adds the kernel argument into the list
  void addArgument(KernelArg* arg) { arguments_.push_back(arg); }

  //! Returns the argument for the specified sampler's index
  const KernelArg* sampler(uint idx) const { return intSamplers_[idx]; }

  //! Returns the total number of all internal samplers
  size_t samplerSize() const { return intSamplers_.size(); }

  //! Adds the kernel sampler into the sampler's list
  void addSampler(KernelArg* arg) { intSamplers_.push_back(arg); }

  //! Returns UAV raw index for this kernel
  uint uavRaw() const { return uavRaw_; }

  int32_t buildError_;     //!< Kernel's build error
  std::string ilSource_;  //!< IL source code of this kernel

  const NullDevice& gpuDev_;  //!< GPU device object

  CalImageReference* calRef_;  //!< CAL image reference for this kernel
  bool internal_;              //!< Runtime internal ker

  uint flags_;               //!< kernel object flags
  arguments_t arguments_;    //!< kernel arguments for the execution
  arguments_t intSamplers_;  //!< predefined intenal kernel samplers

  size_t* cbSizes_;  //!< real constant buffer sizes for this kernel
  uint numCb_;       //!< total number of constant buffers

  uint uavRaw_;  //!< UAV used for RAW access

  bool rwAttributes_;  //!< backend provides RW attributes for arguments

  uint instructionCnt_;  //!< Instruction count

  uint cbId_;      //!< UAV used for constant buffer access
  uint printfId_;  //!< UAV used for printf buffer access

 private:
  //! Disable copy constructor
  NullKernel(const NullKernel&);

  //! Disable operator=
  NullKernel& operator=(const NullKernel&);

  //! Creates a filename for ISA/IL dumps
  std::string mkDumpName(const char* extension  //!< File extension to append
                         ) const;

  bool createMultiBinary(uint* imageSize,  //!< Multibinary image size
                         void** image,     //!< Multibinary image
                         const void* isa   //!< Kernel HW info
                         );

  //! SI HW specific setup for kernels
  bool siCreateHwInfo(const void* shader,          //!< HW info shader
                      AMUabiAddEncoding& encoding  //!< ABI encoding structure
                      );

  //! r800 HW specific setup for kernels
  bool r800CreateHwInfo(const void* shader,          //!< HW info shader
                        AMUabiAddEncoding& encoding  //!< ABI encoding structure
                        );
};

//! \class GPU kernel
class Kernel : public NullKernel {
 public:
  struct InitData {
    uint privateSize_;    //!< Private ring initial size
    uint localSize_;      //!< Local ring initial size
    uint hwPrivateSize_;  //!< HW private ring initial size
    uint hwLocalSize_;    //!< HW local ring initial size
    uint flags_;          //!< Kernel initialization flags
  };

  //! GPU kernel constructor
  Kernel(const std::string& name,   //!< The kernel's name
         const Device& gpuDev,      //!< GPU device object
         const Program& prog,       //!< Reference to the program
         const InitData* initData_  //!< Initialization data
         );

  //! GPU kernel destructor
  virtual ~Kernel();

  /*! \brief Creates a GPU kernel in CAL
   *
   *  \return True if we successfully created a kernel in CAL
   */
  bool create(const std::string& code,        //!< IL source code
              const std::string& metadata,    //!< the kernel metadata structure
              const void* binaryCode = NULL,  //!< binary machine code for CAL
              size_t binarySize = 0           //!< the machine code size
              );

  //! Initializes the CAL program grid for the kernel execution
  void setupProgramGrid(VirtualGPU& gpu,                    //!< virtual GPU device object
                        size_t workDim,                     //!< work dimension
                        const amd::NDRange& glbWorkOffset,  //!< global work offset
                        const amd::NDRange& gblWorkSize,    //!< global work size
                        amd::NDRange& lclWorkSize,          //!< local work size
                        const amd::NDRange& groupOffset,    //!< group offsets
                        const amd::NDRange& glbWorkOffsetOrg,
                        const amd::NDRange& glbWorkSizeOrg  //!< original global work size
                        ) const;

  /*! \brief Detects if runtime has to disable cache optimization and
   *   recompiles the kernel
   *
   *  \return True if aliases were detected in the kernel arguments
   */
  void processMemObjects(VirtualGPU& gpu,            //!< Virtual GPU objects - queue
                         const amd::Kernel& kernel,  //!< AMD kernel object for execution
                         const_address params,       //!< pointer to the param's store
                         bool nativeMem              //!< Native memory objects
                         ) const;

  /*! \brief Loads all kernel arguments, so we could run the kernel in HW.
   *  This includes CB update and resource binding
   *
   *  \return True if we succefully loaded the arguments
   */
  bool loadParameters(VirtualGPU& gpu,            //!< virtual GPU device object
                      const amd::Kernel& kernel,  //!< AMD kernel object for execution
                      const_address params,       //!< pointer to the param's store
                      bool nativeMem              //!< Native memory objects
                      ) const;

  //! Binds the constant buffers associated with the kernel
  bool bindConstantBuffers(VirtualGPU& gpu) const;

  /*! \brief Runs the kernel on HW
   *
   *  \return True if we succefully executed the kernel
   */
  bool run(VirtualGPU& gpu,     //!< virtual GPU device object
           GpuEvent* gpuEvent,  //!< Pointer to the GPU event
           bool lastRun,        //!< Last run in the split execution
           bool lastDoppCmd,    //!< for last dopp submission kernel dispatch
           bool pfpaDoppCmd     //!< for PFPA dopp submission kernel dispatch
           ) const;

  //! Help function to debug the kernel output
  void debug(VirtualGPU& gpu  //!< virtual GPU device object
             ) const;

  //! Programs internal samplers defined inside the kernel
  bool setInternalSamplers(VirtualGPU& gpu  //!< Virtual GPU device object
                           ) const;

  //! Returns TRUE if we successfully retrieved the binary from CAL
  bool getCalBinary(void* binary,  //!< ISA binary code
                    size_t size    //!< ISA binary size
                    ) const;

  //! Returns CAL image size
  size_t getCalBinarySize() const;

  //! Returns GPU device object, associated with this kernel
  const Device& dev() const;

  //! Returns GPU device object, associated with this kernel
  const Program& prog() const;

  //! Binds global HW constant buffers
  bool bindGlobalHwCb(VirtualGPU& gpu,                 //!< Virtual GPU device object
                      VirtualGPU::GslKernelDesc* desc  //!< Kernel descriptor
                      ) const;

 protected:
  //! Initializes the kernel parameters for the abstraction layer
  bool initParameters();

  /*! \brief Creates constant buffer resources, associated with the kernel
   *
   *  \return TRUE if we succefully created constant buffers
   */
  bool initConstBuffers();

 private:
  //! Disable copy constructor
  Kernel(const Kernel&);

  //! Disable operator=
  Kernel& operator=(const Kernel&);

  //! \enum Fixed Metadata offsets
  enum MetadataOffsets {
    GlobalWorkitemOffset = 0,
    LocalWorkitemOffset = 1,
    GroupsOffset = 2,
    PrivateRingOffset = 3,
    LocalRingOffset = 4,
    MathLibOffset = 5,
    GlobalWorkOffsetOffset = 6,
    GroupWorkOffsetOffset = 7,
    GlobalDataStoreOffset = 8,
    DebugOffset = 8,
    NDRangeGlobalWorkOffsetOffset = 9,

    // The total number of constants reserved for ABI
    TotalABIVectors
  };

  /*! \brief Sets the kernel argument
   *
   *  \return True if we succefully updated the arguments
   */
  bool setArgument(VirtualGPU& gpu,     //!< Virtual GPU device object
                   const amd::Kernel& kernel, //!< AMD kernel object
                   uint idx,            //!< the argument index
                   const_address params,//!< the arguments data
                   const amd::KernelParameterDescriptor& desc, //!< Argument's descriptor
                   bool nativeMem      //!< Native memory objects
                   ) const;

  /*! \brief Initializes local and private buffer ranges
   *
   *  \return True if we succefully initialized the ranges
   */
  bool initLocalPrivateRanges(VirtualGPU& gpu  //!< Virtual GPU device object
                              ) const;

  //! Sets local and private buffer ranges
  void setLocalPrivateRanges(VirtualGPU& gpu  //!< Virtual GPU device object
                             ) const;

  //! Sets the sampler's parameters for the image look-up
  void setSampler(VirtualGPU& gpu,  //!< virtual GPU device object
                  uint32_t state,   //!< sampler state
                  uint physUnit     //!< sampler's number
                  ) const;

  /*! \brief Binds resource
   *
   *  \return True if we succefully created constant buffers
   */
  bool bindResource(VirtualGPU& gpu,       //!< virtual GPU device object
                    const Memory& memory,  //!< memory for binding
                    uint paramIdx,         //!< index of the parameter
                    ResourceType type,     //!< resource type
                    uint physUnit,         //!< PhysUnit
                    size_t offset = 0) const;

  //! Unbinds all resources for the kernel
  void unbindResources(VirtualGPU& gpu,    //!< virtual GPU device object
                       GpuEvent gpuEvent,  //!< GPU event that will be associated with the resources
                       bool lastRun        //!< last run in the split execution
                       ) const;

  //! Copies image constants to the constant buffer
  void copyImageConstants(const amd::Image* amdImage,  //!< Abstraction layer image object
                          ImageConstants* imageData    //!< Pointer in CB to the image constants
                          ) const;

  //! Finds local workgroup size
  void findLocalWorkSize(size_t workDim,                   //!< Work dimension
                         const amd::NDRange& gblWorkSize,  //!< Global work size
                         amd::NDRange& lclWorkSize         //!< Local work size
                         ) const;

  uint hwPrivateSize_;  //!< initial HW private size
  uint hwLocalSize_;    //!< initial HW local size
};

enum HSAIL_ADDRESS_QUALIFIER {
  HSAIL_ADDRESS_ERROR = 0,
  HSAIL_ADDRESS_GLOBAL,
  HSAIL_ADDRESS_LOCAL,
  HSAIL_MAX_ADDRESS_QUALIFIERS
};

enum HSAIL_ARG_TYPE {
  HSAIL_ARGTYPE_ERROR = 0,
  HSAIL_ARGTYPE_POINTER,
  HSAIL_ARGTYPE_VALUE,
  HSAIL_ARGTYPE_IMAGE,
  HSAIL_ARGTYPE_SAMPLER,
  HSAIL_ARGTYPE_QUEUE,
  HSAIL_ARGMAX_ARG_TYPES
};

enum HSAIL_DATA_TYPE {
  HSAIL_DATATYPE_ERROR = 0,
  HSAIL_DATATYPE_B1,
  HSAIL_DATATYPE_B8,
  HSAIL_DATATYPE_B16,
  HSAIL_DATATYPE_B32,
  HSAIL_DATATYPE_B64,
  HSAIL_DATATYPE_S8,
  HSAIL_DATATYPE_S16,
  HSAIL_DATATYPE_S32,
  HSAIL_DATATYPE_S64,
  HSAIL_DATATYPE_U8,
  HSAIL_DATATYPE_U16,
  HSAIL_DATATYPE_U32,
  HSAIL_DATATYPE_U64,
  HSAIL_DATATYPE_F16,
  HSAIL_DATATYPE_F32,
  HSAIL_DATATYPE_F64,
  HSAIL_DATATYPE_STRUCT,
  HSAIL_DATATYPE_OPAQUE,
  HSAIL_DATATYPE_MAX_TYPES
};

enum HSAIL_ACCESS_TYPE {
  HSAIL_ACCESS_TYPE_NONE = 0,
  HSAIL_ACCESS_TYPE_RO,
  HSAIL_ACCESS_TYPE_WO,
  HSAIL_ACCESS_TYPE_RW
};

class HSAILKernel : public device::Kernel {
 public:
  struct Argument {
    std::string name_;                  //!< Argument's name
    std::string typeName_;              //!< Argument's type name
    uint size_;                         //!< Size in bytes
    uint offset_;                       //!< Argument's offset
    uint alignment_;                    //!< Argument's alignment
    HSAIL_ARG_TYPE type_;               //!< Type of the argument
    HSAIL_ADDRESS_QUALIFIER addrQual_;  //!< Address qualifier of the argument
    HSAIL_DATA_TYPE dataType_;          //!< The type of data
    uint numElem_;                      //!< Number of elements
    HSAIL_ACCESS_TYPE access_;          //!< Access type for the argument
  };

  // Max number of possible extra (hidden) kernel arguments
  static constexpr uint MaxExtraArgumentsNum = 6;

  HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum);

  virtual ~HSAILKernel();

  //! Initializes the metadata required for this kernel,
  //! finalizes the kernel if needed
  bool init(amd::hsa::loader::Symbol* sym, bool finalize = false);

  //! Returns a pointer to the hsail argument
  const Argument* argument(size_t i) const { return arguments_[i]; }

  //! Returns the number of hsail arguments
  size_t numArguments() const { return arguments_.size(); }

  //! Returns GPU device object, associated with this kernel
  const Device& dev() const;

  //! Returns HSA program associated with this kernel
  const HSAILProgram& prog() const;

  //! Returns LDS size used in this kernel
  uint32_t ldsSize() const { return cpuAqlCode_->workgroup_group_segment_byte_size; }

  //! Returns pointer on CPU to AQL code info
  const void* cpuAqlCode() const { return cpuAqlCode_; }

  //! Returns memory object with AQL code
  gpu::Memory* gpuAqlCode() const { return code_; }

  //! Returns size of AQL code
  size_t aqlCodeSize() const { return codeSize_; }

  //! Returns the size of argument buffer
  size_t argsBufferSize() const { return cpuAqlCode_->kernarg_segment_byte_size; }

  //! Returns spill reg size per workitem
  int spillSegSize() const { return cpuAqlCode_->workitem_private_segment_byte_size; }

  //! Returns AQL packet in CPU memory
  //! if the kerenl arguments were successfully loaded, otherwise NULL
  hsa_kernel_dispatch_packet_t* loadArguments(
      VirtualGPU& gpu,                     //!< Running GPU context
      const amd::Kernel& kernel,           //!< AMD kernel object
      const amd::NDRangeContainer& sizes,  //!< NDrange container
      const_address parameters,            //!< Application arguments for the kernel
      bool nativeMem,                      //!< Native memory objectes are passed
      uint64_t vmDefQueue,                 //!< GPU VM default queue pointer
      uint64_t* vmParentWrap,              //!< GPU VM parent aql wrap object
      std::vector<const Memory*>& memList  //!< Memory list for GSL/VidMM handles
      ) const;

  //! Returns the kernel index in the program
  uint index() const { return index_; }

  //! Returns kernel's extra argument count
  uint extraArgumentsNum() const { return extraArgumentsNum_; }

 private:
  //! Disable copy constructor
  HSAILKernel(const HSAILKernel&);

  //! Disable operator=
  HSAILKernel& operator=(const HSAILKernel&);

  //! Creates AQL kernel HW info
  bool aqlCreateHWInfo(amd::hsa::loader::Symbol* sym);

  //! Initializes arguments_ and the abstraction layer kernel parameters
  void initArgList(const aclArgData* aclArg  //!< List of ACL arguments
                   );

  //! Initializes Hsail Argument metadata and info
  void initHsailArgs(const aclArgData* aclArg  //!< List of ACL arguments
                     );

  std::vector<Argument*> arguments_;  //!< Vector list of HSAIL Arguments
  std::string compileOptions_;        //!< compile used for finalizing this kernel
  amd_kernel_code_t* cpuAqlCode_;     //!< AQL kernel code on CPU
  uint index_;                        //!< Kernel index in the program

  gpu::Memory* code_;  //!< Memory object with ISA code
  size_t codeSize_;    //!< Size of ISA code

  char* hwMetaData_;  //!< SI metadata

  uint extraArgumentsNum_;  //! Number of extra (hidden) kernel arguments
};

/*@}*/} // namespace gpu

#endif /*GPUKERNEL_HPP_*/