rocm-systems/rocclr/runtime/device/gpu/gpudevice.hpp

//
// Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
//

#ifndef GPU_HPP_
#define GPU_HPP_

#include "top.hpp"
#include "device/device.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/memory.hpp"
#include "utils/concurrent.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "device/gpu/gpuvirtual.hpp"
#include "device/gpu/gpumemory.hpp"
#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpusettings.hpp"
#include "device/gpu/gpuappprofile.hpp"


#include "acl.h"
#include "vaminterface.h"

/*! \addtogroup GPU
 *  @{
 */

//! GPU Device Implementation
namespace gpu {

//! A nil device object
class NullDevice : public amd::Device {
 protected:
  static aclCompiler* compiler_;
  static aclCompiler* hsaCompiler_;

 public:
  aclCompiler* amdilCompiler() const { return compiler_; }
  aclCompiler* hsaCompiler() const { return hsaCompiler_; }
  aclCompiler* compiler() const { return hsaCompiler_; }
  Compiler* binCompiler() const { return amdilCompiler(); }

  static bool init(void);

  //! Construct a new identifier
  NullDevice();

  //! Creates an offline device with the specified target
  bool create(CALtarget target  //!< GPU device identifier
              );

  //! Instantiate a new virtual device
  virtual device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL) {
    return NULL;
  }

  //! Create the device program.
  virtual device::Program* createProgram(amd::option::Options* options = NULL);

  //! Just returns NULL for the dummy device
  virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; }

  //! Sampler object allocation
  virtual bool createSampler(const amd::Sampler& owner,  //!< abstraction layer sampler object
                             device::Sampler** sampler   //!< device sampler object
                             ) const {
    ShouldNotReachHere();
    return true;
  }

  //! Just returns NULL for the dummy device
  virtual device::Memory* createView(
      amd::Memory& owner,           //!< Owner memory object
      const device::Memory& parent  //!< Parent device memory object for the view
      ) const {
    return NULL;
  }

  //! Acquire external graphics API object in the host thread
  //! Needed for OpenGL objects on CPU device

  virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
                                  bool validateOnly) {
    return true;
  }

  virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext,
                                    bool validateOnly) {
    return true;
  }

  //! Releases non-blocking map target memory
  virtual void freeMapTarget(amd::Memory& mem, void* target) {}

  CALtarget calTarget() const { return calTarget_; }

  const AMDDeviceInfo* hwInfo() const { return hwInfo_; }

  //! Empty implementation on Null device
  virtual bool globalFreeMemory(size_t* freeMemory) const { return false; }

  //! Get GPU device settings
  const gpu::Settings& settings() const { return reinterpret_cast<gpu::Settings&>(*settings_); }
  virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
                         cl_svm_mem_flags flags, void* svmPtr) const {
    return NULL;
  }
  virtual void svmFree(void* ptr) const { return; }

  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }

 protected:
  bool usePal() const {
    return (calTarget_ == CAL_TARGET_GREENLAND || calTarget_ == CAL_TARGET_RAVEN ||
            calTarget_ == CAL_TARGET_RAVEN2 || calTarget_ >= CAL_TARGET_VEGA12);
  }

  //! Answer the question: "Should HSAIL Program be created?",
  //! based on the given options.
  bool isHsailProgram(amd::option::Options* options = NULL);

  //! Fills OpenCL device info structure
  void fillDeviceInfo(const CALdeviceattribs& calAttr,  //!< CAL device attributes info
                      const gslMemInfo& memInfo,        //!< GSL mem info
                      size_t maxTextureSize,            //!< Maximum texture size supported in HW
                      uint numComputeRings,             //!< Number of compute rings
                      uint numComputeRingsRT            //!< Number of RT compute rings
                      );

  CALtarget calTarget_;          //!< GPU device identifier
  const AMDDeviceInfo* hwInfo_;  //!< Device HW info structure
};

//! Forward declarations
class Command;
class Device;
class GpuCommand;
class Heap;
class HeapBlock;
class Program;
class Kernel;
class Memory;
class Resource;
class VirtualDevice;
class PrintfDbg;
class ThreadTrace;

#ifndef CL_FILTER_NONE
#define CL_FILTER_NONE 0x1142
#endif

class Sampler : public device::Sampler {
 public:
  //! Constructor
  Sampler(const Device& dev) : dev_(dev) {}

  //! Default destructor for the device memory object
  virtual ~Sampler();

  //! Creates a device sampler from the OCL sampler state
  bool create(uint32_t oclSamplerState  //!< OCL sampler state
              );

  //! Creates a device sampler from the OCL sampler state
  bool create(const amd::Sampler& owner  //!< AMD sampler object
              );

  const void* hwState() const { return hwState_; }

 private:
  //! Disable default copy constructor
  Sampler& operator=(const Sampler&);

  //! Disable operator=
  Sampler(const Sampler&);

  const Device& dev_;  //!< Device object associated with the sampler
  address hwState_;    //!< GPU HW state (\todo legacy path)
};

//! A GPU device ordinal (physical GPU device)
class Device : public NullDevice, public CALGSLDevice {
 public:
  class Heap : public amd::EmbeddedObject {
   public:
    //! The size of a heap element in bytes
    static const size_t ElementSize = 4;

    //! The type of a heap element in bytes
    static const cmSurfFmt ElementType = CM_SURF_FMT_R32I;

    Heap() : resource_(NULL), baseAddress_(0) {}

    bool create(Device& device  //!< GPU device object
                );

    //! Gets the GPU resource associated with the global heap
    const Memory& resource() const { return *resource_; }

    //! Returns the base virtual address of the heap
    uint64_t baseAddress() const { return baseAddress_; }

   protected:
    Memory* resource_;      //!< GPU resource referencing the heap memory
    uint64_t baseAddress_;  //!< Virtual heap base address
  };

  //! Locks any access to the virtual GPUs
  class ScopedLockVgpus : public amd::StackObject {
   public:
    //! Default constructor
    ScopedLockVgpus(const Device& dev);

    //! Destructor
    ~ScopedLockVgpus();

   private:
    const Device& dev_;  //! Device object
  };

  //! Interop emulation flags
  enum InteropEmulationFlags {
    D3D10Device = 0x00000001,
    GLContext = 0x00000002,
  };

  class Engines : public amd::EmbeddedObject {
   public:
    //! Default constructor
    Engines() : numComputeRings_(0), numComputeRingsRT_(0), numDmaEngines_(0) {
      memset(desc_, 0xff, sizeof(desc_));
    }

    //! Creates engine descriptor for this class
    void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings);

    //! Gets engine type mask
    uint getMask(gslEngineID id) const { return (1 << id); }

    //! Gets a descriptor for the requested engines
    uint getRequested(uint engines, gslEngineDescriptor* desc) const;

    //! Returns the number of available compute rings
    uint numComputeRings() const { return numComputeRings_; }

    //! Returns the number of available real time compute rings
    uint numComputeRingsRT() const { return numComputeRingsRT_; }

    //! Returns the number of available DMA engines
    uint numDMAEngines() const { return numDmaEngines_; }

   private:
    uint numComputeRings_;
    uint numComputeRingsRT_;
    uint numDmaEngines_;
    gslEngineDescriptor desc_[GSL_ENGINEID_MAX];  //!< Engine descriptor
  };

  //! Transfer buffers
  class XferBuffers : public amd::HeapObject {
   public:
    static const size_t MaxXferBufListSize = 8;

    //! Default constructor
    XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize)
        : type_(type), bufSize_(bufSize), acquiredCnt_(0), gpuDevice_(device) {}

    //! Default destructor
    ~XferBuffers();

    //! Creates the xfer buffers object
    bool create();

    //! Acquires an instance of the transfer buffers
    Memory& acquire();

    //! Releases transfer buffer
    void release(VirtualGPU& gpu,  //!< Virual GPU object used with the buffer
                 Memory& buffer    //!< Transfer buffer for release
                 );

    //! Returns the buffer's size for transfer
    size_t bufSize() const { return bufSize_; }

   private:
    //! Disable copy constructor
    XferBuffers(const XferBuffers&);

    //! Disable assignment operator
    XferBuffers& operator=(const XferBuffers&);

    //! Get device object
    const Device& dev() const { return gpuDevice_; }

    Resource::MemoryType type_;       //!< The buffer's type
    size_t bufSize_;                  //!< Staged buffer size
    std::list<Memory*> freeBuffers_;  //!< The list of free buffers
    amd::Atomic<uint> acquiredCnt_;   //!< The total number of acquired buffers
    amd::Monitor lock_;               //!< Staged buffer acquire/release lock
    const Device& gpuDevice_;         //!< GPU device object
  };

  struct ScratchBuffer : public amd::HeapObject {
    uint regNum_;      //!< The number of used scratch registers
    Memory* memObj_;   //!< Memory objects for scratch buffers
    uint64_t offset_;  //!< Offset from the global scratch store
    uint64_t size_;    //!< Scratch buffer size on this queue

    //! Default constructor
    ScratchBuffer() : regNum_(0), memObj_(NULL), offset_(0), size_(0) {}

    //! Default constructor
    ~ScratchBuffer();

    //! Destroys memory objects
    void destroyMemory();
  };


  class SrdManager : public amd::HeapObject {
   public:
    SrdManager(const Device& dev, uint srdSize, uint bufSize)
        : dev_(dev),
          numFlags_(bufSize / (srdSize * MaskBits)),
          srdSize_(srdSize),
          bufSize_(bufSize) {}
    ~SrdManager();

    //! Allocates a new SRD slot for a resource
    uint64_t allocSrdSlot(address* cpuAddr);

    //! Frees a SRD slot
    void freeSrdSlot(uint64_t addr);

    // Fills the memory list for VidMM KMD
    void fillResourceList(std::vector<const Memory*>& memList);

   private:
    //! Disable copy constructor
    SrdManager(const SrdManager&);

    //! Disable assignment operator
    SrdManager& operator=(const SrdManager&);

    struct Chunk {
      Memory* buf_;
      uint* flags_;
      Chunk() : buf_(NULL), flags_(NULL) {}
    };

    static const uint MaskBits = 32;
    const Device& dev_;        //!< GPU device for the chunk manager
    amd::Monitor ml_;          //!< Global lock for the SRD manager
    std::vector<Chunk> pool_;  //!< Pool of SRD buffers
    uint numFlags_;            //!< Total number of flags in array
    uint srdSize_;             //!< SRD size
    uint bufSize_;             //!< Buffer size that holds SRDs
  };

  //! Initialise the whole GPU device subsystem (CAL init, device enumeration, etc).
  static bool init();

  //! Shutdown the whole GPU device subsystem (CAL shutdown).
  static void tearDown();

  //! Construct a new physical GPU device
  Device();

  //! Initialise a device (i.e. all parts of the constructor that could
  //! potentially fail)
  bool create(CALuint ordinal,      //!< GPU device ordinal index. Starts from 0
              CALuint numOfDevices  //!< number of GPU devices in the system
              );

  //! Destructor for the physical GPU device
  virtual ~Device();

  //! Instantiate a new virtual device
  device::VirtualDevice* createVirtualDevice(amd::CommandQueue* queue = NULL);

  //! Memory allocation
  virtual device::Memory* createMemory(amd::Memory& owner  //!< abstraction layer memory object
                                       ) const;

  //! Sampler object allocation
  virtual bool createSampler(const amd::Sampler& owner,  //!< abstraction layer sampler object
                             device::Sampler** sampler   //!< device sampler object
                             ) const;

  //! Allocates a view object from the device memory
  virtual device::Memory* createView(
      amd::Memory& owner,           //!< Owner memory object
      const device::Memory& parent  //!< Parent device memory object for the view
      ) const;

  //! Create the device program.
  virtual device::Program* createProgram(amd::option::Options* options = NULL);

  //! Attempt to bind with external graphics API's device/context
  virtual bool bindExternalDevice(uint flags, void* const pDevice[], void* pContext,
                                  bool validateOnly);

  //! Attempt to unbind with external graphics API's device/context
  virtual bool unbindExternalDevice(uint flags, void* const pDevice[], void* pContext,
                                    bool validateOnly);

  //! Validates kernel before execution
  virtual bool validateKernel(const amd::Kernel& kernel,  //!< AMD kernel object
                              const device::VirtualDevice* vdev);

  virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput);

  //! Retrieves information about free memory on a GPU device
  virtual bool globalFreeMemory(size_t* freeMemory) const;

  //! Returns a GPU memory object from AMD memory object
  gpu::Memory* getGpuMemory(amd::Memory* mem  //!< Pointer to AMD memory object
                            ) const;

  //! Gets the GPU resource associated with the global heap
  const Memory& globalMem() const { return heap_.resource(); }

  //! Gets the device context object
  amd::Context& context() const { return *context_; }

  //! Gets the global heap object
  const Heap& heap() const { return heap_; }

  //! Gets the memory object for the dummy page
  amd::Memory* dummyPage() const { return dummyPage_; }

  amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; }

  //! Returns the lock object for the virtual gpus list
  amd::Monitor* vgpusAccess() const { return vgpusAccess_; }

  //! Returns the number of virtual GPUs allocated on this device
  uint numOfVgpus() const { return numOfVgpus_; }
  uint numOfVgpus_;  //!< The number of virtual GPUs (lock protected)

  typedef std::vector<VirtualGPU*> VirtualGPUs;

  //! Returns the list of all virtual GPUs running on this device
  const VirtualGPUs& vgpus() const { return vgpus_; }
  VirtualGPUs vgpus_;  //!< The list of all running virtual gpus (lock protected)

  //! Scratch buffer allocation
  gpu::Memory* createScratchBuffer(size_t size  //!< Size of buffer
                                   ) const;

  //! Returns transfer buffer object
  XferBuffers& xferWrite() const { return *xferWrite_; }

  //! Returns transfer buffer object
  XferBuffers& xferRead() const { return *xferRead_; }

  //! Finds an appropriate map target
  amd::Memory* findMapTarget(size_t size) const;

  //! Adds a map target to the cache
  bool addMapTarget(amd::Memory* memory) const;

  //! Returns resource cache object
  ResourceCache& resourceCache() const { return *resourceCache_; }

  //! Returns engines object
  const Engines& engines() const { return engines_; }

  //! Returns engines object
  const device::BlitManager& xferMgr() const;

  VirtualGPU* xferQueue() const { return xferQueue_; }

  //! Retrieves the internal format from the OCL format
  CalFormat getCalFormat(const amd::Image::Format& format  //! OCL image format
                         ) const;

  //! Retrieves the OCL format from the internal image format
  amd::Image::Format getOclFormat(const CalFormat& format  //! Internal image format
                                  ) const;

  const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }

  //! Returns the global scratch buffer
  Memory* globalScratchBuf() const { return globalScratchBuf_; };

  //! Destroys scratch buffer memory
  void destroyScratchBuffers();

  //! Initialize heap resources if uninitialized
  bool initializeHeapResources();

  //! Set GSL sampler to the specified state
  void fillHwSampler(uint32_t state,                       //!< Sampler's OpenCL state
                     void* hwState,                        //!< Sampler's HW state
                     uint32_t hwStateSize,                 //!< Size of sampler's HW state
                     uint32_t mipFilter = CL_FILTER_NONE,  //!< Mip filter
                     float minLod = 0.f,                   //!< Min level of detail
                     float maxLod = CL_MAXFLOAT            //!< Max level of detail
                     ) const;

  //! host memory alloc
  virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;

  //! SVM allocation
  virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
                         cl_svm_mem_flags flags, void* svmPtr) const;

  //! Free host SVM memory
  void hostFree(void* ptr, size_t size) const;

  //! SVM free
  virtual void svmFree(void* ptr) const;

  //! Returns SRD manger object
  SrdManager& srds() const { return *srdManager_; }

  //! Initial the Hardware Debug Manager
  cl_int hwDebugManagerInit(amd::Context* context, uintptr_t messageStorage);

 private:
  //! Disable copy constructor
  Device(const Device&);

  //! Disable assignment
  Device& operator=(const Device&);

  //! Sends the stall command to all queues
  bool stallQueues();

  //! Buffer allocation
  gpu::Memory* createBuffer(amd::Memory& owner,  //!< Abstraction layer memory object
                            bool directAccess    //!< Use direct host memory access
                            ) const;

  //! Image allocation
  gpu::Memory* createImage(amd::Memory& owner,  //!< Abstraction layer memory object
                           bool directAccess    //!< Use direct host memory access
                           ) const;

  //! Allocates/reallocates the scratch buffer, according to the usage
  bool allocScratch(uint regNum,            //!< Number of the scratch registers
                    const VirtualGPU* vgpu  //!< Virtual GPU for the allocation
                    );

  amd::Context* context_;   //!< A dummy context for internal allocations
  Heap heap_;               //!< GPU global heap
  amd::Memory* dummyPage_;  //!< A dummy page for NULL pointer

  amd::Monitor* lockAsyncOps_;             //!< Lock to serialise all async ops on this device
  amd::Monitor* lockAsyncOpsForInitHeap_;  //!< Lock to serialise all async ops on initialization
                                           //!heap operation
  amd::Monitor* vgpusAccess_;              //!< Lock to serialise virtual gpu list access
  amd::Monitor* scratchAlloc_;             //!< Lock to serialise scratch allocation
  amd::Monitor* mapCacheOps_;              //!< Lock to serialise cache for the map resources

  XferBuffers* xferRead_;   //!< Transfer buffers read
  XferBuffers* xferWrite_;  //!< Transfer buffers write

  std::vector<amd::Memory*>* mapCache_;  //!< Map cache info structure
  ResourceCache* resourceCache_;         //!< Resource cache
  Engines engines_;                      //!< Available engines on device
  bool heapInitComplete_;                //!< Keep track of initialization status of heap resources
  VirtualGPU* xferQueue_;                //!< Transfer queue
  std::vector<ScratchBuffer*> scratch_;  //!< Scratch buffers for kernels
  Memory* globalScratchBuf_;             //!< Global scratch buffer
  SrdManager* srdManager_;               //!< SRD manager object

  static AppProfile appProfile_;  //!< application profile
};

/*@}*/} // namespace gpu

#endif /*GPU_HPP_*/