rocm-systems/rocclr/runtime/device/gpu/gpuvirtual.hpp

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//

#ifndef GPUVIRTUAL_HPP_
#define GPUVIRTUAL_HPP_

#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpuprintf.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"

#include "device/gpu/gpudebugger.hpp"


/*! \addtogroup GPU GPU Resource Implementation
 *  @{
 */

//! GPU Device Implementation
namespace gpu {

class Device;
class Kernel;
class Memory;
class CalCounterReference;
class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;

//! Virtual GPU
class VirtualGPU : public device::VirtualDevice, public CALGSLContext {
 public:
  struct CommandBatch : public amd::HeapObject {
    amd::Command* head_;           //!< Command batch head
    GpuEvent events_[AllEngines];  //!< Last known GPU events
    TimeStamp* lastTS_;            //!< TS associated with command batch

    //! Constructor
    CommandBatch(amd::Command* head,      //!< Command batch head
                 const GpuEvent* events,  //!< HW events on all engines
                 TimeStamp* lastTS        //!< Last TS in command batch
                 )
        : head_(head), lastTS_(lastTS) {
      memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
    }
  };

  //! The virtual GPU states
  union State {
    struct {
      uint boundGlobal_ : 1;  //!< Global buffer was bound
      uint profiling_ : 1;    //!< Profiling is enabled
      uint forceWait_ : 1;    //!< Forces wait in flush()
      uint boundCb_ : 1;      //!< Constant buffer was bound
      uint boundPrintf_ : 1;  //!< Printf buffer was bound
      uint hsailKernel_ : 1;  //!< True if HSAIL kernel was used
    };
    uint value_;
    State() : value_(0) {}
  };

  //! CAL descriptor for the GPU virtual device
  struct CalVirtualDesc : public amd::EmbeddedObject {
    gslDomain3D gridBlock;                        //!< size of a block of data
    gslDomain3D gridSize;                         //!< size of 'blocks' to execute
    gslDomain3D partialGridBlock;                 //!< Partial grid block
    CALuint localSize;                            //!< size of OpenCL Local Memory in bytes
    uint memCount_;                               //!< Memory objects count
    GpuEvent events_[AllEngines];                 //!< Last known GPU events
    uint iterations_;                             //!< Number of iterations for the execution
    TimeStamp* lastTS_;                           //!< Last timestamp executed on Virtual GPU
    gslMemObject constBuffers_[MaxConstBuffers];  //!< Constant buffer names
    gslMemObject uavs_[MaxUavArguments];          //!< UAV bindings
    gslMemObject readImages_[MaxReadImage];       //!< Read images
    uint32_t samplersState_[MaxSamplers];         //!< State of all samplers
  };

  typedef std::vector<ConstBuffer*> constbufs_t;

  //! GSL descriptor for the GPU kernel, specific to the virtual device
  struct GslKernelDesc : public amd::HeapObject {
    CALimage image_;         //!< CAL image for the program
    gslProgramObject func_;  //!< GSL program object
    gslMemObject intCb_;     //!< Internal constant buffer
  };

  struct ResourceSlot {
    union State {
      struct {
        uint bound_ : 1;     //!< Resource is bound
        uint constant_ : 1;  //!< Resource is a constant
      };
      uint value_;
      State() : value_(0) {}
    };

    State state_;           //!< slot's state
    const Memory* memory_;  //!< GPU memory object

    ResourceSlot() : memory_(NULL) {}

    //! Copy constructor for the kernel argument
    ResourceSlot(const ResourceSlot& data) { *this = data; }

    //! Overloads operator=
    ResourceSlot& operator=(const ResourceSlot& data) {
      state_.value_ = data.state_.value_;
      memory_ = data.memory_;
      return *this;
    }
  };

  class MemoryDependency : public amd::EmbeddedObject {
   public:
    //! Default constructor
    MemoryDependency()
        : memObjectsInQueue_(NULL), endMemObjectsInQueue_(0), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {}

    ~MemoryDependency() { delete[] memObjectsInQueue_; }

    //! Creates memory dependecy structure
    bool create(size_t numMemObj);

    //! Notify the tracker about new kernel
    void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }

    //! Validates memory object on dependency
    void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);

    //! Clear memory dependency
    void clear(bool all = true);

   private:
    struct MemoryState {
      uint64_t start_;  //! Busy memory start address
      uint64_t end_;    //! Busy memory end address
      bool readOnly_;   //! Current GPU state in the queue
    };

    MemoryState* memObjectsInQueue_;  //!< Memory object state in the queue
    size_t endMemObjectsInQueue_;     //!< End of mem objects in the queue
    size_t numMemObjectsInQueue_;     //!< Number of mem objects in the queue
    size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
  };


  class DmaFlushMgmt : public amd::EmbeddedObject {
   public:
    DmaFlushMgmt(const Device& dev);

    // Resets DMA command buffer workload
    void resetCbWorkload(const Device& dev);

    // Finds split size for the current dispatch
    void findSplitSize(const Device& dev,  //!< GPU device object
                       uint64_t threads,   //!< Total number of execution threads
                       uint instructions   //!< Number of ALU instructions
                       );

    // Returns TRUE if DMA command buffer is ready for a flush
    bool isCbReady(VirtualGPU& gpu,   //!< Virtual GPU object
                   uint64_t threads,  //!< Total number of execution threads
                   uint instructions  //!< Number of ALU instructions
                   );

    // Returns dispatch split size
    uint dispatchSplitSize() const { return dispatchSplitSize_; }

   private:
    uint64_t maxDispatchWorkload_;  //!< Maximum number of operations for a single dispatch
    uint64_t maxCbWorkload_;        //!< Maximum number of operations for DMA command buffer
    uint64_t cbWorkload_;           //!< Current number of operations in DMA command buffer
    uint aluCnt_;                   //!< All ALUs on the chip
    uint dispatchSplitSize_;        //!< Dispath split size in elements
  };

  typedef std::vector<ResourceSlot> ResourceSlots;

 public:
  explicit VirtualGPU(Device& device);
  bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled,
              uint deviceQueueSize = 0,
              amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
  ~VirtualGPU();

  void submitReadMemory(amd::ReadMemoryCommand& vcmd);
  void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
  void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
  void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& vcmd) {}
  void submitMapMemory(amd::MapMemoryCommand& vcmd);
  void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
  void submitKernel(amd::NDRangeKernelCommand& vcmd);
  bool submitKernelInternal(
      const amd::NDRangeContainer& sizes,  //!< Workload sizes
      const amd::Kernel& kernel,           //!< Kernel for execution
      const_address parameters,            //!< Parameters for the kernel
      bool nativeMem = true,               //!< Native memory objects
      amd::Event* enqueueEvent = NULL      //!< Event provided in the enqueue kernel command
      );
  bool submitKernelInternalHSA(
      const amd::NDRangeContainer& sizes,  //!< Workload sizes
      const amd::Kernel& kernel,           //!< Kernel for execution
      const_address parameters,            //!< Parameters for the kernel
      bool nativeMem = true,               //!< Native memory objects
      amd::Event* enqueueEvent = NULL      //!< Event provided in the enqueue kernel command
      );
  void submitNativeFn(amd::NativeFnCommand& vcmd);
  void submitFillMemory(amd::FillMemoryCommand& vcmd);
  void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
  void submitMarker(amd::Marker& vcmd);
  void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
  void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
  void submitPerfCounter(amd::PerfCounterCommand& vcmd);
  void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
  void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
  void submitSignal(amd::SignalCommand& vcmd);
  void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd);
  virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
  virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
  virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
  virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
  virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);

  void releaseMemory(gslMemObject gslResource, bool wait = true);
  void releaseKernel(CALimage calImage);

  void flush(amd::Command* list = NULL, bool wait = false);
  bool terminate() { return true; }

  //! Returns GPU device object associated with this kernel
  const Device& dev() const { return gpuDevice_; }

  //! Returns CAL descriptor of the virtual device
  const CalVirtualDesc* cal() const { return &cal_; }

  //! Returns active kernel descriptor for this virtual device
  const GslKernelDesc* gslKernelDesc() const { return activeKernelDesc_; }

  //! Returns a GPU event, associated with GPU memory
  GpuEvent* getGpuEvent(const gslMemObject gslMem  //!< GSL mem object
                        ) {
    return &gpuEvents_[gslMem];
  }

  //! Assigns a GPU event, associated with GPU memory
  void assignGpuEvent(const gslMemObject gslMem,  //!< GSL mem object
                      GpuEvent gpuEvent) {
    gpuEvents_[gslMem] = gpuEvent;
  }

  //! Set the kernel as active
  bool setActiveKernelDesc(const amd::NDRangeContainer& sizes,  //!< kernel execution work sizes
                           const Kernel* kernel                 //!< GPU kernel object
                           );

  //! Set the last known GPU event
  void setGpuEvent(GpuEvent gpuEvent,  //!< GPU event for tracking
                   bool flush = false  //!< TRUE if flush is required
                   );

  //! Flush DMA buffer on the specified engine
  void flushDMA(uint engineID  //!< Engine ID for DMA flush
                );

  //! Wait for all engines on this Virtual GPU
  //! Returns TRUE if CPU didn't wait for GPU
  bool waitAllEngines(CommandBatch* cb = NULL  //!< Command batch
                      );

  //! Waits for the latest GPU event with a lock to prevent multiple entries
  void waitEventLock(CommandBatch* cb  //!< Command batch
                     );

  //! Returns a resource associated with the constant buffer
  const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }

  //! Adds CAL objects into the constant buffer vector
  void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }

  constbufs_t constBufs_;  //!< constant buffers

  //! Start the command profiling
  void profilingBegin(amd::Command& command,     //!< Command queue object
                      bool drmProfiling = false  //!< Measure DRM time
                      );

  //! End the command profiling
  void profilingEnd(amd::Command& command);

  //! Collect the profiling results
  bool profilingCollectResults(CommandBatch* cb,               //!< Command batch
                               const amd::Event* waitingEvent  //!< Waiting event
                               );

  //! Adds a memory handle into the GSL memory array for Virtual Heap
  bool addVmMemory(const Memory* memory  //!< GPU memory object
                   );

  //! Adds a stage write buffer into a list
  void addXferWrite(Memory& memory);

  //! Adds a pinned memory object into a map
  void addPinnedMem(amd::Memory* mem);

  //! Release pinned memory objects
  void releasePinnedMem();

  //! Finds if pinned memory is cached
  amd::Memory* findPinnedMem(void* addr, size_t size);

  //! Returns gsl memory object for VM
  const gslMemObject* vmMems() const { return vmMems_; }

  //! Returns the monitor object for execution access by VirtualGPU
  amd::Monitor& execution() { return execution_; }

  //! Returns the virtual gpu unique index
  uint index() const { return index_; }

  //! Get the PrintfDbg object
  PrintfDbg& printfDbg() const { return *printfDbg_; }

  //! Get the PrintfDbgHSA object
  PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }

  //! Enables synchronized transfers
  void enableSyncedBlit() const;

  //! Checks if profiling is enabled
  bool profiling() const { return state_.profiling_; }

  //! Returns memory dependency class
  MemoryDependency& memoryDependency() { return memoryDependency_; }

  //! Returns hsaQueueMem_
  const Memory* hsaQueueMem() const { return hsaQueueMem_; }

  //! Returns DMA flush management structure
  const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }

  //! Releases GSL memory objects allocated on this queue
  void releaseMemObjects(bool scratch = true);

  //! Returns the HW ring used on this virtual device
  uint hwRing() const { return hwRing_; }

  //! Returns current timestamp object for profiling
  TimeStamp* currTs() const { return cal_.lastTS_; }

  //! Returns virtual queue object for device enqueuing
  Memory* vQueue() const { return virtualQueue_; }

  //! Update virtual queue header
  void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);

  //! Returns TRUE if virtual queue was successfully allocatted
  bool createVirtualQueue(uint deviceQueueSize  //!< Device queue size
                          );

  EngineType engineID_;  //!< Engine ID for this VirtualGPU
  ResourceSlots slots_;  //!< Resource slots for kernel arguments
  State state_;          //!< virtual GPU current state
  CalVirtualDesc cal_;   //!< CAL virtual device descriptor

  void flushCuCaches(HwDbgGpuCacheMask cache_mask);  //!< flush/invalidate SQ cache

 protected:
  virtual void profileEvent(EngineType engine, bool type) const;

  //! Creates buffer object from image
  amd::Memory* createBufferFromImage(
      amd::Memory& amdImage  //! The parent image object(untiled images only)
      );

 private:
  typedef std::unordered_map<CALimage, GslKernelDesc*> GslKernels;
  typedef std::unordered_map<gslMemObject, GpuEvent> GpuEvents;

  //! Finds total amount of necessary iterations
  inline void findIterations(const amd::NDRangeContainer& sizes,  //!< Original workload sizes
                             const amd::NDRange& local,           //!< Local workgroup size
                             amd::NDRange& groups,                //!< Calculated workgroup sizes
                             amd::NDRange& remainder,             //!< Calculated remainder sizes
                             size_t& extra  //!< Amount of extra executions for remainder
                             );

  //! Setups workloads for the current iteration
  inline void setupIteration(
      uint iteration,                      //!< Current iteration
      const amd::NDRangeContainer& sizes,  //!< Original workload sizes
      Kernel& gpuKernel,                   //!< GPU kernel
      amd::NDRange& global,                //!< Global size for the current iteration
      amd::NDRange& offsets,               //!< Offsets for the current iteration
      amd::NDRange& local,                 //!< Local sizes for the current iteration
      amd::NDRange& groups,                //!< Group sizes for the current iteration
      amd::NDRange& groupOffset,           //!< Group offsets for the current iteration
      amd::NDRange& divider,               //!< Group divider
      amd::NDRange& remainder,             //!< Remain workload
      size_t extra                         //!< Extra groups
      );

  //! Allocates constant buffers
  bool allocConstantBuffers();

  //! Allocates CAL kernel descriptor of the virtual device
  GslKernelDesc* allocKernelDesc(const Kernel* kernel,  //!< Kernel object
                                 CALimage calImage);    //!< CAL image

  //! Frees CAL kernel descriptor of the virtual device
  void freeKernelDesc(GslKernelDesc* desc);

  bool gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs);
  void gslDestroy();

  //! Releases stage write buffers
  void releaseXferWrite();

  //! Allocate hsaQueueMem_
  bool allocHsaQueueMem();

  //! Awaits a command batch with a waiting event
  bool awaitCompletion(CommandBatch* cb,                      //!< Command batch for to wait
                       const amd::Event* waitingEvent = NULL  //!< A waiting event
                       );

  //! Validates the scratch buffer memory for a specified kernel
  void validateScratchBuffer(const Kernel* kernel  //!< Kernel for validaiton
                             );

  //! Detects memory dependency for HSAIL kernels and flushes caches
  bool processMemObjectsHSA(const amd::Kernel& kernel,  //!< AMD kernel object for execution
                            const_address params,       //!< Pointer to the param's store
                            bool nativeMem,             //!< Native memory objects
                            std::vector<const Memory*>* memList  //!< Memory list for KMD tracking
                            );

  //! Common function for fill memory used by both svm Fill and non-svm fill
  bool fillMemory(cl_command_type type,        //!< the command type
                  amd::Memory* amdMemory,      //!< memory object to fill
                  const void* pattern,         //!< pattern to fill the memory
                  size_t patternSize,          //!< pattern size
                  const amd::Coord3D& origin,  //!< memory origin
                  const amd::Coord3D& size     //!< memory size for filling
                  );

  bool copyMemory(cl_command_type type,            //!< the command type
                  amd::Memory& srcMem,             //!< source memory object
                  amd::Memory& dstMem,             //!< destination memory object
                  bool entire,                     //!< flag of entire memory copy
                  const amd::Coord3D& srcOrigin,   //!< source memory origin
                  const amd::Coord3D& dstOrigin,   //!< destination memory object
                  const amd::Coord3D& size,        //!< copy size
                  const amd::BufferRect& srcRect,  //!< region of source for copy
                  const amd::BufferRect& dstRect   //!< region of destination for copy
                  );

  void buildKernelInfo(const HSAILKernel& hsaKernel,          //!< hsa kernel
                       hsa_kernel_dispatch_packet_t* aqlPkt,  //!< aql packet for dispatch
                       HwDbgKernelInfo& kernelInfo,           //!< kernel info for the dispatch
                       amd::Event* enqueueEvent  //!< Event provided in the enqueue kernel command
                       );

  void assignDebugTrapHandler(const DebugToolInfo& dbgSetting,  //!< debug settings
                              HwDbgKernelInfo& kernelInfo       //!< kernel info for the dispatch
                              );

  GslKernels gslKernels_;            //!< GSL kernel descriptors
  GslKernelDesc* activeKernelDesc_;  //!< active GSL kernel descriptors
  GpuEvents gpuEvents_;              //!< GPU events

  Device& gpuDevice_;       //!< physical GPU device
  amd::Monitor execution_;  //!< Lock to serialise access to all device objects
  uint index_;              //!< The virtual device unique index

  PrintfDbg* printfDbg_;        //!< GPU printf implemenation
  PrintfDbgHSA* printfDbgHSA_;  //!< HSAIL printf implemenation

  TimeStampCache* tsCache_;            //!< TimeStamp cache
  MemoryDependency memoryDependency_;  //!< Memory dependency class

  gslMemObject* vmMems_;  //!< Array of GSL memories for VM mode
  uint numVmMems_;        //!< Number of entries in VM mem array

  DmaFlushMgmt dmaFlushMgmt_;  //!< DMA flush management

  std::list<Memory*> xferWriteBuffers_;  //!< Stage write buffers
  std::list<amd::Memory*> pinnedMems_;   //!< Pinned memory list

  typedef std::list<CommandBatch*> CommandBatchList;
  CommandBatchList cbList_;  //!< List of command batches

  uint hwRing_;  //!< HW ring used on this virtual device

  uint64_t readjustTimeGPU_;  //!< Readjust time between GPU and CPU timestamps
  TimeStamp* currTs_;         //!< current timestamp for command

  AmdVQueueHeader* vqHeader_;  //!< Sysmem copy for virtual queue header
  Memory* virtualQueue_;       //!< Virtual device queue
  Memory* schedParams_;        //!< The scheduler parameters
  uint schedParamIdx_;         //!< Index in the scheduler parameters buffer
  uint deviceQueueSize_;       //!< Device queue size
  uint maskGroups_;  //!< The number of mask groups processed in the scheduler by one thread

  Memory* hsaQueueMem_;  //!< Memory for the amd_queue_t object
  bool profileEnabled_;  //!< Profiling is enabled
};

/*@}*/} // namespace gpu

#endif /*GPUVIRTUAL_HPP_*/