Files
rocm-systems/rocclr/device/gpu/gpuvirtual.hpp
T
Laurent Morichetti b4c6143a2f Update copyright info
Change-Id: Ia4f9ff0f5f873b4223a8cca154188bb0d2f1abba
2020-02-04 09:26:14 -08:00

526 خطوط
22 KiB
C++

/* Copyright (c) 2008-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef GPUVIRTUAL_HPP_
#define GPUVIRTUAL_HPP_
#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpuprintf.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "device/gpu/gpudebugger.hpp"
/*! \addtogroup GPU GPU Resource Implementation
* @{
*/
//! GPU Device Implementation
namespace gpu {
class Device;
class Kernel;
class Memory;
class CalCounterReference;
class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice, public CALGSLContext {
public:
struct CommandBatch : public amd::HeapObject {
amd::Command* head_; //!< Command batch head
GpuEvent events_[AllEngines]; //!< Last known GPU events
TimeStamp* lastTS_; //!< TS associated with command batch
//! Constructor
CommandBatch(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
)
: head_(head), lastTS_(lastTS) {
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
}
};
//! The virtual GPU states
union State {
struct {
uint boundGlobal_ : 1; //!< Global buffer was bound
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint boundCb_ : 1; //!< Constant buffer was bound
uint boundPrintf_ : 1; //!< Printf buffer was bound
uint hsailKernel_ : 1; //!< True if HSAIL kernel was used
};
uint value_;
State() : value_(0) {}
};
//! CAL descriptor for the GPU virtual device
struct CalVirtualDesc : public amd::EmbeddedObject {
gslDomain3D gridBlock; //!< size of a block of data
gslDomain3D gridSize; //!< size of 'blocks' to execute
gslDomain3D partialGridBlock; //!< Partial grid block
CALuint localSize; //!< size of OpenCL Local Memory in bytes
uint memCount_; //!< Memory objects count
GpuEvent events_[AllEngines]; //!< Last known GPU events
uint iterations_; //!< Number of iterations for the execution
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
gslMemObject constBuffers_[MaxConstBuffers]; //!< Constant buffer names
gslMemObject uavs_[MaxUavArguments]; //!< UAV bindings
gslMemObject readImages_[MaxReadImage]; //!< Read images
uint32_t samplersState_[MaxSamplers]; //!< State of all samplers
};
typedef std::vector<ConstBuffer*> constbufs_t;
//! GSL descriptor for the GPU kernel, specific to the virtual device
struct GslKernelDesc : public amd::HeapObject {
CALimage image_; //!< CAL image for the program
gslProgramObject func_; //!< GSL program object
gslMemObject intCb_; //!< Internal constant buffer
};
struct ResourceSlot {
union State {
struct {
uint bound_ : 1; //!< Resource is bound
uint constant_ : 1; //!< Resource is a constant
};
uint value_;
State() : value_(0) {}
};
State state_; //!< slot's state
const Memory* memory_; //!< GPU memory object
ResourceSlot() : memory_(NULL) {}
//! Copy constructor for the kernel argument
ResourceSlot(const ResourceSlot& data) { *this = data; }
//! Overloads operator=
ResourceSlot& operator=(const ResourceSlot& data) {
state_.value_ = data.state_.value_;
memory_ = data.memory_;
return *this;
}
};
class MemoryDependency : public amd::EmbeddedObject {
public:
//! Default constructor
MemoryDependency()
: memObjectsInQueue_(NULL), endMemObjectsInQueue_(0), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {}
~MemoryDependency() { delete[] memObjectsInQueue_; }
//! Creates memory dependecy structure
bool create(size_t numMemObj);
//! Notify the tracker about new kernel
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
//! Validates memory object on dependency
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
//! Clear memory dependency
void clear(bool all = true);
private:
struct MemoryState {
uint64_t start_; //! Busy memory start address
uint64_t end_; //! Busy memory end address
bool readOnly_; //! Current GPU state in the queue
};
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class DmaFlushMgmt : public amd::EmbeddedObject {
public:
DmaFlushMgmt(const Device& dev);
// Resets DMA command buffer workload
void resetCbWorkload(const Device& dev);
// Finds split size for the current dispatch
void findSplitSize(const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
private:
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
uint aluCnt_; //!< All ALUs on the chip
uint dispatchSplitSize_; //!< Dispath split size in elements
};
typedef std::vector<ResourceSlot> ResourceSlots;
public:
explicit VirtualGPU(Device& device);
bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled,
uint deviceQueueSize = 0,
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
~VirtualGPU();
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& vcmd) {}
void submitMapMemory(amd::MapMemoryCommand& vcmd);
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
void submitKernel(amd::NDRangeKernelCommand& vcmd);
bool submitKernelInternal(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
);
bool submitKernelInternalHSA(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitMarker(amd::Marker& vcmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
void submitSignal(amd::SignalCommand& vcmd);
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd);
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
void releaseMemory(gslMemObject gslResource, bool wait = true);
void releaseKernel(CALimage calImage);
void flush(amd::Command* list = NULL, bool wait = false);
bool terminate() { return true; }
//! Returns GPU device object associated with this kernel
const Device& dev() const { return gpuDevice_; }
//! Returns CAL descriptor of the virtual device
const CalVirtualDesc* cal() const { return &cal_; }
//! Returns active kernel descriptor for this virtual device
const GslKernelDesc* gslKernelDesc() const { return activeKernelDesc_; }
//! Returns a GPU event, associated with GPU memory
GpuEvent* getGpuEvent(const gslMemObject gslMem //!< GSL mem object
) {
return &gpuEvents_[gslMem];
}
//! Assigns a GPU event, associated with GPU memory
void assignGpuEvent(const gslMemObject gslMem, //!< GSL mem object
GpuEvent gpuEvent) {
gpuEvents_[gslMem] = gpuEvent;
}
//! Set the kernel as active
bool setActiveKernelDesc(const amd::NDRangeContainer& sizes, //!< kernel execution work sizes
const Kernel* kernel //!< GPU kernel object
);
//! Set the last known GPU event
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
);
//! Flush DMA buffer on the specified engine
void flushDMA(uint engineID //!< Engine ID for DMA flush
);
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(CommandBatch* cb = NULL //!< Command batch
);
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(CommandBatch* cb //!< Command batch
);
//! Returns a resource associated with the constant buffer
const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
//! Adds CAL objects into the constant buffer vector
void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
constbufs_t constBufs_; //!< constant buffers
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
//! End the command profiling
void profilingEnd(amd::Command& command);
//! Collect the profiling results
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
);
//! Adds a memory handle into the GSL memory array for Virtual Heap
bool addVmMemory(const Memory* memory //!< GPU memory object
);
//! Adds a stage write buffer into a list
void addXferWrite(Memory& memory);
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
//! Release pinned memory objects
void releasePinnedMem();
//! Finds if pinned memory is cached
amd::Memory* findPinnedMem(void* addr, size_t size);
//! Returns gsl memory object for VM
const gslMemObject* vmMems() const { return vmMems_; }
//! Get the PrintfDbg object
PrintfDbg& printfDbg() const { return *printfDbg_; }
//! Get the PrintfDbgHSA object
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
//! Enables synchronized transfers
void enableSyncedBlit() const;
//! Checks if profiling is enabled
bool profiling() const { return state_.profiling_; }
//! Returns memory dependency class
MemoryDependency& memoryDependency() { return memoryDependency_; }
//! Returns hsaQueueMem_
const Memory* hsaQueueMem() const { return hsaQueueMem_; }
//! Returns DMA flush management structure
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
//! Releases GSL memory objects allocated on this queue
void releaseMemObjects(bool scratch = true);
//! Returns the HW ring used on this virtual device
uint hwRing() const { return hwRing_; }
//! Returns current timestamp object for profiling
TimeStamp* currTs() const { return cal_.lastTS_; }
//! Returns virtual queue object for device enqueuing
Memory* vQueue() const { return virtualQueue_; }
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
ResourceSlots slots_; //!< Resource slots for kernel arguments
State state_; //!< virtual GPU current state
CalVirtualDesc cal_; //!< CAL virtual device descriptor
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
protected:
virtual void profileEvent(EngineType engine, bool type) const;
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
);
private:
typedef std::unordered_map<CALimage, GslKernelDesc*> GslKernels;
typedef std::unordered_map<gslMemObject, GpuEvent> GpuEvents;
//! Finds total amount of necessary iterations
inline void findIterations(const amd::NDRangeContainer& sizes, //!< Original workload sizes
const amd::NDRange& local, //!< Local workgroup size
amd::NDRange& groups, //!< Calculated workgroup sizes
amd::NDRange& remainder, //!< Calculated remainder sizes
size_t& extra //!< Amount of extra executions for remainder
);
//! Setups workloads for the current iteration
inline void setupIteration(
uint iteration, //!< Current iteration
const amd::NDRangeContainer& sizes, //!< Original workload sizes
Kernel& gpuKernel, //!< GPU kernel
amd::NDRange& global, //!< Global size for the current iteration
amd::NDRange& offsets, //!< Offsets for the current iteration
amd::NDRange& local, //!< Local sizes for the current iteration
amd::NDRange& groups, //!< Group sizes for the current iteration
amd::NDRange& groupOffset, //!< Group offsets for the current iteration
amd::NDRange& divider, //!< Group divider
amd::NDRange& remainder, //!< Remain workload
size_t extra //!< Extra groups
);
//! Allocates constant buffers
bool allocConstantBuffers();
//! Allocates CAL kernel descriptor of the virtual device
GslKernelDesc* allocKernelDesc(const Kernel* kernel, //!< Kernel object
CALimage calImage); //!< CAL image
//! Frees CAL kernel descriptor of the virtual device
void freeKernelDesc(GslKernelDesc* desc);
bool gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs);
void gslDestroy();
//! Releases stage write buffers
void releaseXferWrite();
//! Allocate hsaQueueMem_
bool allocHsaQueueMem();
//! Awaits a command batch with a waiting event
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = NULL //!< A waiting event
);
//! Validates the scratch buffer memory for a specified kernel
void validateScratchBuffer(const Kernel* kernel //!< Kernel for validaiton
);
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
amd::Memory& dstMem, //!< destination memory object
bool entire, //!< flag of entire memory copy
const amd::Coord3D& srcOrigin, //!< source memory origin
const amd::Coord3D& dstOrigin, //!< destination memory object
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
Device& gpuDevice_; //!< physical GPU device
PrintfDbg* printfDbg_; //!< GPU printf implemenation
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
TimeStampCache* tsCache_; //!< TimeStamp cache
MemoryDependency memoryDependency_; //!< Memory dependency class
gslMemObject* vmMems_; //!< Array of GSL memories for VM mode
uint numVmMems_; //!< Number of entries in VM mem array
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
std::list<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::list<amd::Memory*> pinnedMems_; //!< Pinned memory list
typedef std::list<CommandBatch*> CommandBatchList;
CommandBatchList cbList_; //!< List of command batches
uint hwRing_; //!< HW ring used on this virtual device
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* currTs_; //!< current timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
uint deviceQueueSize_; //!< Device queue size
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
bool profileEnabled_; //!< Profiling is enabled
};
/*@}*/} // namespace gpu
#endif /*GPUVIRTUAL_HPP_*/