파일
rocm-systems/rocclr/runtime/device/gpu/gpuvirtual.hpp
T
foreman c11b2d52b7 P4 to Git Change 1552704 by gandryey@gera-w8 on 2018/05/09 15:11:23
SWDEV-79445 - OCL generic changes and code clean-up
	- Following CL#1552596. Make sure virtual GPU is set for the internal allocations before the create() call, since the deferred alloc is disabled.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#416 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#144 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#96 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#51 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#21 edit
2018-05-09 15:16:40 -04:00

518 라인
21 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef GPUVIRTUAL_HPP_
#define GPUVIRTUAL_HPP_
#include "device/gpu/gpudefs.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpuprintf.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "device/gpu/gpudebugger.hpp"
/*! \addtogroup GPU GPU Resource Implementation
* @{
*/
//! GPU Device Implementation
namespace gpu {
class Device;
class Kernel;
class Memory;
class CalCounterReference;
class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice, public CALGSLContext {
public:
struct CommandBatch : public amd::HeapObject {
amd::Command* head_; //!< Command batch head
GpuEvent events_[AllEngines]; //!< Last known GPU events
TimeStamp* lastTS_; //!< TS associated with command batch
//! Constructor
CommandBatch(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
)
: head_(head), lastTS_(lastTS) {
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
}
};
//! The virtual GPU states
union State {
struct {
uint boundGlobal_ : 1; //!< Global buffer was bound
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint boundCb_ : 1; //!< Constant buffer was bound
uint boundPrintf_ : 1; //!< Printf buffer was bound
uint hsailKernel_ : 1; //!< True if HSAIL kernel was used
};
uint value_;
State() : value_(0) {}
};
//! CAL descriptor for the GPU virtual device
struct CalVirtualDesc : public amd::EmbeddedObject {
gslDomain3D gridBlock; //!< size of a block of data
gslDomain3D gridSize; //!< size of 'blocks' to execute
gslDomain3D partialGridBlock; //!< Partial grid block
CALuint localSize; //!< size of OpenCL Local Memory in bytes
uint memCount_; //!< Memory objects count
GpuEvent events_[AllEngines]; //!< Last known GPU events
uint iterations_; //!< Number of iterations for the execution
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
gslMemObject constBuffers_[MaxConstBuffers]; //!< Constant buffer names
gslMemObject uavs_[MaxUavArguments]; //!< UAV bindings
gslMemObject readImages_[MaxReadImage]; //!< Read images
uint32_t samplersState_[MaxSamplers]; //!< State of all samplers
};
typedef std::vector<ConstBuffer*> constbufs_t;
//! GSL descriptor for the GPU kernel, specific to the virtual device
struct GslKernelDesc : public amd::HeapObject {
CALimage image_; //!< CAL image for the program
gslProgramObject func_; //!< GSL program object
gslMemObject intCb_; //!< Internal constant buffer
};
struct ResourceSlot {
union State {
struct {
uint bound_ : 1; //!< Resource is bound
uint constant_ : 1; //!< Resource is a constant
};
uint value_;
State() : value_(0) {}
};
State state_; //!< slot's state
const Memory* memory_; //!< GPU memory object
ResourceSlot() : memory_(NULL) {}
//! Copy constructor for the kernel argument
ResourceSlot(const ResourceSlot& data) { *this = data; }
//! Overloads operator=
ResourceSlot& operator=(const ResourceSlot& data) {
state_.value_ = data.state_.value_;
memory_ = data.memory_;
return *this;
}
};
class MemoryDependency : public amd::EmbeddedObject {
public:
//! Default constructor
MemoryDependency()
: memObjectsInQueue_(NULL), endMemObjectsInQueue_(0), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {}
~MemoryDependency() { delete[] memObjectsInQueue_; }
//! Creates memory dependecy structure
bool create(size_t numMemObj);
//! Notify the tracker about new kernel
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
//! Validates memory object on dependency
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
//! Clear memory dependency
void clear(bool all = true);
private:
struct MemoryState {
uint64_t start_; //! Busy memory start address
uint64_t end_; //! Busy memory end address
bool readOnly_; //! Current GPU state in the queue
};
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class DmaFlushMgmt : public amd::EmbeddedObject {
public:
DmaFlushMgmt(const Device& dev);
// Resets DMA command buffer workload
void resetCbWorkload(const Device& dev);
// Finds split size for the current dispatch
void findSplitSize(const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
private:
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
uint aluCnt_; //!< All ALUs on the chip
uint dispatchSplitSize_; //!< Dispath split size in elements
};
typedef std::vector<ResourceSlot> ResourceSlots;
public:
explicit VirtualGPU(Device& device);
bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled,
uint deviceQueueSize = 0,
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
~VirtualGPU();
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& vcmd) {}
void submitMapMemory(amd::MapMemoryCommand& vcmd);
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
void submitKernel(amd::NDRangeKernelCommand& vcmd);
bool submitKernelInternal(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
);
bool submitKernelInternalHSA(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitMarker(amd::Marker& vcmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
void submitSignal(amd::SignalCommand& vcmd);
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd);
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
void releaseMemory(gslMemObject gslResource, bool wait = true);
void releaseKernel(CALimage calImage);
void flush(amd::Command* list = NULL, bool wait = false);
bool terminate() { return true; }
//! Returns GPU device object associated with this kernel
const Device& dev() const { return gpuDevice_; }
//! Returns CAL descriptor of the virtual device
const CalVirtualDesc* cal() const { return &cal_; }
//! Returns active kernel descriptor for this virtual device
const GslKernelDesc* gslKernelDesc() const { return activeKernelDesc_; }
//! Returns a GPU event, associated with GPU memory
GpuEvent* getGpuEvent(const gslMemObject gslMem //!< GSL mem object
) {
return &gpuEvents_[gslMem];
}
//! Assigns a GPU event, associated with GPU memory
void assignGpuEvent(const gslMemObject gslMem, //!< GSL mem object
GpuEvent gpuEvent) {
gpuEvents_[gslMem] = gpuEvent;
}
//! Set the kernel as active
bool setActiveKernelDesc(const amd::NDRangeContainer& sizes, //!< kernel execution work sizes
const Kernel* kernel //!< GPU kernel object
);
//! Set the last known GPU event
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
);
//! Flush DMA buffer on the specified engine
void flushDMA(uint engineID //!< Engine ID for DMA flush
);
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(CommandBatch* cb = NULL //!< Command batch
);
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(CommandBatch* cb //!< Command batch
);
//! Returns a resource associated with the constant buffer
const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
//! Adds CAL objects into the constant buffer vector
void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
constbufs_t constBufs_; //!< constant buffers
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
//! End the command profiling
void profilingEnd(amd::Command& command);
//! Collect the profiling results
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
);
//! Adds a memory handle into the GSL memory array for Virtual Heap
bool addVmMemory(const Memory* memory //!< GPU memory object
);
//! Adds a stage write buffer into a list
void addXferWrite(Memory& memory);
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
//! Release pinned memory objects
void releasePinnedMem();
//! Finds if pinned memory is cached
amd::Memory* findPinnedMem(void* addr, size_t size);
//! Returns gsl memory object for VM
const gslMemObject* vmMems() const { return vmMems_; }
//! Returns the monitor object for execution access by VirtualGPU
amd::Monitor& execution() { return execution_; }
//! Returns the virtual gpu unique index
uint index() const { return index_; }
//! Get the PrintfDbg object
PrintfDbg& printfDbg() const { return *printfDbg_; }
//! Get the PrintfDbgHSA object
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
//! Enables synchronized transfers
void enableSyncedBlit() const;
//! Checks if profiling is enabled
bool profiling() const { return state_.profiling_; }
//! Returns memory dependency class
MemoryDependency& memoryDependency() { return memoryDependency_; }
//! Returns hsaQueueMem_
const Memory* hsaQueueMem() const { return hsaQueueMem_; }
//! Returns DMA flush management structure
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
//! Releases GSL memory objects allocated on this queue
void releaseMemObjects(bool scratch = true);
//! Returns the HW ring used on this virtual device
uint hwRing() const { return hwRing_; }
//! Returns current timestamp object for profiling
TimeStamp* currTs() const { return cal_.lastTS_; }
//! Returns virtual queue object for device enqueuing
Memory* vQueue() const { return virtualQueue_; }
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
ResourceSlots slots_; //!< Resource slots for kernel arguments
State state_; //!< virtual GPU current state
CalVirtualDesc cal_; //!< CAL virtual device descriptor
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
protected:
virtual void profileEvent(EngineType engine, bool type) const;
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
);
private:
typedef std::unordered_map<CALimage, GslKernelDesc*> GslKernels;
typedef std::unordered_map<gslMemObject, GpuEvent> GpuEvents;
//! Finds total amount of necessary iterations
inline void findIterations(const amd::NDRangeContainer& sizes, //!< Original workload sizes
const amd::NDRange& local, //!< Local workgroup size
amd::NDRange& groups, //!< Calculated workgroup sizes
amd::NDRange& remainder, //!< Calculated remainder sizes
size_t& extra //!< Amount of extra executions for remainder
);
//! Setups workloads for the current iteration
inline void setupIteration(
uint iteration, //!< Current iteration
const amd::NDRangeContainer& sizes, //!< Original workload sizes
Kernel& gpuKernel, //!< GPU kernel
amd::NDRange& global, //!< Global size for the current iteration
amd::NDRange& offsets, //!< Offsets for the current iteration
amd::NDRange& local, //!< Local sizes for the current iteration
amd::NDRange& groups, //!< Group sizes for the current iteration
amd::NDRange& groupOffset, //!< Group offsets for the current iteration
amd::NDRange& divider, //!< Group divider
amd::NDRange& remainder, //!< Remain workload
size_t extra //!< Extra groups
);
//! Allocates constant buffers
bool allocConstantBuffers();
//! Allocates CAL kernel descriptor of the virtual device
GslKernelDesc* allocKernelDesc(const Kernel* kernel, //!< Kernel object
CALimage calImage); //!< CAL image
//! Frees CAL kernel descriptor of the virtual device
void freeKernelDesc(GslKernelDesc* desc);
bool gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs);
void gslDestroy();
//! Releases stage write buffers
void releaseXferWrite();
//! Allocate hsaQueueMem_
bool allocHsaQueueMem();
//! Awaits a command batch with a waiting event
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = NULL //!< A waiting event
);
//! Validates the scratch buffer memory for a specified kernel
void validateScratchBuffer(const Kernel* kernel //!< Kernel for validaiton
);
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
amd::Memory& dstMem, //!< destination memory object
bool entire, //!< flag of entire memory copy
const amd::Coord3D& srcOrigin, //!< source memory origin
const amd::Coord3D& dstOrigin, //!< destination memory object
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
Device& gpuDevice_; //!< physical GPU device
amd::Monitor execution_; //!< Lock to serialise access to all device objects
uint index_; //!< The virtual device unique index
PrintfDbg* printfDbg_; //!< GPU printf implemenation
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
TimeStampCache* tsCache_; //!< TimeStamp cache
MemoryDependency memoryDependency_; //!< Memory dependency class
gslMemObject* vmMems_; //!< Array of GSL memories for VM mode
uint numVmMems_; //!< Number of entries in VM mem array
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
std::list<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::list<amd::Memory*> pinnedMems_; //!< Pinned memory list
typedef std::list<CommandBatch*> CommandBatchList;
CommandBatchList cbList_; //!< List of command batches
uint hwRing_; //!< HW ring used on this virtual device
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* currTs_; //!< current timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
uint deviceQueueSize_; //!< Device queue size
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
bool profileEnabled_; //!< Profiling is enabled
};
/*@}*/} // namespace gpu
#endif /*GPUVIRTUAL_HPP_*/