Files
rocm-systems/rocclr/device/pal/palvirtual.hpp
T
German Andryeyev 481cecec78 SWDEV-79445 - HIP improvement
- Make sure only one GPU barrier is issued per dispatch
when memory tracking is disabled

Change-Id: I974569ab42a8835304a2930eef87b561a3750327
2020-04-03 12:21:03 -04:00

727 γραμμές
28 KiB
C++

/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#pragma once
#include <queue>
#include "device/pal/paldefs.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palprintf.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palsched.hpp"
#include "device/pal/paldebugger.hpp"
#include "device/pal/palgpuopen.hpp"
#include "platform/commandqueue.hpp"
#include "device/blit.hpp"
#include "palUtil.h"
#include "palCmdBuffer.h"
#include "palCmdAllocator.h"
#include "palQueue.h"
#include "palFence.h"
#include "palLinearAllocator.h"
/*! \addtogroup PAL PAL Resource Implementation
* @{
*/
//! PAL Device Implementation
namespace pal {
class Device;
class Kernel;
class Memory;
class CalCounterReference;
class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice {
public:
class Queue : public amd::HeapObject {
public:
static const uint MaxCommands = 256;
static const uint StartCmdBufIdx = 1;
static const uint FirstMemoryReference = 0x80000000;
static const uint64_t WaitTimeoutInNsec = 6000000000;
static const uint64_t PollIntervalInNsec = 200000;
Queue(const Queue&) = delete;
Queue& operator=(const Queue&) = delete;
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
uint rtCU, //!< The number of reserved CUs
amd::CommandQueue::Priority priority, //!< Queue priority
uint64_t residency_limit, //!< Enables residency limit
uint max_command_buffers //!< Number of allocated command buffers
);
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
uint max_command_buffers)
: lock_(nullptr),
iQueue_(nullptr),
iCmdBuffs_(max_command_buffers, nullptr),
iCmdFences_(max_command_buffers, nullptr),
last_kernel_(nullptr),
gpu_(gpu),
iDev_(iDev),
cmdBufIdSlot_(StartCmdBufIdx),
cmdBufIdCurrent_(StartCmdBufIdx),
cmbBufIdRetired_(0),
cmdCnt_(0),
vlAlloc_(64 * Ki),
residency_size_(0),
residency_limit_(residency_limit),
max_command_buffers_(max_command_buffers) {
vlAlloc_.Init();
}
~Queue();
void addCmdMemRef(GpuMemoryReference* mem);
void removeCmdMemRef(GpuMemoryReference* mem);
void addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd);
void addMemRef(Pal::IGpuMemory* iMem) const {
Pal::GpuMemoryRef memRef = {};
memRef.pGpuMemory = iMem;
iDev_->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim);
}
void removeMemRef(Pal::IGpuMemory* iMem) const {
iDev_->RemoveGpuMemoryReferences(1, &iMem, nullptr);
}
// Notice KMD to update applicaiton profile
Pal::Result UpdateAppPowerProfile();
// ibReuse forces event wait without polling, to make sure event occured
template <bool ibReuse> bool waifForFence(uint cbId) const {
Pal::Result result = Pal::Result::Success;
uint64_t start;
uint64_t end;
if (!ibReuse) {
start = amd::Os::timeNanos();
}
while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) {
if (result == Pal::Result::ErrorFenceNeverSubmitted) {
result = Pal::Result::Success;
break;
}
if (!ibReuse) {
end = amd::Os::timeNanos();
}
if (!ibReuse && ((end - start) < PollIntervalInNsec)) {
amd::Os::yield();
continue;
}
result = iDev_->WaitForFences(1, &iCmdFences_[cbId], true, WaitTimeoutInNsec);
if (Pal::Result::Success == result) {
break;
} else if ((Pal::Result::NotReady == result) || (Pal::Result::Timeout == result)) {
LogWarning("PAL fence isn't ready!");
if (GPU_ANALYZE_HANG) {
DumpMemoryReferences();
}
} else {
LogError("PAL wait for a fence failed!");
break;
}
}
return (result == Pal::Result::Success) ? true : false;
}
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
template <bool avoidBarrierSubmit = false> uint submit(bool forceFlush);
bool flush();
bool waitForEvent(uint id);
bool isDone(uint id);
Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
uint cmdBufId() const { return cmdBufIdCurrent_; }
static uint32_t AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type);
amd::Monitor* lock_; //!< Lock PAL queue for access
Pal::IQueue* iQueue_; //!< PAL queue object
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
const amd::Kernel* last_kernel_; //!< Last submitted kernel
private:
void DumpMemoryReferences() const;
const VirtualGPU& gpu_; //!< OCL virtual GPU object
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
Util::VirtualLinearAllocator vlAlloc_;
std::vector<Pal::GpuMemoryRef> palMemRefs_;
std::vector<Pal::IGpuMemory*> palMems_;
std::vector<Pal::DoppRef> palDoppRefs_;
std::set<Pal::IGpuMemory*> sdiReferences_;
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
uint64_t residency_size_; //!< Resource residency size
uint64_t residency_limit_; //!< Enables residency limit
uint max_command_buffers_;
};
struct CommandBatch : public amd::HeapObject {
amd::Command* head_; //!< Command batch head
GpuEvent events_[AllEngines]; //!< Last known GPU events
TimeStamp* lastTS_; //!< TS associated with command batch
//! Constructor
CommandBatch(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
) {
init(head, events, lastTS);
}
void init(amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
) {
head_ = head;
lastTS_ = lastTS;
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
}
};
//! The virtual GPU states
union State {
struct {
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back
};
uint value_;
State() : value_(0) {}
};
typedef std::vector<ConstantBuffer*> constbufs_t;
class MemoryDependency : public amd::EmbeddedObject {
public:
//! Default constructor
MemoryDependency()
: memObjectsInQueue_(nullptr), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {}
~MemoryDependency() { delete[] memObjectsInQueue_; }
//! Creates memory dependecy structure
bool create(size_t numMemObj);
//! Notify the tracker about new kernel
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
//! Validates memory object on dependency
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
//! Invalidates GPU caches if memory dependency tracking is disabled
void sync(VirtualGPU& gpu) const {
if (maxMemObjectsInQueue_ == 0) {
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
}
}
//! Clear memory dependency
void clear(bool all = true);
private:
struct MemoryState {
uint64_t start_; //! Busy memory start address
uint64_t end_; //! Busy memory end address
bool readOnly_; //! Current GPU state in the queue
};
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class DmaFlushMgmt : public amd::EmbeddedObject {
public:
DmaFlushMgmt(const Device& dev);
// Resets DMA command buffer workload
void resetCbWorkload(const Device& dev);
// Finds split size for the current dispatch
void findSplitSize(const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
private:
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
uint aluCnt_; //!< All ALUs on the chip
uint dispatchSplitSize_; //!< Dispath split size in elements
};
public:
VirtualGPU(Device& device);
//! Creates virtual gpu object
bool create(bool profiling, //!< Enables profilng on the queue
uint deviceQueueSize = 0, //!< Device queue size, 0 if host queue
uint rtCUs = amd::CommandQueue::RealTimeDisabled,
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
~VirtualGPU();
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& vcmd);
void submitMapMemory(amd::MapMemoryCommand& vcmd);
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
void submitKernel(amd::NDRangeKernelCommand& vcmd);
bool submitKernelInternal(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
uint32_t sharedMemBytes = 0, //!< Shared memory size
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitMarker(amd::Marker& vcmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
void submitSignal(amd::SignalCommand& vcmd);
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd);
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
void releaseMemory(GpuMemoryReference* mem);
void flush(amd::Command* list = nullptr, bool wait = false);
bool terminate() { return true; }
//! Returns GPU device object associated with this kernel
const Device& dev() const { return gpuDevice_; }
//! Set the last known GPU event
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
);
//! Flush DMA buffer on the specified engine
void flushDMA(uint engineID //!< Engine ID for DMA flush
);
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch
);
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(CommandBatch* cb //!< Command batch
);
//! Returns a resource associated with the constant buffer
const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
//! Adds CAL objects into the constant buffer vector
void addConstBuffer(ConstantBuffer* cb) { constBufs_.push_back(cb); }
//! Start the command profiling
void profilingBegin(amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
//! End the command profiling
void profilingEnd(amd::Command& command);
//! Collect the profiling results
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
);
//! Adds a memory handle into the GSL memory array for Virtual Heap
inline void addVmMemory(const Memory* memory //!< GPU memory object
);
//! Adds the last submitted kernel to the queue for tracking a possible hang
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
) const;
//! Checks if runtime dispatches the same kernel as previously
inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object
) const;
//! Adds a dopp desktop texture reference
void addDoppRef(const Memory* memory, //!< GPU memory object
bool lastDoopCmd, //!< is the last submission for the pre-present primary
bool pfpaDoppCmd //!< is a submission for the pre-present primary
);
//! Return xfer buffer for staging operations
XferBuffer& xferWrite() { return writeBuffer_; }
//! Return managed buffer for staging operations
ManagedBuffer& managedBuffer() { return managedBuffer_; }
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
//! Release pinned memory objects
void releasePinnedMem();
//! Finds if pinned memory is cached
amd::Memory* findPinnedMem(void* addr, size_t size);
//! Get the PrintfDbgHSA object
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
//! Enables synchronized transfers
void enableSyncedBlit() const;
//! Checks if profiling is enabled
bool profiling() const { return state_.profiling_; }
//! Returns memory dependency class
MemoryDependency& memoryDependency() { return memoryDependency_; }
//! Returns hsaQueueMem_
const Memory* hsaQueueMem() const { return hsaQueueMem_; }
//! Returns DMA flush management structure
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
//! Returns the HW ring used on this virtual device
uint hwRing() const { return hwRing_; }
//! Returns virtual queue object for device enqueuing
Memory* vQueue() const { return virtualQueue_; }
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
//! Returns PAL command buffer interface
Pal::ICmdBuffer* iCmd() const {
Queue* queue = queues_[engineID_];
return queue->iCmd();
}
//! Returns true if the provided command buffer is the active one
bool isActiveCmd(Pal::ICmdBuffer* iCmd) const {
return (queues_[engineID_] != nullptr) && (iCmd == queues_[engineID_]->iCmd()) ? true : false;
}
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
bool flushL2 = false) const {
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
barrier.pPipePoints = &point;
barrier.transitionCount = 1;
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
Pal::BarrierTransition trans = {cacheMask,
cacheMask,
{nullptr,
{{Pal::ImageAspect::Color, 0, 0}, 0, 0},
Pal::LayoutShaderRead,
Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
barrier.reason = static_cast<uint32_t>(reason);
iCmd()->CmdBarrier(barrier);
queues_[engineID_]->submit<true>(false);
}
void eventBegin(EngineType engId) const {
const static bool Begin = true;
profileEvent(engId, Begin);
}
void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
constexpr bool End = false;
if (forceExec) {
constexpr bool ForceFlush = true;
event.id_ = queues_[engId]->submit(ForceFlush);
profileEvent(engId, End);
} else {
profileEvent(engId, End);
event.id_ = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
}
event.engineId_ = engId;
}
void waitForEvent(GpuEvent* event) const {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
queues_[event->engineId_]->waitForEvent(event->id_);
event->invalidate();
}
}
bool isDone(GpuEvent* event) {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
if (queues_[event->engineId_]->isDone(event->id_)) {
event->invalidate();
return true;
}
return false;
}
return true;
}
//! Returns TRUE if SDMA requires overlap synchronizaiton
bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer
const Resource& dst //!< Destination resource for SDMA transfer
);
//! Checks if RGP capture is enabled
bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
//! Waits for idle on compute engine
void WaitForIdleCompute() {
if (events_[MainEngine].isValid()) {
queues_[events_[MainEngine].engineId_]->waitForEvent(events_[MainEngine].id_);
events_[MainEngine].invalidate();
}
}
protected:
void profileEvent(EngineType engine, bool type) const;
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
);
private:
struct MemoryRange {
uint64_t start_; //!< Memory range start address
uint64_t end_; //!< Memory range end address
MemoryRange() : start_(0), end_(0) {}
};
//! Allocates constant buffers
bool allocConstantBuffers();
//! Allocate hsaQueueMem_
bool allocHsaQueueMem();
//! Awaits a command batch with a waiting event
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = nullptr //!< A waiting event
);
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
size_t& ldsAddess //!< Returns LDS size, used in the kernel
);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(cl_command_type type, //!< the command type
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
bool copyMemory(cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
amd::Memory& dstMem, //!< destination memory object
bool entire, //!< flag of entire memory copy
const amd::Coord3D& srcOrigin, //!< source memory origin
const amd::Coord3D& dstOrigin, //!< destination memory object
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel
VirtualGPU* gpuDefQueue //!< Device queue for children execution
);
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
);
void PostDeviceEnqueue(
const amd::Kernel& kernel, //!< Parent amd kernel object
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
VirtualGPU* gpuDefQueue, //!< GPU default queue
uint64_t vmDefQueue, //!< VM handle to the virtual queue
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
);
Device& gpuDevice_; //!< physical GPU device
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
TimeStampCache* tsCache_; //!< TimeStamp cache
MemoryDependency memoryDependency_; //!< Memory dependency class
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
ManagedBuffer managedBuffer_; //!< Managed write buffer
constbufs_t constBufs_; //!< constant buffers
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
typedef std::queue<CommandBatch*> CommandBatchQueue;
CommandBatchQueue cbQueue_; //!< Queue of command batches
CommandBatchQueue freeCbQueue_; //!< Queue of free command batches
uint hwRing_; //!< HW ring used on this virtual device
State state_; //!< virtual GPU current state
GpuEvent events_[AllEngines]; //!< Last known GPU events
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
TimeStamp* profileTs_; //!< current profiling timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint deviceQueueSize_; //!< Device queue size
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
Queue* queues_[AllEngines]; //!< HW queues for all engines
MemoryRange sdmaRange_; //!< SDMA memory range for write access
std::vector<Image*> wrtBackImageBuffer_; //!< Array of images for write back
};
inline void VirtualGPU::addVmMemory(const Memory* memory) {
queues_[MainEngine]->addCmdMemRef(memory->memRef());
memory->setBusy(*this, queues_[MainEngine]->cmdBufId());
}
inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
queues_[MainEngine]->last_kernel_ = &kernel;
}
inline bool VirtualGPU::IsSameKernel(const amd::Kernel& kernel) const {
return (queues_[MainEngine]->last_kernel_ == &kernel) ? true : false;
}
template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
cmdCnt_++;
uint id = cmdBufIdCurrent_;
bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
if (flushCmd) {
if (!flush()) {
return GpuEvent::InvalidID;
}
}
return id;
}
template <typename T>
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const T* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
memcpy(dst + offset, src, size);
}
template <>
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const uint32_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
}
template <>
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
const uint64_t* src, //!< The source pointer
uint size, //!< The size in bytes to copy
size_t offset //!< The alignment to follow while writing to the buffer
) {
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
}
/*@}*/ // namespace pal
} // namespace pal