481cecec78
- Make sure only one GPU barrier is issued per dispatch when memory tracking is disabled Change-Id: I974569ab42a8835304a2930eef87b561a3750327
727 γραμμές
28 KiB
C++
727 γραμμές
28 KiB
C++
/* Copyright (c) 2015-present Advanced Micro Devices, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE. */
|
|
|
|
#pragma once
|
|
|
|
#include <queue>
|
|
#include "device/pal/paldefs.hpp"
|
|
#include "device/pal/palconstbuf.hpp"
|
|
#include "device/pal/palprintf.hpp"
|
|
#include "device/pal/paltimestamp.hpp"
|
|
#include "device/pal/palsched.hpp"
|
|
#include "device/pal/paldebugger.hpp"
|
|
#include "device/pal/palgpuopen.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "device/blit.hpp"
|
|
#include "palUtil.h"
|
|
#include "palCmdBuffer.h"
|
|
#include "palCmdAllocator.h"
|
|
#include "palQueue.h"
|
|
#include "palFence.h"
|
|
#include "palLinearAllocator.h"
|
|
|
|
/*! \addtogroup PAL PAL Resource Implementation
|
|
* @{
|
|
*/
|
|
|
|
//! PAL Device Implementation
|
|
namespace pal {
|
|
|
|
class Device;
|
|
class Kernel;
|
|
class Memory;
|
|
class CalCounterReference;
|
|
class VirtualGPU;
|
|
class Program;
|
|
class BlitManager;
|
|
class ThreadTrace;
|
|
class HSAILKernel;
|
|
|
|
//! Virtual GPU
|
|
class VirtualGPU : public device::VirtualDevice {
|
|
public:
|
|
class Queue : public amd::HeapObject {
|
|
public:
|
|
static const uint MaxCommands = 256;
|
|
static const uint StartCmdBufIdx = 1;
|
|
static const uint FirstMemoryReference = 0x80000000;
|
|
static const uint64_t WaitTimeoutInNsec = 6000000000;
|
|
static const uint64_t PollIntervalInNsec = 200000;
|
|
|
|
Queue(const Queue&) = delete;
|
|
Queue& operator=(const Queue&) = delete;
|
|
|
|
static Queue* Create(const VirtualGPU& gpu, //!< OCL virtual GPU object
|
|
Pal::QueueType queueType, //!< PAL queue type
|
|
uint engineIdx, //!< Select particular engine index
|
|
Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator
|
|
uint rtCU, //!< The number of reserved CUs
|
|
amd::CommandQueue::Priority priority, //!< Queue priority
|
|
uint64_t residency_limit, //!< Enables residency limit
|
|
uint max_command_buffers //!< Number of allocated command buffers
|
|
);
|
|
|
|
Queue(const VirtualGPU& gpu, Pal::IDevice* iDev, uint64_t residency_limit,
|
|
uint max_command_buffers)
|
|
: lock_(nullptr),
|
|
iQueue_(nullptr),
|
|
iCmdBuffs_(max_command_buffers, nullptr),
|
|
iCmdFences_(max_command_buffers, nullptr),
|
|
last_kernel_(nullptr),
|
|
gpu_(gpu),
|
|
iDev_(iDev),
|
|
cmdBufIdSlot_(StartCmdBufIdx),
|
|
cmdBufIdCurrent_(StartCmdBufIdx),
|
|
cmbBufIdRetired_(0),
|
|
cmdCnt_(0),
|
|
vlAlloc_(64 * Ki),
|
|
residency_size_(0),
|
|
residency_limit_(residency_limit),
|
|
max_command_buffers_(max_command_buffers) {
|
|
vlAlloc_.Init();
|
|
}
|
|
|
|
~Queue();
|
|
|
|
void addCmdMemRef(GpuMemoryReference* mem);
|
|
void removeCmdMemRef(GpuMemoryReference* mem);
|
|
|
|
void addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd);
|
|
|
|
void addMemRef(Pal::IGpuMemory* iMem) const {
|
|
Pal::GpuMemoryRef memRef = {};
|
|
memRef.pGpuMemory = iMem;
|
|
iDev_->AddGpuMemoryReferences(1, &memRef, nullptr, Pal::GpuMemoryRefCantTrim);
|
|
}
|
|
void removeMemRef(Pal::IGpuMemory* iMem) const {
|
|
iDev_->RemoveGpuMemoryReferences(1, &iMem, nullptr);
|
|
}
|
|
|
|
// Notice KMD to update applicaiton profile
|
|
Pal::Result UpdateAppPowerProfile();
|
|
|
|
// ibReuse forces event wait without polling, to make sure event occured
|
|
template <bool ibReuse> bool waifForFence(uint cbId) const {
|
|
Pal::Result result = Pal::Result::Success;
|
|
uint64_t start;
|
|
uint64_t end;
|
|
if (!ibReuse) {
|
|
start = amd::Os::timeNanos();
|
|
}
|
|
while ((Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) || ibReuse) {
|
|
if (result == Pal::Result::ErrorFenceNeverSubmitted) {
|
|
result = Pal::Result::Success;
|
|
break;
|
|
}
|
|
if (!ibReuse) {
|
|
end = amd::Os::timeNanos();
|
|
}
|
|
if (!ibReuse && ((end - start) < PollIntervalInNsec)) {
|
|
amd::Os::yield();
|
|
continue;
|
|
}
|
|
result = iDev_->WaitForFences(1, &iCmdFences_[cbId], true, WaitTimeoutInNsec);
|
|
if (Pal::Result::Success == result) {
|
|
break;
|
|
} else if ((Pal::Result::NotReady == result) || (Pal::Result::Timeout == result)) {
|
|
LogWarning("PAL fence isn't ready!");
|
|
if (GPU_ANALYZE_HANG) {
|
|
DumpMemoryReferences();
|
|
}
|
|
} else {
|
|
LogError("PAL wait for a fence failed!");
|
|
break;
|
|
}
|
|
}
|
|
return (result == Pal::Result::Success) ? true : false;
|
|
}
|
|
|
|
//! Flushes the current command buffer to HW
|
|
//! Returns ID associated with the submission
|
|
template <bool avoidBarrierSubmit = false> uint submit(bool forceFlush);
|
|
|
|
bool flush();
|
|
|
|
bool waitForEvent(uint id);
|
|
|
|
bool isDone(uint id);
|
|
|
|
Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
|
|
|
|
uint cmdBufId() const { return cmdBufIdCurrent_; }
|
|
|
|
static uint32_t AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type);
|
|
|
|
amd::Monitor* lock_; //!< Lock PAL queue for access
|
|
Pal::IQueue* iQueue_; //!< PAL queue object
|
|
std::vector<Pal::ICmdBuffer*> iCmdBuffs_; //!< PAL command buffers
|
|
std::vector<Pal::IFence*> iCmdFences_; //!< PAL fences, associated with CMD
|
|
const amd::Kernel* last_kernel_; //!< Last submitted kernel
|
|
|
|
private:
|
|
void DumpMemoryReferences() const;
|
|
const VirtualGPU& gpu_; //!< OCL virtual GPU object
|
|
Pal::IDevice* iDev_; //!< PAL device
|
|
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
|
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
|
uint cmbBufIdRetired_; //!< The last retired command buffer ID
|
|
uint cmdCnt_; //!< Counter of commands
|
|
std::unordered_map<GpuMemoryReference*, uint> memReferences_;
|
|
Util::VirtualLinearAllocator vlAlloc_;
|
|
std::vector<Pal::GpuMemoryRef> palMemRefs_;
|
|
std::vector<Pal::IGpuMemory*> palMems_;
|
|
std::vector<Pal::DoppRef> palDoppRefs_;
|
|
std::set<Pal::IGpuMemory*> sdiReferences_;
|
|
std::vector<const Pal::IGpuMemory*> palSdiRefs_;
|
|
uint64_t residency_size_; //!< Resource residency size
|
|
uint64_t residency_limit_; //!< Enables residency limit
|
|
uint max_command_buffers_;
|
|
};
|
|
|
|
struct CommandBatch : public amd::HeapObject {
|
|
amd::Command* head_; //!< Command batch head
|
|
GpuEvent events_[AllEngines]; //!< Last known GPU events
|
|
TimeStamp* lastTS_; //!< TS associated with command batch
|
|
|
|
//! Constructor
|
|
CommandBatch(amd::Command* head, //!< Command batch head
|
|
const GpuEvent* events, //!< HW events on all engines
|
|
TimeStamp* lastTS //!< Last TS in command batch
|
|
) {
|
|
init(head, events, lastTS);
|
|
}
|
|
|
|
void init(amd::Command* head, //!< Command batch head
|
|
const GpuEvent* events, //!< HW events on all engines
|
|
TimeStamp* lastTS //!< Last TS in command batch
|
|
) {
|
|
head_ = head;
|
|
lastTS_ = lastTS;
|
|
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
|
|
}
|
|
};
|
|
|
|
//! The virtual GPU states
|
|
union State {
|
|
struct {
|
|
uint profiling_ : 1; //!< Profiling is enabled
|
|
uint forceWait_ : 1; //!< Forces wait in flush()
|
|
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
|
|
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
|
|
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
|
|
uint imageBufferWrtBack_ : 1; //!< Enable image buffer write back
|
|
};
|
|
uint value_;
|
|
State() : value_(0) {}
|
|
};
|
|
|
|
typedef std::vector<ConstantBuffer*> constbufs_t;
|
|
|
|
class MemoryDependency : public amd::EmbeddedObject {
|
|
public:
|
|
//! Default constructor
|
|
MemoryDependency()
|
|
: memObjectsInQueue_(nullptr), numMemObjectsInQueue_(0), maxMemObjectsInQueue_(0) {}
|
|
|
|
~MemoryDependency() { delete[] memObjectsInQueue_; }
|
|
|
|
//! Creates memory dependecy structure
|
|
bool create(size_t numMemObj);
|
|
|
|
//! Notify the tracker about new kernel
|
|
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
|
|
|
|
//! Validates memory object on dependency
|
|
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
|
|
|
|
//! Invalidates GPU caches if memory dependency tracking is disabled
|
|
void sync(VirtualGPU& gpu) const {
|
|
if (maxMemObjectsInQueue_ == 0) {
|
|
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
|
|
}
|
|
}
|
|
|
|
//! Clear memory dependency
|
|
void clear(bool all = true);
|
|
|
|
private:
|
|
struct MemoryState {
|
|
uint64_t start_; //! Busy memory start address
|
|
uint64_t end_; //! Busy memory end address
|
|
bool readOnly_; //! Current GPU state in the queue
|
|
};
|
|
|
|
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
|
|
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
|
|
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
|
|
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
|
};
|
|
|
|
|
|
class DmaFlushMgmt : public amd::EmbeddedObject {
|
|
public:
|
|
DmaFlushMgmt(const Device& dev);
|
|
|
|
// Resets DMA command buffer workload
|
|
void resetCbWorkload(const Device& dev);
|
|
|
|
// Finds split size for the current dispatch
|
|
void findSplitSize(const Device& dev, //!< GPU device object
|
|
uint64_t threads, //!< Total number of execution threads
|
|
uint instructions //!< Number of ALU instructions
|
|
);
|
|
|
|
// Returns TRUE if DMA command buffer is ready for a flush
|
|
bool isCbReady(VirtualGPU& gpu, //!< Virtual GPU object
|
|
uint64_t threads, //!< Total number of execution threads
|
|
uint instructions //!< Number of ALU instructions
|
|
);
|
|
|
|
// Returns dispatch split size
|
|
uint dispatchSplitSize() const { return dispatchSplitSize_; }
|
|
|
|
private:
|
|
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
|
|
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
|
|
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
|
|
uint aluCnt_; //!< All ALUs on the chip
|
|
uint dispatchSplitSize_; //!< Dispath split size in elements
|
|
};
|
|
|
|
public:
|
|
VirtualGPU(Device& device);
|
|
//! Creates virtual gpu object
|
|
bool create(bool profiling, //!< Enables profilng on the queue
|
|
uint deviceQueueSize = 0, //!< Device queue size, 0 if host queue
|
|
uint rtCUs = amd::CommandQueue::RealTimeDisabled,
|
|
amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
|
|
~VirtualGPU();
|
|
|
|
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
|
|
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
|
|
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
|
|
void submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& vcmd);
|
|
void submitMapMemory(amd::MapMemoryCommand& vcmd);
|
|
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
|
|
void submitKernel(amd::NDRangeKernelCommand& vcmd);
|
|
bool submitKernelInternal(
|
|
const amd::NDRangeContainer& sizes, //!< Workload sizes
|
|
const amd::Kernel& kernel, //!< Kernel for execution
|
|
const_address parameters, //!< Parameters for the kernel
|
|
bool nativeMem = true, //!< Native memory objects
|
|
amd::Event* enqueueEvent = nullptr, //!< Event provided in the enqueue kernel command
|
|
uint32_t sharedMemBytes = 0, //!< Shared memory size
|
|
bool cooperativeGroups = false //!< TRUE if cooperative groups mode is required
|
|
);
|
|
void submitNativeFn(amd::NativeFnCommand& vcmd);
|
|
void submitFillMemory(amd::FillMemoryCommand& vcmd);
|
|
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
|
void submitMarker(amd::Marker& vcmd);
|
|
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
|
|
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
|
|
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
|
|
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
|
|
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
|
|
void submitSignal(amd::SignalCommand& vcmd);
|
|
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd);
|
|
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
|
|
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
|
|
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
|
|
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
|
|
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
|
|
virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
|
|
|
|
void releaseMemory(GpuMemoryReference* mem);
|
|
|
|
void flush(amd::Command* list = nullptr, bool wait = false);
|
|
bool terminate() { return true; }
|
|
|
|
//! Returns GPU device object associated with this kernel
|
|
const Device& dev() const { return gpuDevice_; }
|
|
|
|
//! Set the last known GPU event
|
|
void setGpuEvent(GpuEvent gpuEvent, //!< GPU event for tracking
|
|
bool flush = false //!< TRUE if flush is required
|
|
);
|
|
|
|
//! Flush DMA buffer on the specified engine
|
|
void flushDMA(uint engineID //!< Engine ID for DMA flush
|
|
);
|
|
|
|
//! Wait for all engines on this Virtual GPU
|
|
//! Returns TRUE if CPU didn't wait for GPU
|
|
bool waitAllEngines(CommandBatch* cb = nullptr //!< Command batch
|
|
);
|
|
|
|
//! Waits for the latest GPU event with a lock to prevent multiple entries
|
|
void waitEventLock(CommandBatch* cb //!< Command batch
|
|
);
|
|
|
|
//! Returns a resource associated with the constant buffer
|
|
const ConstantBuffer* cb(uint idx) const { return constBufs_[idx]; }
|
|
|
|
//! Adds CAL objects into the constant buffer vector
|
|
void addConstBuffer(ConstantBuffer* cb) { constBufs_.push_back(cb); }
|
|
|
|
//! Start the command profiling
|
|
void profilingBegin(amd::Command& command, //!< Command queue object
|
|
bool drmProfiling = false //!< Measure DRM time
|
|
);
|
|
|
|
//! End the command profiling
|
|
void profilingEnd(amd::Command& command);
|
|
|
|
//! Collect the profiling results
|
|
bool profilingCollectResults(CommandBatch* cb, //!< Command batch
|
|
const amd::Event* waitingEvent //!< Waiting event
|
|
);
|
|
|
|
//! Adds a memory handle into the GSL memory array for Virtual Heap
|
|
inline void addVmMemory(const Memory* memory //!< GPU memory object
|
|
);
|
|
|
|
//! Adds the last submitted kernel to the queue for tracking a possible hang
|
|
inline void AddKernel(const amd::Kernel& kernel //!< AMD kernel object
|
|
) const;
|
|
|
|
//! Checks if runtime dispatches the same kernel as previously
|
|
inline bool IsSameKernel(const amd::Kernel& kernel //!< AMD kernel object
|
|
) const;
|
|
|
|
//! Adds a dopp desktop texture reference
|
|
void addDoppRef(const Memory* memory, //!< GPU memory object
|
|
bool lastDoopCmd, //!< is the last submission for the pre-present primary
|
|
bool pfpaDoppCmd //!< is a submission for the pre-present primary
|
|
);
|
|
|
|
//! Return xfer buffer for staging operations
|
|
XferBuffer& xferWrite() { return writeBuffer_; }
|
|
|
|
//! Return managed buffer for staging operations
|
|
ManagedBuffer& managedBuffer() { return managedBuffer_; }
|
|
|
|
//! Adds a pinned memory object into a map
|
|
void addPinnedMem(amd::Memory* mem);
|
|
|
|
//! Release pinned memory objects
|
|
void releasePinnedMem();
|
|
|
|
//! Finds if pinned memory is cached
|
|
amd::Memory* findPinnedMem(void* addr, size_t size);
|
|
|
|
//! Get the PrintfDbgHSA object
|
|
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
|
|
|
|
//! Enables synchronized transfers
|
|
void enableSyncedBlit() const;
|
|
|
|
//! Checks if profiling is enabled
|
|
bool profiling() const { return state_.profiling_; }
|
|
|
|
//! Returns memory dependency class
|
|
MemoryDependency& memoryDependency() { return memoryDependency_; }
|
|
|
|
//! Returns hsaQueueMem_
|
|
const Memory* hsaQueueMem() const { return hsaQueueMem_; }
|
|
|
|
//! Returns DMA flush management structure
|
|
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
|
|
|
|
//! Returns the HW ring used on this virtual device
|
|
uint hwRing() const { return hwRing_; }
|
|
|
|
//! Returns virtual queue object for device enqueuing
|
|
Memory* vQueue() const { return virtualQueue_; }
|
|
|
|
//! Update virtual queue header
|
|
void writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable);
|
|
|
|
//! Returns TRUE if virtual queue was successfully allocatted
|
|
bool createVirtualQueue(uint deviceQueueSize //!< Device queue size
|
|
);
|
|
|
|
EngineType engineID_; //!< Engine ID for this VirtualGPU
|
|
|
|
//! Returns PAL command buffer interface
|
|
Pal::ICmdBuffer* iCmd() const {
|
|
Queue* queue = queues_[engineID_];
|
|
return queue->iCmd();
|
|
}
|
|
|
|
//! Returns true if the provided command buffer is the active one
|
|
bool isActiveCmd(Pal::ICmdBuffer* iCmd) const {
|
|
return (queues_[engineID_] != nullptr) && (iCmd == queues_[engineID_]->iCmd()) ? true : false;
|
|
}
|
|
|
|
//! Returns queue, associated with VirtualGPU
|
|
Queue& queue(EngineType id) const { return *queues_[id]; }
|
|
|
|
void addBarrier(RgpSqqtBarrierReason reason = RgpSqqtBarrierReason::Unknown,
|
|
bool flushL2 = false) const {
|
|
Pal::BarrierInfo barrier = {};
|
|
barrier.pipePointWaitCount = 1;
|
|
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
|
barrier.pPipePoints = &point;
|
|
barrier.transitionCount = 1;
|
|
uint32_t cacheMask = (flushL2) ? Pal::CoherCopy : Pal::CoherShader;
|
|
Pal::BarrierTransition trans = {cacheMask,
|
|
cacheMask,
|
|
{nullptr,
|
|
{{Pal::ImageAspect::Color, 0, 0}, 0, 0},
|
|
Pal::LayoutShaderRead,
|
|
Pal::LayoutShaderRead}};
|
|
barrier.pTransitions = &trans;
|
|
barrier.waitPoint = Pal::HwPipePreCs;
|
|
barrier.reason = static_cast<uint32_t>(reason);
|
|
iCmd()->CmdBarrier(barrier);
|
|
queues_[engineID_]->submit<true>(false);
|
|
}
|
|
|
|
void eventBegin(EngineType engId) const {
|
|
const static bool Begin = true;
|
|
profileEvent(engId, Begin);
|
|
}
|
|
|
|
void eventEnd(EngineType engId, GpuEvent& event, bool forceExec = false) const {
|
|
constexpr bool End = false;
|
|
if (forceExec) {
|
|
constexpr bool ForceFlush = true;
|
|
event.id_ = queues_[engId]->submit(ForceFlush);
|
|
profileEvent(engId, End);
|
|
} else {
|
|
profileEvent(engId, End);
|
|
event.id_ = queues_[engId]->submit(GPU_FLUSH_ON_EXECUTION);
|
|
}
|
|
event.engineId_ = engId;
|
|
}
|
|
|
|
void waitForEvent(GpuEvent* event) const {
|
|
if (event->isValid()) {
|
|
assert(event->engineId_ < AllEngines);
|
|
queues_[event->engineId_]->waitForEvent(event->id_);
|
|
event->invalidate();
|
|
}
|
|
}
|
|
|
|
bool isDone(GpuEvent* event) {
|
|
if (event->isValid()) {
|
|
assert(event->engineId_ < AllEngines);
|
|
if (queues_[event->engineId_]->isDone(event->id_)) {
|
|
event->invalidate();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
//! Returns TRUE if SDMA requires overlap synchronizaiton
|
|
bool validateSdmaOverlap(const Resource& src, //!< Source resource for SDMA transfer
|
|
const Resource& dst //!< Destination resource for SDMA transfer
|
|
);
|
|
|
|
//! Checks if RGP capture is enabled
|
|
bool rgpCaptureEna() const { return state_.rgpCaptureEnabled_; }
|
|
|
|
//! Waits for idle on compute engine
|
|
void WaitForIdleCompute() {
|
|
if (events_[MainEngine].isValid()) {
|
|
queues_[events_[MainEngine].engineId_]->waitForEvent(events_[MainEngine].id_);
|
|
events_[MainEngine].invalidate();
|
|
}
|
|
}
|
|
|
|
protected:
|
|
void profileEvent(EngineType engine, bool type) const;
|
|
|
|
//! Creates buffer object from image
|
|
amd::Memory* createBufferFromImage(
|
|
amd::Memory& amdImage //! The parent image object(untiled images only)
|
|
);
|
|
|
|
private:
|
|
struct MemoryRange {
|
|
uint64_t start_; //!< Memory range start address
|
|
uint64_t end_; //!< Memory range end address
|
|
MemoryRange() : start_(0), end_(0) {}
|
|
};
|
|
|
|
//! Allocates constant buffers
|
|
bool allocConstantBuffers();
|
|
|
|
//! Allocate hsaQueueMem_
|
|
bool allocHsaQueueMem();
|
|
|
|
//! Awaits a command batch with a waiting event
|
|
bool awaitCompletion(CommandBatch* cb, //!< Command batch for to wait
|
|
const amd::Event* waitingEvent = nullptr //!< A waiting event
|
|
);
|
|
|
|
//! Detects memory dependency for HSAIL kernels and flushes caches
|
|
bool processMemObjectsHSA(const amd::Kernel& kernel, //!< AMD kernel object for execution
|
|
const_address params, //!< Pointer to the param's store
|
|
bool nativeMem, //!< Native memory objects
|
|
size_t& ldsAddess //!< Returns LDS size, used in the kernel
|
|
);
|
|
|
|
//! Common function for fill memory used by both svm Fill and non-svm fill
|
|
bool fillMemory(cl_command_type type, //!< the command type
|
|
amd::Memory* amdMemory, //!< memory object to fill
|
|
const void* pattern, //!< pattern to fill the memory
|
|
size_t patternSize, //!< pattern size
|
|
const amd::Coord3D& origin, //!< memory origin
|
|
const amd::Coord3D& size //!< memory size for filling
|
|
);
|
|
|
|
bool copyMemory(cl_command_type type, //!< the command type
|
|
amd::Memory& srcMem, //!< source memory object
|
|
amd::Memory& dstMem, //!< destination memory object
|
|
bool entire, //!< flag of entire memory copy
|
|
const amd::Coord3D& srcOrigin, //!< source memory origin
|
|
const amd::Coord3D& dstOrigin, //!< destination memory object
|
|
const amd::Coord3D& size, //!< copy size
|
|
const amd::BufferRect& srcRect, //!< region of source for copy
|
|
const amd::BufferRect& dstRect //!< region of destination for copy
|
|
);
|
|
|
|
void buildKernelInfo(const HSAILKernel& hsaKernel, //!< hsa kernel
|
|
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
|
|
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
|
|
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
|
|
);
|
|
|
|
void assignDebugTrapHandler(const DebugToolInfo& dbgSetting, //!< debug settings
|
|
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
|
);
|
|
|
|
void PrintChildren(const HSAILKernel& hsaKernel, //!< The parent HSAIL kernel
|
|
VirtualGPU* gpuDefQueue //!< Device queue for children execution
|
|
);
|
|
|
|
bool PreDeviceEnqueue(const amd::Kernel& kernel, //!< Parent amd kernel object
|
|
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
|
VirtualGPU** gpuDefQueue, //!< [Return] GPU default queue
|
|
uint64_t* vmDefQueue //!< [Return] VM handle to the virtual queue
|
|
);
|
|
|
|
void PostDeviceEnqueue(
|
|
const amd::Kernel& kernel, //!< Parent amd kernel object
|
|
const HSAILKernel& hsaKernel, //!< Parent HSAIL object
|
|
VirtualGPU* gpuDefQueue, //!< GPU default queue
|
|
uint64_t vmDefQueue, //!< VM handle to the virtual queue
|
|
uint64_t vmParentWrap, //!< VM handle to the wrapped AQL packet location
|
|
GpuEvent* gpuEvent //!< [Return] GPU event associated with the device enqueue
|
|
);
|
|
|
|
Device& gpuDevice_; //!< physical GPU device
|
|
|
|
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
|
|
|
|
TimeStampCache* tsCache_; //!< TimeStamp cache
|
|
MemoryDependency memoryDependency_; //!< Memory dependency class
|
|
|
|
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
|
|
|
|
std::vector<amd::Memory*> pinnedMems_; //!< Pinned memory list
|
|
|
|
ManagedBuffer managedBuffer_; //!< Managed write buffer
|
|
constbufs_t constBufs_; //!< constant buffers
|
|
XferBuffer writeBuffer_; //!< Transfer/staging buffer for uploads
|
|
|
|
typedef std::queue<CommandBatch*> CommandBatchQueue;
|
|
CommandBatchQueue cbQueue_; //!< Queue of command batches
|
|
CommandBatchQueue freeCbQueue_; //!< Queue of free command batches
|
|
|
|
uint hwRing_; //!< HW ring used on this virtual device
|
|
|
|
State state_; //!< virtual GPU current state
|
|
GpuEvent events_[AllEngines]; //!< Last known GPU events
|
|
|
|
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
|
|
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
|
|
TimeStamp* profileTs_; //!< current profiling timestamp for command
|
|
|
|
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
|
|
Memory* virtualQueue_; //!< Virtual device queue
|
|
Memory* schedParams_; //!< The scheduler parameters
|
|
uint deviceQueueSize_; //!< Device queue size
|
|
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
|
|
|
|
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
|
|
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
|
|
Queue* queues_[AllEngines]; //!< HW queues for all engines
|
|
MemoryRange sdmaRange_; //!< SDMA memory range for write access
|
|
std::vector<Image*> wrtBackImageBuffer_; //!< Array of images for write back
|
|
};
|
|
|
|
inline void VirtualGPU::addVmMemory(const Memory* memory) {
|
|
queues_[MainEngine]->addCmdMemRef(memory->memRef());
|
|
memory->setBusy(*this, queues_[MainEngine]->cmdBufId());
|
|
}
|
|
|
|
inline void VirtualGPU::AddKernel(const amd::Kernel& kernel) const {
|
|
queues_[MainEngine]->last_kernel_ = &kernel;
|
|
}
|
|
|
|
inline bool VirtualGPU::IsSameKernel(const amd::Kernel& kernel) const {
|
|
return (queues_[MainEngine]->last_kernel_ == &kernel) ? true : false;
|
|
}
|
|
|
|
template <bool avoidBarrierSubmit> uint VirtualGPU::Queue::submit(bool forceFlush) {
|
|
cmdCnt_++;
|
|
uint id = cmdBufIdCurrent_;
|
|
bool flushCmd = ((cmdCnt_ > MaxCommands) || forceFlush) && !avoidBarrierSubmit;
|
|
if (flushCmd) {
|
|
if (!flush()) {
|
|
return GpuEvent::InvalidID;
|
|
}
|
|
}
|
|
return id;
|
|
}
|
|
|
|
template <typename T>
|
|
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
|
const T* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
size_t offset //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
memcpy(dst + offset, src, size);
|
|
}
|
|
|
|
template <>
|
|
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
|
const uint32_t* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
size_t offset //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
*(reinterpret_cast<uint32_t*>(dst + offset)) = *src;
|
|
}
|
|
|
|
template <>
|
|
inline void WriteAqlArgAt(unsigned char* dst, //!< The write pointer to the buffer
|
|
const uint64_t* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
size_t offset //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
*(reinterpret_cast<uint64_t*>(dst + offset)) = *src;
|
|
}
|
|
/*@}*/ // namespace pal
|
|
} // namespace pal
|