ファイル
rocm-systems/rocclr/runtime/device/pal/palvirtual.cpp
T
foreman 162c07fcb4 P4 to Git Change 1288095 by jsjodin@jsjodin-git2p4-llvm on 2016/07/06 18:08:24
SWDEV-3 - IR: Set TargetPrefix for some X86 and AArch64 intrinsics where it was missing

	git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274390 91177308-0d34-0410-b5e6-96231b3b80d8

	GitHash: f0a4c116041f7c2aef7796c8b067f0947b69602d

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/include/llvm/IR/IntrinsicsAArch64.td#2 edit
... //depot/stg/opencl/drivers/opencl/compiler/llvm.git/include/llvm/IR/IntrinsicsX86.td#2 edit
2016-07-07 02:37:58 -04:00

3471 行
118 KiB
C++

//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/kernel.hpp"
#include "platform/commandqueue.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palblit.hpp"
#include "device/pal/paldebugger.hpp"
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "amd_hsa_queue.h"
#include <fstream>
#include <sstream>
#include <algorithm>
#include "palQueue.h"
#include "palFence.h"
#ifdef _WIN32
#include <d3d10_1.h>
#include "amdocl/cl_d3d9_amd.hpp"
#include "amdocl/cl_d3d10_amd.hpp"
#include "amdocl/cl_d3d11_amd.hpp"
#endif // _WIN32
namespace pal {
VirtualGPU::Queue*
VirtualGPU::Queue::Create(
Pal::IDevice* palDev,
Pal::QueueType queueType,
uint engineIdx,
Pal::ICmdAllocator* cmdAllocator)
{
Pal::Result result;
Pal::QueueCreateInfo qCreateInfo = {};
qCreateInfo.engineType = queueType;
qCreateInfo.engineIndex = engineIdx;
qCreateInfo.aqlQueue = true;
// Find queue object size
size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
cmdCreateInfo.pCmdAllocator = cmdAllocator;
cmdCreateInfo.queueType = queueType;
// Find command buffer object size
size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
// Find fence object size
size_t fSize = palDev->GetFenceSize(&result);
if (result != Pal::Result::Success) {
return nullptr;
}
size_t allocSize = qSize + MaxCmdBuffers * (cmdSize + fSize);
VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(palDev);
if (queue != nullptr) {
address addrQ = reinterpret_cast<address>(&queue[1]);
// Create PAL queue object
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
address addrCmd = addrQ + qSize;
address addrF = addrCmd + MaxCmdBuffers * cmdSize;
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
for (uint i = 0; i < MaxCmdBuffers; ++i) {
result = palDev->CreateCmdBuffer(cmdCreateInfo,
&addrCmd[i*cmdSize], &queue->iCmdBuffs_[i]);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
static const bool InitiallySignaled = false;
result = palDev->CreateFence(InitiallySignaled, &addrF[i*fSize],
&queue->iCmdFences_[i]);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
if (i == StartCmdBufIdx) {
result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
}
}
}
return queue;
}
VirtualGPU::Queue::~Queue()
{
std::vector<Pal::IGpuMemory*> memRef;
// Remove all memory references
for (auto it: memReferences_) {
memRef.push_back(it.first);
}
if (memRef.size() != 0) {
iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], NULL);
}
memReferences_.clear();
for (uint i = 0; i < MaxCmdBuffers; ++i) {
if (nullptr != iCmdBuffs_[i]) {
iCmdBuffs_[i]->Destroy();
}
if (nullptr != iCmdFences_[i]) {
iCmdFences_[i]->Destroy();
}
}
if (nullptr != iQueue_) {
iQueue_->Destroy();
}
}
void
VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem)
{
auto it = memReferences_.find(iMem);
if (it != memReferences_.end()) {
it->second = (it->second & FirstMemoryReference) | cmdBufIdSlot_;
}
else {
memReferences_[iMem] = FirstMemoryReference | cmdBufIdSlot_;
}
}
void
VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
{
if (0 != memReferences_.erase(iMem)) {
iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
}
}
uint
VirtualGPU::Queue::submit()
{
cmdCnt_++;
uint id = cmdBufIdCurrent_;
if ((cmdCnt_ > MaxCommands) || GPU_FLUSH_ON_EXECUTION) {
if (!flush()) {
return GpuEvent::InvalidID;
}
}
return id;
}
bool
VirtualGPU::Queue::flush()
{
std::vector<Pal::IGpuMemory*> memRef;
// Stop commands building
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->End()) {
LogError("PAL failed to finalize a command buffer!");
return false;
}
// Add memory references
for (auto it = memReferences_.begin(); it != memReferences_.end(); ++it) {
if (it->second & FirstMemoryReference) {
it->second &= ~FirstMemoryReference;
memRef.push_back(it->first);
}
}
if (memRef.size() != 0) {
iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_,
Pal::GpuMemoryRefCantTrim);
}
// Submit command buffer to OS
if (Pal::Result::Success != iQueue_->Submit(
1, &iCmdBuffs_[cmdBufIdSlot_], nullptr, 0, nullptr, iCmdFences_[cmdBufIdSlot_])) {
LogError("PAL failed to submit CMD!");
return false;
}
if (GPU_FLUSH_ON_EXECUTION) {
if (Pal::Result::Success !=
iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, WaitTimeoutInNsec)) {
LogError("PAL wait for a fence failed!");
return false;
}
}
// Reset the counter of commands
cmdCnt_ = 0;
// Find the next command buffer
cmdBufIdCurrent_++;
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
///@todo handle wrapping
cmdBufIdCurrent_ = 1;
cmbBufIdRetired_ = 0;
}
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % MaxCmdBuffers;
// Make sure the slot isn't busy
if (Pal::Result::NotReady == iCmdFences_[cmdBufIdSlot_]->GetStatus()) {
if (Pal::Result::Success !=
iDev_->WaitForFences(1, &iCmdFences_[cmdBufIdSlot_], true, WaitTimeoutInNsec)) {
LogError("PAL wait for a fence failed!");
return false;
}
}
// Progress retired TS
if ((cmdBufIdCurrent_ > MaxCmdBuffers) &&
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - MaxCmdBuffers))) {
cmbBufIdRetired_ = cmdBufIdCurrent_ - MaxCmdBuffers;
}
if (Pal::Result::Success !=
iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_])) {
LogError("PAL failed to reset a fence!");
return false;
}
// Start command buffer building
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
if (Pal::Result::Success != iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo)) {
LogError("PAL failed CB building initialization!");
return false;
}
memRef.clear();
// Remove old memory references
for (auto it = memReferences_.begin(); it != memReferences_.end();) {
if (it->second == cmdBufIdSlot_) {
memRef.push_back(it->first);
it = memReferences_.erase(it);
}
else {
++it;
}
}
if (memRef.size() != 0) {
iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
}
return true;
}
bool
VirtualGPU::Queue::waitForEvent(uint id)
{
if (isDone(id)) {
return true;
}
uint slotId = id % MaxCmdBuffers;
// Wait for the specified fence
if (Pal::Result::Success != iCmdFences_[slotId]->GetStatus()) {
if (Pal::Result::Success !=
iDev_->WaitForFences(1, &iCmdFences_[slotId], true, WaitTimeoutInNsec)) {
LogError("PAL wait for a fence failed!");
return false;
}
}
cmbBufIdRetired_ = id;
return true;
}
bool
VirtualGPU::Queue::isDone(uint id)
{
if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
return true;
}
if (id == cmdBufIdCurrent_) {
// Flush the current command buffer
flush();
}
if (Pal::Result::Success != iCmdFences_[id % MaxCmdBuffers]->GetStatus()) {
return false;
}
cmbBufIdRetired_ = id;
return true;
}
bool
VirtualGPU::MemoryDependency::create(size_t numMemObj)
{
if (numMemObj > 0) {
// Allocate the array of memory objects for dependency tracking
memObjectsInQueue_ = new MemoryState[numMemObj];
if (nullptr == memObjectsInQueue_) {
return false;
}
memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
maxMemObjectsInQueue_ = numMemObj;
}
return true;
}
void
VirtualGPU::MemoryDependency::validate(
VirtualGPU& gpu,
const Memory* memory,
bool readOnly)
{
bool flushL1Cache = false;
if (maxMemObjectsInQueue_ == 0) {
// Flush cache
gpu.flushCUCaches();
return;
}
uint64_t curStart = memory->vmAddress();
uint64_t curEnd = curStart + memory->vmSize();
// Loop through all memory objects in the queue and find dependency
// @note don't include objects from the current kernel
for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
// Check if the queue already contains this mem object and
// GPU operations aren't readonly
uint64_t busyStart = memObjectsInQueue_[j].start_;
uint64_t busyEnd = memObjectsInQueue_[j].end_;
// Check if the start inside the busy region
if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
// Check if the end inside the busy region
((curEnd > busyStart) && (curEnd <= busyEnd)) ||
// Check if the start/end cover the busy region
((curStart <= busyStart) && (curEnd >= busyEnd))) &&
// If the buys region was written or the current one is for write
(!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
flushL1Cache = true;
break;
}
}
// Did we reach the limit?
if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
flushL1Cache = true;
}
if (flushL1Cache) {
// Flush cache
gpu.flushCUCaches();
// Clear memory dependency state
const static bool All = true;
clear(!All);
}
// Insert current memory object into the queue always,
// since runtime calls flush before kernel execution and it has to keep
// current kernel in tracking
memObjectsInQueue_
[numMemObjectsInQueue_].start_ = curStart;
memObjectsInQueue_
[numMemObjectsInQueue_].end_ = curEnd;
memObjectsInQueue_
[numMemObjectsInQueue_].readOnly_ = readOnly;
numMemObjectsInQueue_++;
}
void
VirtualGPU::MemoryDependency::clear(bool all)
{
if (numMemObjectsInQueue_ > 0) {
size_t i, j;
if (all) {
endMemObjectsInQueue_ = numMemObjectsInQueue_;
}
// Preserve all objects from the current kernel
for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
}
// Clear all objects except current kernel
memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
numMemObjectsInQueue_ -= endMemObjectsInQueue_;
endMemObjectsInQueue_ = 0;
}
}
VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev)
: cbWorkload_(0)
, dispatchSplitSize_(0)
{
aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_;
maxDispatchWorkload_ = static_cast<uint64_t>(dev.info().maxClockFrequency_) *
// find time in us
100 * dev.settings().maxWorkloadTime_ *
aluCnt_;
resetCbWorkload(dev);
}
void
VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev)
{
cbWorkload_ = 0;
maxCbWorkload_ = static_cast<uint64_t>(dev.info().maxClockFrequency_) *
// find time in us
100 * dev.settings().minWorkloadTime_ * aluCnt_;
}
void
VirtualGPU::DmaFlushMgmt::findSplitSize(
const Device& dev, uint64_t threads, uint instructions)
{
uint64_t workload = threads * instructions;
if (maxDispatchWorkload_ < workload) {
dispatchSplitSize_ = static_cast<uint>(maxDispatchWorkload_ / instructions);
uint fullLoad = dev.info().maxComputeUnits_ * dev.info().maxWorkGroupSize_;
if ((dispatchSplitSize_ % fullLoad) != 0) {
dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad;
}
}
else {
dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ?
dev.settings().workloadSplitSize_ : 0;
}
}
bool
VirtualGPU::DmaFlushMgmt::isCbReady(
VirtualGPU& gpu, uint64_t threads, uint instructions)
{
bool cbReady = false;
uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions;
// Add current workload to the overall workload in the current DMA
cbWorkload_ += workload;
// Did it exceed maximum?
if (cbWorkload_ > maxCbWorkload_) {
// Reset DMA workload
cbWorkload_ = 0;
// Increase workload of the next DMA buffer by 50%
maxCbWorkload_ = maxCbWorkload_ * 3 / 2;
if (maxCbWorkload_ > maxDispatchWorkload_) {
maxCbWorkload_ = maxDispatchWorkload_;
}
cbReady = true;
}
return cbReady;
}
void
VirtualGPU::addXferWrite(Memory& memory)
{
if (xferWriteBuffers_.size() > 7) {
dev().xferWrite().release(*this, *xferWriteBuffers_.front());
xferWriteBuffers_.pop_front();
}
// Delay destruction
xferWriteBuffers_.push_back(&memory);
}
void
VirtualGPU::releaseXferWrite()
{
for (auto& memory : xferWriteBuffers_) {
dev().xferWrite().release(*this, *memory);
}
xferWriteBuffers_.clear();
}
void
VirtualGPU::addPinnedMem(amd::Memory* mem)
{
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
pinnedMems_.pop_front();
}
// Start operation, since we should release mem object
flushDMA(getGpuEvent(dev().getGpuMemory(mem)->iMem())->engineId_);
// Delay destruction
pinnedMems_.push_back(mem);
}
}
void
VirtualGPU::releasePinnedMem()
{
for (auto& amdMemory : pinnedMems_) {
amdMemory->release();
}
pinnedMems_.clear();
}
amd::Memory*
VirtualGPU::findPinnedMem(void* addr, size_t size)
{
for (auto& amdMemory : pinnedMems_) {
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
return amdMemory;
}
}
return nullptr;
}
bool
VirtualGPU::createVirtualQueue(uint deviceQueueSize)
{
uint MinDeviceQueueSize = 16 * 1024;
deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);
maskGroups_ = deviceQueueSize / (512 * Ki);
maskGroups_ = (maskGroups_== 0) ? 1 : maskGroups_;
// Align the queue size for the multiple dispatch scheduler.
// Each thread works with 32 entries * maskGroups
uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) *
DeviceQueueMaskSize * maskGroups_);
if (extra != 0) {
deviceQueueSize += (sizeof(AmdAqlWrap) *
DeviceQueueMaskSize * maskGroups_) - extra;
}
if (deviceQueueSize_ == deviceQueueSize) {
return true;
}
else {
//! @todo Temporarily keep the buffer mapped for debug purpose
if (nullptr != schedParams_) {
schedParams_->unmap(this);
}
delete vqHeader_;
delete virtualQueue_;
delete schedParams_;
vqHeader_ = nullptr;
virtualQueue_ = nullptr;
schedParams_ = nullptr;
schedParamIdx_ = 0;
deviceQueueSize_ = 0;
}
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
uint allocSize = deviceQueueSize;
// Add the virtual queue header
allocSize += sizeof(AmdVQueueHeader);
allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));
uint argOffs = allocSize;
// Add the kernel arguments and wait events
uint singleArgSize = amd::alignUp(dev().info().maxParameterSize_ + 64 +
dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap));
allocSize += singleArgSize * numSlots;
uint eventsOffs = allocSize;
// Add the device events
allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);
uint eventMaskOffs = allocSize;
// Add mask array for events
allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;
uint slotMaskOffs = allocSize;
// Add mask array for AmdAqlWrap slots
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
virtualQueue_ = new Memory(dev(), allocSize);
Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ?
Resource::Local : Resource::Remote;
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
return false;
}
address ptr = reinterpret_cast<address>(
virtualQueue_->map(this, Resource::WriteOnly));
if (nullptr == ptr) {
return false;
}
// Clear memory
memset(ptr, 0, allocSize);
uint64_t vaBase = virtualQueue_->vmAddress();
AmdVQueueHeader* header = reinterpret_cast<AmdVQueueHeader*>(ptr);
// Initialize the virtual queue header
header->aql_slot_num = numSlots;
header->event_slot_num = dev().settings().numDeviceEvents_;
header->event_slot_mask = vaBase + eventMaskOffs;
header->event_slots = vaBase + eventsOffs;
header->aql_slot_mask = vaBase + slotMaskOffs;
header->wait_size = dev().settings().numWaitEvents_;
header->arg_size = dev().info().maxParameterSize_ + 64;
header->mask_groups = maskGroups_;
vqHeader_ = new AmdVQueueHeader;
if (nullptr == vqHeader_) {
return false;
}
*vqHeader_ = *header;
// Go over all slots and perform initialization
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
slots[i].aql.kernarg_address = reinterpret_cast<void*>(argStart);
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
}
// Upload data back to local memory
if (GPU_PRINT_CHILD_KERNEL == 0) {
virtualQueue_->unmap(this);
}
schedParams_ = new Memory(dev(), 64 * Ki);
if ((schedParams_ == nullptr) || !schedParams_->create(Resource::RemoteUSWC)) {
return false;
}
ptr = reinterpret_cast<address>(schedParams_->map(this));
deviceQueueSize_ = deviceQueueSize;
return true;
}
VirtualGPU::VirtualGPU(
Device& device)
: device::VirtualDevice(device)
, engineID_(MainEngine)
, gpuDevice_(static_cast<Device&>(device))
, execution_("Virtual GPU execution lock", true)
, printfDbg_(nullptr)
, printfDbgHSA_(nullptr)
, tsCache_(nullptr)
, dmaFlushMgmt_(device)
, hwRing_(0)
, readjustTimeGPU_(0)
, currTs_(nullptr)
, vqHeader_(nullptr)
, virtualQueue_(nullptr)
, schedParams_(nullptr)
, schedParamIdx_(0)
, deviceQueueSize_(0)
, maskGroups_(1)
, hsaQueueMem_(nullptr)
, cmdAllocator_(nullptr)
{
memset(&cal_, 0, sizeof(CalVirtualDesc));
for (uint i = 0; i < AllEngines; ++i) {
cal_.events_[i].invalidate();
}
// Note: Virtual GPU device creation must be a thread safe operation
index_ = gpuDevice_.numOfVgpus_++;
gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
gpuDevice_.vgpus_[index()] = this;
queues_[MainEngine] = nullptr;
queues_[SdmaEngine] = nullptr;
}
bool
VirtualGPU::create(bool profiling, uint deviceQueueSize)
{
device::BlitManager::Setup blitSetup;
if (index() >= GPU_MAX_COMMAND_QUEUES) {
// Cap the maximum number of concurrent Virtual GPUs
return false;
}
// Virtual GPU will have profiling enabled
state_.profiling_ = profiling;
Pal::CmdAllocatorCreateInfo createInfo = {};
// \todo forces PAL to reuse CBs, but requires postamble
createInfo.flags.autoMemoryReuse = true;
createInfo.flags.threadSafe = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap =
Pal::GpuHeapGartCacheable;
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = 128 * Ki;
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = 128 * Ki;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap =
Pal::GpuHeapGartCacheable;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 64 * Ki;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;
Pal::Result result;
size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
if (Pal::Result::Success != result) {
return false;
}
char* addr = new char [cmdAllocSize];
if (Pal::Result::Success !=
dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
return false;
}
if (dev().numComputeEngines()) {
uint idx = index() % dev().numComputeEngines();
// hwRing_ should be set 0 if forced to have single scratch buffer
hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
queues_[MainEngine] = Queue::Create(
dev().iDev(), Pal::QueueTypeCompute, idx, cmdAllocator_);
if (nullptr == queues_[MainEngine]) {
return false;
}
// Check if device has SDMA engines
if (dev().numDMAEngines() != 0) {
queues_[SdmaEngine] = Queue::Create(
dev().iDev(), Pal::QueueTypeDma,
idx % dev().numDMAEngines(), cmdAllocator_);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
}
else {
Unimplemented();
}
}
else {
Unimplemented();
}
// Diable double copy optimization,
// since UAV read from nonlocal is fast enough
blitSetup.disableCopyBufferToImageOpt_ = true;
if (!allocConstantBuffers()) {
return false;
}
// Create Printf class
printfDbg_ = new PrintfDbg(gpuDevice_);
if ((nullptr == printfDbg_) || !printfDbg_->create()) {
delete printfDbg_;
LogError("Could not allocate debug buffer for printf()!");
return false;
}
// Create HSAILPrintf class
printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
if (nullptr == printfDbgHSA_) {
delete printfDbgHSA_;
LogError("Could not create PrintfDbgHSA class!");
return false;
}
// Choose the appropriate class for blit engine
switch (dev().settings().blitEngine_) {
default:
// Fall through ...
case Settings::BlitEngineHost:
blitSetup.disableAll();
// Fall through ...
case Settings::BlitEngineCAL:
case Settings::BlitEngineKernel:
// use host blit for HW debug
if (dev().settings().enableHwDebug_) {
blitSetup.disableCopyImageToBuffer_ = true;
blitSetup.disableCopyBufferToImage_ = true;
}
blitMgr_ = new KernelBlitManager(*this, blitSetup);
break;
}
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
LogError("Could not create BlitManager!");
return false;
}
tsCache_ = new TimeStampCache(*this);
if (nullptr == tsCache_) {
LogError("Could not create TimeStamp cache!");
return false;
}
if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
LogError("Could not create the array of memory objects!");
return false;
}
if(!allocHsaQueueMem()) {
LogError("Could not create hsaQueueMem object!");
return false;
}
// Check if the app requested a device queue creation
if (dev().settings().useDeviceQueue_ &&
(0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
LogError("Could not create a virtual queue!");
return false;
}
return true;
}
bool
VirtualGPU::allocHsaQueueMem()
{
// Allocate a dummy HSA queue
hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
if ((hsaQueueMem_ == nullptr) ||
(!hsaQueueMem_->create(Resource::RemoteUSWC))) {
delete hsaQueueMem_;
return false;
}
amd_queue_t* queue = reinterpret_cast<amd_queue_t*>
(hsaQueueMem_->map(nullptr, Resource::WriteOnly));
if (nullptr == queue) {
delete hsaQueueMem_;
return false;
}
memset(queue, 0, sizeof(amd_queue_t));
// Provide private and local heap addresses
const static uint addressShift = LP64_SWITCH(0, 32);
queue->private_segment_aperture_base_hi = static_cast<uint32_t>(
dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
queue->group_segment_aperture_base_hi = static_cast<uint32_t>(
dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);
hsaQueueMem_->unmap(nullptr);
return true;
}
VirtualGPU::~VirtualGPU()
{
// Not safe to remove a queue. So lock the device
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
// Destroy all memories
static const bool SkipScratch = false;
releaseMemObjects(SkipScratch);
// Destroy printf object
delete printfDbg_;
// Destroy printfHSA object
delete printfDbgHSA_;
// Destroy BlitManager object
delete blitMgr_;
// Destroy TimeStamp cache
delete tsCache_;
// Destroy resource list with the constant buffers
for (uint i = 0; i < constBufs_.size(); ++i) {
delete constBufs_[i];
}
// Destroy queues
if (nullptr != queues_[MainEngine]) {
// Make sure the queues are idle
// It's unclear why PAL could still have a busy queue
queues_[MainEngine]->iQueue_->WaitIdle();
delete queues_[MainEngine];
}
if (nullptr != queues_[SdmaEngine]) {
queues_[SdmaEngine]->iQueue_->WaitIdle();
delete queues_[SdmaEngine];
}
if (nullptr != cmdAllocator_) {
cmdAllocator_->Destroy();
delete [] reinterpret_cast<char*>(cmdAllocator_);
}
gpuDevice_.numOfVgpus_--;
gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->index_--;
}
// Release scratch buffer memory to reduce memory pressure
//!@note OCLtst uses single device with multiple tests
//! Release memory only if it's the last command queue.
//! The first queue is reserved for the transfers on device
if (gpuDevice_.numOfVgpus_ <= 1) {
gpuDevice_.destroyScratchBuffers();
}
//! @todo Temporarily keep the buffer mapped for debug purpose
if (nullptr != schedParams_) {
schedParams_->unmap(this);
}
delete vqHeader_;
delete virtualQueue_;
delete schedParams_;
delete hsaQueueMem_;
}
void
VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Translate memory references and ensure cache up-to-date
pal::Memory* memory = dev().getGpuMemory(&vcmd.source());
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
profilingBegin(vcmd, true);
memory->syncCacheFromHost(*this);
cl_command_type type = vcmd.type();
bool result = false;
amd::Memory* bufferFromImage = nullptr;
// Force buffer read for IMAGE1D_BUFFER
if ((type == CL_COMMAND_READ_IMAGE) &&
(vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(vcmd.source());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
type = CL_COMMAND_READ_BUFFER;
bufferFromImage->setVirtualDevice(this);
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Process different write commands
switch (type) {
case CL_COMMAND_READ_BUFFER: {
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
if (nullptr != bufferFromImage) {
size_t elemSize =
vcmd.source().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (hostMemory != nullptr) {
// Accelerated transfer without pinning
amd::Coord3D dstOrigin(offset);
result = blitMgr().copyBuffer(*memory, *hostMemory,
origin, dstOrigin, size, vcmd.isEntireMemory());
}
else {
result = blitMgr().readBuffer(
*memory, vcmd.destination(),
origin, size, vcmd.isEntireMemory());
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
}
break;
case CL_COMMAND_READ_BUFFER_RECT: {
amd::BufferRect hostbufferRect;
amd::Coord3D region(0);
amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset);
hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_);
if (hostMemory != nullptr) {
result = blitMgr().copyBufferRect(*memory, *hostMemory,
vcmd.bufRect(), hostbufferRect, vcmd.size(),
vcmd.isEntireMemory());
}
else {
result = blitMgr().readBufferRect(*memory,
vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(),
vcmd.isEntireMemory());
}
}
break;
case CL_COMMAND_READ_IMAGE:
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
result = blitMgr().copyImageToBuffer(*memory, *hostMemory,
vcmd.origin(), dstOrigin, vcmd.size(),
vcmd.isEntireMemory(),
vcmd.rowPitch(), vcmd.slicePitch());
}
else {
result = blitMgr().readImage(*memory, vcmd.destination(),
vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(),
vcmd.isEntireMemory());
}
break;
default:
LogError("Unsupported type for the read command");
break;
}
if (!result) {
LogError("submitReadMemory failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Translate memory references and ensure cache up to date
pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
profilingBegin(vcmd, true);
bool entire = vcmd.isEntireMemory();
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
memory->syncCacheFromHost(*this, syncFlags);
cl_command_type type = vcmd.type();
bool result = false;
amd::Memory* bufferFromImage = nullptr;
// Force buffer write for IMAGE1D_BUFFER
if ((type == CL_COMMAND_WRITE_IMAGE) &&
(vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(vcmd.destination());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
type = CL_COMMAND_WRITE_BUFFER;
bufferFromImage->setVirtualDevice(this);
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Process different write commands
switch (type) {
case CL_COMMAND_WRITE_BUFFER: {
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
if (nullptr != bufferFromImage) {
size_t elemSize =
vcmd.destination().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (hostMemory != nullptr) {
// Accelerated transfer without pinning
amd::Coord3D srcOrigin(offset);
result = blitMgr().copyBuffer(*hostMemory, *memory,
srcOrigin, origin, size, vcmd.isEntireMemory());
}
else {
result = blitMgr().writeBuffer(vcmd.source(), *memory,
origin, size, vcmd.isEntireMemory());
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
}
break;
case CL_COMMAND_WRITE_BUFFER_RECT: {
amd::BufferRect hostbufferRect;
amd::Coord3D region(0);
amd::Coord3D hostOrigin(vcmd.hostRect().start_+ offset);
hostbufferRect.create(hostOrigin.c, vcmd.size().c , vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_);
if (hostMemory != nullptr) {
result = blitMgr().copyBufferRect(*hostMemory, *memory,
hostbufferRect, vcmd.bufRect(), vcmd.size(),
vcmd.isEntireMemory());
}
else {
result = blitMgr().writeBufferRect(vcmd.source(), *memory,
vcmd.hostRect(), vcmd.bufRect(), vcmd.size(),
vcmd.isEntireMemory());
}
}
break;
case CL_COMMAND_WRITE_IMAGE:
if (hostMemory != nullptr) {
// Accelerated buffer to image transfer without pinning
amd::Coord3D srcOrigin(offset);
result = blitMgr().copyBufferToImage(*hostMemory, *memory,
srcOrigin, vcmd.origin(), vcmd.size(),
vcmd.isEntireMemory(),
vcmd.rowPitch(), vcmd.slicePitch());
}
else {
result = blitMgr().writeImage(vcmd.source(), *memory,
vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(),
vcmd.isEntireMemory());
}
break;
default:
LogError("Unsupported type for the write command");
break;
}
if (!result) {
LogError("submitWriteMemory failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
else {
// Mark this as the most-recently written cache of the destination
vcmd.destination().signalWrite(&gpuDevice_);
}
profilingEnd(vcmd);
}
bool
VirtualGPU::copyMemory(cl_command_type type
, amd::Memory& srcMem
, amd::Memory& dstMem
, bool entire
, const amd::Coord3D& srcOrigin
, const amd::Coord3D& dstOrigin
, const amd::Coord3D& size
, const amd::BufferRect& srcRect
, const amd::BufferRect& dstRect
)
{
// Translate memory references and ensure cache up-to-date
pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);
// Synchronize source and destination memory
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
dstMemory->syncCacheFromHost(*this, syncFlags);
srcMemory->syncCacheFromHost(*this);
amd::Memory* bufferFromImageSrc = nullptr;
amd::Memory* bufferFromImageDst = nullptr;
// Force buffer read for IMAGE1D_BUFFER
if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImageSrc = createBufferFromImage(srcMem);
if (nullptr == bufferFromImageSrc) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
type = CL_COMMAND_COPY_BUFFER;
bufferFromImageSrc->setVirtualDevice(this);
srcMemory = dev().getGpuMemory(bufferFromImageSrc);
}
}
// Force buffer write for IMAGE1D_BUFFER
if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImageDst = createBufferFromImage(dstMem);
if (nullptr == bufferFromImageDst) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
type = CL_COMMAND_COPY_BUFFER;
bufferFromImageDst->setVirtualDevice(this);
dstMemory = dev().getGpuMemory(bufferFromImageDst);
}
}
bool result = false;
// Check if HW can be used for memory copy
switch (type) {
case CL_COMMAND_SVM_MEMCPY:
case CL_COMMAND_COPY_BUFFER: {
amd::Coord3D realSrcOrigin(srcOrigin[0]);
amd::Coord3D realDstOrigin(dstOrigin[0]);
amd::Coord3D realSize(size.c[0],size.c[1],size.c[2]);
if (nullptr != bufferFromImageSrc) {
size_t elemSize =
srcMem.asImage()->getImageFormat().getElementSize();
realSrcOrigin.c[0] *= elemSize;
if (nullptr != bufferFromImageDst) {
realDstOrigin.c[0] *= elemSize;
}
realSize.c[0] *= elemSize;
}
else if (nullptr != bufferFromImageDst) {
size_t elemSize =
dstMem.asImage()->getImageFormat().getElementSize();
realDstOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
}
result = blitMgr().copyBuffer(*srcMemory, *dstMemory,
realSrcOrigin, realDstOrigin, realSize, entire);
if (nullptr != bufferFromImageSrc) {
bufferFromImageSrc->release();
}
if (nullptr != bufferFromImageDst) {
bufferFromImageDst->release();
}
}
break;
case CL_COMMAND_COPY_BUFFER_RECT:
result = blitMgr().copyBufferRect(*srcMemory, *dstMemory,
srcRect, dstRect, size, entire);
break;
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory,
srcOrigin, dstOrigin, size, entire);
break;
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory,
srcOrigin, dstOrigin, size, entire);
break;
case CL_COMMAND_COPY_IMAGE:
result = blitMgr().copyImage(*srcMemory, *dstMemory,
srcOrigin, dstOrigin, size, entire);
break;
default:
LogError("Unsupported command type for memory copy!");
break;
}
if (!result) {
LogError("submitCopyMemory failed!");
return false;
}
else {
// Mark this as the most-recently written cache of the destination
dstMem.signalWrite(&gpuDevice_);
}
return true;
}
void
VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
cl_command_type type = vcmd.type();
bool entire = vcmd.isEntireMemory();
if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire,
vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(),
vcmd.dstRect())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
cl_command_type type = vcmd.type();
//no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
amd::Memory* srcMem = amd::SvmManager::FindSvmBuffer(vcmd.src());
amd::Memory* dstMem = amd::SvmManager::FindSvmBuffer(vcmd.dst());
if (nullptr == srcMem || nullptr == dstMem) {
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
amd::Coord3D srcOrigin(0, 0, 0);
amd::Coord3D dstOrigin(0, 0, 0);
amd::Coord3D size(vcmd.srcSize(), 1, 1);
amd::BufferRect srcRect;
amd::BufferRect dstRect;
srcOrigin.c[0] = static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
dstOrigin.c[0] = static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
if (!(srcMem->validateRegion(srcOrigin, size)) || !(dstMem->validateRegion(dstOrigin, size))) {
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
bool entire = srcMem->isEntirelyCovered(srcOrigin, size) &&
dstMem->isEntirelyCovered(dstOrigin, size);
if (!copyMemory(type, *srcMem, *dstMem, entire,
srcOrigin, dstOrigin, size, srcRect, dstRect)) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
}
else {
//direct memcpy for FGS enabled system
amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
// Save map info for unmap operation
memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(),
vcmd.mapFlags(), vcmd.isEntireMemory());
// If we have host memory, use it
if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
if (!memory->isHostMemDirectAccess()) {
// Make sure GPU finished operation before
// synchronization with the backing store
memory->wait(*this);
}
// Target is the backing store, so just ensure that owner is up-to-date
memory->owner()->cacheWriteBack();
// Add memory to VA cache, so rutnime can detect direct access to VA
dev().addVACache(memory);
}
else if (memory->isPersistentDirectMap()) {
// Nothing to do here
}
else if (memory->mapMemory() != nullptr) {
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
amd::Coord3D dstOrigin(0, 0, 0);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(*memory,
*memory->mapMemory(), vcmd.origin(), vcmd.origin(),
vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
}
else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
amd::Memory* bufferFromImage = nullptr;
Memory* memoryBuf = memory;
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
size_t elemSize =
vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
bufferFromImage->setVirtualDevice(this);
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(*memoryBuf,
*memory->mapMemory(), origin, dstOrigin,
size, vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
}
else {
// Validate if it's a view for a map of mip level
if (vcmd.memory().parent() != nullptr) {
amd::Image* amdImage = vcmd.memory().parent()->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
// Save map write info in the parent object
dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(),
vcmd.origin(), vcmd.size(),
vcmd.mapFlags(), vcmd.isEntireMemory(),
vcmd.memory().asImage());
}
}
if (!blitMgr().copyImageToBuffer(*memory,
*memory->mapMemory(), vcmd.origin(), dstOrigin,
vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
}
}
}
else {
LogError("Unhandled map!");
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
amd::Memory* owner = memory->owner();
bool unmapMip = false;
const device::Memory::WriteMapInfo* writeMapInfo =
memory->writeMapInfo(vcmd.mapPtr());
if (nullptr == writeMapInfo) {
LogError("Unmap without map call");
return;
}
profilingBegin(vcmd, true);
// Check if image is a mipmap and assign a saved view
amd::Image* amdImage = owner->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
(writeMapInfo->baseMip_ != nullptr)) {
// Assign mip level view
amdImage = writeMapInfo->baseMip_;
// Clear unmap flags from the parent image
memory->clearUnmapInfo(vcmd.mapPtr());
memory = dev().getGpuMemory(amdImage);
unmapMip = true;
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
}
// We used host memory
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
if (writeMapInfo->isUnmapWrite() && !owner->usesSvmPointer()) {
// Target is the backing store, so sync
owner->signalWrite(nullptr);
memory->syncCacheFromHost(*this);
}
// Remove memory from VA cache
dev().removeVACache(memory);
}
// data check was added for persistent memory that failed to get aperture
// and therefore are treated like a remote resource
else if (memory->isPersistentDirectMap() && (memory->data() != nullptr)) {
memory->unmap(this);
}
else if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(
*memory->mapMemory(), *memory,
writeMapInfo->origin_,
writeMapInfo->origin_,
writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
amd::Memory* bufferFromImage = nullptr;
Memory* memoryBuf = memory;
amd::Coord3D origin(writeMapInfo->origin_[0]);
amd::Coord3D size(writeMapInfo->region_[0]);
size_t elemSize =
vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
bufferFromImage->setVirtualDevice(this);
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(
*memory->mapMemory(), *memoryBuf,
srcOrigin, origin, size,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
}
else {
if (!blitMgr().copyBufferToImage(
*memory->mapMemory(), *memory,
srcOrigin,
writeMapInfo->origin_,
writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
}
}
else {
LogError("Unhandled unmap!");
vcmd.setStatus(CL_INVALID_VALUE);
}
// Clear unmap flags
memory->clearUnmapInfo(vcmd.mapPtr());
// Release a view for a mipmap map
if (unmapMip) {
amdImage->release();
}
profilingEnd(vcmd);
}
bool
VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size)
{
pal::Memory* memory = dev().getGpuMemory(amdMemory);
bool entire = amdMemory->isEntirelyCovered(origin, size);
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
memory->syncCacheFromHost(*this, syncFlags);
bool result = false;
amd::Memory* bufferFromImage = nullptr;
float fillValue[4];
// Force fill buffer for IMAGE1D_BUFFER
if ((type == CL_COMMAND_FILL_IMAGE) &&
(amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(*amdMemory);
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
}
else {
type = CL_COMMAND_FILL_BUFFER;
bufferFromImage->setVirtualDevice(this);
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Find the the right fill operation
switch (type) {
case CL_COMMAND_FILL_BUFFER :
case CL_COMMAND_SVM_MEMFILL : {
amd::Coord3D realOrigin(origin[0]);
amd::Coord3D realSize(size[0]);
// Reprogram fill parameters if it's an IMAGE1D_BUFFER object
if (nullptr != bufferFromImage) {
size_t elemSize =
amdMemory->asImage()->getImageFormat().getElementSize();
realOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
memset(fillValue, 0, sizeof(fillValue));
amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(*memory, pattern,
patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size));
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
}
break;
case CL_COMMAND_FILL_IMAGE:
result = blitMgr().fillImage(*memory, pattern,
origin, size, amdMemory->isEntirelyCovered(origin, size));
break;
default:
LogError("Unsupported command type for FillMemory!");
break;
}
if (!result) {
LogError("fillMemory failed!");
return false;
}
// Mark this as the most-recently written cache of the destination
amdMemory->signalWrite(&gpuDevice_);
return true;
}
void
VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
if (!fillMemory(vcmd.type(), &vcmd.memory(),vcmd.pattern(),
vcmd.patternSize(), vcmd.origin(), vcmd.size())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
//no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
// Make sure we have memory for the command execution
pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(),
vcmd.mapFlags(), vcmd.isEntireMemory());
if (memory->mapMemory() != nullptr) {
if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
assert(memory->desc().buffer_ && "SVM memory can't be an image");
if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(),
vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitSVMMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
}
}
else {
LogError("Unhandled svm map!");
}
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
//no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
const device::Memory::WriteMapInfo* writeMapInfo =
memory->writeMapInfo(vcmd.svmPtr());
if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->desc().buffer_ && "SVM memory can't be an image");
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory,
writeMapInfo->origin_, writeMapInfo->origin_,
writeMapInfo->region_, writeMapInfo->isEntire())) {
LogError("submitSvmUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
}
memory->clearUnmapInfo(vcmd.svmPtr());
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
if (!dev().isFineGrainedSystem()) {
size_t patternSize = vcmd.patternSize();
size_t fillSize = patternSize * vcmd.times();
size_t offset = 0;
amd::Memory* dstMemory = amd::SvmManager::FindSvmBuffer(vcmd.dst());
assert(dstMemory&&"No svm Buffer to fill with!");
offset = reinterpret_cast<uintptr_t>(vcmd.dst())
- reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
assert((offset >= 0) && "wrong svm ptr to fill with!");
pal::Memory* memory = dev().getGpuMemory(dstMemory);
amd::Coord3D origin(offset, 0, 0);
amd::Coord3D size(fillSize, 1, 1);
assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(),
vcmd.patternSize(), origin, size)) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
}
else {
// for FGS capable device, fill CPU memory directly
amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd, true);
std::vector<amd::Memory*>::const_iterator itr;
for (itr = vcmd.memObjects().begin(); itr != vcmd.memObjects().end(); itr++) {
// Find device memory
pal::Memory* memory = dev().getGpuMemory(*itr);
if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
memory->mgpuCacheWriteBack();
}
else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
// Synchronize memory from host if necessary.
// The sync function will perform memory migration from
// another device if necessary
device::Memory::SyncFlags syncFlags;
memory->syncCacheFromHost(*this, syncFlags);
}
else {
LogWarning("Unknown operation for memory migration!");
}
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd)
{
// in-order semantics: previous commands need to be done before we start
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
std::vector<void*>& svmPointers = vcmd.svmPointers();
if (vcmd.pfnFreeFunc() == nullptr) {
// pointers allocated using clSVMAlloc
for (cl_uint i = 0; i < svmPointers.size(); i++) {
dev().svmFree(svmPointers[i]);
}
}
else {
vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
static_cast<void**>(&(svmPointers[0])), vcmd.userData());
}
profilingEnd(vcmd);
}
void
VirtualGPU::findIterations(
const amd::NDRangeContainer& sizes,
const amd::NDRange& local,
amd::NDRange& groups,
amd::NDRange& remainder,
size_t& extra)
{
size_t dimensions = sizes.dimensions();
if (cal()->iterations_ > 1) {
size_t iterations = cal()->iterations_;
cal_.iterations_ = 1;
// Find the total amount of all groups
groups = sizes.global() / local;
if (dev().settings().partialDispatch_) {
for (uint j = 0; j < dimensions; ++j) {
if ((sizes.global()[j] % local[j]) != 0) {
groups[j]++;
}
}
}
// Calculate the real number of required iterations and
// the workgroup size of each iteration
for (int j = (dimensions - 1); j >= 0; --j) {
// Find possible size of each iteration
size_t tmp = (groups[j] / iterations);
// Make sure the group size is more than 1
if (tmp > 0) {
remainder = groups;
remainder[j] = (groups[j] % tmp);
extra = ((groups[j] / tmp) +
// Check for the remainder
((remainder[j] != 0) ? 1 : 0));
// Recalculate the number of iterations
cal_.iterations_ *= extra;
if (remainder[j] == 0) {
extra = 0;
}
groups[j] = tmp;
break;
}
else {
iterations = ((iterations / groups[j]) +
(((iterations % groups[j]) != 0) ? 1 : 0));
cal_.iterations_ *= groups[j];
groups[j] = 1;
}
}
}
}
void
VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
&vcmd.event())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
bool
VirtualGPU::submitKernelInternal(
const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel,
const_address parameters,
bool nativeMem,
amd::Event* enqueueEvent)
{
uint64_t vmParentWrap = 0;
uint64_t vmDefQueue = 0;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
VirtualGPU* gpuDefQueue = nullptr;
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
// Get the HSA kernel object
const HSAILKernel& hsaKernel =
static_cast<const HSAILKernel&>(*(kernel.getDeviceKernel(dev())));
std::vector<const Memory*> memList;
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false;
if (!printfDbgHSA().init(*this, printfEnabled )) {
LogError( "Printf debug buffer initialization failed!");
return false;
}
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) {
LogError("Wrong memory objects!");
return false;
}
if (hsaKernel.dynamicParallelism()) {
if (nullptr == defQueue) {
LogError("Default device queue wasn't allocated");
return false;
}
else {
if (dev().settings().useDeviceQueue_) {
gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
if (gpuDefQueue->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
}
else {
createVirtualQueue(defQueue->size());
gpuDefQueue = this;
}
}
vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
// Add memory handles before the actual dispatch
memList.push_back(gpuDefQueue->virtualQueue_);
memList.push_back(gpuDefQueue->schedParams_);
memList.push_back(hsaKernel.prog().kernelTable());
gpuDefQueue->writeVQueueHeader(*this,
hsaKernel.prog().kernelTable()->vmAddress());
}
// setup the storage for the memory pointers of the kernel parameters
uint numParams = kernel.signature().numParameters();
if (dbgManager) {
dbgManager->allocParamMemList(numParams);
}
size_t newOffset[3] = {0, 0, 0};
size_t newGlobalSize[3] = {0, 0, 0};
int dim = -1;
int iteration = 1;
size_t globalStep = 0;
for (uint i = 0; i < sizes.dimensions(); i++) {
newGlobalSize[i] = sizes.global()[i];
newOffset[i] = sizes.offset()[i];
}
// Check if it is blit kernel. If it is, then check if split is needed.
if (hsaKernel.isInternalKernel()) {
// Calculate new group size for each submission
for (uint i = 0; i < sizes.dimensions(); i++) {
if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
dim = i;
iteration = sizes.global()[i] / 0xC0000000
+ ((sizes.global()[i] % 0xC0000000) ? 1: 0);
globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
* sizes.local()[dim];
break;
}
}
}
for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
newOffset[dim] = sizes.offset()[dim] + globalStep * j;
if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
(j != (iteration - 1))) {
newGlobalSize[dim] = globalStep;
}
else {
newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
}
}
amd::NDRangeContainer tmpSizes(sizes.dimensions(),
&newOffset[0], &newGlobalSize[0],
&(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, tmpSizes, parameters, nativeMem,
vmDefQueue, &vmParentWrap, memList);
if (nullptr == aqlPkt) {
LogError("Couldn't load kernel arguments");
return false;
}
const Device::ScratchBuffer* scratch = nullptr;
// Check if the device allocated more registers than the old setup
if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
scratch = dev().scratch(hwRing());
memList.push_back(scratch->memObj_);
}
// Add GSL handle to the memory list for VidMM
for (uint i = 0; i < memList.size(); ++i) {
addVmMemory(memList[i]);
}
// HW Debug for the kernel?
HwDbgKernelInfo kernelInfo;
HwDbgKernelInfo *pKernelInfo = nullptr;
if (dbgManager) {
buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent);
pKernelInfo = &kernelInfo;
}
GpuEvent gpuEvent;
// Run AQL dispatch in HW
eventBegin(MainEngine);
if (nullptr == scratch) {
iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0,
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), hsaKernel.getWavesPerSH(this));
}
else {
iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(),
scratch->size_, scratch->offset_,
hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), hsaKernel.getWavesPerSH(this));
}
eventEnd(MainEngine, gpuEvent);
if (dbgManager && (nullptr != dbgManager->postDispatchCallBackFunc())) {
dbgManager->executePostDispatchCallBack();
}
if (hsaKernel.dynamicParallelism()) {
// Make sure exculsive access to the device queue
amd::ScopedLock(defQueue->lock());
if (GPU_PRINT_CHILD_KERNEL != 0) {
waitForEvent(&gpuEvent);
AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
uint p = 0;
for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
if (wraps[i].state != 0) {
uint j;
if (p == GPU_PRINT_CHILD_KERNEL) {
break;
}
p++;
std::stringstream print;
print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
print << "Slot#: " << i << "\n";
print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
print << "\tcommand_id: " << wraps[i].command_id << "\n";
print << "\tchild_counter: " << wraps[i].child_counter << "\n";
print << "\tcompletion: " << wraps[i].completion << "\n";
print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
print << "\twait_list: " << wraps[i].wait_list << "\n";
print << "\twait_num: " << wraps[i].wait_num << "\n";
uint offsEvents = wraps[i].wait_list -
gpuDefQueue->virtualQueue_->vmAddress();
size_t* events = reinterpret_cast<size_t*>(
gpuDefQueue->virtualQueue_->data() + offsEvents);
for (j = 0; j < wraps[i].wait_num; ++j) {
uint offs = static_cast<uint64_t>(events[j]) -
gpuDefQueue->virtualQueue_->vmAddress();
AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
print << "Wait Event#: " << j << "\n";
print << "\tState: " << eventD->state <<
"; Counter: " << eventD->counter << "\n";
}
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
print << wraps[i].aql.workgroup_size_y << ", ";
print << wraps[i].aql.workgroup_size_z << "]\n";
print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
print << wraps[i].aql.grid_size_y << ", ";
print << wraps[i].aql.grid_size_z << "]\n";
uint64_t* kernels = (uint64_t*)(
const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) {
if (kernels[j] == wraps[i].aql.kernel_object) {
break;
}
}
const_cast<Memory*>(hsaKernel.prog().kernelTable())->unmap(this);
HSAILKernel* child = nullptr;
for (auto it = hsaKernel.prog().kernels().begin();
it != hsaKernel.prog().kernels().end(); ++it) {
if (j == static_cast<HSAILKernel*>(it->second)->index()) {
child = static_cast<HSAILKernel*>(it->second);
}
}
if (child == nullptr) {
printf("Error: couldn't find child kernel!\n");
continue;
}
const uint64_t kernarg_address =
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
uint offsArg = kernarg_address -
gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = {
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
for (j = 0; j < child->extraArgumentsNum(); ++j) {
print << "\t" << Names[j] << *(size_t*)argum;
print << "\n";
argum += sizeof(size_t);
}
for (j = 0; j < child->numArguments(); ++j) {
print << "\t" << child->argument(j)->name_ << ": ";
for (int s = child->argument(j)->size_ - 1; s >= 0; --s) {
print.width(2);
print.fill('0');
print << (uint32_t)(argum[s]);
}
argum += child->argument(j)->size_;
print << "\n";
}
printf("%s", print.str().c_str());
}
}
}
if (!dev().settings().useDeviceQueue_) {
Unimplemented();
/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
0, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
*/
}
// Get the global loop start before the scheduler
Unimplemented();
/*
mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
*gpuDefQueue->virtualQueue_,
*gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
const static bool FlushL2 = true;
gpuDefQueue->flushCUCaches(FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart();
*/
Pal::gpusize patchStart = 0;
// Program parameters for the scheduler
SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
(gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
param->signal = 1;
// Scale clock to 1024 to avoid 64 bit div in the scheduler
param->eng_clk = (1000 * 1024) / dev().info().maxClockFrequency_;
param->hw_queue = patchStart + sizeof(uint32_t)/* Rewind packet*/;
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
param->releaseHostCP = 0;
param->parentAQL = vmParentWrap;
param->dedicatedQueue = dev().settings().useDeviceQueue_;
param->useATC = dev().settings().svmFineGrainSystem_;
// Fill the scratch buffer information
if (hsaKernel.prog().maxScratchRegs() > 0) {
pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
param->scratchSize = scratchBuf->size();
param->scratch = scratchBuf->vmAddress();
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
memList.push_back(scratchBuf);
}
else {
param->numMaxWaves = 0;
param->scratchSize = 0;
param->scratch = 0;
param->scratchOffset = 0;
}
// Add all kernels in the program to the mem list.
//! \note Runtime doesn't know which one will be called
hsaKernel.prog().fillResListWithKernels(memList);
// Add GPU memory handle to the memory list for VidMM
for (uint i = 0; i < memList.size(); ++i) {
gpuDefQueue->addVmMemory(memList[i]);
}
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress() +
gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam);
Unimplemented();
/*
gpuDefQueue->eventBegin(MainEngine);
gpuDefQueue->cs()->VirtualQueueDispatcherEnd(
gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num /
(DeviceQueueMaskSize * maskGroups_));
gpuDefQueue->eventEnd(MainEngine, gpuEvent);
*/
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*gpuDefQueue, gpuEvent);
}
if (dev().settings().useDeviceQueue_) {
Unimplemented();
/*
// Add the termination handshake to the host queue
eventBegin(MainEngine);
cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->iMem(),
vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter),
signalAddr, dev().settings().useDeviceQueue_);
eventEnd(MainEngine, gpuEvent);
*/
}
++gpuDefQueue->schedParamIdx_ %=
gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
//! \todo optimize the wrap around
if (gpuDefQueue->schedParamIdx_ == 0) {
gpuDefQueue->schedParams_->wait(*gpuDefQueue);
}
}
// Set GPU event for the used resources
for (uint i = 0; i < memList.size(); ++i) {
memList[i]->setBusy(*this, gpuEvent);
}
// Update the global GPU event
setGpuEvent(gpuEvent);
if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
LogError("Couldn't read printf data from the buffer!\n");
return false;
}
}
return true;
}
void
VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
Unimplemented(); //!< @todo: Unimplemented
}
void
VirtualGPU::submitMarker(amd::Marker& vcmd)
{
//!@note runtime doesn't need to lock this command on execution
if (vcmd.waitingEvent() != nullptr) {
bool foundEvent = false;
// Loop through all outstanding command batches
while (!cbList_.empty()) {
CommandBatchList::const_iterator it = cbList_.begin();
// Wait for completion
foundEvent = awaitCompletion(*it, vcmd.waitingEvent());
// Release a command batch
delete *it;
// Remove command batch from the list
cbList_.pop_front();
// Early exit if we found a command
if (foundEvent) break;
}
// Event should be in the current command batch
if (!foundEvent) {
state_.forceWait_ = true;
}
// If we don't have any more batches, then assume GPU is idle
else if (cbList_.empty()) {
dmaFlushMgmt_.resetCbWorkload(dev());
}
}
}
GpuEvent*
VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem)
{
return &gpuEvents_[iMem];
}
void
VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent)
{
auto it = gpuEvents_.find(iMem);
if (it != gpuEvents_.end()) {
it->second = gpuEvent;
}
else {
gpuEvents_[iMem] = gpuEvent;
}
}
void
VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait)
{
auto it = gpuEvents_.find(iMem);
//! @note if there is no wait, then it's a view release
if (wait && (it != gpuEvents_.end())) {
waitForEvent(&it->second);
queues_[MainEngine]->removeCmdMemRef(iMem);
queues_[SdmaEngine]->removeCmdMemRef(iMem);
gpuEvents_.erase(it);
}
}
void
VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();
// Create performance experiment
Pal::PerfExperimentCreateInfo createInfo = {};
createInfo.optionValues.sqShaderMask = Pal::PerfShaderMaskCs;
PalCounterReference* palRef = PalCounterReference::Create(*this, createInfo);
if (palRef == nullptr) {
LogError("We failed to allocate memory for the GPU perfcounter");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
bool newExperiment = false;
for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
amd::PerfCounter* amdCounter =
static_cast<amd::PerfCounter*>(counters[i]);
const PerfCounter* counter =
static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
// Make sure we have a valid gpu performance counter
if (nullptr == counter) {
amd::PerfCounter::Properties prop = amdCounter->properties();
PerfCounter* gpuCounter = new PerfCounter(
gpuDevice_,
*this,
prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX],
prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
if (nullptr == gpuCounter) {
LogError("We failed to allocate memory for the GPU perfcounter");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
else if (gpuCounter->create(palRef)) {
amdCounter->setDeviceCounter(gpuCounter);
newExperiment = true;
}
else {
LogPrintfError("We failed to allocate a perfcounter in CAL.\
Block: %d, counter: #d, event: %d",
gpuCounter->info()->blockIndex_,
gpuCounter->info()->counterIndex_,
gpuCounter->info()->eventIndex_);
delete gpuCounter;
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
counter = gpuCounter;
}
}
if (newExperiment) {
palRef->finalize();
}
palRef->release();
Pal::IPerfExperiment* palPerf = nullptr;
for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
amd::PerfCounter* amdCounter =
static_cast<amd::PerfCounter*>(counters[i]);
const PerfCounter* counter =
static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
if (palPerf != counter->iPerf()) {
palPerf = counter->iPerf();
// Find the state and sends the command to PAL
if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
iCmd()->CmdBeginPerfExperiment(palPerf);
}
else if (vcmd.getState() == amd::PerfCounterCommand::End) {
GpuEvent event;
eventBegin(MainEngine);
iCmd()->CmdEndPerfExperiment(palPerf);
eventEnd(MainEngine, event);
setGpuEvent(event);
}
else {
LogError("Unsupported performance counter state");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
}
}
void
VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
switch(cmd.type()) {
case CL_COMMAND_THREAD_TRACE_MEM:
{
amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
ThreadTrace* threadTrace =
static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
Unimplemented();
/*
if (threadTrace == nullptr) {
gslQueryObject gslThreadTrace;
// Create a HW thread trace query object
gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN);
if (0 == gslThreadTrace) {
LogError("Failure in memory allocation for the GPU threadtrace");
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
CalThreadTraceReference* palRef = new CalThreadTraceReference(*this,gslThreadTrace);
if (palRef == nullptr) {
LogError("Failure in memory allocation for the GPU threadtrace");
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
size_t seNum = amdThreadTrace->deviceSeNumThreadTrace();
ThreadTrace* gpuThreadTrace = new ThreadTrace(
gpuDevice_,
*this,
seNum);
if (nullptr == gpuThreadTrace) {
LogError("Failure in memory allocation for the GPU threadtrace");
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
if (gpuThreadTrace->create(palRef)) {
amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
}
else {
LogError("Failure in memory allocation for the GPU threadtrace");
delete gpuThreadTrace;
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
threadTrace = gpuThreadTrace;
palRef->release();
}
gslShaderTraceBufferObject* threadTraceBufferObjects = threadTrace->getThreadTraceBufferObjects();
const size_t memObjSize = cmd.getMemoryObjectSize();
const std::vector<amd::Memory*>& memObj = cmd.getMemList();
size_t se = 0;
for (std::vector<amd::Memory*>::const_iterator itMemObj = memObj.begin();itMemObj != memObj.end();++itMemObj,++se) {
// Find GSL Mem Object
Pal::IGpuMemory* gslMemObj = dev().getGpuMemory(*itMemObj)->iMem();
// Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object
threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se);
}
*/
break;
}
default:
LogError("Unsupported command type for ThreadTraceMemObjects!");
break;
}
}
void
VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
switch(cmd.type()) {
case CL_COMMAND_THREAD_TRACE:
{
amd::ThreadTrace* amdThreadTrace =
static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
ThreadTrace* threadTrace =
static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
// gpu thread trace object had to be generated prior to begin/end/pause/resume due
// to ThreadTraceMemObjectsCommand execution
if (threadTrace == nullptr) {
return;
}
else {
Unimplemented();
/*
gslQueryObject gslThreadTrace;
gslThreadTrace = threadTrace->gslThreadTrace();
uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace();
// Find the state and sends the commands to GSL
if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
amd::ThreadTrace::ThreadTraceConfig* traceCfg =
static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
const gslErrorCode ec = gslThreadTrace->BeginQuery(cs(),
GSL_SHADER_TRACE_BYTES_WRITTEN, 0);
assert(ec == GSL_NO_ERROR);
for (uint32_t idx = 0; idx < seNum; ++idx) {
rs()->enableShaderTrace(cs(), idx, true);
rs()->setShaderTraceComputeUnit (idx, traceCfg->cu_);
rs()->setShaderTraceShaderArray (idx, traceCfg->sh_);
rs()->setShaderTraceSIMDMask (idx, traceCfg->simdMask_);
rs()->setShaderTraceVmIdMask (idx, traceCfg->vmIdMask_);
rs()->setShaderTraceTokenMask (idx, traceCfg->tokenMask_);
rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_);
rs()->setShaderTraceIssueMask (idx, traceCfg->instMask_);
rs()->setShaderTraceRandomSeed (idx, traceCfg->randomSeed_);
rs()->setShaderTraceCaptureMode (idx, traceCfg->captureMode_);
rs()->setShaderTraceWrap (idx, traceCfg->isWrapped_);
rs()->setShaderTraceUserData (idx,
(traceCfg->isUserData_) ? traceCfg->userData_ : 0);
}
}
else if (cmd.getState() == amd::ThreadTraceCommand::End) {
for (uint32_t idx = 0; idx < seNum; ++idx) {
rs()->enableShaderTrace(cs(), idx, false);
}
gslThreadTrace->EndQuery(cs(), 0);
}
else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
for (uint32_t idx = 0; idx < seNum; ++idx) {
rs()->setShaderTraceIsPaused(cs(), idx, true);
}
}
else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
for (uint32_t idx = 0; idx < seNum; ++idx) {
rs()->setShaderTraceIsPaused(cs(), idx, false);
}
}
*/
}
break;
}
default:
LogError("Unsupported command type for ThreadTrace!");
break;
}
}
void
VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
it != vcmd.getMemList().end(); it++) {
// amd::Memory object should never be nullptr
assert(*it && "Memory object for interop is nullptr");
pal::Memory* memory = dev().getGpuMemory(*it);
// If resource is a shared copy of original resource, then
// runtime needs to copy data from original resource
(*it)->getInteropObj()->copyOrigToShared();
// Check if OpenCL has direct access to the interop memory
if (memory->interopType() == Memory::InteropDirectAccess) {
continue;
}
// Does interop use HW emulation?
if (memory->interopType() == Memory::InteropHwEmulation) {
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(memory->size());
// Synchronize the object
if (!blitMgr().copyBuffer(*memory->interop(),
*memory, origin, origin, region, Entire)) {
LogError("submitAcquireExtObjects - Interop synchronization failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
for (std::vector<amd::Memory*>::const_iterator it = vcmd.getMemList().begin();
it != vcmd.getMemList().end(); it++) {
// amd::Memory object should never be nullptr
assert(*it && "Memory object for interop is nullptr");
pal::Memory* memory = dev().getGpuMemory(*it);
// Check if we can use HW interop
if (memory->interopType() == Memory::InteropHwEmulation) {
static const bool Entire = true;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(memory->size());
// Synchronize the object
if (!blitMgr().copyBuffer(*memory, *memory->interop(),
origin, origin, region, Entire)) {
LogError("submitReleaseExtObjects interop synchronization failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
else {
if (memory->interopType() != Memory::InteropDirectAccess) {
LogError("None interop release!");
}
}
// If resource is a shared copy of original resource, then
// runtime needs to copy data back to original resource
(*it)->getInteropObj()->copySharedToOrig();
}
profilingEnd(vcmd);
}
void
VirtualGPU::submitSignal(amd::SignalCommand & vcmd)
{
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
pal::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory());
Unimplemented();
/*
if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
uint64_t surfAddr = gpuMemory->iMem()->getPhysicalAddress(cs());
uint64_t markerAddr = gpuMemory->iMem()->getMarkerAddress(cs());
uint64_t markerOffset = markerAddr - surfAddr;
cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(),
markerOffset, false);
}
else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
GpuEvent gpuEvent;
eventBegin(MainEngine);
cs()->p2pMarkerOp(gpuMemory->iMem(), vcmd.markerValue(), vcmd.markerOffset(), true);
//! @todo We don't need flush if an event is tracked.
cs()->Flush();
eventEnd(MainEngine, gpuEvent);
gpuMemory->setBusy(*this, gpuEvent);
// Update the global GPU event
setGpuEvent(gpuEvent);
}
*/
profilingEnd(vcmd);
}
void
VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd)
{
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
std::vector<amd::Memory*> memObjects = vcmd.memObjects();
cl_uint numObjects = memObjects.size();
Pal::IGpuMemory** pGpuMemObjects = new Pal::IGpuMemory*[numObjects];
for(cl_uint i = 0; i < numObjects; ++i)
{
pal::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]);
pGpuMemObjects[i] = gpuMemory->iMem();
gpuMemory->syncCacheFromHost(*this);
}
uint64_t* surfBusAddr = new uint64_t[numObjects];
uint64_t* markerBusAddr = new uint64_t[numObjects];
Unimplemented();
/*
gslErrorCode res = cs()->makeBuffersResident(numObjects, pGpuMemObjects,
surfBusAddr, markerBusAddr);
if(res != GSL_NO_ERROR) {
LogError("MakeBuffersResident failed");
vcmd.setStatus(CL_INVALID_OPERATION);
}
else {
cl_bus_address_amd* busAddr = vcmd.busAddress();
for(cl_uint i = 0; i < numObjects; ++i)
{
busAddr[i].surface_bus_address = surfBusAddr[i];
busAddr[i].marker_bus_address = markerBusAddr[i];
}
}
*/
delete[] pGpuMemObjects;
delete[] surfBusAddr;
delete[] markerBusAddr;
profilingEnd(vcmd);
}
bool
VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent)
{
bool found = false;
amd::Command* current;
amd::Command* head = cb->head_;
// Make sure that profiling is enabled
if (state_.profileEnabled_) {
return profilingCollectResults(cb, waitingEvent);
}
// Mark the first command in the batch as running
if (head != nullptr) {
head->setStatus(CL_RUNNING);
}
else {
return found;
}
// Wait for the last known GPU event
waitEventLock(cb);
while (nullptr != head) {
current = head->getNext();
if (head->status() == CL_SUBMITTED) {
head->setStatus(CL_RUNNING);
head->setStatus(CL_COMPLETE);
}
else if (head->status() == CL_RUNNING) {
head->setStatus(CL_COMPLETE);
}
else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
LogPrintfError("Unexpected command status - %d!", head->status());
}
// Check if it's a waiting command
if (head == waitingEvent) {
found = true;
}
head->release();
head = current;
}
return found;
}
void
VirtualGPU::flush(amd::Command* list, bool wait)
{
CommandBatch* cb = nullptr;
bool gpuCommand = false;
for (uint i = 0; i < AllEngines; ++i) {
if (cal_.events_[i].isValid()) {
gpuCommand = true;
}
}
// If the batch doesn't have any GPU command and the list is empty
if (!gpuCommand && cbList_.empty()) {
state_.forceWait_ = true;
}
// Insert the current batch into a list
if (nullptr != list) {
cb = new CommandBatch(list, cal()->events_, cal()->lastTS_);
}
{
//! @todo: Check if really need a lock
amd::ScopedLock lock(execution());
for (uint i = 0; i < AllEngines; ++i) {
flushDMA(i);
// Reset event so we won't try to wait again,
// if runtime didn't submit any commands
//! @note: it's safe to invalidate events, since
//! we already saved them with the batch creation step above
cal_.events_[i].invalidate();
}
}
// Mark last TS as nullptr, so runtime won't process empty batches with the old TS
cal_.lastTS_ = nullptr;
if (nullptr != cb) {
cbList_.push_back(cb);
}
wait |= state_.forceWait_;
// Loop through all outstanding command batches
while (!cbList_.empty()) {
CommandBatchList::const_iterator it = cbList_.begin();
// Check if command batch finished without a wait
bool finished = true;
for (uint i = 0; i < AllEngines; ++i) {
finished &= isDone(&(*it)->events_[i]);
}
if (finished || wait) {
// Wait for completion
awaitCompletion(*it);
// Release a command batch
delete *it;
// Remove command batch from the list
cbList_.pop_front();
}
else {
// Early exit if no finished
break;
}
}
state_.forceWait_ = false;
}
void
VirtualGPU::enableSyncedBlit() const
{
return blitMgr_->enableSynchronization();
}
void
VirtualGPU::releaseMemObjects(bool scratch)
{
for (GpuEvents::const_iterator it = gpuEvents_.begin();
it != gpuEvents_.end(); ++it) {
GpuEvent event = it->second;
waitForEvent(&event);
queues_[MainEngine]->removeCmdMemRef(const_cast<Pal::IGpuMemory*>(it->first));
queues_[SdmaEngine]->removeCmdMemRef(const_cast<Pal::IGpuMemory*>(it->first));
}
gpuEvents_.clear();
}
void
VirtualGPU::setGpuEvent(
GpuEvent gpuEvent,
bool flush)
{
cal_.events_[engineID_] = gpuEvent;
// Flush current DMA buffer if requested
if (flush) {
flushDMA(engineID_);
}
}
void
VirtualGPU::flushDMA(uint engineID)
{
if (engineID == MainEngine) {
// Clear memory dependency state, since runtime flushes compute
// memoryDependency().clear();
//!@todo Keep memory dependency alive even if we flush DMA,
//! since only L2 cache is flushed in KMD frame,
//! but L1 still has to be invalidated.
}
isDone(&cal_.events_[engineID]);
}
bool
VirtualGPU::waitAllEngines(CommandBatch* cb)
{
uint i;
GpuEvent* events; //!< GPU events for the batch
// If command batch is nullptr then wait for the current
if (nullptr == cb) {
events = cal_.events_;
}
else {
events = cb->events_;
}
bool earlyDone = true;
// The first loop is to flush all engines and/or check if
// engines are idle already
for (i = 0; i < AllEngines; ++i) {
earlyDone &= isDone(&events[i]);
}
// Release all transfer buffers on this command queue
releaseXferWrite();
// Rlease all pinned memory
releasePinnedMem();
// The second loop is to wait all engines
for (i = 0; i < AllEngines; ++i) {
waitForEvent(&events[i]);
}
return earlyDone;
}
void
VirtualGPU::waitEventLock(CommandBatch* cb)
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
bool earlyDone = waitAllEngines(cb);
// Free resource cache if we have too many entries
//! \note we do it here, when all engines are idle,
// because Vista/Win7 idles GPU on a resource destruction
static const size_t MinCacheEntries = 4096;
dev().resourceCache().free(MinCacheEntries);
// Find the timestamp object of the last command in the batch
if (cb->lastTS_ != nullptr) {
// If earlyDone is TRUE, then CPU didn't wait for GPU.
// Thus the sync point between CPU and GPU is unclear and runtime
// will use an older adjustment value to maintain the same timeline
if (!earlyDone ||
//! \note Workaround for APU(s).
//! GPU-CPU timelines may go off too much, thus always
//! force calibration with the last batch in the list
(cbList_.size() <= 1) ||
(readjustTimeGPU_ == 0)) {
uint64_t startTimeStampGPU = 0;
uint64_t endTimeStampGPU = 0;
// Get the timestamp value of the last command in the batch
cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
uint64_t endTimeStampCPU = amd::Os::timeNanos();
// Make sure the command batch has a valid GPU TS
if (!GPU_RAW_TIMESTAMP) {
// Adjust the base time by the execution time
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
}
}
}
}
bool
VirtualGPU::allocConstantBuffers()
{
// Allocate/reallocate constant buffers
size_t minCbSize;
// GCN doesn't really have a limit
minCbSize = 256 * Ki;
uint i;
// Create/reallocate constant buffer resources
for (i = 0; i < MaxConstBuffersArguments; ++i) {
ConstBuffer* constBuf = new ConstBuffer(*this, ((minCbSize +
ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize));
if ((constBuf != nullptr) && constBuf->create()) {
addConstBuffer(constBuf);
}
else {
// We failed to create a constant buffer
delete constBuf;
return false;
}
}
return true;
}
void
VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling)
{
// Is profiling enabled?
if (command.profilingInfo().enabled_) {
// Allocate a timestamp object from the cache
TimeStamp* ts = tsCache_->allocTimeStamp();
if (nullptr == ts) {
return;
}
// Save the TimeStamp object in the current OCL event
command.setData(ts);
currTs_ = ts;
state_.profileEnabled_ = true;
}
}
void
VirtualGPU::profilingEnd(amd::Command& command)
{
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(command.data());
if (ts != nullptr) {
// Check if the command actually did any GPU submission
if (ts->isValid()) {
cal_.lastTS_ = ts;
}
else {
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
command.setData(nullptr);
}
}
}
bool
VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent)
{
bool found = false;
amd::Command* current;
amd::Command* first = cb->head_;
// If the command list is, empty then exit
if (nullptr == first) {
return found;
}
// Wait for the last known GPU events on all engines
waitEventLock(cb);
// Find the CPU base time of the entire command batch execution
uint64_t endTimeStamp = amd::Os::timeNanos();
uint64_t startTimeStamp = endTimeStamp;
// First step, walk the command list to find the first valid command
//! \note The batch may have empty markers at the beginning.
//! So the start/end of the empty commands is equal to
//! the start of the first valid command in the batch.
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
if (ts != nullptr) {
ts->value(&startTimeStamp, &endTimeStamp);
endTimeStamp -= readjustTimeGPU_;
startTimeStamp -= readjustTimeGPU_;
// Assign to endTimeStamp the start of the first valid command
endTimeStamp = startTimeStamp;
break;
}
first = first->getNext();
}
// Second step, walk the command list to construct the time line
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts = reinterpret_cast<TimeStamp*>(first->data());
current = first->getNext();
if (ts != nullptr) {
ts->value(&startTimeStamp, &endTimeStamp);
endTimeStamp -= readjustTimeGPU_;
startTimeStamp -= readjustTimeGPU_;
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
first->setData(nullptr);
}
else {
// For empty commands start/end is equal to
// the end of the last valid command
startTimeStamp = endTimeStamp;
}
// Update the command status with the proper timestamps
if (first->status() == CL_SUBMITTED) {
first->setStatus(CL_RUNNING, startTimeStamp);
first->setStatus(CL_COMPLETE, endTimeStamp);
}
else if (first->status() == CL_RUNNING) {
first->setStatus(CL_COMPLETE, endTimeStamp);
}
else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
LogPrintfError("Unexpected command status - %d!", first->status());
}
// Do we wait this event?
if (first == waitingEvent) {
found = true;
}
first->release();
first = current;
}
return found;
}
bool
VirtualGPU::addVmMemory(const Memory* memory)
{
queues_[MainEngine]->addCmdMemRef(memory->iMem());
return true;
}
void
VirtualGPU::profileEvent(EngineType engine, bool type) const
{
if (nullptr == currTs_) {
return;
}
if (type) {
currTs_->begin((engine == SdmaEngine) ? true : false);
}
else {
currTs_->end((engine == SdmaEngine) ? true : false);
}
}
bool
VirtualGPU::processMemObjectsHSA(
const amd::Kernel& kernel,
const_address params,
bool nativeMem,
std::vector<const Memory*>* memList)
{
static const bool NoAlias = true;
const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>
(*(kernel.getDeviceKernel(dev(), NoAlias)));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
memoryDependency().newKernel();
bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
bool supportFineGrainedSystem = deviceSupportFGS;
FGSStatus status = kernelParams.getSvmSystemPointersSupport();
switch (status) {
case FGS_YES:
if (!deviceSupportFGS) {
return false;
}
supportFineGrainedSystem = true;
break;
case FGS_NO:
supportFineGrainedSystem = false;
break;
case FGS_DEFAULT:
default:
break;
}
size_t count = kernelParams.getNumberOfSvmPtr();
size_t execInfoOffset = kernelParams.getExecInfoOffset();
bool sync = true;
amd::Memory* memory = nullptr;
//get svm non arugment information
void* const* svmPtrArray =
reinterpret_cast<void* const*>(params + execInfoOffset);
for (size_t i = 0; i < count; i++) {
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
if (nullptr == memory) {
if (!supportFineGrainedSystem) {
return false;
}
else if (sync) {
Unimplemented();
//flushCUCaches();
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
}
}
else {
Memory* gpuMemory = dev().getGpuMemory(memory);
if (nullptr != gpuMemory) {
// Synchronize data with other memory instances if necessary
gpuMemory->syncCacheFromHost(*this);
const static bool IsReadOnly = false;
// Validate SVM passed in the non argument list
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
memList->push_back(gpuMemory);
}
else {
return false;
}
}
}
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const HSAILKernel::Argument* arg = hsaKernel.argument(i);
Memory* memory = nullptr;
bool readOnly = false;
amd::Memory* svmMem = nullptr;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
if (kernelParams.boundToSvmPointer(dev(), params, i)) {
svmMem = amd::SvmManager::FindSvmBuffer(
*reinterpret_cast<void* const*>(params + desc.offset_));
if (!svmMem) {
Unimplemented();
//flushCUCaches();
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
}
}
if (nativeMem) {
memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
}
else if (*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_) != nullptr) {
if (nullptr == svmMem) {
memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_));
}
else {
memory = dev().getGpuMemory(svmMem);
}
// Synchronize data with other memory instances if necessary
memory->syncCacheFromHost(*this);
}
if (memory != nullptr) {
// Check image
readOnly = (desc.accessQualifier_ ==
CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
// Check buffer
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, memory, readOnly);
}
}
}
for (pal::Memory* mem : hsaKernel.prog().globalStores()) {
const static bool IsReadOnly = false;
// Validate global store for a dependency in the queue
memoryDependency().validate(*this, mem, IsReadOnly);
}
return true;
}
amd::Memory*
VirtualGPU::createBufferFromImage(amd::Memory& amdImage) const
{
amd::Memory* mem = new(amdImage.getContext())
amd::Buffer(amdImage, 0, 0, amdImage.getSize());
if ((mem != nullptr) && !mem->create()) {
mem->release();
}
return mem;
}
void
VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
{
const static bool Wait = true;
vqHeader_->kernel_table = kernelTable;
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
}
void
VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
{
Unimplemented();
/*
//! @todo: fix issue of no event available for the flush/invalidate cache command
InvalidateSqCaches(cache_mask.sqICache_,
cache_mask.sqKCache_,
cache_mask.tcL1_,
cache_mask.tcL2_);
*/
flushDMA(engineID_);
return;
}
void
VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
hsa_kernel_dispatch_packet_t* aqlPkt,
HwDbgKernelInfo& kernelInfo,
amd::Event* enqueueEvent)
{
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
assert (dbgManager && "No HW Debug Manager!");
// Initialize structure with default values
if (hsaKernel.prog().maxScratchRegs() > 0) {
pal::Memory* scratchBuf = dev().scratch(hwRing())->memObj_;
kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
// Get the address of the scratch buffer and its size for CPU access
address scratchRingAddr = nullptr;
scratchRingAddr = static_cast<address>(scratchBuf->map(nullptr, 0));
dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
scratchBuf->unmap(nullptr);
}
else {
kernelInfo.scratchBufAddr = 0;
kernelInfo.scratchBufferSizeInBytes = 0;
dbgManager->setScratchRing(nullptr, 0);
}
//! @todo: need to verify what is wanted for the global memory
Unimplemented();
kernelInfo.heapBufAddr = 0;
kernelInfo.pAqlDispatchPacket = aqlPkt;
kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
// Get the address of the kernel code and its size for CPU access
pal::Memory* aqlCode = hsaKernel.gpuAqlCode();
if (nullptr != aqlCode) {
address aqlCodeAddr = static_cast<address>(aqlCode->map(nullptr, 0));
dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
aqlCode->unmap(nullptr);
}
else {
dbgManager->setKernelCodeInfo(nullptr, 0);
}
kernelInfo.trapPresent = false;
kernelInfo.trapHandler = nullptr;
kernelInfo.trapHandlerBuffer = nullptr;
kernelInfo.excpEn = 0;
kernelInfo.cacheDisableMask = 0;
kernelInfo.sqDebugMode = 0;
kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
// set kernel info for HW debug and call the callback function
if (nullptr != dbgManager->preDispatchCallBackFunc()) {
DebugToolInfo dbgSetting = {0};
dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf();
dbgSetting.event_ = enqueueEvent;
// Call the predispatch callback function & set the trap info
AqlCodeInfo aqlCodeInfo;
aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
// Execute the pre-dispatch call back function
dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
// assign the debug TMA and TBA for kernel dispatch
if (nullptr != dbgSetting.trapHandler_ && nullptr != dbgSetting.trapBuffer_) {
assignDebugTrapHandler(dbgSetting, kernelInfo);
}
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
// Execption policy
kernelInfo.excpEn = dbgSetting.exceptionMask_;
kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
// Compute the mask for reserved CUs. These two dwords correspond to
// two registers used for reserving CUs for display. In the current
// implementation, the number of CUs reserved can be 0 to 7, and it
// is set by debugger users.
if (dbgSetting.monitorMode_) {
uint32_t i = dbgSetting.reservedCuNum_ / 2;
kernelInfo.mgmtSe0Mask <<= i;
i = dbgSetting.reservedCuNum_ - i;
kernelInfo.mgmtSe1Mask <<= i;
}
Unimplemented();
/*
// flush/invalidate the instruction, data, L1 and L2 caches
InvalidateSqCaches();
*/
}
}
void
VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting,
HwDbgKernelInfo& kernelInfo)
{
// setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching
//
Memory* rtTrapHandlerMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTBA());
Memory* rtTrapBufferMem = static_cast<Memory*>(dev().hwDebugMgr()->runtimeTMA());
kernelInfo.trapHandler = reinterpret_cast<void *>(rtTrapHandlerMem->vmAddress() + TbaStartOffset);
// With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero.
// However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander
// without the workaround can still function correctly.
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(rtTrapBufferMem->vmAddress());
address rtTrapBufferAddress = static_cast<address>(rtTrapBufferMem->map(this));
Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
// Address of the trap handler code/buffer should be 256-byte aligned
uint64_t tbaAddress = trapHandlerMem->vmAddress();
uint64_t tmaAddress = trapBufferMem->vmAddress();
if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) {
assert(false && "Trap handler/buffer is not 256-byte aligned");
}
// The addresses of the debug trap handler code (TBA) and buffer (TMA) are
// stored in the runtime trap handler buffer with offset location of 0x18-19
// and 0x20-21, respectively.
uint64_t * rtTmaPtr = reinterpret_cast<uint64_t *>(rtTrapBufferAddress + 0x18);
rtTmaPtr[0] = tbaAddress;
rtTmaPtr[1] = tmaAddress;
rtTrapBufferMem->unmap(nullptr);
// Add GPU mem handles to the memory list for VidMM
addVmMemory(trapHandlerMem);
addVmMemory(trapBufferMem);
addVmMemory(rtTrapHandlerMem);
addVmMemory(rtTrapBufferMem);
}
bool
VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst)
{
uint64_t srcVmEnd = src.vmAddress() + src.vmSize();
if (((src.vmAddress() >= sdmaRange_.start_) &&
(src.vmAddress() <= sdmaRange_.end_)) ||
((srcVmEnd >= sdmaRange_.start_) &&
(srcVmEnd <= sdmaRange_.end_)) ||
((src.vmAddress() <= sdmaRange_.start_) &&
(srcVmEnd >= sdmaRange_.end_))) {
sdmaRange_.start_ = dst.vmAddress();
sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
return true;
}
sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
return false;
}
void
VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd)
{
size_t copySize = cmd.size()[0];
size_t fileOffset = cmd.fileOffset();
size_t srcDstOffset = cmd.origin()[0];
Memory* mem = dev().getGpuMemory(&cmd.memory());
uint idx = 0;
assert((cmd.type() == CL_COMMAND_WRITE_BUFFER_FROM_FILE_AMD) ||
(cmd.type() == CL_COMMAND_READ_BUFFER_FROM_FILE_AMD));
bool writeBuffer(cmd.type() == CL_COMMAND_WRITE_BUFFER_FROM_FILE_AMD);
while (copySize > 0) {
Memory* staging = dev().getGpuMemory(&cmd.staging(idx));
size_t srcDstSize = amd::TransferBufferFileCommand::StagingBufferSize;
srcDstSize = std::min(srcDstSize, copySize);
void* srcDstBuffer = staging->cpuMap(*this);
if (!cmd.file()->transferBlock(writeBuffer, srcDstBuffer, fileOffset, 0, srcDstSize)) {
return;
}
staging->cpuUnmap(*this);
bool result = blitMgr().copyBuffer(*staging, *mem,
fileOffset, srcDstOffset, srcDstSize, false);
flushDMA(getGpuEvent(staging->iMem())->engineId_);
srcDstOffset += srcDstSize;
copySize -= srcDstSize;
}
}
} // namespace pal