Files
rocm-systems/projects/clr/rocclr/device/pal/palvirtual.cpp
T
2025-10-23 11:21:49 -07:00

3848 wiersze
141 KiB
C++

/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/kernel.hpp"
#include "platform/commandqueue.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palblit.hpp"
#include "device/appprofile.hpp"
#include "device/devhostcall.hpp"
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "amd_hsa_queue.h"
#include <fstream>
#include <sstream>
#include <algorithm>
#include <thread>
#include "palQueue.h"
#include "palFence.h"
#include "palQueueSemaphore.h"
#ifdef _WIN32
#include <d3d10_1.h>
#include "platform/interop_d3d9.hpp"
#include "platform/interop_d3d10.hpp"
#include "platform/interop_d3d11.hpp"
#endif // _WIN32
namespace amd::pal {
uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) {
uint32_t allocedQueues = 0;
for (const auto& queue : gpu.dev().QueuePool()) {
allocedQueues += (queue.second->engineType_ == type) ? 1 : 0;
}
return allocedQueues;
}
// ================================================================================================
VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType,
uint engineIdx, Pal::ICmdAllocator* cmdAllocator,
uint rtCU, amd::CommandQueue::Priority priority,
uint64_t residency_limit, uint max_command_buffers) {
Pal::IDevice* palDev = gpu.dev().iDev();
Pal::Result result;
Pal::CmdBufferCreateInfo cmdCreateInfo = {};
Pal::QueueCreateInfo qCreateInfo = {};
qCreateInfo.engineIndex =
(queueType == Pal::QueueTypeCompute) ? gpu.dev().computeEnginesId()[engineIdx] : engineIdx;
qCreateInfo.aqlQueue = true;
qCreateInfo.queueType = queueType;
qCreateInfo.priority = Pal::QueuePriority::Normal;
if (queueType == Pal::QueueTypeDma) {
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma;
} else {
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
}
std::map<ExclusiveQueueType, uint32_t>::const_iterator it;
if ((priority == amd::CommandQueue::Priority::Medium) &&
(amd::CommandQueue::RealTimeDisabled == rtCU)) {
it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::Medium);
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
qCreateInfo.priority = Pal::QueuePriority::Medium;
} else if (amd::CommandQueue::RealTimeDisabled != rtCU) {
if (gpu.dev().settings().enableWgpMode_) {
rtCU = rtCU * 2;
}
qCreateInfo.numReservedCu = amd::alignDown(
rtCU,
gpu.dev().properties().engineProperties[Pal::EngineTypeCompute].dedicatedCuGranularity);
if (qCreateInfo.numReservedCu == 0) {
return nullptr;
}
if ((priority == amd::CommandQueue::Priority::Medium) &&
// If Windows HWS is enabled, then the both real time queues are allocated
// on the same engine
(gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) !=
gpu.dev().exclusiveComputeEnginesId().end())) {
it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1);
} else {
it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime0);
}
cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute;
cmdCreateInfo.flags.realtimeComputeUnits = true;
qCreateInfo.priority = Pal::QueuePriority::Realtime;
// If the app creates an exclusive compute, then find the engine id
if (qCreateInfo.engineType == Pal::EngineTypeCompute) {
if (it != gpu.dev().exclusiveComputeEnginesId().end()) {
qCreateInfo.engineIndex = it->second;
} else {
return nullptr;
}
}
}
// Find queue object size
size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
cmdCreateInfo.pCmdAllocator = cmdAllocator;
cmdCreateInfo.queueType = queueType;
// Find command buffer object size
size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
// Find fence object size
size_t fSize = palDev->GetFenceSize(&result);
if (result != Pal::Result::Success) {
return nullptr;
}
size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize);
VirtualGPU::Queue* queue =
new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers);
if (queue != nullptr) {
address addrQ = nullptr;
if (((qCreateInfo.engineType == Pal::EngineTypeCompute) ||
(qCreateInfo.engineType == Pal::EngineTypeDma)) &&
(qCreateInfo.priority != Pal::QueuePriority::Realtime)) {
uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType);
// Create PAL queue object
if (index < GPU_MAX_HW_QUEUES) {
Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo();
if (info == nullptr) {
LogError("Could not create QueueRecycleInfo!");
return nullptr;
}
addrQ = reinterpret_cast<address>(&info[1]);
qCreateInfo.aqlPacketList = info->AqlPacketList();
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
if (result == Pal::Result::Success) {
const_cast<Device&>(gpu.dev()).QueuePool().insert({queue->iQueue_, info});
info->engineType_ = qCreateInfo.engineType;
// Save uniqueue index for scratch buffer access
info->index_ = index;
} else {
delete queue;
return nullptr;
}
} else {
int usage = std::numeric_limits<int>::max();
uint indexBase = std::numeric_limits<uint32_t>::max();
// Loop through all allocated queues and find the lowest usage
for (const auto& it : gpu.dev().QueuePool()) {
if ((qCreateInfo.engineType == it.second->engineType_) &&
(it.second->counter_ <= usage)) {
if ((it.second->counter_ < usage) ||
// Preserve the order of allocations, because SDMA engines
// should be used in round-robin manner
((it.second->counter_ == usage) && (it.second->index_ < indexBase))) {
queue->iQueue_ = it.first;
usage = it.second->counter_;
indexBase = it.second->index_;
}
}
}
// Increment the usage of the current queue
gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++;
}
Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second;
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
queue->lock_ = &info->queue_lock_;
addrQ = reinterpret_cast<address>(&queue[1]);
} else {
Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo();
if (info == nullptr) {
LogError("Could not create QueueRecycleInfo!");
return nullptr;
}
queue->info_ = info;
queue->aql_mgmt_ = &info->aql_packet_mgmt_;
// Exclusive compute path
addrQ = reinterpret_cast<address>(&queue[1]);
qCreateInfo.aqlPacketList = info->AqlPacketList();
result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_);
}
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
queue->UpdateAppPowerProfile();
address addrCmd = addrQ + qSize;
address addrF = addrCmd + max_command_buffers * cmdSize;
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
for (uint i = 0; i < max_command_buffers; ++i) {
result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
Pal::FenceCreateInfo fenceCreateinfo = {};
fenceCreateinfo.flags.signaled = false;
result = palDev->CreateFence(fenceCreateinfo, &addrF[i * fSize], &queue->iCmdFences_[i]);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
if (i == StartCmdBufIdx) {
result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo);
if (result != Pal::Result::Success) {
delete queue;
return nullptr;
}
}
}
}
return queue;
}
VirtualGPU::Queue::~Queue() {
delete reinterpret_cast<Device::QueueRecycleInfo*>(info_);
if (nullptr != iQueue_) {
// Make sure the queues are idle
// It's unclear why PAL could still have a busy queue
amd::ScopedLock l(lock_);
iQueue_->WaitIdle();
}
// Remove all memory references
std::vector<Pal::IGpuMemory*> memRef;
for (auto it : memReferences_) {
memRef.push_back(it.first->iMem());
}
if (memRef.size() != 0) {
iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
}
memReferences_.clear();
for (uint i = 0; i < max_command_buffers_; ++i) {
if (nullptr != iCmdBuffs_[i]) {
iCmdBuffs_[i]->Destroy();
}
if (nullptr != iCmdFences_[i]) {
iCmdFences_[i]->Destroy();
}
}
if (nullptr != iQueue_) {
// Find if this queue was used in recycling
if (lock_ != nullptr) {
// Release the queue if the counter is 0
if (--gpu_.dev().QueuePool().find(iQueue_)->second->counter_ == 0) {
iQueue_->Destroy();
const auto& info = gpu_.dev().QueuePool().find(iQueue_);
// Readjust HW queue index for scratch buffer access
for (auto& queue : gpu_.dev().QueuePool()) {
if ((queue.second->engineType_ == info->second->engineType_) &&
(queue.second->index_ > info->second->index_)) {
queue.second->index_--;
}
}
delete gpu_.dev().QueuePool().find(iQueue_)->second;
const_cast<Device&>(gpu_.dev()).QueuePool().erase(iQueue_);
}
} else {
iQueue_->Destroy();
}
}
}
Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() {
std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName();
const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str();
// Find the last occurance of the '\\' character and extract the name of the application as wide
// char.
const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\');
const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName;
return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName);
}
void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) {
if (gpu_.dev().settings().alwaysResident_) {
return;
}
Pal::IGpuMemory* iMem = mem->iMem();
auto it = memReferences_.find(mem);
if (it != memReferences_.end()) {
it->second = cmdBufIdSlot_;
} else {
// Update runtime tracking with TS
memReferences_[mem] = cmdBufIdSlot_;
// Update PAL list with the new entry
Pal::GpuMemoryRef memRef = {};
memRef.pGpuMemory = iMem;
palMemRefs_.push_back(memRef);
// Check SDI memory object
if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) {
sdiReferences_.insert(iMem);
palSdiRefs_.push_back(iMem);
}
residency_size_ += iMem->Desc().size;
}
}
void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) {
Pal::IGpuMemory* iMem = mem->iMem();
if (0 != memReferences_.erase(mem)) {
iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
residency_size_ -= iMem->Desc().size;
}
}
void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) {
for (size_t i = 0; i < palDoppRefs_.size(); i++) {
if (palDoppRefs_[i].pGpuMemory == iMem) {
// If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested,
// the LAST_DOPP_SUBMISSION is send as requsted by KMD
//
if (palDoppRefs_[i].flags.lastPfpaCmd == 1) {
return; // no need to override the last submission command
}
if (lastDoppCmd) {
palDoppRefs_[i].flags.lastPfpaCmd = 1;
palDoppRefs_[i].flags.pfpa = 0;
} else if (pfpaDoppCmd) {
palDoppRefs_[i].flags.pfpa = 1;
}
return;
}
}
// this is the first reference of the DOPP desktop texture, add it in the vector
Pal::DoppRef doppRef = {};
doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0;
doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0;
doppRef.pGpuMemory = iMem;
palDoppRefs_.push_back(doppRef);
}
// ================================================================================================
bool VirtualGPU::Queue::flush() {
amd::ScopedLock l(lock_);
const Settings& settings = gpu_.dev().settings();
if (!settings.alwaysResident_ && palMemRefs_.size() != 0) {
Pal::Result result = iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_,
Pal::GpuMemoryRefCantTrim);
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed to make resident resources! result: %d", result);
return false;
}
palMemRefs_.clear();
}
// Stop commands building
Pal::Result result;
result = iCmdBuffs_[cmdBufIdSlot_]->End();
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed to finalize a command buffer! result: %d", result);
return false;
}
// Reset the fence. PAL will reset OS event
result = iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_]);
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed to reset a fence! result:%d", result);
return false;
}
Pal::PerSubQueueSubmitInfo perSubQueueSubmitInfo = {};
perSubQueueSubmitInfo.cmdBufferCount = 1;
perSubQueueSubmitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_];
Pal::MultiSubmitInfo submitInfo = {};
submitInfo.perSubQueueInfoCount = 1;
submitInfo.pPerSubQueueInfo = &perSubQueueSubmitInfo;
submitInfo.doppRefCount = palDoppRefs_.size();
submitInfo.pDoppRefs = palDoppRefs_.data();
submitInfo.externPhysMemCount = palSdiRefs_.size();
submitInfo.ppExternPhysMem = palSdiRefs_.data();
submitInfo.fenceCount = 1;
submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_];
if (iQueue_->Type() == Pal::QueueTypeCompute) {
if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
// If runtime uses device memory for kernel arguments, then perform a CPU read back on
// submission. That will make sure NBIO puches all previous CPU write requests through PCIE
gpu_.managedBuffer().CpuReadBack();
}
if (amd::IS_HIP) {
// HIP disables per resource tracking, because the app may embed SVM ptr into other buffers.
// Force CPU sync if there are pending operations on SDMA, until OS fences will be added
gpu_.WaitForIdleSdma();
}
}
// Submit command buffer to OS
if (gpu_.rgpCaptureEna()) {
result = gpu_.dev().captureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo);
} else {
result = iQueue_->Submit(submitInfo);
}
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed to submit CMD! result:%d", result);
if (GPU_ANALYZE_HANG) {
DumpMemoryReferences();
}
return false;
}
// Make sure the slot isn't busy
constexpr bool IbReuse = true;
if (GPU_FLUSH_ON_EXECUTION) {
waitForFence<!IbReuse>(cmdBufIdSlot_);
}
// Reset the counter of commands
cmdCnt_ = 0;
// Find the next command buffer
cmdBufIdCurrent_++;
if (cmdBufIdCurrent_ == GpuEvent::InvalidID) {
// Wait for the last one
waitForFence<!IbReuse>(cmdBufIdSlot_);
cmdBufIdCurrent_ = 1;
cmbBufIdRetired_ = 0;
}
// Wrap current slot
cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_;
waitForFence<IbReuse>(cmdBufIdSlot_);
// Progress retired TS
if ((cmdBufIdCurrent_ > max_command_buffers_) &&
(cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) {
cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_;
}
// Reset command buffer, so CB chunks could be reused
result = iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false);
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed CB reset! result:%d", result);
return false;
}
// Start command buffer building
Pal::CmdBufferBuildInfo cmdBuildInfo = {};
cmdBuildInfo.pMemAllocator = &vlAlloc_;
result = iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo);
if (Pal::Result::Success != result) {
LogPrintfError("PAL failed CB building initialization! result:%d", result);
return false;
}
// Clear dopp references
palDoppRefs_.clear();
palSdiRefs_.clear();
// Remove old memory references
if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) {
for (auto it = memReferences_.begin(); it != memReferences_.end();) {
if (it->second == cmdBufIdSlot_) {
palMems_.push_back(it->first->iMem());
residency_size_ -= it->first->iMem()->Desc().size;
it = memReferences_.erase(it);
} else {
++it;
}
}
}
if (!settings.alwaysResident_ && palMems_.size() != 0) {
iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_);
palMems_.clear();
}
return true;
}
// ================================================================================================
bool VirtualGPU::Queue::waitForEvent(uint id) {
amd::ScopedLock l(lock_);
if (isDone(id)) {
return true;
}
if (id == cmdBufIdCurrent_) {
// There is an error in the flush() and wait is bogus
return false;
}
uint slotId = id % max_command_buffers_;
constexpr bool IbReuse = true;
bool result = waitForFence<!IbReuse>(slotId);
cmbBufIdRetired_ = id;
return result;
}
// ================================================================================================
bool VirtualGPU::Queue::isDone(uint id) {
amd::ScopedLock l(lock_);
if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) {
return true;
}
if (id == cmdBufIdCurrent_) {
// Flush the current command buffer
if (!flush()) {
// If flush failed, then exit earlier...
return false;
}
}
if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) {
return false;
}
cmbBufIdRetired_ = id;
return true;
}
// ================================================================================================
void VirtualGPU::Queue::DumpMemoryReferences() const {
std::fstream dump;
std::stringstream file_name("ocl_hang_dump.txt");
uint64_t start = amd::Os::timeNanos() / 1e9;
dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app));
// Check if we have OpenCL program
if (dump.is_open()) {
dump << start << " Queue: ";
switch (iQueue_->Type()) {
case Pal::QueueTypeCompute:
dump << "Compute";
break;
case Pal::QueueTypeDma:
dump << "SDMA";
break;
default:
dump << "unknown";
break;
}
dump << "\n"
<< "Resident memory resources:\n";
uint idx = 0;
for (auto it : memReferences_) {
dump << " " << idx << "\t[";
dump.setf(std::ios::hex, std::ios::basefield);
dump.setf(std::ios::showbase);
dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", "
<< (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size;
dump.setf(std::ios::dec);
dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().heaps[0] << "\n";
idx++;
}
if (last_kernel_ != nullptr) {
const amd::KernelSignature& signature = last_kernel_->signature();
dump << last_kernel_->name() << std::endl;
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Find if the current argument is a memory object
if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) {
dump << " " << desc.name_ << ": " << std::endl;
}
}
}
dump.close();
}
}
bool VirtualGPU::MemoryDependency::create(size_t numMemObj) {
if (numMemObj > 0) {
// Allocate the array of memory objects for dependency tracking
memObjectsInQueue_ = new MemoryState[numMemObj];
if (nullptr == memObjectsInQueue_) {
return false;
}
memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
maxMemObjectsInQueue_ = numMemObj;
}
return true;
}
void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) {
bool flushL1Cache = false;
if (maxMemObjectsInQueue_ == 0) {
// Return earlier if tracking is disabled
return;
}
uint64_t curStart = memory->vmAddress();
uint64_t curEnd = curStart + memory->size();
if (memory->isModified(gpu) || !readOnly) {
// Mark resource as modified
memory->setModified(gpu, !readOnly);
// Loop through all memory objects in the queue and find dependency
// @note don't include objects from the current kernel
for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
// Check if the queue already contains this mem object and
// GPU operations aren't readonly
uint64_t busyStart = memObjectsInQueue_[j].start_;
uint64_t busyEnd = memObjectsInQueue_[j].end_;
// Check if the start inside the busy region
if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
// Check if the end inside the busy region
((curEnd > busyStart) && (curEnd <= busyEnd)) ||
// Check if the start/end cover the busy region
((curStart <= busyStart) && (curEnd >= busyEnd))) &&
// If the buys region was written or the current one is for write
(!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
flushL1Cache = true;
break;
}
}
}
// Did we reach the limit?
if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) {
flushL1Cache = true;
}
if (flushL1Cache) {
// Flush cache
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
clear(!All);
}
// Insert current memory object into the queue always,
// since runtime calls flush before kernel execution and it has to keep
// current kernel in tracking
memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart;
memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd;
memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly;
numMemObjectsInQueue_++;
}
void VirtualGPU::MemoryDependency::clear(bool all) {
if (numMemObjectsInQueue_ > 0) {
if (all) {
endMemObjectsInQueue_ = numMemObjectsInQueue_;
}
// If the current launch didn't start from the beginning, then move the data
if (0 != endMemObjectsInQueue_) {
size_t i, j;
// Preserve all objects from the current kernel
for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
}
} else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) {
// note: The array growth shouldn't occur under the normal conditions,
// but in a case when SVM path sends the amount of SVM ptrs over
// the max size of kernel arguments
MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1];
if (nullptr == ptr) {
numMemObjectsInQueue_ = 0;
return;
}
maxMemObjectsInQueue_ <<= 1;
memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_);
delete[] memObjectsInQueue_;
memObjectsInQueue_ = ptr;
}
// Adjust the number of active objects
numMemObjectsInQueue_ -= endMemObjectsInQueue_;
endMemObjectsInQueue_ = 0;
}
}
void VirtualGPU::addPinnedMem(amd::Memory* mem) {
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
pinnedMems_.erase(pinnedMems_.begin());
}
// Start operation, since we should release mem object
flushDMA(dev().getGpuMemory(mem)->getGpuEvent(*this)->engineId_);
// Delay destruction
pinnedMems_.push_back(mem);
}
}
void VirtualGPU::releasePinnedMem() {
for (auto& amdMemory : pinnedMems_) {
amdMemory->release();
}
pinnedMems_.clear();
}
amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
for (auto& amdMemory : pinnedMems_) {
if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) {
return amdMemory;
}
}
return nullptr;
}
bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
uint MinDeviceQueueSize = 16 * 1024;
deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize);
maskGroups_ = deviceQueueSize / (512 * Ki);
maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_;
// Align the queue size for the multiple dispatch scheduler.
// Each thread works with 32 entries * maskGroups
uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_);
if (extra != 0) {
deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra;
}
if (deviceQueueSize_ == deviceQueueSize) {
return true;
} else {
delete vqHeader_;
delete virtualQueue_;
vqHeader_ = nullptr;
virtualQueue_ = nullptr;
deviceQueueSize_ = 0;
}
uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
uint allocSize = deviceQueueSize;
// Add the virtual queue header
allocSize += sizeof(AmdVQueueHeader);
allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap));
uint argOffs = allocSize;
// Add the kernel arguments and wait events
uint singleArgSize = amd::alignUp(
dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t),
sizeof(AmdAqlWrap));
allocSize += singleArgSize * numSlots;
uint eventsOffs = allocSize;
// Add the device events
allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent);
uint eventMaskOffs = allocSize;
// Add mask array for events
allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8;
uint slotMaskOffs = allocSize;
// Add mask array for AmdAqlWrap slots
allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;
// Align size to 64 bytes for more efficient fill operation
allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
virtualQueue_ = new Memory(dev(), allocSize);
Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
return false;
}
if (GPU_PRINT_CHILD_KERNEL != 0) {
address ptr = reinterpret_cast<address>(virtualQueue_->map(this, Resource::WriteOnly));
if (nullptr == ptr) {
return false;
}
}
uint64_t pattern = 0;
amd::Coord3D origin(0, 0, 0);
amd::Coord3D region(virtualQueue_->size(), 0, 0);
if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), region, origin,
region)) {
return false;
}
uint64_t vaBase = virtualQueue_->vmAddress();
AmdVQueueHeader header = {};
// Initialize the virtual queue header
header.aql_slot_num = numSlots;
header.event_slot_num = dev().settings().numDeviceEvents_;
header.event_slot_mask = vaBase + eventMaskOffs;
header.event_slots = vaBase + eventsOffs;
header.aql_slot_mask = vaBase + slotMaskOffs;
header.wait_size = dev().settings().numWaitEvents_;
header.arg_size = dev().info().maxParameterSize_ + 64;
header.mask_groups = maskGroups_;
vqHeader_ = new AmdVQueueHeader;
if (nullptr == vqHeader_) {
return false;
}
*vqHeader_ = header;
virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false);
// Go over all slots and perform initialization
AmdAqlWrap slot = {};
size_t offset = sizeof(AmdVQueueHeader);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
slot.aql.kernarg_address = reinterpret_cast<void*>(argStart);
slot.wait_list = argStart + dev().info().maxParameterSize_ + 64;
virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false);
offset += sizeof(AmdAqlWrap);
}
deviceQueueSize_ = deviceQueueSize;
return true;
}
// ================================================================================================
VirtualGPU::VirtualGPU(Device& device)
: device::VirtualDevice(device),
engineID_(MainEngine),
gpuDevice_(static_cast<Device&>(device)),
printfDbgHSA_(nullptr),
tsCache_(nullptr),
managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki),
writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_),
hwRing_(0),
readjustTimeGPU_(0),
lastTS_(nullptr),
profileTs_(nullptr),
vqHeader_(nullptr),
virtualQueue_(nullptr),
schedParams_(nullptr),
deviceQueueSize_(0),
maskGroups_(1),
hsaQueueMem_(nullptr),
cmdAllocator_(nullptr) {
// Note: Virtual GPU device creation must be a thread safe operation
index_ = gpuDevice_.numOfVgpus_++;
gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus());
gpuDevice_.vgpus_[index()] = this;
queues_[MainEngine] = nullptr;
queues_[SdmaEngine] = nullptr;
// The hostcall buffer for this vqueue is initialized on demand.
hostcallBuffer_ = nullptr;
}
// ================================================================================================
bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
amd::CommandQueue::Priority priority) {
device::BlitManager::Setup blitSetup;
// Resize the list of device resources always,
// because destructor calls eraseResourceList() even if create() failed
dev().resizeResoureList(index());
// Virtual GPU will have profiling enabled
state_.profiling_ = profiling;
Pal::CmdAllocatorCreateInfo createInfo = {};
createInfo.flags.threadSafe = true;
// \todo forces PAL to reuse CBs, but requires postamble
createInfo.flags.autoMemoryReuse = false;
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
VirtualGPU::Queue::MaxCommands *
(320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0));
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 256 * Ki;
createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki;
createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocSize = 64 * Ki;
createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].suballocSize = 32 * Ki;
createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocHeap = Pal::GpuHeapInvisible;
createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocSize = 64 * Ki;
createInfo.allocInfo[Pal::GpuScratchMemAlloc].suballocSize = 4 * Ki;
Pal::Result result;
size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result);
if (Pal::Result::Success != result) {
return false;
}
char* addr = new char[cmdAllocSize];
if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) {
return false;
}
uint idx = index() % dev().numComputeEngines();
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
? 0
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
if (dev().numComputeEngines()) {
queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs,
priority, residency_limit, max_cmd_buffers);
if (nullptr == queues_[MainEngine]) {
return false;
}
const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_);
hwRing_ = (info != dev().QueuePool().end())
? info->second->index_
: (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;
// Check if device has SDMA engines
if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) {
uint sdma;
// If only 1 SDMA engine is available then use that one, otherwise it's a round-robin manner
if ((dev().numDMAEngines() < 2) || ((idx + 1) & 0x1)) {
sdma = 0;
} else {
sdma = 1;
}
queues_[SdmaEngine] = Queue::Create(
*this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled,
amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers);
if (nullptr == queues_[SdmaEngine]) {
return false;
}
}
} else {
LogError("Runtme couldn't find compute queues!");
return false;
}
// Create buffers for kernel arg management
if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs
? Resource::Persistent
: Resource::RemoteUSWC)) {
// Try just USWC if persistent memory failed
if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) {
if (!managedBuffer_.create(Resource::RemoteUSWC)) {
return false;
}
} else {
return false;
}
}
// Diable double copy optimization,
// since UAV read from nonlocal is fast enough
blitSetup.disableCopyBufferToImageOpt_ = true;
if (!allocConstantBuffers()) {
return false;
}
// Create HSAILPrintf class
printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_);
if (nullptr == printfDbgHSA_) {
LogError("Could not create PrintfDbgHSA class!");
return false;
}
tsCache_ = new TimeStampCache(*this);
if (nullptr == tsCache_) {
LogError("Could not create TimeStamp cache!");
return false;
}
if (!memoryDependency().create(dev().settings().numMemDependencies_)) {
LogError("Could not create the array of memory objects!");
return false;
}
if (!allocHsaQueueMem()) {
LogError("Could not create hsaQueueMem object!");
return false;
}
// Check if the app requested a device queue creation
if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) &&
!createVirtualQueue(deviceQueueSize)) {
LogError("Could not create a virtual queue!");
return false;
}
// Choose the appropriate class for blit engine
switch (dev().settings().blitEngine_) {
default:
// Fall through ...
case Settings::BlitEngineHost:
blitSetup.disableAll();
// Fall through ...
case Settings::BlitEngineCAL:
case Settings::BlitEngineKernel:
blitMgr_ = new KernelBlitManager(*this, blitSetup);
break;
}
if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
LogError("Could not create BlitManager!");
return false;
}
// If the developer mode manager is available and it's not a device queue,
// then enable RGP capturing
if ((index() != 0) && dev().captureMgr() != nullptr) {
bool dbg_vmid = false;
state_.rgpCaptureEnabled_ = true;
dev().captureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid);
dev().captureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid);
}
return true;
}
// ================================================================================================
bool VirtualGPU::allocHsaQueueMem() {
// Allocate a dummy HSA queue
hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t));
if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) {
delete hsaQueueMem_;
return false;
}
amd_queue_t hsa_queue = {};
// Provide private and local heap addresses
constexpr uint addressShift = LP64_SWITCH(0, 32);
hsa_queue.private_segment_aperture_base_hi = static_cast<uint32_t>(
dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift);
hsa_queue.group_segment_aperture_base_hi = static_cast<uint32_t>(
dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift);
hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true);
return true;
}
VirtualGPU::~VirtualGPU() {
// Not safe to remove a queue. So lock the device
amd::ScopedLock k(dev().lockAsyncOps());
amd::ScopedLock lock(dev().vgpusAccess());
if (queues_[MainEngine] != nullptr) {
// Clear all timestamps, associated with this virtual GPU
auto& mgmt = *queues_[MainEngine]->aql_mgmt_;
for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) {
if (mgmt.aql_vgpus_[i] == this) {
mgmt.aql_vgpus_[i] = nullptr;
mgmt.aql_events_[i].invalidate();
}
}
}
// Destroy RGP trace
if (rgpCaptureEna()) {
dev().captureMgr()->FinishRGPTrace(this, true);
}
while (!freeCbQueue_.empty()) {
auto cb = freeCbQueue_.front();
delete cb;
freeCbQueue_.pop();
}
// Destroy printfHSA object
delete printfDbgHSA_;
// Destroy TimeStamp cache
delete tsCache_;
// Destroy resource list with the constant buffers
for (uint i = 0; i < constBufs_.size(); ++i) {
delete constBufs_[i];
}
managedBuffer_.release();
delete vqHeader_;
delete virtualQueue_;
delete hsaQueueMem_;
// Release scratch buffer memory to reduce memory pressure
//!@note OCLtst uses single device with multiple tests
//! Release memory only if it's the last command queue.
//! The first queue is reserved for the transfers on device
if (static_cast<int>(gpuDevice_.numOfVgpus_ - 1) <= 1) {
gpuDevice_.destroyScratchBuffers();
}
// Destroy BlitManager object
delete blitMgr_;
{
// Destroy queues
delete queues_[MainEngine];
delete queues_[SdmaEngine];
if (nullptr != cmdAllocator_) {
cmdAllocator_->Destroy();
delete[] reinterpret_cast<char*>(cmdAllocator_);
}
}
{
// Find all available virtual GPUs and lock them
// from the execution of commands, since the queue index and resource list
// Will be adjusted
for (auto it : dev().vgpus()) {
if (it != this) {
it->execution().lock();
}
}
// Not safe to add a resource if create/destroy queue is in process, since
// the size of the TS array can change
amd::ScopedLock r(dev().lockResources());
gpuDevice_.numOfVgpus_--;
gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index());
for (uint idx = index(); idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->index_--;
}
dev().eraseResoureList(index());
// Find all available virtual GPUs and unlock them
// for the execution of commands
for (auto it : dev().vgpus()) {
it->execution().unlock();
}
}
if (hostcallBuffer_ != nullptr) {
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for virtual queue %p",
hostcallBuffer_, this);
amd::disableHostcalls(hostcallBuffer_);
dev().svmFree(hostcallBuffer_);
}
}
void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Translate memory references and ensure cache up-to-date
pal::Memory* memory = dev().getGpuMemory(&vcmd.source());
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset);
profilingBegin(vcmd);
memory->syncCacheFromHost(*this);
cl_command_type type = vcmd.type();
bool result = false;
amd::Memory* bufferFromImage = nullptr;
// Force buffer read for IMAGE1D_BUFFER
if ((type == CL_COMMAND_READ_IMAGE) &&
(vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(vcmd.source());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
type = CL_COMMAND_READ_BUFFER;
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Process different write commands
switch (type) {
case CL_COMMAND_READ_BUFFER: {
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
if (nullptr != bufferFromImage) {
size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (hostMemory != nullptr) {
// Accelerated transfer without pinning
amd::Coord3D dstOrigin(offset);
result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size,
vcmd.isEntireMemory(), vcmd.copyMetadata());
} else {
// The logic below will perform 2 step copy to make sure memory pinning doesn't
// occur on the first unaligned page, because in Windows memory manager can
// have CPU access to the allocation header in another thread
// and a race condition is possible.
char* tmpHost =
amd::alignUp(reinterpret_cast<char*>(vcmd.destination()), PinnedMemoryAlignment);
// Find the partial size for unaligned copy
size_t partial = tmpHost - reinterpret_cast<char*>(vcmd.destination());
result = true;
// Check if it's staging copy, then ignore unaligned address
if (size[0] <= dev().settings().pinnedMinXferSize_) {
partial = size[0];
}
// Make first step transfer
if (partial > 0) {
result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, partial, false,
vcmd.copyMetadata());
}
// Second step transfer if something left to copy
if (partial < size[0]) {
result &= blitMgr().readBuffer(*memory, tmpHost, origin[0] + partial, size[0] - partial,
false, vcmd.copyMetadata());
}
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} break;
case CL_COMMAND_READ_BUFFER_RECT: {
amd::BufferRect hostbufferRect;
amd::Coord3D region(0);
amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
vcmd.hostRect().slicePitch_);
if (hostMemory != nullptr) {
result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect,
vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
} else {
result =
blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(),
vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
}
} break;
case CL_COMMAND_READ_IMAGE:
if (memory->memoryType() == Resource::ImageBuffer) {
Image* imageBuffer = static_cast<Image*>(memory);
// Check if synchronization has to be performed
if (nullptr != imageBuffer->CopyImageBuffer()) {
memory = imageBuffer->CopyImageBuffer();
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
amd::Image* image = imageBuffer->owner()->asImage();
amd::Coord3D offs(0);
// Copy memory from the original image buffer into the backing store image
result = blitMgr().copyBufferToImage(*buffer, *imageBuffer->CopyImageBuffer(), offs, offs,
image->getRegion(), true, image->getRowPitch(),
image->getSlicePitch(), vcmd.copyMetadata());
}
}
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
result = blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin,
vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(),
vcmd.slicePitch(), vcmd.copyMetadata());
} else {
result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(),
vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(),
vcmd.copyMetadata());
}
break;
default:
LogError("Unsupported type for the read command");
break;
}
if (!result) {
LogError("submitReadMemory failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Translate memory references and ensure cache up to date
pal::Memory* memory = dev().getGpuMemory(&vcmd.destination());
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset);
profilingBegin(vcmd);
bool entire = vcmd.isEntireMemory();
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
memory->syncCacheFromHost(*this, syncFlags);
cl_command_type type = vcmd.type();
bool result = false;
amd::Memory* bufferFromImage = nullptr;
// Force buffer write for IMAGE1D_BUFFER
if ((type == CL_COMMAND_WRITE_IMAGE) &&
(vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(vcmd.destination());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
type = CL_COMMAND_WRITE_BUFFER;
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Process different write commands
switch (type) {
case CL_COMMAND_WRITE_BUFFER: {
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
if (nullptr != bufferFromImage) {
size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if ((hostMemory != nullptr) && (vcmd.size()[0] > dev().settings().prepinnedMinSize_)) {
// Accelerated transfer without pinning
amd::Coord3D srcOrigin(offset);
result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size,
vcmd.isEntireMemory(), vcmd.copyMetadata());
} else {
// The logic below will perform 2 step copy to make sure memory pinning doesn't
// occur on the first unaligned page, because in Windows memory manager can
// have CPU access to the allocation header in another thread
// and a race condition is possible.
const char* tmpHost =
amd::alignUp(reinterpret_cast<const char*>(vcmd.source()), PinnedMemoryAlignment);
// Find the partial size for unaligned copy
size_t partial = tmpHost - reinterpret_cast<const char*>(vcmd.source());
result = true;
// Check if it's staging copy, then ignore unaligned address
if (size[0] <= dev().settings().pinnedMinXferSize_) {
partial = size[0];
}
// Make first step transfer
if (partial > 0) {
result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, partial, false,
vcmd.copyMetadata());
}
// Second step transfer if something left to copy
if (partial < size[0]) {
result &= blitMgr().writeBuffer(tmpHost, *memory, origin[0] + partial, size[0] - partial,
false, vcmd.copyMetadata());
}
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} break;
case CL_COMMAND_WRITE_BUFFER_RECT: {
amd::BufferRect hostbufferRect;
amd::Coord3D region(0);
amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset);
hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_,
vcmd.hostRect().slicePitch_);
if (hostMemory != nullptr) {
result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(),
vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
} else {
result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(),
vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata());
}
} break;
case CL_COMMAND_WRITE_IMAGE:
if (hostMemory != nullptr) {
// Accelerated buffer to image transfer without pinning
amd::Coord3D srcOrigin(offset);
result = blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(),
vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(),
vcmd.slicePitch(), vcmd.copyMetadata());
} else {
result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(),
vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(),
vcmd.copyMetadata());
}
break;
default:
LogError("Unsupported type for the write command");
break;
}
if (!result) {
LogError("submitWriteMemory failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
} else {
// Mark this as the most-recently written cache of the destination
vcmd.destination().signalWrite(&gpuDevice_);
}
profilingEnd(vcmd);
}
bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem,
bool entire, const amd::Coord3D& srcOrigin,
const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
amd::CopyMetadata copyMetadata) {
// Translate memory references and ensure cache up-to-date
pal::Memory* dstMemory = dev().getGpuMemory(&dstMem);
pal::Memory* srcMemory = dev().getGpuMemory(&srcMem);
if (dstMemory == nullptr || srcMemory == nullptr) {
LogError("submitcopyMemory Failed!");
return false;
}
// Synchronize source and destination memory
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
dstMemory->syncCacheFromHost(*this, syncFlags);
srcMemory->syncCacheFromHost(*this);
amd::Memory* bufferFromImageSrc = nullptr;
amd::Memory* bufferFromImageDst = nullptr;
// Force buffer read for IMAGE1D_BUFFER
if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
bufferFromImageSrc = createBufferFromImage(srcMem);
if (nullptr == bufferFromImageSrc) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
srcMemory = dev().getGpuMemory(bufferFromImageSrc);
}
}
// Force buffer write for IMAGE1D_BUFFER
if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
bufferFromImageDst = createBufferFromImage(dstMem);
if (nullptr == bufferFromImageDst) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
dstMemory = dev().getGpuMemory(bufferFromImageDst);
}
}
type = getCopyCommandType(type, srcMem.getType(), dstMem.getType());
bool result = false;
// Check if HW can be used for memory copy
switch (type) {
case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD:
case CL_COMMAND_SVM_MEMCPY:
case CL_COMMAND_COPY_BUFFER: {
amd::Coord3D realSrcOrigin(srcOrigin[0]);
amd::Coord3D realDstOrigin(dstOrigin[0]);
amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]);
if (nullptr != bufferFromImageSrc) {
const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
realSrcOrigin.c[0] *= elemSize;
if (nullptr != bufferFromImageDst) {
realDstOrigin.c[0] *= elemSize;
}
realSize.c[0] *= elemSize;
} else if (nullptr != bufferFromImageDst) {
const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
realDstOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
}
result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize,
entire, copyMetadata);
} break;
case CL_COMMAND_COPY_BUFFER_RECT:
result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire,
copyMetadata);
break;
case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
amd::Coord3D realDstOrigin(dstOrigin);
if (nullptr != bufferFromImageDst) {
const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize();
realDstOrigin.c[0] *= elemSize;
}
result =
blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, realDstOrigin, size, entire,
dstRect.rowPitch_, dstRect.slicePitch_, copyMetadata);
break;
}
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
amd::Coord3D realSrcOrigin(srcOrigin);
if (nullptr != bufferFromImageSrc) {
const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize();
realSrcOrigin.c[0] *= elemSize;
}
result =
blitMgr().copyBufferToImage(*srcMemory, *dstMemory, realSrcOrigin, dstOrigin, size, entire,
srcRect.rowPitch_, srcRect.slicePitch_, copyMetadata);
break;
}
case CL_COMMAND_COPY_IMAGE:
result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire,
copyMetadata);
break;
default:
LogError("Unsupported command type for memory copy!");
break;
}
if (nullptr != bufferFromImageSrc) {
bufferFromImageSrc->release();
}
if (nullptr != bufferFromImageDst) {
bufferFromImageDst->release();
}
if (!result) {
LogError("submitCopyMemory failed!");
return false;
} else {
// Mark this as the most-recently written cache of the destination
dstMem.signalWrite(&gpuDevice_);
}
return true;
}
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
cl_command_type type = vcmd.type();
bool entire = vcmd.isEntireMemory();
if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(),
vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect(),
vcmd.copyMetadata())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
cl_command_type type = vcmd.type();
// no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
amd::Coord3D srcOrigin(0, 0, 0);
amd::Coord3D dstOrigin(0, 0, 0);
amd::Coord3D size(vcmd.srcSize(), 1, 1);
amd::BufferRect srcRect;
amd::BufferRect dstRect;
bool result = false;
amd::Memory* srcMem = amd::MemObjMap::FindMemObj(vcmd.src());
amd::Memory* dstMem = amd::MemObjMap::FindMemObj(vcmd.dst());
device::Memory::SyncFlags syncFlags;
if (nullptr != srcMem) {
srcMem->commitSvmMemory();
srcOrigin.c[0] =
static_cast<const_address>(vcmd.src()) - static_cast<address>(srcMem->getSvmPtr());
if (!(srcMem->validateRegion(srcOrigin, size))) {
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
if (nullptr != dstMem) {
dstMem->commitSvmMemory();
dstOrigin.c[0] =
static_cast<const_address>(vcmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
if (!(dstMem->validateRegion(dstOrigin, size))) {
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space
std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize());
result = true;
} else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space
Memory* memory = dev().getGpuMemory(dstMem);
// Synchronize source and destination memory
syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size);
memory->syncCacheFromHost(*this, syncFlags);
result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size,
dstMem->isEntirelyCovered(dstOrigin, size));
// Mark this as the most-recently written cache of the destination
dstMem->signalWrite(&gpuDevice_);
} else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space
Memory* memory = dev().getGpuMemory(srcMem);
// Synchronize source and destination memory
memory->syncCacheFromHost(*this);
result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size,
srcMem->isEntirelyCovered(srcOrigin, size));
} else if (nullptr != srcMem && nullptr != dstMem) { // both in svm space
bool entire =
srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size);
result =
copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect);
}
if (!result) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
} else {
// direct memcpy for FGS enabled system
amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1);
}
profilingEnd(vcmd);
}
void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
// Save map info for unmap operation
memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
vcmd.isEntireMemory());
// If we have host memory, use it
if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
if (!memory->isHostMemDirectAccess()) {
// Make sure GPU finished operation before
// synchronization with the backing store
memory->wait(*this);
}
// Target is the backing store, so just ensure that owner is up-to-date
memory->owner()->cacheWriteBack(this);
// Add memory to VA cache, so rutnime can detect direct access to VA
dev().addVACache(memory);
} else if (memory->isPersistentMapped()) {
// Nothing to do here
} else if (memory->mapMemory() != nullptr) {
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
amd::Coord3D dstOrigin(0, 0, 0);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
Memory* memoryBuf = memory;
amd::Coord3D origin(vcmd.origin()[0]);
amd::Coord3D size(vcmd.size()[0]);
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size,
vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} else {
// Validate if it's a view for a map of mip level
if (vcmd.memory().parent() != nullptr) {
amd::Image* amdImage = vcmd.memory().parent()->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) {
// Save map write info in the parent object
dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(),
vcmd.mapFlags(), vcmd.isEntireMemory(),
vcmd.memory().asImage());
}
}
if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin,
vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
}
}
} else {
LogError("Unhandled map!");
}
profilingEnd(vcmd);
}
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) {
bool unmapMip = false;
amd::Image* amdImage;
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
pal::Memory* memory = dev().getGpuMemory(&vcmd.memory());
amd::Memory* owner = memory->owner();
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
if (nullptr == writeMapInfo) {
LogError("Unmap without map call");
return;
}
profilingBegin(vcmd);
// Check if image is a mipmap and assign a saved view
amdImage = owner->asImage();
if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) &&
(writeMapInfo->baseMip_ != nullptr)) {
// Assign mip level view
amdImage = writeMapInfo->baseMip_;
// Clear unmap flags from the parent image
memory->clearUnmapInfo(vcmd.mapPtr());
memory = dev().getGpuMemory(amdImage);
unmapMip = true;
writeMapInfo = memory->writeMapInfo(vcmd.mapPtr());
}
// We used host memory
if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) {
if (writeMapInfo->isUnmapWrite()) {
// Target is the backing store, so sync
owner->signalWrite(nullptr);
memory->syncCacheFromHost(*this);
}
// Remove memory from VA cache
dev().removeVACache(memory);
}
// data check was added for persistent memory that failed to get aperture
// and therefore are treated like a remote resource
else if (memory->isPersistentMapped()) {
// Map/unmap must be serialized
amd::ScopedLock lock(owner->lockMemoryOps());
memory->unmap(this);
if (memory->getMapCount() == 0) {
memory->setPersistentMapFlag(false);
}
} else if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->mapMemory() != nullptr);
if (memory->desc().buffer_) {
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
} else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
Memory* memoryBuf = memory;
amd::Coord3D origin(writeMapInfo->origin_[0]);
amd::Coord3D size(writeMapInfo->region_[0]);
size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory());
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
memoryBuf = dev().getGpuMemory(bufferFromImage);
}
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} else {
if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
}
} else {
LogError("Unhandled unmap!");
vcmd.setStatus(CL_INVALID_VALUE);
}
// Clear unmap flags
memory->clearUnmapInfo(vcmd.mapPtr());
profilingEnd(vcmd);
}
// Release a view for a mipmap map
if (unmapMip) {
// Memory release should be outside of the execution lock,
// because mapMemory_ isn't marked for a specifc GPU
amdImage->release();
}
}
bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern,
size_t patternSize, const amd::Coord3D& origin,
const amd::Coord3D& size, bool forceBlit) {
pal::Memory* memory = dev().getGpuMemory(amdMemory);
bool entire = amdMemory->isEntirelyCovered(origin, size);
// Synchronize memory from host if necessary
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = entire;
memory->syncCacheFromHost(*this, syncFlags);
bool result = false;
amd::Memory* bufferFromImage = nullptr;
float fillValue[4];
// Force fill buffer for IMAGE1D_BUFFER
if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
bufferFromImage = createBufferFromImage(*amdMemory);
if (nullptr == bufferFromImage) {
LogError("We should not fail buffer creation from image_buffer!");
} else {
type = CL_COMMAND_FILL_BUFFER;
memory = dev().getGpuMemory(bufferFromImage);
}
}
// Find the the right fill operation
switch (type) {
case CL_COMMAND_FILL_BUFFER:
case CL_COMMAND_SVM_MEMFILL: {
amd::Coord3D realOrigin(origin[0]);
amd::Coord3D realSize(size[0]);
// Reprogram fill parameters if it's an IMAGE1D_BUFFER object
if (nullptr != bufferFromImage) {
size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize();
realOrigin.c[0] *= elemSize;
realSize.c[0] *= elemSize;
memset(fillValue, 0, sizeof(fillValue));
amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue);
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSize, realOrigin, realSize,
amdMemory->isEntirelyCovered(origin, size), forceBlit);
if (nullptr != bufferFromImage) {
bufferFromImage->release();
}
} break;
case CL_COMMAND_FILL_IMAGE:
result = blitMgr().fillImage(*memory, pattern, origin, size,
amdMemory->isEntirelyCovered(origin, size));
break;
default:
LogError("Unsupported command type for FillMemory!");
break;
}
if (!result) {
LogError("fillMemory failed!");
return false;
}
// Mark this as the most-recently written cache of the destination
amdMemory->signalWrite(&gpuDevice_);
return true;
}
void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
if (cmd.type() == CL_COMMAND_FILL_IMAGE) {
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
cmd.size())) {
cmd.setStatus(CL_INVALID_OPERATION);
}
} else {
size_t width = cmd.size().c[0];
size_t height = cmd.size().c[1];
size_t depth = cmd.size().c[2];
size_t pitch = cmd.surface().c[0];
amd::Coord3D origin = cmd.origin();
amd::Coord3D region{cmd.surface().c[1], cmd.surface().c[2], depth};
amd::BufferRect rect;
rect.create(static_cast<size_t*>(origin), static_cast<size_t*>(region), pitch, 0);
bool force_blit = false;
if (amd::IS_HIP) {
constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR);
// In case of HMM, use blit kernel instead of CPU memcpy
if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) {
force_blit = true;
}
}
for (size_t slice = 0; slice < depth; slice++) {
for (size_t row = 0; row < height; row++) {
const size_t rowOffset = rect.offset(0, row, slice);
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(),
amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1}, force_blit)) {
cmd.setStatus(CL_INVALID_OPERATION);
}
}
}
}
profilingEnd(cmd);
}
void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
// Get the device memory objects for the current device
Memory* srcDevMem = dev().getGpuMemory(&cmd.source());
Memory* dstDevMem = dev().getGpuMemory(&cmd.destination());
bool p2pAllowed = true;
// If any device object is null, then no HW P2P and runtime has to use staging
if (srcDevMem == nullptr) {
srcDevMem = static_cast<pal::Memory*>(
cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0]));
p2pAllowed = false;
} else if (dstDevMem == nullptr) {
dstDevMem = static_cast<pal::Memory*>(
cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0]));
p2pAllowed = false;
}
// Synchronize source and destination memory
device::Memory::SyncFlags syncFlags;
syncFlags.skipEntire_ = cmd.isEntireMemory();
amd::Coord3D size = cmd.size();
bool result = false;
switch (cmd.type()) {
case CL_COMMAND_COPY_BUFFER: {
amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
if (p2pAllowed) {
result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size,
cmd.isEntireMemory());
} else {
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
result = true;
do {
if (left_size <= copy_size) {
copy_size = left_size;
}
left_size -= copy_size;
amd::Coord3D cpSize(copy_size);
// Perform 2 step transfer with staging buffer
result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin,
stageOffset, cpSize);
srcOrigin.c[0] += copy_size;
result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset,
dstOrigin, cpSize);
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
break;
}
case CL_COMMAND_COPY_BUFFER_RECT: {
if (p2pAllowed) {
result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(),
size, cmd.isEntireMemory(), cmd.copyMetadata());
} else {
amd::ScopedLock lock(dev().P2PStageOps());
Memory* dstStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0]));
Memory* srcStgMem = static_cast<pal::Memory*>(
dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0]));
if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) {
result = true;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(),
cmd.srcRect(), size, false,
cmd.copyMetadata());
result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(),
cmd.dstRect(), size, false,
cmd.copyMetadata());
} else {
size_t srcOffset;
size_t dstOffset;
result = true;
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = cmd.srcRect().offset(0, y, z);
dstOffset = cmd.dstRect().offset(0, y, z);
amd::Coord3D srcOrigin(srcOffset);
amd::Coord3D dstOrigin(dstOffset);
size_t copy_size = Device::kP2PStagingSize;
size_t left_size = size[0];
amd::Coord3D stageOffset(0);
do {
if (left_size <= copy_size) {
copy_size = left_size;
}
left_size -= copy_size;
// Perform 2 step transfer with staging buffer
result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin,
stageOffset, copy_size, *dstStgMem);
srcDevMem->dev().xferQueue()->waitAllEngines();
result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset,
dstOrigin, copy_size, *dstDevMem);
srcStgMem->dev().xferQueue()->waitAllEngines();
srcOrigin.c[0] += copy_size;
dstOrigin.c[0] += copy_size;
} while (left_size > 0);
}
}
}
}
break;
}
case CL_COMMAND_COPY_IMAGE:
case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
LogError("Unsupported P2P type!");
break;
default:
ShouldNotReachHere();
break;
}
if (!result) {
LogError("submitCopyMemoryP2P failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
cmd.destination().signalWrite(&dstDevMem->dev());
profilingEnd(cmd);
}
void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
// Make sure we have memory for the command execution
pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(),
vcmd.isEntireMemory());
if (memory->mapMemory() != nullptr) {
if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
assert(memory->desc().buffer_ && "SVM memory can't be an image");
if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(),
vcmd.size(), vcmd.isEntireMemory())) {
LogError("submitSVMMapMemory() - copy failed");
vcmd.setStatus(CL_MAP_FAILURE);
}
}
} else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
if (!memory->isHostMemDirectAccess()) {
// Make sure GPU finished operation before
// synchronization with the backing store
memory->wait(*this);
}
// Target is the backing store, so just ensure that owner is up-to-date
memory->owner()->cacheWriteBack(this);
} else {
LogError("Unhandled svm map!");
}
}
profilingEnd(vcmd);
}
void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem()) {
pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem());
const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr());
if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
amd::Coord3D srcOrigin(0, 0, 0);
// Target is a remote resource, so copy
assert(memory->desc().buffer_ && "SVM memory can't be an image");
if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_,
writeMapInfo->origin_, writeMapInfo->region_,
writeMapInfo->isEntire())) {
LogError("submitSvmUnmapMemory() - copy failed");
vcmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
} else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) {
if (writeMapInfo->isUnmapWrite()) {
// Target is the backing store, so sync
memory->owner()->signalWrite(nullptr);
memory->syncCacheFromHost(*this);
}
}
memory->clearUnmapInfo(vcmd.svmPtr());
}
profilingEnd(vcmd);
}
void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
if (!dev().isFineGrainedSystem()) {
size_t patternSize = vcmd.patternSize();
size_t fillSize = patternSize * vcmd.times();
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst());
assert(dstMemory && "No svm Buffer to fill with!");
size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
pal::Memory* memory = dev().getGpuMemory(dstMemory);
amd::Coord3D origin(offset, 0, 0);
amd::Coord3D size(fillSize, 1, 1);
assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!");
if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
} else {
// for FGS capable device, fill CPU memory directly
amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times());
}
profilingEnd(vcmd);
}
void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
for (const auto& it : vcmd.memObjects()) {
// Find device memory
pal::Memory* memory = dev().getGpuMemory(it);
if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
memory->mgpuCacheWriteBack(*this);
} else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
// Synchronize memory from host if necessary.
// The sync function will perform memory migration from
// another device if necessary
device::Memory::SyncFlags syncFlags;
memory->syncCacheFromHost(*this, syncFlags);
} else {
LogWarning("Unknown operation for memory migration!");
}
}
profilingEnd(vcmd);
}
void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
// in-order semantics: previous commands need to be done before we start
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
std::vector<void*>& svmPointers = vcmd.svmPointers();
if (vcmd.pfnFreeFunc() == nullptr) {
// pointers allocated using clSVMAlloc
for (uint32_t i = 0; i < svmPointers.size(); ++i) {
dev().svmFree(svmPointers[i]);
}
} else {
vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(),
static_cast<void**>(&(svmPointers[0])), vcmd.userData());
}
profilingEnd(vcmd);
}
void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
const cl_command_type type = cmd.type();
const uint64_t value = cmd.value();
const uint64_t mask = cmd.mask();
const unsigned int flags = cmd.flags();
const size_t sizeBytes = cmd.sizeBytes();
const size_t offset = cmd.offset();
amd::Memory* amdMemory = &cmd.memory();
Memory* memory = dev().getGpuMemory(amdMemory);
if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
// Use a blit kernel to perform the wait operation
// mask is applied on value before performing
// the comparision defined by 'condition'
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
sizeBytes, flags, mask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Waiting for value: 0x%lx."
" Flags: 0x%lx mask: 0x%lx",
value, flags, mask);
if (!result) {
LogError("submitStreamOperation: Wait failed!");
}
} else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value, offset,
sizeBytes);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
if (!result) {
LogError("submitStreamOperation: Write failed!");
}
} else {
ShouldNotReachHere();
}
profilingEnd(cmd);
}
// ================================================================================================
void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
amd::Memory* phys_mem_obj = vcmd.memory();
amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
profilingEnd(vcmd);
return;
}
// Create a view, since original base obj will map the whole memory and multimap cases wont work.
amd::Memory* vaddr_sub_obj = nullptr;
size_t vaddr_offset = 0;
if (phys_mem_obj != nullptr) {
constexpr bool kParent = false;
vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer(
phys_mem_obj->getContext(), const_cast<void*>(vcmd.ptr()), vcmd.size(),
phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent);
// Calculate the offset from the original pointer.
vaddr_offset = (reinterpret_cast<address>(vaddr_sub_obj->getSvmPtr()) -
reinterpret_cast<address>(vaddr_base_obj->getSvmPtr()));
}
// The imem() in the backend is shared between base and sub/view object.
pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_base_obj);
Pal::IGpuMemory* phymem_igpu_mem =
(phys_mem_obj == nullptr) ? nullptr : dev().getGpuMemory(phys_mem_obj)->iMem();
Pal::VirtualMemoryRemapRange range{vaddr_pal_mem->iMem(), vaddr_offset,
phymem_igpu_mem, 0,
vcmd.size(), Pal::VirtualGpuMemAccessMode::NoAccess};
// Wait for previous operations before unmap
if (phys_mem_obj == nullptr) {
// @note: Need to verify if compute requires a wait or IB flush is enough
WaitForIdleCompute();
WaitForIdleSdma();
}
eventBegin(MainEngine);
auto result = queue(MainEngine).iQueue_->RemapVirtualMemoryPages(1, &range, false, nullptr);
// Capture GPU event for the paging operation
GpuEvent event;
eventEnd(MainEngine, event);
setGpuEvent(event);
if (result == Pal::Result::Success) {
if (phys_mem_obj != nullptr) {
// assert the vaddr_mem_obj wasn't mapped already
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_sub_obj);
vaddr_sub_obj->getUserData().phys_mem_obj = phys_mem_obj;
phys_mem_obj->getUserData().vaddr_mem_obj = vaddr_sub_obj;
} else {
// assert the vaddr_mem_obj is mapped and needs to be removed
amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(vcmd.ptr());
assert(vaddr_sub_obj != nullptr);
assert(vcmd.ptr() == vaddr_sub_obj->getSvmPtr());
amd::MemObjMap::RemoveMemObj(vcmd.ptr());
if (vaddr_sub_obj->getUserData().phys_mem_obj != nullptr) {
vaddr_sub_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr;
vaddr_sub_obj->getUserData().phys_mem_obj = nullptr;
}
}
}
profilingEnd(vcmd);
}
// ================================================================================================
void VirtualGPU::PrintChildren(const pal::Kernel& hsaKernel, VirtualGPU* gpuDefQueue) {
AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]);
uint p = 0;
for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) {
if (wraps[i].state != 0) {
uint j;
if (p == GPU_PRINT_CHILD_KERNEL) {
break;
}
p++;
std::stringstream print;
print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase);
print << "Slot#: " << i << "\n";
print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n";
print << "\tcommand_id: " << wraps[i].command_id << "\n";
print << "\tchild_counter: " << wraps[i].child_counter << "\n";
print << "\tcompletion: " << wraps[i].completion << "\n";
print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n";
print << "\twait_list: " << wraps[i].wait_list << "\n";
print << "\twait_num: " << wraps[i].wait_num << "\n";
uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress();
size_t* events = reinterpret_cast<size_t*>(gpuDefQueue->virtualQueue_->data() + offsEvents);
for (j = 0; j < wraps[i].wait_num; ++j) {
uint offs = static_cast<uint64_t>(events[j]) - gpuDefQueue->virtualQueue_->vmAddress();
AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs);
print << "Wait Event#: " << j << "\n";
print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n";
}
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
print << wraps[i].aql.workgroup_size_y << ", ";
print << wraps[i].aql.workgroup_size_z << "]\n";
print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
print << wraps[i].aql.grid_size_y << ", ";
print << wraps[i].aql.grid_size_z << "]\n";
pal::Kernel* child = nullptr;
for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end();
++it) {
if (wraps[i].aql.kernel_object == static_cast<pal::Kernel*>(it->second)->gpuAqlCode()) {
child = static_cast<pal::Kernel*>(it->second);
}
}
if (child == nullptr) {
printf("Error: couldn't find child kernel!\n");
continue;
}
const uint64_t kernarg_address =
static_cast<uint64_t>(reinterpret_cast<uintptr_t>(wraps[i].aql.kernarg_address));
uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
const amd::KernelSignature& signature = child->signature();
// Check if runtime has to setup hidden arguments
for (const auto it : signature.parameters()) {
const char* extraArgName = nullptr;
switch (it.info_.oclObject_) {
case amd::KernelParameterDescriptor::HiddenNone:
// void* zero = 0;
// WriteAqlArgAt(const_cast<address>(parameters), zero, it.size_, it.offset_);
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetX:
extraArgName = "Offset0: ";
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetY:
extraArgName = "Offset1: ";
break;
case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ:
extraArgName = "Offset2: ";
break;
case amd::KernelParameterDescriptor::HiddenPrintfBuffer:
extraArgName = "PrintfBuf: ";
break;
case amd::KernelParameterDescriptor::HiddenDefaultQueue:
extraArgName = "VqueuePtr: ";
break;
case amd::KernelParameterDescriptor::HiddenCompletionAction:
extraArgName = "AqlWrap: ";
break;
default:
break;
}
if (extraArgName) {
print << "\t" << extraArgName << *reinterpret_cast<size_t*>(argum);
print << "\n";
argum += sizeof(size_t);
continue;
}
print << "\t" << it.name_ << ": ";
for (int s = it.size_ - 1; s >= 0; --s) {
print.width(2);
print.fill('0');
print << static_cast<uint32_t>(argum[s]);
}
argum += it.offset_;
print << "\n";
}
printf("%s", print.str().c_str());
}
}
}
// ================================================================================================
bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel,
VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) {
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
if (nullptr == defQueue) {
LogError("Default device queue wasn't allocated");
return false;
} else {
if (dev().settings().useDeviceQueue_) {
*gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
if ((*gpuDefQueue)->hwRing() == hwRing()) {
LogError("Can't submit the child kernels to the same HW ring as the host queue!");
return false;
}
} else {
createVirtualQueue(defQueue->size());
*gpuDefQueue = this;
}
}
*vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress();
(*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable());
// Acquire USWC memory for the scheduler parameters
(*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam));
// Add memory handles before the actual dispatch
addVmMemory((*gpuDefQueue)->virtualQueue_);
addVmMemory((*gpuDefQueue)->schedParams_);
return true;
}
// ================================================================================================
void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel,
VirtualGPU* gpuDefQueue, uint64_t vmDefQueue,
uint64_t vmParentWrap, GpuEvent* gpuEvent) {
uint32_t id = gpuEvent->id_;
amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());
// Make sure exculsive access to the device queue
amd::ScopedLock(defQueue->lock());
Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam));
if (GPU_PRINT_CHILD_KERNEL != 0) {
waitForEvent(gpuEvent);
PrintChildren(hsaKernel, gpuDefQueue);
}
if (!dev().settings().useDeviceQueue_) {
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0,
dev().settings().useDeviceQueue_);
eventEnd(MainEngine, *gpuEvent);
}
// Get the global loop start before the scheduler
Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr())
.runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, BarrierType::FlushL2);
// Get the address of PM4 template and add write it to params
//! @note DMA flush must not occur between patch and the scheduler
Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart();
// Program parameters for the scheduler
SchedulerParam* param = reinterpret_cast<SchedulerParam*>(gpuDefQueue->schedParams_->data());
param->signal = 1;
// Scale clock to 1024 to avoid 64 bit div in the scheduler
param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_;
param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/;
param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
param->releaseHostCP = 0;
param->parentAQL = vmParentWrap;
param->dedicatedQueue = dev().settings().useDeviceQueue_;
// Fill the scratch buffer information
if (hsaKernel.prog().maxScratchRegs() > 0) {
pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_;
param->scratchSize = scratchBuf->size();
param->scratch = scratchBuf->vmAddress();
param->numMaxWaves = 32 * dev().info().maxComputeUnits_;
param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_;
addVmMemory(scratchBuf);
} else {
param->numMaxWaves = 0;
param->scratchSize = 0;
param->scratch = 0;
param->scratchOffset = 0;
}
// Add all kernels in the program to the mem list.
//! \note Runtime doesn't know which one will be called
hsaKernel.prog().fillResListWithKernels(*this);
Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress();
gpuDefQueue->eventBegin(MainEngine);
gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd(
signalAddr, loopStart,
gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_));
// Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call.
// Thus TS command for profiling has to follow in the next CB.
constexpr bool ForceSubmitFirst = true;
gpuDefQueue->eventEnd(MainEngine, *gpuEvent, ForceSubmitFirst);
if (dev().settings().useDeviceQueue_) {
// Add the termination handshake to the host queue
eventBegin(MainEngine);
iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr,
dev().settings().useDeviceQueue_);
if (id != gpuEvent->id_) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, *gpuEvent);
}
xferWrite().Release(*gpuDefQueue->schedParams_);
gpuDefQueue->schedParams_ = nullptr;
}
// ================================================================================================
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
if (vcmd.cooperativeGroups()) {
uint32_t workgroups = 1;
for (uint i = 0; i < vcmd.sizes().dimensions(); i++) {
if (vcmd.sizes().local()[i] != 0) {
workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]);
}
}
bool test = true;
VirtualGPU* queue = (test) ? this : dev().xferQueue();
// Wait for the execution on the current queue, since the coop groups will use the device queue
waitAllEngines();
amd::ScopedLock lock(queue->blitMgr().lockXfer());
queue->profilingBegin(vcmd);
static_cast<KernelBlitManager&>(queue->blitMgr()).RunGwsInit(workgroups);
queue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue);
// Submit kernel to HW
if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
vcmd.sharedMemBytes())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
queue->profilingEnd(vcmd);
// Wait for the execution on the device queue. Keep the current queue in-order
queue->waitAllEngines();
} else {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false,
vcmd.sharedMemBytes(), vcmd.getAnyOrderLaunchFlag())) {
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
}
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel,
const_address parameters, bool nativeMem,
uint32_t sharedMemBytes, bool anyOrder) {
state_.anyOrder_ = anyOrder;
// Get the HSA kernel object
const pal::Kernel& hsaKernel = static_cast<const pal::Kernel&>(*(kernel.getDeviceKernel(dev())));
// If RGP capturing is enabled, then start SQTT trace
if (rgpCaptureEna()) {
size_t newLocalSize[3] = {1, 1, 1};
size_t newGlobalSize[3] = {0, 0, 0};
for (uint i = 0; i < sizes.dimensions(); i++) {
newGlobalSize[i] = sizes.global()[i];
if (sizes.local()[i] != 0) {
newLocalSize[i] = sizes.local()[i];
}
}
dev().captureMgr()->PreDispatch(
this, hsaKernel,
// Report global size in workgroups, since that's the RGP trace semantics
newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1],
newGlobalSize[2] / newLocalSize[2]);
}
bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false;
if (printfEnabled && !printfDbgHSA().init(*this, printfEnabled)) {
LogError("Printf debug buffer initialization failed!");
return false;
}
uint64_t vmDefQueue = 0;
VirtualGPU* gpuDefQueue = nullptr;
if (hsaKernel.dynamicParallelism()) {
// Initialize GPU device queue for execution (gpuDefQueue)
if (!PreDeviceEnqueue(kernel, hsaKernel, &gpuDefQueue, &vmDefQueue)) {
return false;
}
}
size_t ldsSize;
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\tkernel : %s\n", hsaKernel.name().c_str());
if (PAL_EMBED_KERNEL_MD) {
char buf[256];
sprintf(buf, "kernel: %s\n private mem size: %x\n group mem size: %x\n",
hsaKernel.name().c_str(), hsaKernel.spillSegSize(), hsaKernel.ldsSize());
iCmd()->CmdCommentString(buf);
}
bool imageBufferWrtBack = false; // Image buffer write back is required
std::vector<Image*> wrtBackImageBuffer; // Array of images for write back
// Check memory dependency and SVM objects
if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize, imageBufferWrtBack,
wrtBackImageBuffer)) {
LogError("Wrong memory objects!");
return false;
}
// Add ISA memory object to the resource tracking list
AddKernel(kernel);
GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId());
uint32_t id = gpuEvent.id_;
uint64_t vmParentWrap = 0;
uint32_t aql_index = 0;
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes,
vmDefQueue, &vmParentWrap, &aql_index);
assert((nullptr != aqlPkt) && "Couldn't load kernel arguments");
// Dynamic call stack size is considered to calculate private segment size and scratch regs
// in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike
// hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet.
size_t privateMemSize = hsaKernel.spillSegSize();
if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t));
// Validate privateMemSize is more than max allowed.
size_t maxStackSize = device().MaxStackSize();
if (privateMemSize > maxStackSize) {
ClPrint(amd::LOG_INFO, amd::LOG_KERN,
"Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s", privateMemSize,
maxStackSize, hsaKernel.name().c_str());
LogError("Scratch size exceeds max allowed.");
return false;
}
}
// Set up the dispatch information
Pal::DispatchAqlParams dispatchParam = {};
dispatchParam.pAqlPacket = aqlPkt;
if (privateMemSize > 0) {
const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
dispatchParam.scratchAddr = scratch->memObj_->vmAddress();
dispatchParam.scratchSize = scratch->size_;
dispatchParam.scratchOffset = scratch->offset_;
dispatchParam.workitemPrivateSegmentSize = privateMemSize;
}
dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlKd();
dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress();
dispatchParam.wavesPerSh = 0;
dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false;
dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize();
dispatchParam.aqlPacketIndex = aql_index;
// Run AQL dispatch in HW
eventBegin(MainEngine);
iCmd()->CmdDispatchAql(dispatchParam);
if (id != gpuEvent.id_) {
LogError("Something is wrong. ID mismatch!\n");
}
eventEnd(MainEngine, gpuEvent);
AqlPacketUpdateTs(aql_index, gpuEvent);
// Execute scheduler for device enqueue
if (hsaKernel.dynamicParallelism()) {
PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
}
// Update the global GPU event
constexpr bool kNeedFLush = false;
setGpuEvent(gpuEvent, kNeedFLush);
if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) {
LogError("Couldn't read printf data from the buffer!\n");
return false;
}
// Check if image buffer write back is required
if (imageBufferWrtBack) {
// Make sure the original kernel execution is done
addBarrier(RgpSqqtBarrierReason::MemDependency);
for (const auto imageBuffer : wrtBackImageBuffer) {
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
amd::Image* image = imageBuffer->owner()->asImage();
amd::Coord3D offs(0);
// Copy memory from the the backing store image into original buffer
bool result = blitMgr().copyImageToBuffer(*imageBuffer->CopyImageBuffer(), *buffer, offs,
offs, image->getRegion(), true,
image->getRowPitch(), image->getSlicePitch());
}
}
// Perform post dispatch logic for RGP traces
if (rgpCaptureEna()) {
dev().captureMgr()->PostDispatch(this);
}
return true;
}
// ================================================================================================
void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
Unimplemented(); //!< @todo: Unimplemented
}
// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
//!@note runtime doesn't need to lock this command on execution
if (vcmd.waitingEvent() != nullptr) {
bool foundEvent = false;
// Loop through all outstanding command batches
while (!cbQueue_.empty()) {
auto cb = cbQueue_.front();
// Wait for completion
foundEvent = awaitCompletion(cb, vcmd.waitingEvent());
// Release a command batch
freeCbQueue_.push(cb);
// Remove command batch from the list
cbQueue_.pop();
// Early exit if we found a command
if (foundEvent) break;
}
// Event should be in the current command batch
if (!foundEvent) {
state_.forceWait_ = true;
}
} else if (amd::IS_HIP) {
// Use GPU based timing for HIP events
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
GpuEvent event;
profilingBegin(vcmd);
eventBegin(MainEngine);
eventEnd(MainEngine, event);
setGpuEvent(event);
profilingEnd(vcmd);
}
}
// ================================================================================================
void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {}
// ================================================================================================
void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
const Pal::IQueueSemaphore* sem = reinterpret_cast<const Pal::IQueueSemaphore*>(cmd.sem_ptr());
if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) {
flushDMA(MainEngine);
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore(
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
LogError("Failed to signal external semaphore");
}
} else {
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore(
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
LogError("Failed to wait on external semaphore");
}
}
}
void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
queues_[MainEngine]->removeCmdMemRef(mem);
if (!dev().settings().disableSdma_) {
queues_[SdmaEngine]->removeCmdMemRef(mem);
}
}
void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters();
PalCounterReference* palRef = PalCounterReference::Create(*this);
if (palRef == nullptr) {
LogError("We failed to allocate memory for the GPU perfcounter");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
bool newExperiment = false;
for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
// Make sure we have a valid gpu performance counter
if (nullptr == counter) {
amd::PerfCounter::Properties prop = amdCounter->properties();
PerfCounter* gpuCounter = new PerfCounter(
gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX],
prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]);
if (nullptr == gpuCounter) {
LogError("We failed to allocate memory for the GPU perfcounter");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
} else if (gpuCounter->create()) {
newExperiment = true;
} else {
LogPrintfError(
"We failed to allocate a perfcounter in PAL.\
Block: %d, counter: #d, event: %d",
gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_,
gpuCounter->info()->eventIndex_);
}
amdCounter->setDeviceCounter(gpuCounter);
}
}
if (newExperiment) {
palRef->finalize();
}
palRef->release();
Pal::IPerfExperiment* palPerf = nullptr;
for (uint i = 0; i < vcmd.getNumCounters(); ++i) {
amd::PerfCounter* amdCounter = static_cast<amd::PerfCounter*>(counters[i]);
const PerfCounter* counter = static_cast<const PerfCounter*>(amdCounter->getDeviceCounter());
if (palPerf != counter->iPerf()) {
palPerf = counter->iPerf();
// Find the state and sends the command to PAL
if (vcmd.getState() == amd::PerfCounterCommand::Begin) {
state_.perfCounterEnabled_ = true;
GpuEvent event;
eventBegin(MainEngine);
iCmd()->CmdBeginPerfExperiment(palPerf);
eventEnd(MainEngine, event);
setGpuEvent(event);
} else if (vcmd.getState() == amd::PerfCounterCommand::End) {
GpuEvent event;
eventBegin(MainEngine);
iCmd()->CmdEndPerfExperiment(palPerf);
eventEnd(MainEngine, event);
setGpuEvent(event);
state_.perfCounterEnabled_ = false;
} else {
LogError("Unsupported performance counter state");
vcmd.setStatus(CL_INVALID_OPERATION);
return;
}
}
}
}
void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
switch (cmd.type()) {
case CL_COMMAND_THREAD_TRACE_MEM: {
amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace();
ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
if (threadTrace == nullptr) {
PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this);
if (palRef == nullptr) {
LogError("Failure in memory allocation for the GPU threadtrace");
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
size_t numSe = amdThreadTrace->deviceSeNumThreadTrace();
ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe);
if (nullptr == gpuThreadTrace) {
LogError("Failure in memory allocation for the GPU threadtrace");
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
if (gpuThreadTrace->create()) {
amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace);
} else {
LogError("Failure in memory allocation for the GPU threadtrace");
delete gpuThreadTrace;
cmd.setStatus(CL_INVALID_OPERATION);
return;
}
palRef->finalize();
palRef->release();
}
break;
}
default:
LogError("Unsupported command type for ThreadTraceMemObjects!");
break;
}
profilingEnd(cmd);
}
void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(cmd);
switch (cmd.type()) {
case CL_COMMAND_THREAD_TRACE: {
amd::ThreadTrace* amdThreadTrace = static_cast<amd::ThreadTrace*>(&cmd.getThreadTrace());
ThreadTrace* threadTrace = static_cast<ThreadTrace*>(amdThreadTrace->getDeviceThreadTrace());
// gpu thread trace object had to be generated prior to begin/end/pause/resume due
// to ThreadTraceMemObjectsCommand execution
if (threadTrace == nullptr) {
return;
} else {
Pal::IPerfExperiment* palPerf = threadTrace->iPerf();
if (cmd.getState() == amd::ThreadTraceCommand::Begin) {
amd::ThreadTrace::ThreadTraceConfig* traceCfg =
static_cast<amd::ThreadTrace::ThreadTraceConfig*>(cmd.threadTraceConfig());
iCmd()->CmdBeginPerfExperiment(palPerf);
} else if (cmd.getState() == amd::ThreadTraceCommand::End) {
GpuEvent event;
eventBegin(MainEngine);
iCmd()->CmdEndPerfExperiment(palPerf);
threadTrace->populateUserMemory();
eventEnd(MainEngine, event);
setGpuEvent(event);
} else if (cmd.getState() == amd::ThreadTraceCommand::Pause) {
// There's no Pause from the PerfExperiment interface
} else if (cmd.getState() == amd::ThreadTraceCommand::Resume) {
// There's no Resume from the PerfExperiment interface
}
}
break;
}
default:
LogError("Unsupported command type for ThreadTrace!");
break;
}
profilingEnd(cmd);
}
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
for (const auto& it : vcmd.getMemList()) {
// amd::Memory object should never be nullptr
assert(it && "Memory object for interop is nullptr");
pal::Memory* memory = dev().getGpuMemory(it);
// If resource is a shared copy of original resource, then
// runtime needs to copy data from original resource
it->getInteropObj()->copyOrigToShared();
}
profilingEnd(vcmd);
}
void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
for (const auto& it : vcmd.getMemList()) {
// amd::Memory object should never be nullptr
assert(it && "Memory object for interop is nullptr");
pal::Memory* memory = dev().getGpuMemory(it);
// If resource is a shared copy of original resource, then
// runtime needs to copy data back to original resource
it->getInteropObj()->copySharedToOrig();
}
profilingEnd(vcmd);
}
void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) {
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory());
GpuEvent gpuEvent;
uint32_t value = vcmd.markerValue();
if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) {
eventBegin(MainEngine);
addVmMemory(pGpuMemory);
iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF,
Pal::CompareFunc::GreaterEqual);
eventEnd(MainEngine, gpuEvent);
} else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) {
EngineType activeEngineID = engineID_;
engineID_ = static_cast<EngineType>(pGpuMemory->getGpuEvent(*this)->engineId_);
// Make sure GPU finished operation and data reached memory before the marker write
addBarrier(RgpSqqtBarrierReason::SignalSubmit, BarrierType::FlushL2);
// Workarounds: We had systems where an extra delay was necessary.
{
// Flush CB associated with the DGMA buffer
isDone(pGpuMemory->getGpuEvent(*this));
}
eventBegin(engineID_);
queues_[engineID_]->addCmdMemRef(pGpuMemory->memRef());
queues_[engineID_]->iCmd()->
#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 396)
CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value);
#else
CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), vcmd.markerOffset(), value);
#endif
eventEnd(engineID_, gpuEvent);
// Restore the original engine
engineID_ = activeEngineID;
}
// Update the global GPU event
setGpuEvent(gpuEvent);
profilingEnd(vcmd);
}
void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) {
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
std::vector<amd::Memory*> memObjects = vcmd.memObjects();
uint32_t numObjects = memObjects.size();
Pal::GpuMemoryRef* pGpuMemRef = new Pal::GpuMemoryRef[numObjects];
Pal::IGpuMemory** pGpuMems = new Pal::IGpuMemory*[numObjects];
for (uint i = 0; i < numObjects; i++) {
pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]);
pGpuMemory->syncCacheFromHost(*this);
pGpuMemRef[i].pGpuMemory = pGpuMemory->iMem();
pGpuMems[i] = pGpuMemory->iMem();
}
dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_,
Pal::GpuMemoryRefCantTrim);
{
amd::ScopedLock l(queues_[MainEngine]->lock_);
dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems);
}
if (numObjects != 0) {
dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_);
}
for (uint i = 0; i < numObjects; i++) {
vcmd.busAddress()[i].surface_bus_address = pGpuMems[i]->Desc().surfaceBusAddr;
vcmd.busAddress()[i].marker_bus_address = pGpuMems[i]->Desc().markerBusAddr;
}
profilingEnd(vcmd);
}
bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) {
bool found = false;
amd::Command* current;
amd::Command* head = cb->head_;
// Make sure that profiling is enabled
if (state_.profileEnabled_) {
return profilingCollectResults(cb, waitingEvent);
}
// Mark the first command in the batch as running
if (head != nullptr) {
head->setStatus(CL_RUNNING);
} else {
return found;
}
// Wait for the last known GPU event
waitEventLock(cb);
while (nullptr != head) {
current = head->getNext();
if (head->status() == CL_SUBMITTED) {
head->setStatus(CL_RUNNING);
head->setStatus(CL_COMPLETE);
} else if (head->status() == CL_RUNNING) {
head->setStatus(CL_COMPLETE);
} else if ((head->status() != CL_COMPLETE) && (current != nullptr)) {
LogPrintfError("Unexpected command status - %d!", head->status());
}
// Check if it's a waiting command
if (head == waitingEvent) {
found = true;
}
head->release();
head = current;
}
return found;
}
void VirtualGPU::flush(amd::Command* list, bool wait) {
CommandBatch* cb = nullptr;
bool gpuCommand = false;
for (uint i = 0; i < AllEngines; ++i) {
if (events_[i].isValid()) {
gpuCommand = true;
}
}
// If the batch doesn't have any GPU command and the list is empty
if (!gpuCommand && cbQueue_.empty()) {
state_.forceWait_ = true;
}
// Insert the current batch into a list
if (nullptr != list) {
if (!freeCbQueue_.empty()) {
cb = freeCbQueue_.front();
}
if (nullptr == cb) {
cb = new CommandBatch(list, events_, lastTS_);
} else {
freeCbQueue_.pop();
cb->init(list, events_, lastTS_);
}
}
{
//! @todo: Check if really need a lock
amd::ScopedLock lock(execution());
for (uint i = 0; i < AllEngines; ++i) {
flushDMA(i);
// Reset event so we won't try to wait again,
// if runtime didn't submit any commands
//! @note: it's safe to invalidate events, since
//! we already saved them with the batch creation step above
events_[i].invalidate();
}
}
// Mark last TS as nullptr, so runtime won't process empty batches with the old TS
lastTS_ = nullptr;
if (nullptr != cb) {
cbQueue_.push(cb);
}
wait |= state_.forceWait_;
// Loop through all outstanding command batches
while (!cbQueue_.empty()) {
cb = cbQueue_.front();
// Check if command batch finished without a wait
bool finished = true;
for (uint i = 0; i < AllEngines; ++i) {
finished &= isDone(&cb->events_[i]);
}
if (finished || wait) {
// Wait for completion
awaitCompletion(cb);
// Release a command batch
freeCbQueue_.push(cb);
// Remove command batch from the list
cbQueue_.pop();
} else {
// Early exit if no finished
break;
}
}
state_.forceWait_ = false;
}
void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); }
void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) {
events_[engineID_] = gpuEvent;
// Flush current DMA buffer if requested
if (flush) {
flushDMA(engineID_);
}
}
void VirtualGPU::flushDMA(uint engineID) {
if (engineID == MainEngine) {
// Clear memory dependency state, since runtime flushes compute
// memoryDependency().clear();
//!@todo Keep memory dependency alive even if we flush DMA,
//! since only L2 cache is flushed in KMD frame,
//! but L1 still has to be invalidated.
}
isDone(&events_[engineID]);
}
bool VirtualGPU::waitAllEngines(CommandBatch* cb) {
uint i;
GpuEvent* events; //!< GPU events for the batch
// If command batch is nullptr then wait for the current
if (nullptr == cb) {
events = events_;
} else {
events = cb->events_;
}
bool earlyDone = true;
// The first loop is to flush all engines and/or check if
// engines are idle already
for (i = 0; i < AllEngines; ++i) {
earlyDone &= isDone(&events[i]);
}
// Rlease all pinned memory
releasePinnedMem();
// The second loop is to wait all engines
for (i = 0; i < AllEngines; ++i) {
waitForEvent(&events[i]);
}
return earlyDone;
}
void VirtualGPU::waitEventLock(CommandBatch* cb) {
bool earlyDone = false;
{
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
earlyDone = waitAllEngines(cb);
}
// Get timestamp, incase readjustTimeGPU_ needs to be updated
uint64_t endTimeStampCPU = amd::Os::timeNanos();
// Free resource cache if we have too many entries
//! \note we do it here, when all engines are idle,
// because Vista/Win7 idles GPU on a resource destruction
static const size_t MinCacheEntries = 4096;
dev().resourceCache().free(MinCacheEntries);
// Find the timestamp object of the last command in the batch
if (cb->lastTS_ != nullptr) {
// If earlyDone is TRUE, then CPU didn't wait for GPU.
// Thus the sync point between CPU and GPU is unclear and runtime
// will use an older adjustment value to maintain the same timeline
if (!earlyDone ||
//! \note Workaround for APU(s).
//! GPU-CPU timelines may go off too much, thus always
//! force calibration with the last batch in the list
(cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) {
uint64_t startTimeStampGPU = 0;
uint64_t endTimeStampGPU = 0;
// Get the timestamp value of the last command in the batch
cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU);
// Adjust the base time by the execution time
readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU;
}
}
}
bool VirtualGPU::allocConstantBuffers() {
// Allocate constant buffers.
// Use double size, reported to the app to account for internal arguments
const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_;
uint i;
// Create/reallocate constant buffer resources
for (i = 0; i < MaxConstBuffersArguments; ++i) {
ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize);
if ((constBuf != nullptr) && constBuf->Create()) {
addConstBuffer(constBuf);
} else {
// We failed to create a constant buffer
delete constBuf;
return false;
}
}
return true;
}
void VirtualGPU::profilingBegin(amd::Command& command) {
// Is profiling enabled?
if (command.profilingInfo().enabled_) {
// Allocate a timestamp object from the cache
TimeStamp* ts = tsCache_->allocTimeStamp();
if (nullptr == ts) {
return;
}
// Save the TimeStamp object in the current OCL event
command.data().emplace_back(ts);
profileTs_ = ts;
state_.profileEnabled_ = true;
}
}
void VirtualGPU::profilingEnd(amd::Command& command) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts =
!command.data().empty() ? reinterpret_cast<TimeStamp*>(command.data().back()) : nullptr;
if (ts != nullptr) {
// Check if the command actually did any GPU submission
if (ts->isValid()) {
lastTS_ = ts;
} else {
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
command.data().clear();
}
}
}
bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) {
bool found = false;
amd::Command* current;
amd::Command* first = cb->head_;
// If the command list is, empty then exit
if (nullptr == first) {
return found;
}
// Wait for the last known GPU events on all engines
waitEventLock(cb);
// Find the CPU base time of the entire command batch execution
uint64_t endTimeStamp = amd::Os::timeNanos();
uint64_t startTimeStamp = endTimeStamp;
// First step, walk the command list to find the first valid command
//! \note The batch may have empty markers at the beginning.
//! So the start/end of the empty commands is equal to
//! the start of the first valid command in the batch.
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts =
!first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back()) : nullptr;
if (ts != nullptr) {
ts->value(&startTimeStamp, &endTimeStamp);
endTimeStamp -= readjustTimeGPU_;
startTimeStamp -= readjustTimeGPU_;
// Assign to endTimeStamp the start of the first valid command
endTimeStamp = startTimeStamp;
break;
}
first = first->getNext();
}
// Second step, walk the command list to construct the time line
first = cb->head_;
while (nullptr != first) {
// Get the TimeStamp object associated witht the current command
TimeStamp* ts =
!first->data().empty() ? reinterpret_cast<TimeStamp*>(first->data().back()) : nullptr;
current = first->getNext();
if (ts != nullptr) {
ts->value(&startTimeStamp, &endTimeStamp);
endTimeStamp -= readjustTimeGPU_;
startTimeStamp -= readjustTimeGPU_;
// Destroy the TimeStamp object
tsCache_->freeTimeStamp(ts);
first->data().clear();
} else {
// For empty commands start/end is equal to
// the end of the last valid command
startTimeStamp = endTimeStamp;
}
// Update the command status with the proper timestamps
if (first->status() == CL_SUBMITTED) {
first->setStatus(CL_RUNNING, startTimeStamp);
first->setStatus(CL_COMPLETE, endTimeStamp);
} else if (first->status() == CL_RUNNING) {
first->setStatus(CL_COMPLETE, endTimeStamp);
} else if ((first->status() != CL_COMPLETE) && (current != nullptr)) {
LogPrintfError("Unexpected command status - %d!", first->status());
}
// Do we wait this event?
if (first == waitingEvent) {
found = true;
}
first->release();
first = current;
}
return found;
}
void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) {
queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd);
}
void VirtualGPU::profileEvent(EngineType engine, bool type) const {
if (nullptr == profileTs_) {
return;
}
if (type) {
profileTs_->begin();
} else {
profileTs_->end();
}
}
bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params,
bool nativeMem, size_t& ldsAddress, bool& imageBufferWrtBack,
std::vector<Image*>& wrtBackImageBuffer) {
const amd::KernelParameters& kernelParams = kernel.parameters();
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
memoryDependency().newKernel();
size_t count = kernelParams.getNumberOfSvmPtr();
if (count > 0) {
bool supportFineGrainedSystem = dev().isFineGrainedSystem(true);
FGSStatus status = kernelParams.getSvmSystemPointersSupport();
switch (status) {
case FGS_YES:
if (!supportFineGrainedSystem) {
return false;
}
break;
case FGS_NO:
supportFineGrainedSystem = false;
break;
case FGS_DEFAULT:
default:
break;
}
// get svm non arugment information
void* const* svmPtrArray =
reinterpret_cast<void* const*>(params + kernelParams.getExecInfoOffset());
for (size_t i = 0; i < count; i++) {
amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]);
if (nullptr == memory) {
if (!supportFineGrainedSystem) {
return false;
} else {
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
}
} else {
// Validate Mem Access in case of VMM Memory
if (!memory->ValidateMemAccess(dev(), true)) {
return false;
}
Memory* gpuMemory = dev().getGpuMemory(memory);
if (nullptr != gpuMemory) {
// Synchronize data with other memory instances if necessary
gpuMemory->syncCacheFromHost(*this);
const static bool IsReadOnly = false;
// Validate SVM passed in the non argument list
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
constexpr bool WaitOnBusyEngine = true;
gpuMemory->wait(*this, WaitOnBusyEngine);
// Mark signal write for cache coherency,
// since this object isn't a part of kernel arg setup
if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) {
memory->signalWrite(&dev());
}
addVmMemory(gpuMemory);
} else {
return false;
}
}
}
}
bool srdResource = false;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernelParams.memoryObjOffset());
const pal::Kernel& hsaKernel = static_cast<const pal::Kernel&>(*(kernel.getDeviceKernel(dev())));
const amd::KernelSignature& signature = kernel.signature();
ldsAddress = hsaKernel.ldsSize();
if (!nativeMem) {
// Process cache coherency first, since the extra transfers may affect
// other mem dependency tracking logic: TS and signalWrite()
for (uint i = 0; i < signature.numMemories(); ++i) {
amd::Memory* mem = memories[i];
if (mem != nullptr) {
// Synchronize data with other memory instances if necessary
dev().getGpuMemory(mem)->syncCacheFromHost(*this);
}
}
}
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const amd::KernelParameterDescriptor::InfoData& info = desc.info_;
// Find if current argument is a buffer
if (desc.type_ == T_POINTER) {
// If it is a local pointer
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_);
if (desc.size_ == 8) {
// Save the original LDS size
uint64_t ldsSize = *reinterpret_cast<const uint64_t*>(params + desc.offset_);
// Patch the LDS address in the original arguments with an LDS address(offset)
WriteAqlArgAt(const_cast<address>(params), ldsAddress, desc.size_, desc.offset_);
// Add the original size
ldsAddress += ldsSize;
} else {
// Save the original LDS size
uint32_t ldsSize = *reinterpret_cast<const uint32_t*>(params + desc.offset_);
// Patch the LDS address in the original arguments with an LDS address(offset)
uint32_t ldsAddr = ldsAddress;
WriteAqlArgAt(const_cast<address>(params), ldsAddr, desc.size_, desc.offset_);
// Add the original size
ldsAddress += ldsSize;
}
} else {
Memory* gpuMem = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = info.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
if (nullptr != gpuMem) {
mem = gpuMem->owner();
}
} else {
mem = memories[index];
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
}
}
//! This condition is for SVM fine-grain
if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) {
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
} else if (gpuMem != nullptr) {
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, gpuMem, info.readOnly_);
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
constexpr bool WaitOnBusyEngine = true;
gpuMem->wait(*this, WaitOnBusyEngine);
addVmMemory(gpuMem);
const void* globalAddress = *reinterpret_cast<const void* const*>(params + desc.offset_);
logVmMemory(desc.name_, gpuMem);
//! Check if compiler expects read/write.
//! Note: SVM with subbuffers has an issue with tracking.
//! Conformance can send read only subbuffer, but update the region
//! in the kernel.
if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
mem->signalWrite(&dev());
}
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
if (gpuMem->memoryType() == Resource::ImageBuffer) {
Image* imageBuffer = static_cast<Image*>(gpuMem);
// Check if synchronization has to be performed
if (imageBuffer->CopyImageBuffer() != nullptr) {
Memory* buffer = dev().getGpuMemory(mem->parent());
amd::Image* image = mem->asImage();
amd::Coord3D offs(0);
// Copy memory from the original image buffer into the backing store image
bool result = blitMgr().copyBufferToImage(
*buffer, *imageBuffer->CopyImageBuffer(), offs, offs, image->getRegion(), true,
image->getRowPitch(), image->getSlicePitch());
// Make sure the copy operation is done
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Use backing store SRD as the replacment
uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
WriteAqlArgAt(const_cast<address>(params), srd, sizeof(srd), desc.offset_);
// Add backing store image to the list of memory handles
addVmMemory(imageBuffer->CopyImageBuffer());
// If it's not a read only resource, then runtime has to write back
if (!info.readOnly_) {
wrtBackImageBuffer.push_back(imageBuffer);
imageBufferWrtBack = true;
}
}
}
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (gpuMem->memoryType() == Resource::ImageView) {
// Copy the current image SRD into CB1
uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), srd, sizeof(srd), desc.offset_);
addVmMemory(cb(1)->ActiveMemory());
} else {
srdResource = true;
}
if (gpuMem->desc().isDoppTexture_) {
addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(),
kernel.parameters().getExecPfpaVcop());
}
}
}
}
} else if (desc.type_ == T_VOID) {
if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) {
// Copy the current structure into CB1
size_t gpuPtr =
static_cast<size_t>(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_));
// Then use a pointer in aqlArgBuffer to CB1
const auto it = hsaKernel.patch().find(desc.offset_);
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), gpuPtr, sizeof(size_t), it->second);
addVmMemory(cb(1)->ActiveMemory());
}
} else if (desc.type_ == T_SAMPLER) {
srdResource = true;
} else if (desc.type_ == T_QUEUE) {
uint32_t index = desc.info_.arrayIndex_;
const amd::DeviceQueue* queue =
reinterpret_cast<amd::DeviceQueue* const*>(params + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return false;
}
vmQueue = vQueue()->vmAddress();
}
// Patch the GPU VA address in the original arguments
WriteAqlArgAt(const_cast<address>(params), vmQueue, sizeof(vmQueue), desc.offset_);
break;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return false;
}
if (srdResource || hsaKernel.prog().isStaticSampler()) {
dev().srds().fillResourceList(*this);
}
const static bool IsReadOnly = false;
for (const pal::Memory* mem : hsaKernel.prog().globalStores()) {
// Validate global store for a dependency in the queue
memoryDependency().validate(*this, mem, IsReadOnly);
addVmMemory(mem);
}
if (hsaKernel.prog().hasGlobalStores()) {
// Validate code object for a dependency in the queue
memoryDependency().validate(*this, &hsaKernel.prog().codeSegGpu(), IsReadOnly);
}
addVmMemory(&hsaKernel.prog().codeSegGpu());
if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) {
const Device::ScratchBuffer* scratch = dev().scratch(hwRing());
// Validate scratch buffer to force sync mode, because
// the current scratch logic is optimized for size and performance
// Note: runtime can skip sync if the same kernel is used,
// since the number of scratch regs remains the same
if (!IsSameKernel(kernel)) {
memoryDependency().validate(*this, scratch->memObj_, IsReadOnly);
}
addVmMemory(scratch->memObj_);
logVmMemory("scratch", scratch->memObj_);
}
// Synchronize dispatches unconditionally in case memory tracking is disabled
memoryDependency().sync(*this);
return true;
}
void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) {
if (nullptr == kernelTable) {
vqHeader_->kernel_table = 0;
} else {
vqHeader_->kernel_table = kernelTable->vmAddress();
addVmMemory(kernelTable);
}
virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true);
}
bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) {
uint64_t srcVmEnd = src.vmAddress() + src.vmSize();
if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) ||
((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) ||
((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) {
sdmaRange_.start_ = dst.vmAddress();
sdmaRange_.end_ = dst.vmAddress() + dst.vmSize();
return true;
}
sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress());
sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize());
return false;
}
// ================================================================================================
void* VirtualGPU::getOrCreateHostcallBuffer() {
if (hostcallBuffer_ != nullptr) {
return hostcallBuffer_;
}
// The number of packets required in each buffer is at least equal to the
// maximum number of waves supported by the device.
auto wavesPerCu = dev().info().maxThreadsPerCU_ / dev().info().wavefrontWidth_;
auto numPackets = dev().info().maxComputeUnits_ * wavesPerCu;
auto size = amd::getHostcallBufferSize(numPackets);
auto align = amd::getHostcallBufferAlignment();
hostcallBuffer_ = dev().svmAlloc(dev().context(), size, align,
CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, nullptr);
if (!hostcallBuffer_) {
ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to create hostcall buffer");
return nullptr;
}
ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
"Created hostcall buffer %p (numPackets == %d, size == %d, align == %d) for virtual "
"queue %p\n",
hostcallBuffer_, numPackets, size, align, this);
if (!amd::enableHostcalls(dev(), hostcallBuffer_, numPackets)) {
ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener",
hostcallBuffer_);
return nullptr;
}
return hostcallBuffer_;
}
} // namespace amd::pal