/* Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "platform/perfctr.hpp" #include "platform/threadtrace.hpp" #include "platform/kernel.hpp" #include "platform/commandqueue.hpp" #include "device/pal/palconstbuf.hpp" #include "device/pal/palvirtual.hpp" #include "device/pal/palkernel.hpp" #include "device/pal/palprogram.hpp" #include "device/pal/palcounters.hpp" #include "device/pal/palthreadtrace.hpp" #include "device/pal/paltimestamp.hpp" #include "device/pal/palblit.hpp" #include "device/appprofile.hpp" #include "device/devhostcall.hpp" #include "hsa.h" #include "amd_hsa_kernel_code.h" #include "amd_hsa_queue.h" #include #include #include #include #include "palQueue.h" #include "palFence.h" #include "palQueueSemaphore.h" #ifdef _WIN32 #include #include "platform/interop_d3d9.hpp" #include "platform/interop_d3d10.hpp" #include "platform/interop_d3d11.hpp" #endif // _WIN32 namespace amd::pal { uint32_t VirtualGPU::Queue::AllocedQueues(const VirtualGPU& gpu, Pal::EngineType type) { uint32_t allocedQueues = 0; for (const auto& queue : gpu.dev().QueuePool()) { allocedQueues += (queue.second->engineType_ == type) ? 1 : 0; } return allocedQueues; } // ================================================================================================ VirtualGPU::Queue* VirtualGPU::Queue::Create(VirtualGPU& gpu, Pal::QueueType queueType, uint engineIdx, Pal::ICmdAllocator* cmdAllocator, uint rtCU, amd::CommandQueue::Priority priority, uint64_t residency_limit, uint max_command_buffers) { Pal::IDevice* palDev = gpu.dev().iDev(); Pal::Result result; Pal::CmdBufferCreateInfo cmdCreateInfo = {}; Pal::QueueCreateInfo qCreateInfo = {}; qCreateInfo.engineIndex = (queueType == Pal::QueueTypeCompute) ? gpu.dev().computeEnginesId()[engineIdx] : engineIdx; qCreateInfo.aqlQueue = true; qCreateInfo.queueType = queueType; qCreateInfo.priority = Pal::QueuePriority::Normal; if (queueType == Pal::QueueTypeDma) { cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma; } else { cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; } std::map::const_iterator it; if ((priority == amd::CommandQueue::Priority::Medium) && (amd::CommandQueue::RealTimeDisabled == rtCU)) { it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::Medium); cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; qCreateInfo.priority = Pal::QueuePriority::Medium; } else if (amd::CommandQueue::RealTimeDisabled != rtCU) { if (gpu.dev().settings().enableWgpMode_) { rtCU = rtCU * 2; } qCreateInfo.numReservedCu = amd::alignDown( rtCU, gpu.dev().properties().engineProperties[Pal::EngineTypeCompute].dedicatedCuGranularity); if (qCreateInfo.numReservedCu == 0) { return nullptr; } if ((priority == amd::CommandQueue::Priority::Medium) && // If Windows HWS is enabled, then the both real time queues are allocated // on the same engine (gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) != gpu.dev().exclusiveComputeEnginesId().end())) { it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1); } else { it = gpu.dev().exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime0); } cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; cmdCreateInfo.flags.realtimeComputeUnits = true; qCreateInfo.priority = Pal::QueuePriority::Realtime; // If the app creates an exclusive compute, then find the engine id if (qCreateInfo.engineType == Pal::EngineTypeCompute) { if (it != gpu.dev().exclusiveComputeEnginesId().end()) { qCreateInfo.engineIndex = it->second; } else { return nullptr; } } } // Find queue object size size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); if (result != Pal::Result::Success) { return nullptr; } cmdCreateInfo.pCmdAllocator = cmdAllocator; cmdCreateInfo.queueType = queueType; // Find command buffer object size size_t cmdSize = palDev->GetCmdBufferSize(cmdCreateInfo, &result); if (result != Pal::Result::Success) { return nullptr; } // Find fence object size size_t fSize = palDev->GetFenceSize(&result); if (result != Pal::Result::Success) { return nullptr; } size_t allocSize = qSize + max_command_buffers * (cmdSize + fSize); VirtualGPU::Queue* queue = new (allocSize) VirtualGPU::Queue(gpu, palDev, residency_limit, max_command_buffers); if (queue != nullptr) { address addrQ = nullptr; if (((qCreateInfo.engineType == Pal::EngineTypeCompute) || (qCreateInfo.engineType == Pal::EngineTypeDma)) && (qCreateInfo.priority != Pal::QueuePriority::Realtime)) { uint32_t index = AllocedQueues(gpu, qCreateInfo.engineType); // Create PAL queue object if (index < GPU_MAX_HW_QUEUES) { Device::QueueRecycleInfo* info = new (qSize) Device::QueueRecycleInfo(); if (info == nullptr) { LogError("Could not create QueueRecycleInfo!"); return nullptr; } addrQ = reinterpret_cast

(&info[1]); qCreateInfo.aqlPacketList = info->AqlPacketList(); result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); if (result == Pal::Result::Success) { const_cast(gpu.dev()).QueuePool().insert({queue->iQueue_, info}); info->engineType_ = qCreateInfo.engineType; // Save uniqueue index for scratch buffer access info->index_ = index; } else { delete queue; return nullptr; } } else { int usage = std::numeric_limits::max(); uint indexBase = std::numeric_limits::max(); // Loop through all allocated queues and find the lowest usage for (const auto& it : gpu.dev().QueuePool()) { if ((qCreateInfo.engineType == it.second->engineType_) && (it.second->counter_ <= usage)) { if ((it.second->counter_ < usage) || // Preserve the order of allocations, because SDMA engines // should be used in round-robin manner ((it.second->counter_ == usage) && (it.second->index_ < indexBase))) { queue->iQueue_ = it.first; usage = it.second->counter_; indexBase = it.second->index_; } } } // Increment the usage of the current queue gpu.dev().QueuePool().find(queue->iQueue_)->second->counter_++; } Device::QueueRecycleInfo* info = gpu.dev().QueuePool().find(queue->iQueue_)->second; queue->aql_mgmt_ = &info->aql_packet_mgmt_; queue->lock_ = &info->queue_lock_; addrQ = reinterpret_cast

(&queue[1]); } else { Device::QueueRecycleInfo* info = new Device::QueueRecycleInfo(); if (info == nullptr) { LogError("Could not create QueueRecycleInfo!"); return nullptr; } queue->info_ = info; queue->aql_mgmt_ = &info->aql_packet_mgmt_; // Exclusive compute path addrQ = reinterpret_cast

(&queue[1]); qCreateInfo.aqlPacketList = info->AqlPacketList(); result = palDev->CreateQueue(qCreateInfo, addrQ, &queue->iQueue_); } if (result != Pal::Result::Success) { delete queue; return nullptr; } queue->UpdateAppPowerProfile(); address addrCmd = addrQ + qSize; address addrF = addrCmd + max_command_buffers * cmdSize; Pal::CmdBufferBuildInfo cmdBuildInfo = {}; for (uint i = 0; i < max_command_buffers; ++i) { result = palDev->CreateCmdBuffer(cmdCreateInfo, &addrCmd[i * cmdSize], &queue->iCmdBuffs_[i]); if (result != Pal::Result::Success) { delete queue; return nullptr; } Pal::FenceCreateInfo fenceCreateinfo = {}; fenceCreateinfo.flags.signaled = false; result = palDev->CreateFence(fenceCreateinfo, &addrF[i * fSize], &queue->iCmdFences_[i]); if (result != Pal::Result::Success) { delete queue; return nullptr; } if (i == StartCmdBufIdx) { result = queue->iCmdBuffs_[i]->Begin(cmdBuildInfo); if (result != Pal::Result::Success) { delete queue; return nullptr; } } } } return queue; } VirtualGPU::Queue::~Queue() { delete reinterpret_cast(info_); if (nullptr != iQueue_) { // Make sure the queues are idle // It's unclear why PAL could still have a busy queue amd::ScopedLock l(lock_); iQueue_->WaitIdle(); } // Remove all memory references std::vector memRef; for (auto it : memReferences_) { memRef.push_back(it.first->iMem()); } if (memRef.size() != 0) { iDev_->RemoveGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_); } memReferences_.clear(); for (uint i = 0; i < max_command_buffers_; ++i) { if (nullptr != iCmdBuffs_[i]) { iCmdBuffs_[i]->Destroy(); } if (nullptr != iCmdFences_[i]) { iCmdFences_[i]->Destroy(); } } if (nullptr != iQueue_) { // Find if this queue was used in recycling if (lock_ != nullptr) { // Release the queue if the counter is 0 if (--gpu_.dev().QueuePool().find(iQueue_)->second->counter_ == 0) { iQueue_->Destroy(); const auto& info = gpu_.dev().QueuePool().find(iQueue_); // Readjust HW queue index for scratch buffer access for (auto& queue : gpu_.dev().QueuePool()) { if ((queue.second->engineType_ == info->second->engineType_) && (queue.second->index_ > info->second->index_)) { queue.second->index_--; } } delete gpu_.dev().QueuePool().find(iQueue_)->second; const_cast(gpu_.dev()).QueuePool().erase(iQueue_); } } else { iQueue_->Destroy(); } } } Pal::Result VirtualGPU::Queue::UpdateAppPowerProfile() { std::wstring wsAppPathAndFileName = Device::appProfile()->wsAppPathAndFileName(); const wchar_t* wAppPathAndName = wsAppPathAndFileName.c_str(); // Find the last occurance of the '\\' character and extract the name of the application as wide // char. const wchar_t* wAppNamePtr = wcsrchr(wAppPathAndName, '\\'); const wchar_t* wAppName = wAppNamePtr ? wAppNamePtr + 1 : wAppPathAndName; return iQueue_->UpdateAppPowerProfile(wAppName, wAppPathAndName); } void VirtualGPU::Queue::addCmdMemRef(GpuMemoryReference* mem) { if (gpu_.dev().settings().alwaysResident_) { return; } Pal::IGpuMemory* iMem = mem->iMem(); auto it = memReferences_.find(mem); if (it != memReferences_.end()) { it->second = cmdBufIdSlot_; } else { // Update runtime tracking with TS memReferences_[mem] = cmdBufIdSlot_; // Update PAL list with the new entry Pal::GpuMemoryRef memRef = {}; memRef.pGpuMemory = iMem; palMemRefs_.push_back(memRef); // Check SDI memory object if (iMem->Desc().flags.isExternPhys && (sdiReferences_.find(iMem) == sdiReferences_.end())) { sdiReferences_.insert(iMem); palSdiRefs_.push_back(iMem); } residency_size_ += iMem->Desc().size; } } void VirtualGPU::Queue::removeCmdMemRef(GpuMemoryReference* mem) { Pal::IGpuMemory* iMem = mem->iMem(); if (0 != memReferences_.erase(mem)) { iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_); residency_size_ -= iMem->Desc().size; } } void VirtualGPU::Queue::addCmdDoppRef(Pal::IGpuMemory* iMem, bool lastDoppCmd, bool pfpaDoppCmd) { for (size_t i = 0; i < palDoppRefs_.size(); i++) { if (palDoppRefs_[i].pGpuMemory == iMem) { // If both LAST_DOPP_SUBMISSION and PFPA_DOPP_SUBMISSION VCOPs are requested, // the LAST_DOPP_SUBMISSION is send as requsted by KMD // if (palDoppRefs_[i].flags.lastPfpaCmd == 1) { return; // no need to override the last submission command } if (lastDoppCmd) { palDoppRefs_[i].flags.lastPfpaCmd = 1; palDoppRefs_[i].flags.pfpa = 0; } else if (pfpaDoppCmd) { palDoppRefs_[i].flags.pfpa = 1; } return; } } // this is the first reference of the DOPP desktop texture, add it in the vector Pal::DoppRef doppRef = {}; doppRef.flags.pfpa = pfpaDoppCmd ? 1 : 0; doppRef.flags.lastPfpaCmd = lastDoppCmd ? 1 : 0; doppRef.pGpuMemory = iMem; palDoppRefs_.push_back(doppRef); } // ================================================================================================ bool VirtualGPU::Queue::flush() { amd::ScopedLock l(lock_); const Settings& settings = gpu_.dev().settings(); if (!settings.alwaysResident_ && palMemRefs_.size() != 0) { Pal::Result result = iDev_->AddGpuMemoryReferences(palMemRefs_.size(), &palMemRefs_[0], iQueue_, Pal::GpuMemoryRefCantTrim); if (Pal::Result::Success != result) { LogPrintfError("PAL failed to make resident resources! result: %d", result); return false; } palMemRefs_.clear(); } // Stop commands building Pal::Result result; result = iCmdBuffs_[cmdBufIdSlot_]->End(); if (Pal::Result::Success != result) { LogPrintfError("PAL failed to finalize a command buffer! result: %d", result); return false; } // Reset the fence. PAL will reset OS event result = iDev_->ResetFences(1, &iCmdFences_[cmdBufIdSlot_]); if (Pal::Result::Success != result) { LogPrintfError("PAL failed to reset a fence! result:%d", result); return false; } Pal::PerSubQueueSubmitInfo perSubQueueSubmitInfo = {}; perSubQueueSubmitInfo.cmdBufferCount = 1; perSubQueueSubmitInfo.ppCmdBuffers = &iCmdBuffs_[cmdBufIdSlot_]; Pal::MultiSubmitInfo submitInfo = {}; submitInfo.perSubQueueInfoCount = 1; submitInfo.pPerSubQueueInfo = &perSubQueueSubmitInfo; submitInfo.doppRefCount = palDoppRefs_.size(); submitInfo.pDoppRefs = palDoppRefs_.data(); submitInfo.externPhysMemCount = palSdiRefs_.size(); submitInfo.ppExternPhysMem = palSdiRefs_.data(); submitInfo.fenceCount = 1; submitInfo.ppFences = &iCmdFences_[cmdBufIdSlot_]; if (iQueue_->Type() == Pal::QueueTypeCompute) { if (gpu_.dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) { // If runtime uses device memory for kernel arguments, then perform a CPU read back on // submission. That will make sure NBIO puches all previous CPU write requests through PCIE gpu_.managedBuffer().CpuReadBack(); } if (amd::IS_HIP) { // HIP disables per resource tracking, because the app may embed SVM ptr into other buffers. // Force CPU sync if there are pending operations on SDMA, until OS fences will be added gpu_.WaitForIdleSdma(); } } // Submit command buffer to OS if (gpu_.rgpCaptureEna()) { result = gpu_.dev().captureMgr()->TimedQueueSubmit(iQueue_, cmdBufIdCurrent_, submitInfo); } else { result = iQueue_->Submit(submitInfo); } if (Pal::Result::Success != result) { LogPrintfError("PAL failed to submit CMD! result:%d", result); if (GPU_ANALYZE_HANG) { DumpMemoryReferences(); } return false; } // Make sure the slot isn't busy constexpr bool IbReuse = true; if (GPU_FLUSH_ON_EXECUTION) { waitForFence(cmdBufIdSlot_); } // Reset the counter of commands cmdCnt_ = 0; // Find the next command buffer cmdBufIdCurrent_++; if (cmdBufIdCurrent_ == GpuEvent::InvalidID) { // Wait for the last one waitForFence(cmdBufIdSlot_); cmdBufIdCurrent_ = 1; cmbBufIdRetired_ = 0; } // Wrap current slot cmdBufIdSlot_ = cmdBufIdCurrent_ % max_command_buffers_; waitForFence(cmdBufIdSlot_); // Progress retired TS if ((cmdBufIdCurrent_ > max_command_buffers_) && (cmbBufIdRetired_ < (cmdBufIdCurrent_ - max_command_buffers_))) { cmbBufIdRetired_ = cmdBufIdCurrent_ - max_command_buffers_; } // Reset command buffer, so CB chunks could be reused result = iCmdBuffs_[cmdBufIdSlot_]->Reset(nullptr, false); if (Pal::Result::Success != result) { LogPrintfError("PAL failed CB reset! result:%d", result); return false; } // Start command buffer building Pal::CmdBufferBuildInfo cmdBuildInfo = {}; cmdBuildInfo.pMemAllocator = &vlAlloc_; result = iCmdBuffs_[cmdBufIdSlot_]->Begin(cmdBuildInfo); if (Pal::Result::Success != result) { LogPrintfError("PAL failed CB building initialization! result:%d", result); return false; } // Clear dopp references palDoppRefs_.clear(); palSdiRefs_.clear(); // Remove old memory references if ((memReferences_.size() > 2048) || (residency_size_ > residency_limit_)) { for (auto it = memReferences_.begin(); it != memReferences_.end();) { if (it->second == cmdBufIdSlot_) { palMems_.push_back(it->first->iMem()); residency_size_ -= it->first->iMem()->Desc().size; it = memReferences_.erase(it); } else { ++it; } } } if (!settings.alwaysResident_ && palMems_.size() != 0) { iDev_->RemoveGpuMemoryReferences(palMems_.size(), &palMems_[0], iQueue_); palMems_.clear(); } return true; } // ================================================================================================ bool VirtualGPU::Queue::waitForEvent(uint id) { amd::ScopedLock l(lock_); if (isDone(id)) { return true; } if (id == cmdBufIdCurrent_) { // There is an error in the flush() and wait is bogus return false; } uint slotId = id % max_command_buffers_; constexpr bool IbReuse = true; bool result = waitForFence(slotId); cmbBufIdRetired_ = id; return result; } // ================================================================================================ bool VirtualGPU::Queue::isDone(uint id) { amd::ScopedLock l(lock_); if ((id <= cmbBufIdRetired_) || (id > cmdBufIdCurrent_)) { return true; } if (id == cmdBufIdCurrent_) { // Flush the current command buffer if (!flush()) { // If flush failed, then exit earlier... return false; } } if (Pal::Result::Success != iCmdFences_[id % max_command_buffers_]->GetStatus()) { return false; } cmbBufIdRetired_ = id; return true; } // ================================================================================================ void VirtualGPU::Queue::DumpMemoryReferences() const { std::fstream dump; std::stringstream file_name("ocl_hang_dump.txt"); uint64_t start = amd::Os::timeNanos() / 1e9; dump.open(file_name.str().c_str(), (std::fstream::out | std::fstream::app)); // Check if we have OpenCL program if (dump.is_open()) { dump << start << " Queue: "; switch (iQueue_->Type()) { case Pal::QueueTypeCompute: dump << "Compute"; break; case Pal::QueueTypeDma: dump << "SDMA"; break; default: dump << "unknown"; break; } dump << "\n" << "Resident memory resources:\n"; uint idx = 0; for (auto it : memReferences_) { dump << " " << idx << "\t["; dump.setf(std::ios::hex, std::ios::basefield); dump.setf(std::ios::showbase); dump << (it.first)->iMem()->Desc().gpuVirtAddr << ", " << (it.first)->iMem()->Desc().gpuVirtAddr + (it.first)->iMem()->Desc().size; dump.setf(std::ios::dec); dump << "] CbId:" << it.second << ", Heap: " << (it.first)->iMem()->Desc().heaps[0] << "\n"; idx++; } if (last_kernel_ != nullptr) { const amd::KernelSignature& signature = last_kernel_->signature(); dump << last_kernel_->name() << std::endl; for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); // Find if the current argument is a memory object if ((desc.type_ == T_POINTER) && (desc.addressQualifier_ != CL_KERNEL_ARG_ADDRESS_LOCAL)) { dump << " " << desc.name_ << ": " << std::endl; } } } dump.close(); } } bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { if (numMemObj > 0) { // Allocate the array of memory objects for dependency tracking memObjectsInQueue_ = new MemoryState[numMemObj]; if (nullptr == memObjectsInQueue_) { return false; } memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); maxMemObjectsInQueue_ = numMemObj; } return true; } void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { bool flushL1Cache = false; if (maxMemObjectsInQueue_ == 0) { // Return earlier if tracking is disabled return; } uint64_t curStart = memory->vmAddress(); uint64_t curEnd = curStart + memory->size(); if (memory->isModified(gpu) || !readOnly) { // Mark resource as modified memory->setModified(gpu, !readOnly); // Loop through all memory objects in the queue and find dependency // @note don't include objects from the current kernel for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { // Check if the queue already contains this mem object and // GPU operations aren't readonly uint64_t busyStart = memObjectsInQueue_[j].start_; uint64_t busyEnd = memObjectsInQueue_[j].end_; // Check if the start inside the busy region if ((((curStart >= busyStart) && (curStart < busyEnd)) || // Check if the end inside the busy region ((curEnd > busyStart) && (curEnd <= busyEnd)) || // Check if the start/end cover the busy region ((curStart <= busyStart) && (curEnd >= busyEnd))) && // If the buys region was written or the current one is for write (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { flushL1Cache = true; break; } } } // Did we reach the limit? if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) { flushL1Cache = true; } if (flushL1Cache) { // Flush cache gpu.addBarrier(RgpSqqtBarrierReason::MemDependency); // Clear memory dependency state const static bool All = true; clear(!All); } // Insert current memory object into the queue always, // since runtime calls flush before kernel execution and it has to keep // current kernel in tracking memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart; memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; numMemObjectsInQueue_++; } void VirtualGPU::MemoryDependency::clear(bool all) { if (numMemObjectsInQueue_ > 0) { if (all) { endMemObjectsInQueue_ = numMemObjectsInQueue_; } // If the current launch didn't start from the beginning, then move the data if (0 != endMemObjectsInQueue_) { size_t i, j; // Preserve all objects from the current kernel for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; } } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) { // note: The array growth shouldn't occur under the normal conditions, // but in a case when SVM path sends the amount of SVM ptrs over // the max size of kernel arguments MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; if (nullptr == ptr) { numMemObjectsInQueue_ = 0; return; } maxMemObjectsInQueue_ <<= 1; memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); delete[] memObjectsInQueue_; memObjectsInQueue_ = ptr; } // Adjust the number of active objects numMemObjectsInQueue_ -= endMemObjectsInQueue_; endMemObjectsInQueue_ = 0; } } void VirtualGPU::addPinnedMem(amd::Memory* mem) { if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); pinnedMems_.erase(pinnedMems_.begin()); } // Start operation, since we should release mem object flushDMA(dev().getGpuMemory(mem)->getGpuEvent(*this)->engineId_); // Delay destruction pinnedMems_.push_back(mem); } } void VirtualGPU::releasePinnedMem() { for (auto& amdMemory : pinnedMems_) { amdMemory->release(); } pinnedMems_.clear(); } amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { for (auto& amdMemory : pinnedMems_) { if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { return amdMemory; } } return nullptr; } bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { uint MinDeviceQueueSize = 16 * 1024; deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); maskGroups_ = deviceQueueSize / (512 * Ki); maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_; // Align the queue size for the multiple dispatch scheduler. // Each thread works with 32 entries * maskGroups uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_); if (extra != 0) { deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra; } if (deviceQueueSize_ == deviceQueueSize) { return true; } else { delete vqHeader_; delete virtualQueue_; vqHeader_ = nullptr; virtualQueue_ = nullptr; deviceQueueSize_ = 0; } uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); uint allocSize = deviceQueueSize; // Add the virtual queue header allocSize += sizeof(AmdVQueueHeader); allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); uint argOffs = allocSize; // Add the kernel arguments and wait events uint singleArgSize = amd::alignUp( dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap)); allocSize += singleArgSize * numSlots; uint eventsOffs = allocSize; // Add the device events allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); uint eventMaskOffs = allocSize; // Add mask array for events allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; uint slotMaskOffs = allocSize; // Add mask array for AmdAqlWrap slots allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; // Align size to 64 bytes for more efficient fill operation allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t)); virtualQueue_ = new Memory(dev(), allocSize); Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote; if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) { return false; } if (GPU_PRINT_CHILD_KERNEL != 0) { address ptr = reinterpret_cast

(virtualQueue_->map(this, Resource::WriteOnly)); if (nullptr == ptr) { return false; } } uint64_t pattern = 0; amd::Coord3D origin(0, 0, 0); amd::Coord3D region(virtualQueue_->size(), 0, 0); if (!dev().xferMgr().fillBuffer(*virtualQueue_, &pattern, sizeof(pattern), region, origin, region)) { return false; } uint64_t vaBase = virtualQueue_->vmAddress(); AmdVQueueHeader header = {}; // Initialize the virtual queue header header.aql_slot_num = numSlots; header.event_slot_num = dev().settings().numDeviceEvents_; header.event_slot_mask = vaBase + eventMaskOffs; header.event_slots = vaBase + eventsOffs; header.aql_slot_mask = vaBase + slotMaskOffs; header.wait_size = dev().settings().numWaitEvents_; header.arg_size = dev().info().maxParameterSize_ + 64; header.mask_groups = maskGroups_; vqHeader_ = new AmdVQueueHeader; if (nullptr == vqHeader_) { return false; } *vqHeader_ = header; virtualQueue_->writeRawData(*this, 0, sizeof(AmdVQueueHeader), &header, false); // Go over all slots and perform initialization AmdAqlWrap slot = {}; size_t offset = sizeof(AmdVQueueHeader); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; slot.aql.kernarg_address = reinterpret_cast(argStart); slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; virtualQueue_->writeRawData(*this, offset, sizeof(AmdAqlWrap), &slot, false); offset += sizeof(AmdAqlWrap); } deviceQueueSize_ = deviceQueueSize; return true; } // ================================================================================================ VirtualGPU::VirtualGPU(Device& device) : device::VirtualDevice(device), engineID_(MainEngine), gpuDevice_(static_cast(device)), printfDbgHSA_(nullptr), tsCache_(nullptr), managedBuffer_(*this, device.settings().stagedXferSize_ + 32 * Ki), writeBuffer_(device, managedBuffer_, device.settings().stagedXferSize_), hwRing_(0), readjustTimeGPU_(0), lastTS_(nullptr), profileTs_(nullptr), vqHeader_(nullptr), virtualQueue_(nullptr), schedParams_(nullptr), deviceQueueSize_(0), maskGroups_(1), hsaQueueMem_(nullptr), cmdAllocator_(nullptr) { // Note: Virtual GPU device creation must be a thread safe operation index_ = gpuDevice_.numOfVgpus_++; gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); gpuDevice_.vgpus_[index()] = this; queues_[MainEngine] = nullptr; queues_[SdmaEngine] = nullptr; // The hostcall buffer for this vqueue is initialized on demand. hostcallBuffer_ = nullptr; } // ================================================================================================ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, amd::CommandQueue::Priority priority) { device::BlitManager::Setup blitSetup; // Resize the list of device resources always, // because destructor calls eraseResourceList() even if create() failed dev().resizeResoureList(index()); // Virtual GPU will have profiling enabled state_.profiling_ = profiling; Pal::CmdAllocatorCreateInfo createInfo = {}; createInfo.flags.threadSafe = true; // \todo forces PAL to reuse CBs, but requires postamble createInfo.flags.autoMemoryReuse = false; createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc; createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = VirtualGPU::Queue::MaxCommands * (320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0)); createInfo.allocInfo[Pal::CommandDataAlloc].allocSize = dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize; createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc; createInfo.allocInfo[Pal::EmbeddedDataAlloc].allocSize = 256 * Ki; createInfo.allocInfo[Pal::EmbeddedDataAlloc].suballocSize = 64 * Ki; createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocHeap = Pal::GpuHeapGartUswc; createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].allocSize = 64 * Ki; createInfo.allocInfo[Pal::LargeEmbeddedDataAlloc].suballocSize = 32 * Ki; createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocHeap = Pal::GpuHeapInvisible; createInfo.allocInfo[Pal::GpuScratchMemAlloc].allocSize = 64 * Ki; createInfo.allocInfo[Pal::GpuScratchMemAlloc].suballocSize = 4 * Ki; Pal::Result result; size_t cmdAllocSize = dev().iDev()->GetCmdAllocatorSize(createInfo, &result); if (Pal::Result::Success != result) { return false; } char* addr = new char[cmdAllocSize]; if (Pal::Result::Success != dev().iDev()->CreateCmdAllocator(createInfo, addr, &cmdAllocator_)) { return false; } uint idx = index() % dev().numComputeEngines(); uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs ? 0 : (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2); uint max_cmd_buffers = dev().settings().maxCmdBuffers_; if (dev().numComputeEngines()) { queues_[MainEngine] = Queue::Create(*this, Pal::QueueTypeCompute, idx, cmdAllocator_, rtCUs, priority, residency_limit, max_cmd_buffers); if (nullptr == queues_[MainEngine]) { return false; } const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_); hwRing_ = (info != dev().QueuePool().end()) ? info->second->index_ : (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES; // Check if device has SDMA engines if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) { uint sdma; // If only 1 SDMA engine is available then use that one, otherwise it's a round-robin manner if ((dev().numDMAEngines() < 2) || ((idx + 1) & 0x1)) { sdma = 0; } else { sdma = 1; } queues_[SdmaEngine] = Queue::Create( *this, Pal::QueueTypeDma, sdma, cmdAllocator_, amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal, residency_limit, max_cmd_buffers); if (nullptr == queues_[SdmaEngine]) { return false; } } } else { LogError("Runtme couldn't find compute queues!"); return false; } // Create buffers for kernel arg management if (!managedBuffer_.create(dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs ? Resource::Persistent : Resource::RemoteUSWC)) { // Try just USWC if persistent memory failed if (dev().settings().kernel_arg_impl_ == KernelArgImpl::DeviceKernelArgs) { if (!managedBuffer_.create(Resource::RemoteUSWC)) { return false; } } else { return false; } } // Diable double copy optimization, // since UAV read from nonlocal is fast enough blitSetup.disableCopyBufferToImageOpt_ = true; if (!allocConstantBuffers()) { return false; } // Create HSAILPrintf class printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); if (nullptr == printfDbgHSA_) { LogError("Could not create PrintfDbgHSA class!"); return false; } tsCache_ = new TimeStampCache(*this); if (nullptr == tsCache_) { LogError("Could not create TimeStamp cache!"); return false; } if (!memoryDependency().create(dev().settings().numMemDependencies_)) { LogError("Could not create the array of memory objects!"); return false; } if (!allocHsaQueueMem()) { LogError("Could not create hsaQueueMem object!"); return false; } // Check if the app requested a device queue creation if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { LogError("Could not create a virtual queue!"); return false; } // Choose the appropriate class for blit engine switch (dev().settings().blitEngine_) { default: // Fall through ... case Settings::BlitEngineHost: blitSetup.disableAll(); // Fall through ... case Settings::BlitEngineCAL: case Settings::BlitEngineKernel: blitMgr_ = new KernelBlitManager(*this, blitSetup); break; } if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; } // If the developer mode manager is available and it's not a device queue, // then enable RGP capturing if ((index() != 0) && dev().captureMgr() != nullptr) { bool dbg_vmid = false; state_.rgpCaptureEnabled_ = true; dev().captureMgr()->RegisterTimedQueue(2 * index(), queue(MainEngine).iQueue_, &dbg_vmid); dev().captureMgr()->RegisterTimedQueue(2 * index() + 1, queue(SdmaEngine).iQueue_, &dbg_vmid); } return true; } // ================================================================================================ bool VirtualGPU::allocHsaQueueMem() { // Allocate a dummy HSA queue hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); if ((hsaQueueMem_ == nullptr) || (!hsaQueueMem_->create(Resource::Local))) { delete hsaQueueMem_; return false; } amd_queue_t hsa_queue = {}; // Provide private and local heap addresses constexpr uint addressShift = LP64_SWITCH(0, 32); hsa_queue.private_segment_aperture_base_hi = static_cast( dev().properties().gpuMemoryProperties.privateApertureBase >> addressShift); hsa_queue.group_segment_aperture_base_hi = static_cast( dev().properties().gpuMemoryProperties.sharedApertureBase >> addressShift); hsaQueueMem_->writeRawData(*this, 0, sizeof(amd_queue_t), &hsa_queue, true); return true; } VirtualGPU::~VirtualGPU() { // Not safe to remove a queue. So lock the device amd::ScopedLock k(dev().lockAsyncOps()); amd::ScopedLock lock(dev().vgpusAccess()); if (queues_[MainEngine] != nullptr) { // Clear all timestamps, associated with this virtual GPU auto& mgmt = *queues_[MainEngine]->aql_mgmt_; for (uint32_t i = 0; i < AqlPacketMgmt::kAqlPacketsListSize; ++i) { if (mgmt.aql_vgpus_[i] == this) { mgmt.aql_vgpus_[i] = nullptr; mgmt.aql_events_[i].invalidate(); } } } // Destroy RGP trace if (rgpCaptureEna()) { dev().captureMgr()->FinishRGPTrace(this, true); } while (!freeCbQueue_.empty()) { auto cb = freeCbQueue_.front(); delete cb; freeCbQueue_.pop(); } // Destroy printfHSA object delete printfDbgHSA_; // Destroy TimeStamp cache delete tsCache_; // Destroy resource list with the constant buffers for (uint i = 0; i < constBufs_.size(); ++i) { delete constBufs_[i]; } managedBuffer_.release(); delete vqHeader_; delete virtualQueue_; delete hsaQueueMem_; // Release scratch buffer memory to reduce memory pressure //!@note OCLtst uses single device with multiple tests //! Release memory only if it's the last command queue. //! The first queue is reserved for the transfers on device if (static_cast(gpuDevice_.numOfVgpus_ - 1) <= 1) { gpuDevice_.destroyScratchBuffers(); } // Destroy BlitManager object delete blitMgr_; { // Destroy queues delete queues_[MainEngine]; delete queues_[SdmaEngine]; if (nullptr != cmdAllocator_) { cmdAllocator_->Destroy(); delete[] reinterpret_cast(cmdAllocator_); } } { // Find all available virtual GPUs and lock them // from the execution of commands, since the queue index and resource list // Will be adjusted for (auto it : dev().vgpus()) { if (it != this) { it->execution().lock(); } } // Not safe to add a resource if create/destroy queue is in process, since // the size of the TS array can change amd::ScopedLock r(dev().lockResources()); gpuDevice_.numOfVgpus_--; gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { dev().vgpus()[idx]->index_--; } dev().eraseResoureList(index()); // Find all available virtual GPUs and unlock them // for the execution of commands for (auto it : dev().vgpus()) { it->execution().unlock(); } } if (hostcallBuffer_ != nullptr) { ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for virtual queue %p", hostcallBuffer_, this); amd::disableHostcalls(hostcallBuffer_); dev().svmFree(hostcallBuffer_); } } void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); // Translate memory references and ensure cache up-to-date pal::Memory* memory = dev().getGpuMemory(&vcmd.source()); size_t offset = 0; // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); profilingBegin(vcmd); memory->syncCacheFromHost(*this); cl_command_type type = vcmd.type(); bool result = false; amd::Memory* bufferFromImage = nullptr; // Force buffer read for IMAGE1D_BUFFER if ((type == CL_COMMAND_READ_IMAGE) && (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(vcmd.source()); if (nullptr == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_READ_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Process different write commands switch (type) { case CL_COMMAND_READ_BUFFER: { amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); if (nullptr != bufferFromImage) { size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; } if (hostMemory != nullptr) { // Accelerated transfer without pinning amd::Coord3D dstOrigin(offset); result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size, vcmd.isEntireMemory(), vcmd.copyMetadata()); } else { // The logic below will perform 2 step copy to make sure memory pinning doesn't // occur on the first unaligned page, because in Windows memory manager can // have CPU access to the allocation header in another thread // and a race condition is possible. char* tmpHost = amd::alignUp(reinterpret_cast(vcmd.destination()), PinnedMemoryAlignment); // Find the partial size for unaligned copy size_t partial = tmpHost - reinterpret_cast(vcmd.destination()); result = true; // Check if it's staging copy, then ignore unaligned address if (size[0] <= dev().settings().pinnedMinXferSize_) { partial = size[0]; } // Make first step transfer if (partial > 0) { result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, partial, false, vcmd.copyMetadata()); } // Second step transfer if something left to copy if (partial < size[0]) { result &= blitMgr().readBuffer(*memory, tmpHost, origin[0] + partial, size[0] - partial, false, vcmd.copyMetadata()); } } if (nullptr != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_READ_BUFFER_RECT: { amd::BufferRect hostbufferRect; amd::Coord3D region(0); amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); if (hostMemory != nullptr) { result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect, vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } else { result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } } break; case CL_COMMAND_READ_IMAGE: if (memory->memoryType() == Resource::ImageBuffer) { Image* imageBuffer = static_cast(memory); // Check if synchronization has to be performed if (nullptr != imageBuffer->CopyImageBuffer()) { memory = imageBuffer->CopyImageBuffer(); Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent()); amd::Image* image = imageBuffer->owner()->asImage(); amd::Coord3D offs(0); // Copy memory from the original image buffer into the backing store image result = blitMgr().copyBufferToImage(*buffer, *imageBuffer->CopyImageBuffer(), offs, offs, image->getRegion(), true, image->getRowPitch(), image->getSlicePitch(), vcmd.copyMetadata()); } } if (hostMemory != nullptr) { // Accelerated image to buffer transfer without pinning amd::Coord3D dstOrigin(offset); result = blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.copyMetadata()); } else { result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } break; default: LogError("Unsupported type for the read command"); break; } if (!result) { LogError("submitReadMemory failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); // Translate memory references and ensure cache up to date pal::Memory* memory = dev().getGpuMemory(&vcmd.destination()); size_t offset = 0; // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); profilingBegin(vcmd); bool entire = vcmd.isEntireMemory(); // Synchronize memory from host if necessary device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; memory->syncCacheFromHost(*this, syncFlags); cl_command_type type = vcmd.type(); bool result = false; amd::Memory* bufferFromImage = nullptr; // Force buffer write for IMAGE1D_BUFFER if ((type == CL_COMMAND_WRITE_IMAGE) && (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(vcmd.destination()); if (nullptr == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_WRITE_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Process different write commands switch (type) { case CL_COMMAND_WRITE_BUFFER: { amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); if (nullptr != bufferFromImage) { size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; } if ((hostMemory != nullptr) && (vcmd.size()[0] > dev().settings().prepinnedMinSize_)) { // Accelerated transfer without pinning amd::Coord3D srcOrigin(offset); result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size, vcmd.isEntireMemory(), vcmd.copyMetadata()); } else { // The logic below will perform 2 step copy to make sure memory pinning doesn't // occur on the first unaligned page, because in Windows memory manager can // have CPU access to the allocation header in another thread // and a race condition is possible. const char* tmpHost = amd::alignUp(reinterpret_cast(vcmd.source()), PinnedMemoryAlignment); // Find the partial size for unaligned copy size_t partial = tmpHost - reinterpret_cast(vcmd.source()); result = true; // Check if it's staging copy, then ignore unaligned address if (size[0] <= dev().settings().pinnedMinXferSize_) { partial = size[0]; } // Make first step transfer if (partial > 0) { result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, partial, false, vcmd.copyMetadata()); } // Second step transfer if something left to copy if (partial < size[0]) { result &= blitMgr().writeBuffer(tmpHost, *memory, origin[0] + partial, size[0] - partial, false, vcmd.copyMetadata()); } } if (nullptr != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_WRITE_BUFFER_RECT: { amd::BufferRect hostbufferRect; amd::Coord3D region(0); amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); if (hostMemory != nullptr) { result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(), vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } else { result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } } break; case CL_COMMAND_WRITE_IMAGE: if (hostMemory != nullptr) { // Accelerated buffer to image transfer without pinning amd::Coord3D srcOrigin(offset); result = blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.copyMetadata()); } else { result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory(), vcmd.copyMetadata()); } break; default: LogError("Unsupported type for the write command"); break; } if (!result) { LogError("submitWriteMemory failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } else { // Mark this as the most-recently written cache of the destination vcmd.destination().signalWrite(&gpuDevice_); } profilingEnd(vcmd); } bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem, bool entire, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, amd::CopyMetadata copyMetadata) { // Translate memory references and ensure cache up-to-date pal::Memory* dstMemory = dev().getGpuMemory(&dstMem); pal::Memory* srcMemory = dev().getGpuMemory(&srcMem); if (dstMemory == nullptr || srcMemory == nullptr) { LogError("submitcopyMemory Failed!"); return false; } // Synchronize source and destination memory device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; dstMemory->syncCacheFromHost(*this, syncFlags); srcMemory->syncCacheFromHost(*this); amd::Memory* bufferFromImageSrc = nullptr; amd::Memory* bufferFromImageDst = nullptr; // Force buffer read for IMAGE1D_BUFFER if (srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { bufferFromImageSrc = createBufferFromImage(srcMem); if (nullptr == bufferFromImageSrc) { LogError("We should not fail buffer creation from image_buffer!"); } else { srcMemory = dev().getGpuMemory(bufferFromImageSrc); } } // Force buffer write for IMAGE1D_BUFFER if (dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) { bufferFromImageDst = createBufferFromImage(dstMem); if (nullptr == bufferFromImageDst) { LogError("We should not fail buffer creation from image_buffer!"); } else { dstMemory = dev().getGpuMemory(bufferFromImageDst); } } type = getCopyCommandType(type, srcMem.getType(), dstMem.getType()); bool result = false; // Check if HW can be used for memory copy switch (type) { case CL_COMMAND_MAKE_BUFFERS_RESIDENT_AMD: case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { amd::Coord3D realSrcOrigin(srcOrigin[0]); amd::Coord3D realDstOrigin(dstOrigin[0]); amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]); if (nullptr != bufferFromImageSrc) { const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); realSrcOrigin.c[0] *= elemSize; if (nullptr != bufferFromImageDst) { realDstOrigin.c[0] *= elemSize; } realSize.c[0] *= elemSize; } else if (nullptr != bufferFromImageDst) { const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); realDstOrigin.c[0] *= elemSize; realSize.c[0] *= elemSize; } result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize, entire, copyMetadata); } break; case CL_COMMAND_COPY_BUFFER_RECT: result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire, copyMetadata); break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: { amd::Coord3D realDstOrigin(dstOrigin); if (nullptr != bufferFromImageDst) { const size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); realDstOrigin.c[0] *= elemSize; } result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, realDstOrigin, size, entire, dstRect.rowPitch_, dstRect.slicePitch_, copyMetadata); break; } case CL_COMMAND_COPY_BUFFER_TO_IMAGE: { amd::Coord3D realSrcOrigin(srcOrigin); if (nullptr != bufferFromImageSrc) { const size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); realSrcOrigin.c[0] *= elemSize; } result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory, realSrcOrigin, dstOrigin, size, entire, srcRect.rowPitch_, srcRect.slicePitch_, copyMetadata); break; } case CL_COMMAND_COPY_IMAGE: result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire, copyMetadata); break; default: LogError("Unsupported command type for memory copy!"); break; } if (nullptr != bufferFromImageSrc) { bufferFromImageSrc->release(); } if (nullptr != bufferFromImageDst) { bufferFromImageDst->release(); } if (!result) { LogError("submitCopyMemory failed!"); return false; } else { // Mark this as the most-recently written cache of the destination dstMem.signalWrite(&gpuDevice_); } return true; } void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); cl_command_type type = vcmd.type(); bool entire = vcmd.isEntireMemory(); if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect(), vcmd.copyMetadata())) { vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); cl_command_type type = vcmd.type(); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { amd::Coord3D srcOrigin(0, 0, 0); amd::Coord3D dstOrigin(0, 0, 0); amd::Coord3D size(vcmd.srcSize(), 1, 1); amd::BufferRect srcRect; amd::BufferRect dstRect; bool result = false; amd::Memory* srcMem = amd::MemObjMap::FindMemObj(vcmd.src()); amd::Memory* dstMem = amd::MemObjMap::FindMemObj(vcmd.dst()); device::Memory::SyncFlags syncFlags; if (nullptr != srcMem) { srcMem->commitSvmMemory(); srcOrigin.c[0] = static_cast(vcmd.src()) - static_cast

(srcMem->getSvmPtr()); if (!(srcMem->validateRegion(srcOrigin, size))) { vcmd.setStatus(CL_INVALID_OPERATION); return; } } if (nullptr != dstMem) { dstMem->commitSvmMemory(); dstOrigin.c[0] = static_cast(vcmd.dst()) - static_cast

(dstMem->getSvmPtr()); if (!(dstMem->validateRegion(dstOrigin, size))) { vcmd.setStatus(CL_INVALID_OPERATION); return; } } if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space std::memcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize()); result = true; } else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space Memory* memory = dev().getGpuMemory(dstMem); // Synchronize source and destination memory syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); memory->syncCacheFromHost(*this, syncFlags); result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size, dstMem->isEntirelyCovered(dstOrigin, size)); // Mark this as the most-recently written cache of the destination dstMem->signalWrite(&gpuDevice_); } else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space Memory* memory = dev().getGpuMemory(srcMem); // Synchronize source and destination memory memory->syncCacheFromHost(*this); result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size, srcMem->isEntirelyCovered(srcOrigin, size)); } else if (nullptr != srcMem && nullptr != dstMem) { // both in svm space bool entire = srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size); result = copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect); } if (!result) { vcmd.setStatus(CL_INVALID_OPERATION); } } else { // direct memcpy for FGS enabled system amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); } profilingEnd(vcmd); } void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); // Save map info for unmap operation memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory()); // If we have host memory, use it if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { if (!memory->isHostMemDirectAccess()) { // Make sure GPU finished operation before // synchronization with the backing store memory->wait(*this); } // Target is the backing store, so just ensure that owner is up-to-date memory->owner()->cacheWriteBack(this); // Add memory to VA cache, so rutnime can detect direct access to VA dev().addVACache(memory); } else if (memory->isPersistentMapped()) { // Nothing to do here } else if (memory->mapMemory() != nullptr) { // Target is a remote resource, so copy assert(memory->mapMemory() != nullptr); if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { amd::Coord3D dstOrigin(0, 0, 0); if (memory->desc().buffer_) { if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { Memory* memoryBuf = memory; amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory()); if (nullptr == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { memoryBuf = dev().getGpuMemory(bufferFromImage); } if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size, vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } if (nullptr != bufferFromImage) { bufferFromImage->release(); } } else { // Validate if it's a view for a map of mip level if (vcmd.memory().parent() != nullptr) { amd::Image* amdImage = vcmd.memory().parent()->asImage(); if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1)) { // Save map write info in the parent object dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory(), vcmd.memory().asImage()); } } if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } } } else { LogError("Unhandled map!"); } profilingEnd(vcmd); } void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { bool unmapMip = false; amd::Image* amdImage; { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); pal::Memory* memory = dev().getGpuMemory(&vcmd.memory()); amd::Memory* owner = memory->owner(); const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); if (nullptr == writeMapInfo) { LogError("Unmap without map call"); return; } profilingBegin(vcmd); // Check if image is a mipmap and assign a saved view amdImage = owner->asImage(); if ((amdImage != nullptr) && (amdImage->getMipLevels() > 1) && (writeMapInfo->baseMip_ != nullptr)) { // Assign mip level view amdImage = writeMapInfo->baseMip_; // Clear unmap flags from the parent image memory->clearUnmapInfo(vcmd.mapPtr()); memory = dev().getGpuMemory(amdImage); unmapMip = true; writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); } // We used host memory if ((owner->getHostMem() != nullptr) && memory->isDirectMap()) { if (writeMapInfo->isUnmapWrite()) { // Target is the backing store, so sync owner->signalWrite(nullptr); memory->syncCacheFromHost(*this); } // Remove memory from VA cache dev().removeVACache(memory); } // data check was added for persistent memory that failed to get aperture // and therefore are treated like a remote resource else if (memory->isPersistentMapped()) { // Map/unmap must be serialized amd::ScopedLock lock(owner->lockMemoryOps()); memory->unmap(this); if (memory->getMapCount() == 0) { memory->setPersistentMapFlag(false); } } else if (memory->mapMemory() != nullptr) { if (writeMapInfo->isUnmapWrite()) { amd::Coord3D srcOrigin(0, 0, 0); // Target is a remote resource, so copy assert(memory->mapMemory() != nullptr); if (memory->desc().buffer_) { if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { Memory* memoryBuf = memory; amd::Coord3D origin(writeMapInfo->origin_[0]); amd::Coord3D size(writeMapInfo->region_[0]); size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; amd::Memory* bufferFromImage = createBufferFromImage(vcmd.memory()); if (nullptr == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { memoryBuf = dev().getGpuMemory(bufferFromImage); } if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } if (nullptr != bufferFromImage) { bufferFromImage->release(); } } else { if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } } } else { LogError("Unhandled unmap!"); vcmd.setStatus(CL_INVALID_VALUE); } // Clear unmap flags memory->clearUnmapInfo(vcmd.mapPtr()); profilingEnd(vcmd); } // Release a view for a mipmap map if (unmapMip) { // Memory release should be outside of the execution lock, // because mapMemory_ isn't marked for a specifc GPU amdImage->release(); } } bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size, bool forceBlit) { pal::Memory* memory = dev().getGpuMemory(amdMemory); bool entire = amdMemory->isEntirelyCovered(origin, size); // Synchronize memory from host if necessary device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; memory->syncCacheFromHost(*this, syncFlags); bool result = false; amd::Memory* bufferFromImage = nullptr; float fillValue[4]; // Force fill buffer for IMAGE1D_BUFFER if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(*amdMemory); if (nullptr == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_FILL_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Find the the right fill operation switch (type) { case CL_COMMAND_FILL_BUFFER: case CL_COMMAND_SVM_MEMFILL: { amd::Coord3D realOrigin(origin[0]); amd::Coord3D realSize(size[0]); // Reprogram fill parameters if it's an IMAGE1D_BUFFER object if (nullptr != bufferFromImage) { size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize(); realOrigin.c[0] *= elemSize; realSize.c[0] *= elemSize; memset(fillValue, 0, sizeof(fillValue)); amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); pattern = fillValue; patternSize = elemSize; } result = blitMgr().fillBuffer(*memory, pattern, patternSize, realSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size), forceBlit); if (nullptr != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_FILL_IMAGE: result = blitMgr().fillImage(*memory, pattern, origin, size, amdMemory->isEntirelyCovered(origin, size)); break; default: LogError("Unsupported command type for FillMemory!"); break; } if (!result) { LogError("fillMemory failed!"); return false; } // Mark this as the most-recently written cache of the destination amdMemory->signalWrite(&gpuDevice_); return true; } void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); if (cmd.type() == CL_COMMAND_FILL_IMAGE) { if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(), cmd.size())) { cmd.setStatus(CL_INVALID_OPERATION); } } else { size_t width = cmd.size().c[0]; size_t height = cmd.size().c[1]; size_t depth = cmd.size().c[2]; size_t pitch = cmd.surface().c[0]; amd::Coord3D origin = cmd.origin(); amd::Coord3D region{cmd.surface().c[1], cmd.surface().c[2], depth}; amd::BufferRect rect; rect.create(static_cast(origin), static_cast(region), pitch, 0); bool force_blit = false; if (amd::IS_HIP) { constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR); // In case of HMM, use blit kernel instead of CPU memcpy if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) { force_blit = true; } } for (size_t slice = 0; slice < depth; slice++) { for (size_t row = 0; row < height; row++) { const size_t rowOffset = rect.offset(0, row, slice); if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1}, force_blit)) { cmd.setStatus(CL_INVALID_OPERATION); } } } } profilingEnd(cmd); } void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); // Get the device memory objects for the current device Memory* srcDevMem = dev().getGpuMemory(&cmd.source()); Memory* dstDevMem = dev().getGpuMemory(&cmd.destination()); bool p2pAllowed = true; // If any device object is null, then no HW P2P and runtime has to use staging if (srcDevMem == nullptr) { srcDevMem = static_cast( cmd.source().getDeviceMemory(*cmd.source().getContext().devices()[0])); p2pAllowed = false; } else if (dstDevMem == nullptr) { dstDevMem = static_cast( cmd.destination().getDeviceMemory(*cmd.destination().getContext().devices()[0])); p2pAllowed = false; } // Synchronize source and destination memory device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = cmd.isEntireMemory(); amd::Coord3D size = cmd.size(); bool result = false; switch (cmd.type()) { case CL_COMMAND_COPY_BUFFER: { amd::Coord3D srcOrigin(cmd.srcOrigin()[0]); amd::Coord3D dstOrigin(cmd.dstOrigin()[0]); if (p2pAllowed) { result = blitMgr().copyBuffer(*srcDevMem, *dstDevMem, srcOrigin, dstOrigin, size, cmd.isEntireMemory()); } else { amd::ScopedLock lock(dev().P2PStageOps()); Memory* dstStgMem = static_cast( dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); Memory* srcStgMem = static_cast( dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); size_t copy_size = Device::kP2PStagingSize; size_t left_size = size[0]; amd::Coord3D stageOffset(0); result = true; do { if (left_size <= copy_size) { copy_size = left_size; } left_size -= copy_size; amd::Coord3D cpSize(copy_size); // Perform 2 step transfer with staging buffer result &= srcDevMem->dev().xferMgr().copyBuffer(*srcDevMem, *dstStgMem, srcOrigin, stageOffset, cpSize); srcOrigin.c[0] += copy_size; result &= dstDevMem->dev().xferMgr().copyBuffer(*srcStgMem, *dstDevMem, stageOffset, dstOrigin, cpSize); dstOrigin.c[0] += copy_size; } while (left_size > 0); } break; } case CL_COMMAND_COPY_BUFFER_RECT: { if (p2pAllowed) { result = blitMgr().copyBufferRect(*srcDevMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, cmd.isEntireMemory(), cmd.copyMetadata()); } else { amd::ScopedLock lock(dev().P2PStageOps()); Memory* dstStgMem = static_cast( dev().P2PStage()->getDeviceMemory(*cmd.source().getContext().devices()[0])); Memory* srcStgMem = static_cast( dev().P2PStage()->getDeviceMemory(*cmd.destination().getContext().devices()[0])); if ((cmd.srcRect().slicePitch_ * size[2]) <= Device::kP2PStagingSize) { result = true; // Perform 2 step transfer with staging buffer result &= srcDevMem->dev().xferMgr().copyBufferRect(*srcDevMem, *dstStgMem, cmd.srcRect(), cmd.srcRect(), size, false, cmd.copyMetadata()); result &= dstDevMem->dev().xferMgr().copyBufferRect(*srcStgMem, *dstDevMem, cmd.srcRect(), cmd.dstRect(), size, false, cmd.copyMetadata()); } else { size_t srcOffset; size_t dstOffset; result = true; for (size_t z = 0; z < size[2]; ++z) { for (size_t y = 0; y < size[1]; ++y) { srcOffset = cmd.srcRect().offset(0, y, z); dstOffset = cmd.dstRect().offset(0, y, z); amd::Coord3D srcOrigin(srcOffset); amd::Coord3D dstOrigin(dstOffset); size_t copy_size = Device::kP2PStagingSize; size_t left_size = size[0]; amd::Coord3D stageOffset(0); do { if (left_size <= copy_size) { copy_size = left_size; } left_size -= copy_size; // Perform 2 step transfer with staging buffer result &= srcDevMem->partialMemCopyTo(*(srcDevMem->dev().xferQueue()), srcOrigin, stageOffset, copy_size, *dstStgMem); srcDevMem->dev().xferQueue()->waitAllEngines(); result &= srcStgMem->partialMemCopyTo(*(dstDevMem->dev().xferQueue()), stageOffset, dstOrigin, copy_size, *dstDevMem); srcStgMem->dev().xferQueue()->waitAllEngines(); srcOrigin.c[0] += copy_size; dstOrigin.c[0] += copy_size; } while (left_size > 0); } } } } break; } case CL_COMMAND_COPY_IMAGE: case CL_COMMAND_COPY_IMAGE_TO_BUFFER: case CL_COMMAND_COPY_BUFFER_TO_IMAGE: LogError("Unsupported P2P type!"); break; default: ShouldNotReachHere(); break; } if (!result) { LogError("submitCopyMemoryP2P failed!"); cmd.setStatus(CL_OUT_OF_RESOURCES); } cmd.destination().signalWrite(&dstDevMem->dev()); profilingEnd(cmd); } void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { // Make sure we have memory for the command execution pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory()); if (memory->mapMemory() != nullptr) { if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { assert(memory->desc().buffer_ && "SVM memory can't be an image"); if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) { LogError("submitSVMMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { if (!memory->isHostMemDirectAccess()) { // Make sure GPU finished operation before // synchronization with the backing store memory->wait(*this); } // Target is the backing store, so just ensure that owner is up-to-date memory->owner()->cacheWriteBack(this); } else { LogError("Unhandled svm map!"); } } profilingEnd(vcmd); } void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { pal::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr()); if (memory->mapMemory() != nullptr) { if (writeMapInfo->isUnmapWrite()) { amd::Coord3D srcOrigin(0, 0, 0); // Target is a remote resource, so copy assert(memory->desc().buffer_ && "SVM memory can't be an image"); if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitSvmUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { if (writeMapInfo->isUnmapWrite()) { // Target is the backing store, so sync memory->owner()->signalWrite(nullptr); memory->syncCacheFromHost(*this); } } memory->clearUnmapInfo(vcmd.svmPtr()); } profilingEnd(vcmd); } void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); if (!dev().isFineGrainedSystem()) { size_t patternSize = vcmd.patternSize(); size_t fillSize = patternSize * vcmd.times(); amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst()); assert(dstMemory && "No svm Buffer to fill with!"); size_t offset = reinterpret_cast(vcmd.dst()) - reinterpret_cast(dstMemory->getSvmPtr()); pal::Memory* memory = dev().getGpuMemory(dstMemory); amd::Coord3D origin(offset, 0, 0); amd::Coord3D size(fillSize, 1, 1); assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) { vcmd.setStatus(CL_INVALID_OPERATION); } } else { // for FGS capable device, fill CPU memory directly amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); } profilingEnd(vcmd); } void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); for (const auto& it : vcmd.memObjects()) { // Find device memory pal::Memory* memory = dev().getGpuMemory(it); if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { memory->mgpuCacheWriteBack(*this); } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { // Synchronize memory from host if necessary. // The sync function will perform memory migration from // another device if necessary device::Memory::SyncFlags syncFlags; memory->syncCacheFromHost(*this, syncFlags); } else { LogWarning("Unknown operation for memory migration!"); } } profilingEnd(vcmd); } void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { // in-order semantics: previous commands need to be done before we start // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); std::vector& svmPointers = vcmd.svmPointers(); if (vcmd.pfnFreeFunc() == nullptr) { // pointers allocated using clSVMAlloc for (uint32_t i = 0; i < svmPointers.size(); ++i) { dev().svmFree(svmPointers[i]); } } else { vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), static_cast(&(svmPointers[0])), vcmd.userData()); } profilingEnd(vcmd); } void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); const cl_command_type type = cmd.type(); const uint64_t value = cmd.value(); const uint64_t mask = cmd.mask(); const unsigned int flags = cmd.flags(); const size_t sizeBytes = cmd.sizeBytes(); const size_t offset = cmd.offset(); amd::Memory* amdMemory = &cmd.memory(); Memory* memory = dev().getGpuMemory(amdMemory); if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) { // Use a blit kernel to perform the wait operation // mask is applied on value before performing // the comparision defined by 'condition' bool result = static_cast(blitMgr()).streamOpsWait(*memory, value, offset, sizeBytes, flags, mask); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx." " Flags: 0x%lx mask: 0x%lx", value, flags, mask); if (!result) { LogError("submitStreamOperation: Wait failed!"); } } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) { bool result = static_cast(blitMgr()).streamOpsWrite(*memory, value, offset, sizeBytes); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value); if (!result) { LogError("submitStreamOperation: Write failed!"); } } else { ShouldNotReachHere(); } profilingEnd(cmd); } // ================================================================================================ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); amd::Memory* phys_mem_obj = vcmd.memory(); amd::Memory* vaddr_base_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); if (vaddr_base_obj == nullptr || !(vaddr_base_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { profilingEnd(vcmd); return; } // Create a view, since original base obj will map the whole memory and multimap cases wont work. amd::Memory* vaddr_sub_obj = nullptr; size_t vaddr_offset = 0; if (phys_mem_obj != nullptr) { constexpr bool kParent = false; vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer( phys_mem_obj->getContext(), const_cast(vcmd.ptr()), vcmd.size(), phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent); // Calculate the offset from the original pointer. vaddr_offset = (reinterpret_cast

(vaddr_sub_obj->getSvmPtr()) - reinterpret_cast

(vaddr_base_obj->getSvmPtr())); } // The imem() in the backend is shared between base and sub/view object. pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_base_obj); Pal::IGpuMemory* phymem_igpu_mem = (phys_mem_obj == nullptr) ? nullptr : dev().getGpuMemory(phys_mem_obj)->iMem(); Pal::VirtualMemoryRemapRange range{vaddr_pal_mem->iMem(), vaddr_offset, phymem_igpu_mem, 0, vcmd.size(), Pal::VirtualGpuMemAccessMode::NoAccess}; // Wait for previous operations before unmap if (phys_mem_obj == nullptr) { // @note: Need to verify if compute requires a wait or IB flush is enough WaitForIdleCompute(); WaitForIdleSdma(); } eventBegin(MainEngine); auto result = queue(MainEngine).iQueue_->RemapVirtualMemoryPages(1, &range, false, nullptr); // Capture GPU event for the paging operation GpuEvent event; eventEnd(MainEngine, event); setGpuEvent(event); if (result == Pal::Result::Success) { if (phys_mem_obj != nullptr) { // assert the vaddr_mem_obj wasn't mapped already assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr); amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_sub_obj); vaddr_sub_obj->getUserData().phys_mem_obj = phys_mem_obj; phys_mem_obj->getUserData().vaddr_mem_obj = vaddr_sub_obj; } else { // assert the vaddr_mem_obj is mapped and needs to be removed amd::Memory* vaddr_sub_obj = amd::MemObjMap::FindMemObj(vcmd.ptr()); assert(vaddr_sub_obj != nullptr); assert(vcmd.ptr() == vaddr_sub_obj->getSvmPtr()); amd::MemObjMap::RemoveMemObj(vcmd.ptr()); if (vaddr_sub_obj->getUserData().phys_mem_obj != nullptr) { vaddr_sub_obj->getUserData().phys_mem_obj->getUserData().vaddr_mem_obj = nullptr; vaddr_sub_obj->getUserData().phys_mem_obj = nullptr; } } } profilingEnd(vcmd); } // ================================================================================================ void VirtualGPU::PrintChildren(const pal::Kernel& hsaKernel, VirtualGPU* gpuDefQueue) { AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); uint p = 0; for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { if (wraps[i].state != 0) { uint j; if (p == GPU_PRINT_CHILD_KERNEL) { break; } p++; std::stringstream print; print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); print << "Slot#: " << i << "\n"; print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; print << "\tcommand_id: " << wraps[i].command_id << "\n"; print << "\tchild_counter: " << wraps[i].child_counter << "\n"; print << "\tcompletion: " << wraps[i].completion << "\n"; print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; print << "\twait_list: " << wraps[i].wait_list << "\n"; print << "\twait_num: " << wraps[i].wait_num << "\n"; uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress(); size_t* events = reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); for (j = 0; j < wraps[i].wait_num; ++j) { uint offs = static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); print << "Wait Event#: " << j << "\n"; print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; } print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; print << wraps[i].aql.workgroup_size_y << ", "; print << wraps[i].aql.workgroup_size_z << "]\n"; print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; print << wraps[i].aql.grid_size_y << ", "; print << wraps[i].aql.grid_size_z << "]\n"; pal::Kernel* child = nullptr; for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end(); ++it) { if (wraps[i].aql.kernel_object == static_cast(it->second)->gpuAqlCode()) { child = static_cast(it->second); } } if (child == nullptr) { printf("Error: couldn't find child kernel!\n"); continue; } const uint64_t kernarg_address = static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; const amd::KernelSignature& signature = child->signature(); // Check if runtime has to setup hidden arguments for (const auto it : signature.parameters()) { const char* extraArgName = nullptr; switch (it.info_.oclObject_) { case amd::KernelParameterDescriptor::HiddenNone: // void* zero = 0; // WriteAqlArgAt(const_cast

(parameters), zero, it.size_, it.offset_); break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: extraArgName = "Offset0: "; break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: extraArgName = "Offset1: "; break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: extraArgName = "Offset2: "; break; case amd::KernelParameterDescriptor::HiddenPrintfBuffer: extraArgName = "PrintfBuf: "; break; case amd::KernelParameterDescriptor::HiddenDefaultQueue: extraArgName = "VqueuePtr: "; break; case amd::KernelParameterDescriptor::HiddenCompletionAction: extraArgName = "AqlWrap: "; break; default: break; } if (extraArgName) { print << "\t" << extraArgName << *reinterpret_cast(argum); print << "\n"; argum += sizeof(size_t); continue; } print << "\t" << it.name_ << ": "; for (int s = it.size_ - 1; s >= 0; --s) { print.width(2); print.fill('0'); print << static_cast(argum[s]); } argum += it.offset_; print << "\n"; } printf("%s", print.str().c_str()); } } } // ================================================================================================ bool VirtualGPU::PreDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel, VirtualGPU** gpuDefQueue, uint64_t* vmDefQueue) { amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); if (nullptr == defQueue) { LogError("Default device queue wasn't allocated"); return false; } else { if (dev().settings().useDeviceQueue_) { *gpuDefQueue = static_cast(defQueue->vDev()); if ((*gpuDefQueue)->hwRing() == hwRing()) { LogError("Can't submit the child kernels to the same HW ring as the host queue!"); return false; } } else { createVirtualQueue(defQueue->size()); *gpuDefQueue = this; } } *vmDefQueue = (*gpuDefQueue)->virtualQueue_->vmAddress(); (*gpuDefQueue)->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()); // Acquire USWC memory for the scheduler parameters (*gpuDefQueue)->schedParams_ = &xferWrite().Acquire(sizeof(SchedulerParam)); // Add memory handles before the actual dispatch addVmMemory((*gpuDefQueue)->virtualQueue_); addVmMemory((*gpuDefQueue)->schedParams_); return true; } // ================================================================================================ void VirtualGPU::PostDeviceEnqueue(const amd::Kernel& kernel, const pal::Kernel& hsaKernel, VirtualGPU* gpuDefQueue, uint64_t vmDefQueue, uint64_t vmParentWrap, GpuEvent* gpuEvent) { uint32_t id = gpuEvent->id_; amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); // Make sure exculsive access to the device queue amd::ScopedLock(defQueue->lock()); Memory& schedParams = xferWrite().Acquire(sizeof(SchedulerParam)); if (GPU_PRINT_CHILD_KERNEL != 0) { waitForEvent(gpuEvent); PrintChildren(hsaKernel, gpuDefQueue); } if (!dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, *gpuEvent); } // Get the global loop start before the scheduler Pal::gpusize loopStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); static_cast(gpuDefQueue->blitMgr()) .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, 0, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); gpuDefQueue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue, BarrierType::FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler Pal::gpusize patchStart = gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = reinterpret_cast(gpuDefQueue->schedParams_->data()); param->signal = 1; // Scale clock to 1024 to avoid 64 bit div in the scheduler param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/; param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); param->releaseHostCP = 0; param->parentAQL = vmParentWrap; param->dedicatedQueue = dev().settings().useDeviceQueue_; // Fill the scratch buffer information if (hsaKernel.prog().maxScratchRegs() > 0) { pal::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; param->scratchSize = scratchBuf->size(); param->scratch = scratchBuf->vmAddress(); param->numMaxWaves = 32 * dev().info().maxComputeUnits_; param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; addVmMemory(scratchBuf); } else { param->numMaxWaves = 0; param->scratchSize = 0; param->scratch = 0; param->scratchOffset = 0; } // Add all kernels in the program to the mem list. //! \note Runtime doesn't know which one will be called hsaKernel.prog().fillResListWithKernels(*this); Pal::gpusize signalAddr = gpuDefQueue->schedParams_->vmAddress(); gpuDefQueue->eventBegin(MainEngine); gpuDefQueue->iCmd()->CmdVirtualQueueDispatcherEnd( signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); // Note: Device enqueue can't have extra commands after INDIRECT_BUFFER call. // Thus TS command for profiling has to follow in the next CB. constexpr bool ForceSubmitFirst = true; gpuDefQueue->eventEnd(MainEngine, *gpuEvent, ForceSubmitFirst); if (dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr, dev().settings().useDeviceQueue_); if (id != gpuEvent->id_) { LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, *gpuEvent); } xferWrite().Release(*gpuDefQueue->schedParams_); gpuDefQueue->schedParams_ = nullptr; } // ================================================================================================ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { if (vcmd.cooperativeGroups()) { uint32_t workgroups = 1; for (uint i = 0; i < vcmd.sizes().dimensions(); i++) { if (vcmd.sizes().local()[i] != 0) { workgroups *= (vcmd.sizes().global()[i] / vcmd.sizes().local()[i]); } } bool test = true; VirtualGPU* queue = (test) ? this : dev().xferQueue(); // Wait for the execution on the current queue, since the coop groups will use the device queue waitAllEngines(); amd::ScopedLock lock(queue->blitMgr().lockXfer()); queue->profilingBegin(vcmd); static_cast(queue->blitMgr()).RunGwsInit(workgroups); queue->addBarrier(RgpSqqtBarrierReason::PostDeviceEnqueue); // Submit kernel to HW if (!queue->submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, vcmd.sharedMemBytes())) { vcmd.setStatus(CL_INVALID_OPERATION); } queue->profilingEnd(vcmd); // Wait for the execution on the device queue. Keep the current queue in-order queue->waitAllEngines(); } else { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); // Submit kernel to HW if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, vcmd.sharedMemBytes(), vcmd.getAnyOrderLaunchFlag())) { vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } } // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, bool nativeMem, uint32_t sharedMemBytes, bool anyOrder) { state_.anyOrder_ = anyOrder; // Get the HSA kernel object const pal::Kernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); // If RGP capturing is enabled, then start SQTT trace if (rgpCaptureEna()) { size_t newLocalSize[3] = {1, 1, 1}; size_t newGlobalSize[3] = {0, 0, 0}; for (uint i = 0; i < sizes.dimensions(); i++) { newGlobalSize[i] = sizes.global()[i]; if (sizes.local()[i] != 0) { newLocalSize[i] = sizes.local()[i]; } } dev().captureMgr()->PreDispatch( this, hsaKernel, // Report global size in workgroups, since that's the RGP trace semantics newGlobalSize[0] / newLocalSize[0], newGlobalSize[1] / newLocalSize[1], newGlobalSize[2] / newLocalSize[2]); } bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false; if (printfEnabled && !printfDbgHSA().init(*this, printfEnabled)) { LogError("Printf debug buffer initialization failed!"); return false; } uint64_t vmDefQueue = 0; VirtualGPU* gpuDefQueue = nullptr; if (hsaKernel.dynamicParallelism()) { // Initialize GPU device queue for execution (gpuDefQueue) if (!PreDeviceEnqueue(kernel, hsaKernel, &gpuDefQueue, &vmDefQueue)) { return false; } } size_t ldsSize; ClPrint(amd::LOG_INFO, amd::LOG_KERN, "!\tkernel : %s\n", hsaKernel.name().c_str()); if (PAL_EMBED_KERNEL_MD) { char buf[256]; sprintf(buf, "kernel: %s\n private mem size: %x\n group mem size: %x\n", hsaKernel.name().c_str(), hsaKernel.spillSegSize(), hsaKernel.ldsSize()); iCmd()->CmdCommentString(buf); } bool imageBufferWrtBack = false; // Image buffer write back is required std::vector wrtBackImageBuffer; // Array of images for write back // Check memory dependency and SVM objects if (!processMemObjectsHSA(kernel, parameters, nativeMem, ldsSize, imageBufferWrtBack, wrtBackImageBuffer)) { LogError("Wrong memory objects!"); return false; } // Add ISA memory object to the resource tracking list AddKernel(kernel); GpuEvent gpuEvent(queues_[MainEngine]->cmdBufId()); uint32_t id = gpuEvent.id_; uint64_t vmParentWrap = 0; uint32_t aql_index = 0; // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(*this, kernel, sizes, parameters, ldsSize + sharedMemBytes, vmDefQueue, &vmParentWrap, &aql_index); assert((nullptr != aqlPkt) && "Couldn't load kernel arguments"); // Dynamic call stack size is considered to calculate private segment size and scratch regs // in pal::Kernel::postLoad(). As it is not called during hipModuleLaunchKernel unlike // hipLaunchKernel/hipLaunchKernelGGL, Updated value is passed to dispatch packet. size_t privateMemSize = hsaKernel.spillSegSize(); if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) { privateMemSize = std::max(static_cast(device().StackSize()), hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)); // Validate privateMemSize is more than max allowed. size_t maxStackSize = device().MaxStackSize(); if (privateMemSize > maxStackSize) { ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s", privateMemSize, maxStackSize, hsaKernel.name().c_str()); LogError("Scratch size exceeds max allowed."); return false; } } // Set up the dispatch information Pal::DispatchAqlParams dispatchParam = {}; dispatchParam.pAqlPacket = aqlPkt; if (privateMemSize > 0) { const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); dispatchParam.scratchSize = scratch->size_; dispatchParam.scratchOffset = scratch->offset_; dispatchParam.workitemPrivateSegmentSize = privateMemSize; } dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlKd(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); dispatchParam.wavesPerSh = 0; dispatchParam.useAtc = dev().settings().svmFineGrainSystem_ ? true : false; dispatchParam.kernargSegmentSize = hsaKernel.argsBufferSize(); dispatchParam.aqlPacketIndex = aql_index; // Run AQL dispatch in HW eventBegin(MainEngine); iCmd()->CmdDispatchAql(dispatchParam); if (id != gpuEvent.id_) { LogError("Something is wrong. ID mismatch!\n"); } eventEnd(MainEngine, gpuEvent); AqlPacketUpdateTs(aql_index, gpuEvent); // Execute scheduler for device enqueue if (hsaKernel.dynamicParallelism()) { PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent); } // Update the global GPU event constexpr bool kNeedFLush = false; setGpuEvent(gpuEvent, kNeedFLush); if (printfEnabled && !printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { LogError("Couldn't read printf data from the buffer!\n"); return false; } // Check if image buffer write back is required if (imageBufferWrtBack) { // Make sure the original kernel execution is done addBarrier(RgpSqqtBarrierReason::MemDependency); for (const auto imageBuffer : wrtBackImageBuffer) { Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent()); amd::Image* image = imageBuffer->owner()->asImage(); amd::Coord3D offs(0); // Copy memory from the the backing store image into original buffer bool result = blitMgr().copyImageToBuffer(*imageBuffer->CopyImageBuffer(), *buffer, offs, offs, image->getRegion(), true, image->getRowPitch(), image->getSlicePitch()); } } // Perform post dispatch logic for RGP traces if (rgpCaptureEna()) { dev().captureMgr()->PostDispatch(this); } return true; } // ================================================================================================ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); Unimplemented(); //!< @todo: Unimplemented } // ================================================================================================ void VirtualGPU::submitMarker(amd::Marker& vcmd) { //!@note runtime doesn't need to lock this command on execution if (vcmd.waitingEvent() != nullptr) { bool foundEvent = false; // Loop through all outstanding command batches while (!cbQueue_.empty()) { auto cb = cbQueue_.front(); // Wait for completion foundEvent = awaitCompletion(cb, vcmd.waitingEvent()); // Release a command batch freeCbQueue_.push(cb); // Remove command batch from the list cbQueue_.pop(); // Early exit if we found a command if (foundEvent) break; } // Event should be in the current command batch if (!foundEvent) { state_.forceWait_ = true; } } else if (amd::IS_HIP) { // Use GPU based timing for HIP events // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); GpuEvent event; profilingBegin(vcmd); eventBegin(MainEngine); eventEnd(MainEngine, event); setGpuEvent(event); profilingEnd(vcmd); } } // ================================================================================================ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {} // ================================================================================================ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) { const Pal::IQueueSemaphore* sem = reinterpret_cast(cmd.sem_ptr()); if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) { flushDMA(MainEngine); if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore( const_cast(sem), cmd.fence())) { LogError("Failed to signal external semaphore"); } } else { if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore( const_cast(sem), cmd.fence())) { LogError("Failed to wait on external semaphore"); } } } void VirtualGPU::releaseMemory(GpuMemoryReference* mem) { queues_[MainEngine]->removeCmdMemRef(mem); if (!dev().settings().disableSdma_) { queues_[SdmaEngine]->removeCmdMemRef(mem); } } void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); PalCounterReference* palRef = PalCounterReference::Create(*this); if (palRef == nullptr) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; } bool newExperiment = false; for (uint i = 0; i < vcmd.getNumCounters(); ++i) { amd::PerfCounter* amdCounter = static_cast(counters[i]); const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); // Make sure we have a valid gpu performance counter if (nullptr == counter) { amd::PerfCounter::Properties prop = amdCounter->properties(); PerfCounter* gpuCounter = new PerfCounter( gpuDevice_, palRef, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); if (nullptr == gpuCounter) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; } else if (gpuCounter->create()) { newExperiment = true; } else { LogPrintfError( "We failed to allocate a perfcounter in PAL.\ Block: %d, counter: #d, event: %d", gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_, gpuCounter->info()->eventIndex_); } amdCounter->setDeviceCounter(gpuCounter); } } if (newExperiment) { palRef->finalize(); } palRef->release(); Pal::IPerfExperiment* palPerf = nullptr; for (uint i = 0; i < vcmd.getNumCounters(); ++i) { amd::PerfCounter* amdCounter = static_cast(counters[i]); const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); if (palPerf != counter->iPerf()) { palPerf = counter->iPerf(); // Find the state and sends the command to PAL if (vcmd.getState() == amd::PerfCounterCommand::Begin) { state_.perfCounterEnabled_ = true; GpuEvent event; eventBegin(MainEngine); iCmd()->CmdBeginPerfExperiment(palPerf); eventEnd(MainEngine, event); setGpuEvent(event); } else if (vcmd.getState() == amd::PerfCounterCommand::End) { GpuEvent event; eventBegin(MainEngine); iCmd()->CmdEndPerfExperiment(palPerf); eventEnd(MainEngine, event); setGpuEvent(event); state_.perfCounterEnabled_ = false; } else { LogError("Unsupported performance counter state"); vcmd.setStatus(CL_INVALID_OPERATION); return; } } } } void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); switch (cmd.type()) { case CL_COMMAND_THREAD_TRACE_MEM: { amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); if (threadTrace == nullptr) { PalThreadTraceReference* palRef = PalThreadTraceReference::Create(*this); if (palRef == nullptr) { LogError("Failure in memory allocation for the GPU threadtrace"); cmd.setStatus(CL_INVALID_OPERATION); return; } size_t numSe = amdThreadTrace->deviceSeNumThreadTrace(); ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, palRef, cmd.getMemList(), numSe); if (nullptr == gpuThreadTrace) { LogError("Failure in memory allocation for the GPU threadtrace"); cmd.setStatus(CL_INVALID_OPERATION); return; } if (gpuThreadTrace->create()) { amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); } else { LogError("Failure in memory allocation for the GPU threadtrace"); delete gpuThreadTrace; cmd.setStatus(CL_INVALID_OPERATION); return; } palRef->finalize(); palRef->release(); } break; } default: LogError("Unsupported command type for ThreadTraceMemObjects!"); break; } profilingEnd(cmd); } void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); switch (cmd.type()) { case CL_COMMAND_THREAD_TRACE: { amd::ThreadTrace* amdThreadTrace = static_cast(&cmd.getThreadTrace()); ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); // gpu thread trace object had to be generated prior to begin/end/pause/resume due // to ThreadTraceMemObjectsCommand execution if (threadTrace == nullptr) { return; } else { Pal::IPerfExperiment* palPerf = threadTrace->iPerf(); if (cmd.getState() == amd::ThreadTraceCommand::Begin) { amd::ThreadTrace::ThreadTraceConfig* traceCfg = static_cast(cmd.threadTraceConfig()); iCmd()->CmdBeginPerfExperiment(palPerf); } else if (cmd.getState() == amd::ThreadTraceCommand::End) { GpuEvent event; eventBegin(MainEngine); iCmd()->CmdEndPerfExperiment(palPerf); threadTrace->populateUserMemory(); eventEnd(MainEngine, event); setGpuEvent(event); } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { // There's no Pause from the PerfExperiment interface } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { // There's no Resume from the PerfExperiment interface } } break; } default: LogError("Unsupported command type for ThreadTrace!"); break; } profilingEnd(cmd); } void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); for (const auto& it : vcmd.getMemList()) { // amd::Memory object should never be nullptr assert(it && "Memory object for interop is nullptr"); pal::Memory* memory = dev().getGpuMemory(it); // If resource is a shared copy of original resource, then // runtime needs to copy data from original resource it->getInteropObj()->copyOrigToShared(); } profilingEnd(vcmd); } void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); for (const auto& it : vcmd.getMemList()) { // amd::Memory object should never be nullptr assert(it && "Memory object for interop is nullptr"); pal::Memory* memory = dev().getGpuMemory(it); // If resource is a shared copy of original resource, then // runtime needs to copy data back to original resource it->getInteropObj()->copySharedToOrig(); } profilingEnd(vcmd); } void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); pal::Memory* pGpuMemory = dev().getGpuMemory(&vcmd.memory()); GpuEvent gpuEvent; uint32_t value = vcmd.markerValue(); if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { eventBegin(MainEngine); addVmMemory(pGpuMemory); iCmd()->CmdWaitBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value, 0xFFFFFFFF, Pal::CompareFunc::GreaterEqual); eventEnd(MainEngine, gpuEvent); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { EngineType activeEngineID = engineID_; engineID_ = static_cast(pGpuMemory->getGpuEvent(*this)->engineId_); // Make sure GPU finished operation and data reached memory before the marker write addBarrier(RgpSqqtBarrierReason::SignalSubmit, BarrierType::FlushL2); // Workarounds: We had systems where an extra delay was necessary. { // Flush CB associated with the DGMA buffer isDone(pGpuMemory->getGpuEvent(*this)); } eventBegin(engineID_); queues_[engineID_]->addCmdMemRef(pGpuMemory->memRef()); queues_[engineID_]->iCmd()-> #if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 396) CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), value); #else CmdUpdateBusAddressableMemoryMarker(*(pGpuMemory->iMem()), vcmd.markerOffset(), value); #endif eventEnd(engineID_, gpuEvent); // Restore the original engine engineID_ = activeEngineID; } // Update the global GPU event setGpuEvent(gpuEvent); profilingEnd(vcmd); } void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); std::vector memObjects = vcmd.memObjects(); uint32_t numObjects = memObjects.size(); Pal::GpuMemoryRef* pGpuMemRef = new Pal::GpuMemoryRef[numObjects]; Pal::IGpuMemory** pGpuMems = new Pal::IGpuMemory*[numObjects]; for (uint i = 0; i < numObjects; i++) { pal::Memory* pGpuMemory = dev().getGpuMemory(memObjects[i]); pGpuMemory->syncCacheFromHost(*this); pGpuMemRef[i].pGpuMemory = pGpuMemory->iMem(); pGpuMems[i] = pGpuMemory->iMem(); } dev().iDev()->AddGpuMemoryReferences(numObjects, pGpuMemRef, queues_[MainEngine]->iQueue_, Pal::GpuMemoryRefCantTrim); { amd::ScopedLock l(queues_[MainEngine]->lock_); dev().iDev()->InitBusAddressableGpuMemory(queues_[MainEngine]->iQueue_, numObjects, pGpuMems); } if (numObjects != 0) { dev().iDev()->RemoveGpuMemoryReferences(numObjects, &pGpuMems[0], queues_[MainEngine]->iQueue_); } for (uint i = 0; i < numObjects; i++) { vcmd.busAddress()[i].surface_bus_address = pGpuMems[i]->Desc().surfaceBusAddr; vcmd.busAddress()[i].marker_bus_address = pGpuMems[i]->Desc().markerBusAddr; } profilingEnd(vcmd); } bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) { bool found = false; amd::Command* current; amd::Command* head = cb->head_; // Make sure that profiling is enabled if (state_.profileEnabled_) { return profilingCollectResults(cb, waitingEvent); } // Mark the first command in the batch as running if (head != nullptr) { head->setStatus(CL_RUNNING); } else { return found; } // Wait for the last known GPU event waitEventLock(cb); while (nullptr != head) { current = head->getNext(); if (head->status() == CL_SUBMITTED) { head->setStatus(CL_RUNNING); head->setStatus(CL_COMPLETE); } else if (head->status() == CL_RUNNING) { head->setStatus(CL_COMPLETE); } else if ((head->status() != CL_COMPLETE) && (current != nullptr)) { LogPrintfError("Unexpected command status - %d!", head->status()); } // Check if it's a waiting command if (head == waitingEvent) { found = true; } head->release(); head = current; } return found; } void VirtualGPU::flush(amd::Command* list, bool wait) { CommandBatch* cb = nullptr; bool gpuCommand = false; for (uint i = 0; i < AllEngines; ++i) { if (events_[i].isValid()) { gpuCommand = true; } } // If the batch doesn't have any GPU command and the list is empty if (!gpuCommand && cbQueue_.empty()) { state_.forceWait_ = true; } // Insert the current batch into a list if (nullptr != list) { if (!freeCbQueue_.empty()) { cb = freeCbQueue_.front(); } if (nullptr == cb) { cb = new CommandBatch(list, events_, lastTS_); } else { freeCbQueue_.pop(); cb->init(list, events_, lastTS_); } } { //! @todo: Check if really need a lock amd::ScopedLock lock(execution()); for (uint i = 0; i < AllEngines; ++i) { flushDMA(i); // Reset event so we won't try to wait again, // if runtime didn't submit any commands //! @note: it's safe to invalidate events, since //! we already saved them with the batch creation step above events_[i].invalidate(); } } // Mark last TS as nullptr, so runtime won't process empty batches with the old TS lastTS_ = nullptr; if (nullptr != cb) { cbQueue_.push(cb); } wait |= state_.forceWait_; // Loop through all outstanding command batches while (!cbQueue_.empty()) { cb = cbQueue_.front(); // Check if command batch finished without a wait bool finished = true; for (uint i = 0; i < AllEngines; ++i) { finished &= isDone(&cb->events_[i]); } if (finished || wait) { // Wait for completion awaitCompletion(cb); // Release a command batch freeCbQueue_.push(cb); // Remove command batch from the list cbQueue_.pop(); } else { // Early exit if no finished break; } } state_.forceWait_ = false; } void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); } void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) { events_[engineID_] = gpuEvent; // Flush current DMA buffer if requested if (flush) { flushDMA(engineID_); } } void VirtualGPU::flushDMA(uint engineID) { if (engineID == MainEngine) { // Clear memory dependency state, since runtime flushes compute // memoryDependency().clear(); //!@todo Keep memory dependency alive even if we flush DMA, //! since only L2 cache is flushed in KMD frame, //! but L1 still has to be invalidated. } isDone(&events_[engineID]); } bool VirtualGPU::waitAllEngines(CommandBatch* cb) { uint i; GpuEvent* events; //!< GPU events for the batch // If command batch is nullptr then wait for the current if (nullptr == cb) { events = events_; } else { events = cb->events_; } bool earlyDone = true; // The first loop is to flush all engines and/or check if // engines are idle already for (i = 0; i < AllEngines; ++i) { earlyDone &= isDone(&events[i]); } // Rlease all pinned memory releasePinnedMem(); // The second loop is to wait all engines for (i = 0; i < AllEngines; ++i) { waitForEvent(&events[i]); } return earlyDone; } void VirtualGPU::waitEventLock(CommandBatch* cb) { bool earlyDone = false; { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); earlyDone = waitAllEngines(cb); } // Get timestamp, incase readjustTimeGPU_ needs to be updated uint64_t endTimeStampCPU = amd::Os::timeNanos(); // Free resource cache if we have too many entries //! \note we do it here, when all engines are idle, // because Vista/Win7 idles GPU on a resource destruction static const size_t MinCacheEntries = 4096; dev().resourceCache().free(MinCacheEntries); // Find the timestamp object of the last command in the batch if (cb->lastTS_ != nullptr) { // If earlyDone is TRUE, then CPU didn't wait for GPU. // Thus the sync point between CPU and GPU is unclear and runtime // will use an older adjustment value to maintain the same timeline if (!earlyDone || //! \note Workaround for APU(s). //! GPU-CPU timelines may go off too much, thus always //! force calibration with the last batch in the list (cbQueue_.size() <= 1) || (readjustTimeGPU_ == 0)) { uint64_t startTimeStampGPU = 0; uint64_t endTimeStampGPU = 0; // Get the timestamp value of the last command in the batch cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); // Adjust the base time by the execution time readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } } } bool VirtualGPU::allocConstantBuffers() { // Allocate constant buffers. // Use double size, reported to the app to account for internal arguments const uint32_t MinCbSize = 2 * dev().info().maxParameterSize_; uint i; // Create/reallocate constant buffer resources for (i = 0; i < MaxConstBuffersArguments; ++i) { ConstantBuffer* constBuf = new ConstantBuffer(managedBuffer_, MinCbSize); if ((constBuf != nullptr) && constBuf->Create()) { addConstBuffer(constBuf); } else { // We failed to create a constant buffer delete constBuf; return false; } } return true; } void VirtualGPU::profilingBegin(amd::Command& command) { // Is profiling enabled? if (command.profilingInfo().enabled_) { // Allocate a timestamp object from the cache TimeStamp* ts = tsCache_->allocTimeStamp(); if (nullptr == ts) { return; } // Save the TimeStamp object in the current OCL event command.data().emplace_back(ts); profileTs_ = ts; state_.profileEnabled_ = true; } } void VirtualGPU::profilingEnd(amd::Command& command) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = !command.data().empty() ? reinterpret_cast(command.data().back()) : nullptr; if (ts != nullptr) { // Check if the command actually did any GPU submission if (ts->isValid()) { lastTS_ = ts; } else { // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); command.data().clear(); } } } bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) { bool found = false; amd::Command* current; amd::Command* first = cb->head_; // If the command list is, empty then exit if (nullptr == first) { return found; } // Wait for the last known GPU events on all engines waitEventLock(cb); // Find the CPU base time of the entire command batch execution uint64_t endTimeStamp = amd::Os::timeNanos(); uint64_t startTimeStamp = endTimeStamp; // First step, walk the command list to find the first valid command //! \note The batch may have empty markers at the beginning. //! So the start/end of the empty commands is equal to //! the start of the first valid command in the batch. first = cb->head_; while (nullptr != first) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = !first->data().empty() ? reinterpret_cast(first->data().back()) : nullptr; if (ts != nullptr) { ts->value(&startTimeStamp, &endTimeStamp); endTimeStamp -= readjustTimeGPU_; startTimeStamp -= readjustTimeGPU_; // Assign to endTimeStamp the start of the first valid command endTimeStamp = startTimeStamp; break; } first = first->getNext(); } // Second step, walk the command list to construct the time line first = cb->head_; while (nullptr != first) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = !first->data().empty() ? reinterpret_cast(first->data().back()) : nullptr; current = first->getNext(); if (ts != nullptr) { ts->value(&startTimeStamp, &endTimeStamp); endTimeStamp -= readjustTimeGPU_; startTimeStamp -= readjustTimeGPU_; // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); first->data().clear(); } else { // For empty commands start/end is equal to // the end of the last valid command startTimeStamp = endTimeStamp; } // Update the command status with the proper timestamps if (first->status() == CL_SUBMITTED) { first->setStatus(CL_RUNNING, startTimeStamp); first->setStatus(CL_COMPLETE, endTimeStamp); } else if (first->status() == CL_RUNNING) { first->setStatus(CL_COMPLETE, endTimeStamp); } else if ((first->status() != CL_COMPLETE) && (current != nullptr)) { LogPrintfError("Unexpected command status - %d!", first->status()); } // Do we wait this event? if (first == waitingEvent) { found = true; } first->release(); first = current; } return found; } void VirtualGPU::addDoppRef(const Memory* memory, bool lastDoppCmd, bool pfpaDoppCmd) { queues_[MainEngine]->addCmdDoppRef(memory->iMem(), lastDoppCmd, pfpaDoppCmd); } void VirtualGPU::profileEvent(EngineType engine, bool type) const { if (nullptr == profileTs_) { return; } if (type) { profileTs_->begin(); } else { profileTs_->end(); } } bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params, bool nativeMem, size_t& ldsAddress, bool& imageBufferWrtBack, std::vector& wrtBackImageBuffer) { const amd::KernelParameters& kernelParams = kernel.parameters(); // Mark the tracker with a new kernel, // so we can avoid checks of the aliased objects memoryDependency().newKernel(); size_t count = kernelParams.getNumberOfSvmPtr(); if (count > 0) { bool supportFineGrainedSystem = dev().isFineGrainedSystem(true); FGSStatus status = kernelParams.getSvmSystemPointersSupport(); switch (status) { case FGS_YES: if (!supportFineGrainedSystem) { return false; } break; case FGS_NO: supportFineGrainedSystem = false; break; case FGS_DEFAULT: default: break; } // get svm non arugment information void* const* svmPtrArray = reinterpret_cast(params + kernelParams.getExecInfoOffset()); for (size_t i = 0; i < count; i++) { amd::Memory* memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]); if (nullptr == memory) { if (!supportFineGrainedSystem) { return false; } else { addBarrier(RgpSqqtBarrierReason::MemDependency); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); continue; } } else { // Validate Mem Access in case of VMM Memory if (!memory->ValidateMemAccess(dev(), true)) { return false; } Memory* gpuMemory = dev().getGpuMemory(memory); if (nullptr != gpuMemory) { // Synchronize data with other memory instances if necessary gpuMemory->syncCacheFromHost(*this); const static bool IsReadOnly = false; // Validate SVM passed in the non argument list memoryDependency().validate(*this, gpuMemory, IsReadOnly); // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer constexpr bool WaitOnBusyEngine = true; gpuMemory->wait(*this, WaitOnBusyEngine); // Mark signal write for cache coherency, // since this object isn't a part of kernel arg setup if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { memory->signalWrite(&dev()); } addVmMemory(gpuMemory); } else { return false; } } } } bool srdResource = false; amd::Memory* const* memories = reinterpret_cast(params + kernelParams.memoryObjOffset()); const pal::Kernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); const amd::KernelSignature& signature = kernel.signature(); ldsAddress = hsaKernel.ldsSize(); if (!nativeMem) { // Process cache coherency first, since the extra transfers may affect // other mem dependency tracking logic: TS and signalWrite() for (uint i = 0; i < signature.numMemories(); ++i) { amd::Memory* mem = memories[i]; if (mem != nullptr) { // Synchronize data with other memory instances if necessary dev().getGpuMemory(mem)->syncCacheFromHost(*this); } } } // Check all parameters for the current kernel for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); const amd::KernelParameterDescriptor::InfoData& info = desc.info_; // Find if current argument is a buffer if (desc.type_ == T_POINTER) { // If it is a local pointer if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { ldsAddress = amd::alignUp(ldsAddress, desc.info_.arrayIndex_); if (desc.size_ == 8) { // Save the original LDS size uint64_t ldsSize = *reinterpret_cast(params + desc.offset_); // Patch the LDS address in the original arguments with an LDS address(offset) WriteAqlArgAt(const_cast

(params), ldsAddress, desc.size_, desc.offset_); // Add the original size ldsAddress += ldsSize; } else { // Save the original LDS size uint32_t ldsSize = *reinterpret_cast(params + desc.offset_); // Patch the LDS address in the original arguments with an LDS address(offset) uint32_t ldsAddr = ldsAddress; WriteAqlArgAt(const_cast

(params), ldsAddr, desc.size_, desc.offset_); // Add the original size ldsAddress += ldsSize; } } else { Memory* gpuMem = nullptr; amd::Memory* mem = nullptr; uint32_t index = info.arrayIndex_; if (nativeMem) { gpuMem = reinterpret_cast(memories)[index]; if (nullptr != gpuMem) { mem = gpuMem->owner(); } } else { mem = memories[index]; if (mem != nullptr) { gpuMem = dev().getGpuMemory(mem); } } //! This condition is for SVM fine-grain if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { addBarrier(RgpSqqtBarrierReason::MemDependency); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); continue; } else if (gpuMem != nullptr) { // Validate memory for a dependency in the queue memoryDependency().validate(*this, gpuMem, info.readOnly_); // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer constexpr bool WaitOnBusyEngine = true; gpuMem->wait(*this, WaitOnBusyEngine); addVmMemory(gpuMem); const void* globalAddress = *reinterpret_cast(params + desc.offset_); logVmMemory(desc.name_, gpuMem); //! Check if compiler expects read/write. //! Note: SVM with subbuffers has an issue with tracking. //! Conformance can send read only subbuffer, but update the region //! in the kernel. if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) || ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) { mem->signalWrite(&dev()); } if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { if (gpuMem->memoryType() == Resource::ImageBuffer) { Image* imageBuffer = static_cast(gpuMem); // Check if synchronization has to be performed if (imageBuffer->CopyImageBuffer() != nullptr) { Memory* buffer = dev().getGpuMemory(mem->parent()); amd::Image* image = mem->asImage(); amd::Coord3D offs(0); // Copy memory from the original image buffer into the backing store image bool result = blitMgr().copyBufferToImage( *buffer, *imageBuffer->CopyImageBuffer(), offs, offs, image->getRegion(), true, image->getRowPitch(), image->getSlicePitch()); // Make sure the copy operation is done addBarrier(RgpSqqtBarrierReason::MemDependency); // Use backing store SRD as the replacment uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd(); WriteAqlArgAt(const_cast

(params), srd, sizeof(srd), desc.offset_); // Add backing store image to the list of memory handles addVmMemory(imageBuffer->CopyImageBuffer()); // If it's not a read only resource, then runtime has to write back if (!info.readOnly_) { wrtBackImageBuffer.push_back(imageBuffer); imageBufferWrtBack = true; } } } //! \note Special case for the image views. //! Copy SRD to CB1, so blit manager will be able to release //! this view without a wait for SRD resource. if (gpuMem->memoryType() == Resource::ImageView) { // Copy the current image SRD into CB1 uint64_t srd = cb(1)->UploadDataToHw(gpuMem->hwState(), HsaImageObjectSize); // Then use a pointer in aqlArgBuffer to CB1 // Patch the GPU VA address in the original arguments WriteAqlArgAt(const_cast

(params), srd, sizeof(srd), desc.offset_); addVmMemory(cb(1)->ActiveMemory()); } else { srdResource = true; } if (gpuMem->desc().isDoppTexture_) { addDoppRef(gpuMem, kernel.parameters().getExecNewVcop(), kernel.parameters().getExecPfpaVcop()); } } } } } else if (desc.type_ == T_VOID) { if (desc.info_.oclObject_ == amd::KernelParameterDescriptor::ReferenceObject) { // Copy the current structure into CB1 size_t gpuPtr = static_cast(cb(1)->UploadDataToHw(params + desc.offset_, desc.size_)); // Then use a pointer in aqlArgBuffer to CB1 const auto it = hsaKernel.patch().find(desc.offset_); // Patch the GPU VA address in the original arguments WriteAqlArgAt(const_cast

(params), gpuPtr, sizeof(size_t), it->second); addVmMemory(cb(1)->ActiveMemory()); } } else if (desc.type_ == T_SAMPLER) { srdResource = true; } else if (desc.type_ == T_QUEUE) { uint32_t index = desc.info_.arrayIndex_; const amd::DeviceQueue* queue = reinterpret_cast(params + kernelParams.queueObjOffset())[index]; VirtualGPU* gpuQueue = static_cast(queue->vDev()); uint64_t vmQueue; if (dev().settings().useDeviceQueue_) { vmQueue = gpuQueue->vQueue()->vmAddress(); } else { if (!createVirtualQueue(queue->size())) { LogError("Virtual queue creation failed!"); return false; } vmQueue = vQueue()->vmAddress(); } // Patch the GPU VA address in the original arguments WriteAqlArgAt(const_cast

(params), vmQueue, sizeof(vmQueue), desc.offset_); break; } } if (ldsAddress > dev().info().localMemSize_) { LogError("No local memory available\n"); return false; } if (srdResource || hsaKernel.prog().isStaticSampler()) { dev().srds().fillResourceList(*this); } const static bool IsReadOnly = false; for (const pal::Memory* mem : hsaKernel.prog().globalStores()) { // Validate global store for a dependency in the queue memoryDependency().validate(*this, mem, IsReadOnly); addVmMemory(mem); } if (hsaKernel.prog().hasGlobalStores()) { // Validate code object for a dependency in the queue memoryDependency().validate(*this, &hsaKernel.prog().codeSegGpu(), IsReadOnly); } addVmMemory(&hsaKernel.prog().codeSegGpu()); if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); // Validate scratch buffer to force sync mode, because // the current scratch logic is optimized for size and performance // Note: runtime can skip sync if the same kernel is used, // since the number of scratch regs remains the same if (!IsSameKernel(kernel)) { memoryDependency().validate(*this, scratch->memObj_, IsReadOnly); } addVmMemory(scratch->memObj_); logVmMemory("scratch", scratch->memObj_); } // Synchronize dispatches unconditionally in case memory tracking is disabled memoryDependency().sync(*this); return true; } void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, const Memory* kernelTable) { if (nullptr == kernelTable) { vqHeader_->kernel_table = 0; } else { vqHeader_->kernel_table = kernelTable->vmAddress(); addVmMemory(kernelTable); } virtualQueue_->writeRawData(hostQ, 0, sizeof(AmdVQueueHeader), vqHeader_, true); } bool VirtualGPU::validateSdmaOverlap(const Resource& src, const Resource& dst) { uint64_t srcVmEnd = src.vmAddress() + src.vmSize(); if (((src.vmAddress() >= sdmaRange_.start_) && (src.vmAddress() <= sdmaRange_.end_)) || ((srcVmEnd >= sdmaRange_.start_) && (srcVmEnd <= sdmaRange_.end_)) || ((src.vmAddress() <= sdmaRange_.start_) && (srcVmEnd >= sdmaRange_.end_))) { sdmaRange_.start_ = dst.vmAddress(); sdmaRange_.end_ = dst.vmAddress() + dst.vmSize(); return true; } sdmaRange_.start_ = std::min(sdmaRange_.start_, dst.vmAddress()); sdmaRange_.end_ = std::max(sdmaRange_.end_, dst.vmAddress() + dst.vmSize()); return false; } // ================================================================================================ void* VirtualGPU::getOrCreateHostcallBuffer() { if (hostcallBuffer_ != nullptr) { return hostcallBuffer_; } // The number of packets required in each buffer is at least equal to the // maximum number of waves supported by the device. auto wavesPerCu = dev().info().maxThreadsPerCU_ / dev().info().wavefrontWidth_; auto numPackets = dev().info().maxComputeUnits_ * wavesPerCu; auto size = amd::getHostcallBufferSize(numPackets); auto align = amd::getHostcallBufferAlignment(); hostcallBuffer_ = dev().svmAlloc(dev().context(), size, align, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, nullptr); if (!hostcallBuffer_) { ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to create hostcall buffer"); return nullptr; } ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "Created hostcall buffer %p (numPackets == %d, size == %d, align == %d) for virtual " "queue %p\n", hostcallBuffer_, numPackets, size, align, this); if (!amd::enableHostcalls(dev(), hostcallBuffer_, numPackets)) { ClPrint(amd::LOG_ERROR, amd::LOG_QUEUE, "Failed to register hostcall buffer %p with listener", hostcallBuffer_); return nullptr; } return hostcallBuffer_; } } // namespace amd::pal