// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #include "platform/perfctr.hpp" #include "platform/threadtrace.hpp" #include "platform/kernel.hpp" #include "platform/commandqueue.hpp" #include "device/gpu/gpuconstbuf.hpp" #include "device/gpu/gpuvirtual.hpp" #include "device/gpu/gpukernel.hpp" #include "device/gpu/gpuprogram.hpp" #include "device/gpu/gpucounters.hpp" #include "device/gpu/gputhreadtrace.hpp" #include "device/gpu/gputimestamp.hpp" #include "device/gpu/gpublit.hpp" #include "device/gpu/gpudebugger.hpp" #include "shader/ComputeProgramObject.h" #include "hsa.h" #include "amd_hsa_kernel_code.h" #include "amd_hsa_queue.h" #include #include #include #ifdef _WIN32 #include #include "amdocl/cl_d3d9_amd.hpp" #include "amdocl/cl_d3d10_amd.hpp" #include "amdocl/cl_d3d11_amd.hpp" #endif // _WIN32 namespace gpu { bool VirtualGPU::MemoryDependency::create(size_t numMemObj) { if (numMemObj > 0) { // Allocate the array of memory objects for dependency tracking memObjectsInQueue_ = new MemoryState[numMemObj]; if (NULL == memObjectsInQueue_) { return false; } memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj); maxMemObjectsInQueue_ = numMemObj; } return true; } void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memory, bool readOnly) { bool flushL1Cache = false; if (maxMemObjectsInQueue_ == 0) { // Flush cache gpu.flushCUCaches(); return; } uint64_t curStart = memory->hbOffset(); uint64_t curEnd = curStart + memory->hbSize(); // Loop through all memory objects in the queue and find dependency // @note don't include objects from the current kernel for (size_t j = 0; j < endMemObjectsInQueue_; ++j) { // Check if the queue already contains this mem object and // GPU operations aren't readonly uint64_t busyStart = memObjectsInQueue_[j].start_; uint64_t busyEnd = memObjectsInQueue_[j].end_; // Check if the start inside the busy region if ((((curStart >= busyStart) && (curStart < busyEnd)) || // Check if the end inside the busy region ((curEnd > busyStart) && (curEnd <= busyEnd)) || // Check if the start/end cover the busy region ((curStart <= busyStart) && (curEnd >= busyEnd))) && // If the buys region was written or the current one is for write (!memObjectsInQueue_[j].readOnly_ || !readOnly)) { flushL1Cache = true; break; } } // Did we reach the limit? if (maxMemObjectsInQueue_ <= numMemObjectsInQueue_) { flushL1Cache = true; } if (flushL1Cache) { // Flush cache gpu.flushCUCaches(); // Clear memory dependency state const static bool All = true; clear(!All); } // Insert current memory object into the queue always, // since runtime calls flush before kernel execution and it has to keep // current kernel in tracking memObjectsInQueue_[numMemObjectsInQueue_].start_ = curStart; memObjectsInQueue_[numMemObjectsInQueue_].end_ = curEnd; memObjectsInQueue_[numMemObjectsInQueue_].readOnly_ = readOnly; numMemObjectsInQueue_++; } void VirtualGPU::MemoryDependency::clear(bool all) { if (numMemObjectsInQueue_ > 0) { size_t i, j; if (all) { endMemObjectsInQueue_ = numMemObjectsInQueue_; } // If the current launch didn't start from the beginning, then move the data if (0 != endMemObjectsInQueue_) { // Preserve all objects from the current kernel for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) { memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_; memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_; memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_; } } else if (numMemObjectsInQueue_ >= maxMemObjectsInQueue_) { // note: The array growth shouldn't occur under the normal conditions, // but in a case when SVM path sends the amount of SVM ptrs over // the max size of kernel arguments MemoryState* ptr = new MemoryState[maxMemObjectsInQueue_ << 1]; if (nullptr == ptr) { numMemObjectsInQueue_ = 0; return; } maxMemObjectsInQueue_ <<= 1; memcpy(ptr, memObjectsInQueue_, sizeof(MemoryState) * numMemObjectsInQueue_); delete[] memObjectsInQueue_; memObjectsInQueue_= ptr; } numMemObjectsInQueue_ -= endMemObjectsInQueue_; endMemObjectsInQueue_ = 0; } } VirtualGPU::DmaFlushMgmt::DmaFlushMgmt(const Device& dev) : cbWorkload_(0), dispatchSplitSize_(0) { aluCnt_ = dev.info().simdPerCU_ * dev.info().simdWidth_ * dev.info().maxComputeUnits_; maxDispatchWorkload_ = static_cast(dev.info().maxEngineClockFrequency_) * // find time in us dev.settings().maxWorkloadTime_ * aluCnt_; resetCbWorkload(dev); } void VirtualGPU::DmaFlushMgmt::resetCbWorkload(const Device& dev) { cbWorkload_ = 0; maxCbWorkload_ = static_cast(dev.info().maxEngineClockFrequency_) * // find time in us dev.settings().minWorkloadTime_ * aluCnt_; } void VirtualGPU::DmaFlushMgmt::findSplitSize(const Device& dev, uint64_t threads, uint instructions) { uint64_t workload = threads * instructions; if (maxDispatchWorkload_ < workload) { dispatchSplitSize_ = static_cast(maxDispatchWorkload_ / instructions); uint fullLoad = dev.info().maxComputeUnits_ * dev.info().preferredWorkGroupSize_; if ((dispatchSplitSize_ % fullLoad) != 0) { dispatchSplitSize_ = (dispatchSplitSize_ / fullLoad + 1) * fullLoad; } } else { dispatchSplitSize_ = (threads > dev.settings().workloadSplitSize_) ? dev.settings().workloadSplitSize_ : 0; } } bool VirtualGPU::DmaFlushMgmt::isCbReady(VirtualGPU& gpu, uint64_t threads, uint instructions) { bool cbReady = false; uint64_t workload = amd::alignUp(threads, 4 * aluCnt_) * instructions; // Add current workload to the overall workload in the current DMA cbWorkload_ += workload; // Did it exceed maximum? if (cbWorkload_ > maxCbWorkload_) { // Reset DMA workload cbWorkload_ = 0; // Increase workload of the next DMA buffer by 50% maxCbWorkload_ = maxCbWorkload_ * 3 / 2; if (maxCbWorkload_ > maxDispatchWorkload_) { maxCbWorkload_ = maxDispatchWorkload_; } cbReady = true; } return cbReady; } bool VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor* engines, uint32_t rtCUs) { // GSL device initialization dev().PerformFullInitialization(); // Wait the event m_waitType = dev().settings().syncObject_ ? CAL_WAIT_LOW_CPU_UTILIZATION : CAL_WAIT_POLLING; if (!open(&dev(), nEngines, engines, rtCUs)) { return false; } return true; } void VirtualGPU::gslDestroy() { close(dev().getNative()); } void VirtualGPU::addXferWrite(Memory& memory) { if (xferWriteBuffers_.size() > 7) { dev().xferWrite().release(*this, *xferWriteBuffers_.front()); xferWriteBuffers_.pop_front(); } // Delay destruction xferWriteBuffers_.push_back(&memory); } void VirtualGPU::releaseXferWrite() { for (auto& memory : xferWriteBuffers_) { dev().xferWrite().release(*this, *memory); } xferWriteBuffers_.clear(); } void VirtualGPU::addPinnedMem(amd::Memory* mem) { if (NULL == findPinnedMem(mem->getHostMem(), mem->getSize())) { if (pinnedMems_.size() > 7) { pinnedMems_.front()->release(); pinnedMems_.pop_front(); } // Start operation, since we should release mem object flushDMA(getGpuEvent(dev().getGpuMemory(mem)->gslResource())->engineId_); // Delay destruction pinnedMems_.push_back(mem); } } void VirtualGPU::releasePinnedMem() { for (auto& amdMemory : pinnedMems_) { amdMemory->release(); } pinnedMems_.clear(); } amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) { for (auto& amdMemory : pinnedMems_) { if ((amdMemory->getHostMem() == addr) && (size <= amdMemory->getSize())) { return amdMemory; } } return NULL; } bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) { uint MinDeviceQueueSize = 16 * 1024; deviceQueueSize = std::max(deviceQueueSize, MinDeviceQueueSize); maskGroups_ = deviceQueueSize / (512 * Ki); maskGroups_ = (maskGroups_ == 0) ? 1 : maskGroups_; // Align the queue size for the multiple dispatch scheduler. // Each thread works with 32 entries * maskGroups uint extra = deviceQueueSize % (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_); if (extra != 0) { deviceQueueSize += (sizeof(AmdAqlWrap) * DeviceQueueMaskSize * maskGroups_) - extra; } if (deviceQueueSize_ == deviceQueueSize) { return true; } else { //! @todo Temporarily keep the buffer mapped for debug purpose if (NULL != schedParams_) { schedParams_->unmap(this); } delete vqHeader_; delete virtualQueue_; delete schedParams_; vqHeader_ = NULL; virtualQueue_ = NULL; schedParams_ = NULL; schedParamIdx_ = 0; deviceQueueSize_ = 0; } uint numSlots = deviceQueueSize / sizeof(AmdAqlWrap); uint allocSize = deviceQueueSize; // Add the virtual queue header allocSize += sizeof(AmdVQueueHeader); allocSize = amd::alignUp(allocSize, sizeof(AmdAqlWrap)); uint argOffs = allocSize; // Add the kernel arguments and wait events uint singleArgSize = amd::alignUp( dev().info().maxParameterSize_ + 64 + dev().settings().numWaitEvents_ * sizeof(uint64_t), sizeof(AmdAqlWrap)); allocSize += singleArgSize * numSlots; uint eventsOffs = allocSize; // Add the device events allocSize += dev().settings().numDeviceEvents_ * sizeof(AmdEvent); uint eventMaskOffs = allocSize; // Add mask array for events allocSize += amd::alignUp(dev().settings().numDeviceEvents_, DeviceQueueMaskSize) / 8; uint slotMaskOffs = allocSize; // Add mask array for AmdAqlWrap slots allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8; virtualQueue_ = new Memory(dev(), allocSize); Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote; if ((virtualQueue_ == NULL) || !virtualQueue_->create(type)) { return false; } address ptr = reinterpret_cast

(virtualQueue_->map(this, Resource::WriteOnly)); if (NULL == ptr) { return false; } // Clear memory memset(ptr, 0, allocSize); uint64_t vaBase = virtualQueue_->vmAddress(); AmdVQueueHeader* header = reinterpret_cast(ptr); // Initialize the virtual queue header header->aql_slot_num = numSlots; header->event_slot_num = dev().settings().numDeviceEvents_; header->event_slot_mask = vaBase + eventMaskOffs; header->event_slots = vaBase + eventsOffs; header->aql_slot_mask = vaBase + slotMaskOffs; header->wait_size = dev().settings().numWaitEvents_; header->arg_size = dev().info().maxParameterSize_ + 64; header->mask_groups = maskGroups_; vqHeader_ = new AmdVQueueHeader; if (NULL == vqHeader_) { return false; } *vqHeader_ = *header; // Go over all slots and perform initialization AmdAqlWrap* slots = reinterpret_cast(&header[1]); for (uint i = 0; i < numSlots; ++i) { uint64_t argStart = vaBase + argOffs + i * singleArgSize; slots[i].aql.kernarg_address = reinterpret_cast(argStart); slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64; } // Upload data back to local memory if (GPU_PRINT_CHILD_KERNEL == 0) { virtualQueue_->unmap(this); } schedParams_ = new Memory(dev(), 64 * Ki); if ((schedParams_ == NULL) || !schedParams_->create(Resource::RemoteUSWC)) { return false; } ptr = reinterpret_cast

(schedParams_->map(this)); deviceQueueSize_ = deviceQueueSize; return true; } VirtualGPU::VirtualGPU(Device& device) : device::VirtualDevice(device), CALGSLContext(), engineID_(MainEngine), activeKernelDesc_(NULL), gpuDevice_(static_cast(device)), execution_("Virtual GPU execution lock", true), printfDbg_(NULL), printfDbgHSA_(NULL), tsCache_(NULL), vmMems_(NULL), numVmMems_(0), dmaFlushMgmt_(device), hwRing_(0), readjustTimeGPU_(0), currTs_(NULL), vqHeader_(NULL), virtualQueue_(NULL), schedParams_(NULL), schedParamIdx_(0), deviceQueueSize_(0), maskGroups_(1), hsaQueueMem_(NULL), profileEnabled_(false) { memset(&cal_, 0, sizeof(CalVirtualDesc)); for (uint i = 0; i < AllEngines; ++i) { cal_.events_[i].invalidate(); } memset(&cal_.samplersState_, 0xff, sizeof(cal_.samplersState_)); // Note: Virtual GPU device creation must be a thread safe operation index_ = gpuDevice_.numOfVgpus_++; gpuDevice_.vgpus_.resize(gpuDevice_.numOfVgpus()); gpuDevice_.vgpus_[index()] = this; } bool VirtualGPU::create(bool profiling, uint rtCUs, uint deviceQueueSize, amd::CommandQueue::Priority priority) { device::BlitManager::Setup blitSetup; gslEngineDescriptor engines[2]; uint engineMask = 0; uint32_t num = 0; if (index() >= GPU_MAX_COMMAND_QUEUES) { // Cap the maximum number of concurrent Virtual GPUs. return false; } // Virtual GPU will have profiling enabled state_.profiling_ = profiling; { if (dev().engines().numComputeRings()) { uint idx; if ((amd::CommandQueue::RealTimeDisabled == rtCUs) && (priority == amd::CommandQueue::Priority::Normal)) { idx = index() % dev().engines().numComputeRings(); engineMask = dev().engines().getMask((gslEngineID)( dev().isComputeRingIDForced() ? dev().getforcedComputeEngineID() : (dev().getFirstAvailableComputeEngineID() + idx))); } else { if ((priority == amd::CommandQueue::Priority::Medium) && (amd::CommandQueue::RealTimeDisabled == rtCUs)) { engineMask = dev().engines().getMask((gslEngineID)(GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)); } else { if (priority == amd::CommandQueue::Priority::Medium) { engineMask = dev().engines().getMask((gslEngineID)(GSL_ENGINEID_COMPUTE_RT1)); } else { engineMask = dev().engines().getMask((gslEngineID)(GSL_ENGINEID_COMPUTE_RT)); } } //!@todo This is not a generic solution and // may have issues with > 8 queues idx = index() % (dev().engines().numComputeRings() + dev().engines().numComputeRingsRT()); } // hwRing_ should be set 0 if forced to have single scratch buffer hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; if (dev().canDMA()) { // If only 1 DMA engine is available then use that one if (dev().engines().numDMAEngines() < 2) { engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); } else if (index() & 0x1) { engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); } else { engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA1); } } } else { engineMask = dev().engines().getMask(GSL_ENGINEID_3DCOMPUTE0); if (dev().canDMA()) { engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); } } } num = dev().engines().getRequested(engineMask, engines); // Open GSL context if ((num == 0) || !gslOpen(num, engines, rtCUs)) { return false; } // Diable double copy optimization, // since UAV read from nonlocal is fast enough blitSetup.disableCopyBufferToImageOpt_ = true; if (!allocConstantBuffers()) { return false; } // Create Printf class printfDbg_ = new PrintfDbg(gpuDevice_); if ((NULL == printfDbg_) || !printfDbg_->create()) { delete printfDbg_; LogError("Could not allocate debug buffer for printf()!"); return false; } // Create HSAILPrintf class printfDbgHSA_ = new PrintfDbgHSA(gpuDevice_); if (NULL == printfDbgHSA_) { delete printfDbgHSA_; LogError("Could not create PrintfDbgHSA class!"); return false; } // Choose the appropriate class for blit engine switch (dev().settings().blitEngine_) { default: // Fall through ... case Settings::BlitEngineHost: blitSetup.disableAll(); // Fall through ... case Settings::BlitEngineCAL: case Settings::BlitEngineKernel: // use host blit for HW debug if (dev().settings().enableHwDebug_) { blitSetup.disableCopyImageToBuffer_ = true; blitSetup.disableCopyBufferToImage_ = true; } blitMgr_ = new KernelBlitManager(*this, blitSetup); break; } if ((NULL == blitMgr_) || !blitMgr_->create(gpuDevice_)) { LogError("Could not create BlitManager!"); return false; } tsCache_ = new TimeStampCache(*this); if (NULL == tsCache_) { LogError("Could not create TimeStamp cache!"); return false; } if (!memoryDependency().create(dev().settings().numMemDependencies_)) { LogError("Could not create the array of memory objects!"); return false; } if (!allocHsaQueueMem()) { LogError("Could not create hsaQueueMem object!"); return false; } // Check if the app requested a device queue creation if (dev().settings().useDeviceQueue_ && (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) { LogError("Could not create a virtual queue!"); return false; } return true; } bool VirtualGPU::allocHsaQueueMem() { // Allocate a dummy HSA queue hsaQueueMem_ = new Memory(dev(), sizeof(amd_queue_t)); if ((hsaQueueMem_ == NULL) || (!hsaQueueMem_->create(Resource::Local))) { delete hsaQueueMem_; return false; } amd_queue_t* queue = reinterpret_cast(hsaQueueMem_->map(NULL, Resource::WriteOnly)); if (NULL == queue) { delete hsaQueueMem_; return false; } memset(queue, 0, sizeof(amd_queue_t)); // Provide private and local heap addresses const static uint addressShift = LP64_SWITCH(0, 32); queue->private_segment_aperture_base_hi = static_cast(dev().gslCtx()->getPrivateApertureBase() >> addressShift); queue->group_segment_aperture_base_hi = static_cast(dev().gslCtx()->getSharedApertureBase() >> addressShift); hsaQueueMem_->unmap(NULL); return true; } VirtualGPU::~VirtualGPU() { // Not safe to remove a queue. So lock the device amd::ScopedLock k(dev().lockAsyncOps()); amd::ScopedLock lock(dev().vgpusAccess()); uint i; // Destroy all kernels for (const auto& it : gslKernels_) { if (it.first != 0) { freeKernelDesc(it.second); } } gslKernels_.clear(); // Destroy all memories static const bool SkipScratch = false; releaseMemObjects(SkipScratch); // Destroy printf object delete printfDbg_; // Destroy printfHSA object delete printfDbgHSA_; // Destroy BlitManager object delete blitMgr_; // Destroy TimeStamp cache delete tsCache_; // Destroy resource list with the constant buffers for (i = 0; i < constBufs_.size(); ++i) { delete constBufs_[i]; } gslDestroy(); gpuDevice_.numOfVgpus_--; gpuDevice_.vgpus_.erase(gpuDevice_.vgpus_.begin() + index()); for (uint idx = index(); idx < dev().vgpus().size(); ++idx) { dev().vgpus()[idx]->index_--; } // Release scratch buffer memory to reduce memory pressure //!@note OCLtst uses single device with multiple tests //! Release memory only if it's the last command queue. //! The first queue is reserved for the transfers on device if (gpuDevice_.numOfVgpus_ <= 1) { gpuDevice_.destroyScratchBuffers(); } delete[] vmMems_; //! @todo Temporarily keep the buffer mapped for debug purpose if (NULL != schedParams_) { schedParams_->unmap(this); } delete vqHeader_; delete virtualQueue_; delete schedParams_; delete hsaQueueMem_; } void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); // Translate memory references and ensure cache up-to-date gpu::Memory* memory = dev().getGpuMemory(&vcmd.source()); size_t offset = 0; // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.destination(), &offset); profilingBegin(vcmd, true); memory->syncCacheFromHost(*this); cl_command_type type = vcmd.type(); bool result = false; amd::Memory* bufferFromImage = NULL; // Force buffer read for IMAGE1D_BUFFER if ((type == CL_COMMAND_READ_IMAGE) && (vcmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(vcmd.source()); if (NULL == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_READ_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Process different write commands switch (type) { case CL_COMMAND_READ_BUFFER: { amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); if (NULL != bufferFromImage) { size_t elemSize = vcmd.source().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; } if (hostMemory != NULL) { // Accelerated transfer without pinning amd::Coord3D dstOrigin(offset); result = blitMgr().copyBuffer(*memory, *hostMemory, origin, dstOrigin, size, vcmd.isEntireMemory()); } else { result = blitMgr().readBuffer(*memory, vcmd.destination(), origin, size, vcmd.isEntireMemory()); } if (NULL != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_READ_BUFFER_RECT: { amd::BufferRect hostbufferRect; amd::Coord3D region(0); amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); if (hostMemory != NULL) { result = blitMgr().copyBufferRect(*memory, *hostMemory, vcmd.bufRect(), hostbufferRect, vcmd.size(), vcmd.isEntireMemory()); } else { result = blitMgr().readBufferRect(*memory, vcmd.destination(), vcmd.bufRect(), vcmd.hostRect(), vcmd.size(), vcmd.isEntireMemory()); } } break; case CL_COMMAND_READ_IMAGE: if (hostMemory != NULL) { // Accelerated image to buffer transfer without pinning amd::Coord3D dstOrigin(offset); result = blitMgr().copyImageToBuffer(*memory, *hostMemory, vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); } else { result = blitMgr().readImage(*memory, vcmd.destination(), vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); } break; default: LogError("Unsupported type for the read command"); break; } if (!result) { LogError("submitReadMemory failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); // Translate memory references and ensure cache up to date gpu::Memory* memory = dev().getGpuMemory(&vcmd.destination()); size_t offset = 0; // Find if virtual address is a CL allocation device::Memory* hostMemory = dev().findMemoryFromVA(vcmd.source(), &offset); profilingBegin(vcmd, true); bool entire = vcmd.isEntireMemory(); // Synchronize memory from host if necessary device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; memory->syncCacheFromHost(*this, syncFlags); cl_command_type type = vcmd.type(); bool result = false; amd::Memory* bufferFromImage = NULL; // Force buffer write for IMAGE1D_BUFFER if ((type == CL_COMMAND_WRITE_IMAGE) && (vcmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(vcmd.destination()); if (NULL == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_WRITE_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Process different write commands switch (type) { case CL_COMMAND_WRITE_BUFFER: { amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); if (NULL != bufferFromImage) { size_t elemSize = vcmd.destination().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; } if (hostMemory != NULL) { // Accelerated transfer without pinning amd::Coord3D srcOrigin(offset); result = blitMgr().copyBuffer(*hostMemory, *memory, srcOrigin, origin, size, vcmd.isEntireMemory()); } else { result = blitMgr().writeBuffer(vcmd.source(), *memory, origin, size, vcmd.isEntireMemory()); } if (NULL != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_WRITE_BUFFER_RECT: { amd::BufferRect hostbufferRect; amd::Coord3D region(0); amd::Coord3D hostOrigin(vcmd.hostRect().start_ + offset); hostbufferRect.create(hostOrigin.c, vcmd.size().c, vcmd.hostRect().rowPitch_, vcmd.hostRect().slicePitch_); if (hostMemory != NULL) { result = blitMgr().copyBufferRect(*hostMemory, *memory, hostbufferRect, vcmd.bufRect(), vcmd.size(), vcmd.isEntireMemory()); } else { result = blitMgr().writeBufferRect(vcmd.source(), *memory, vcmd.hostRect(), vcmd.bufRect(), vcmd.size(), vcmd.isEntireMemory()); } } break; case CL_COMMAND_WRITE_IMAGE: if (hostMemory != NULL) { // Accelerated buffer to image transfer without pinning amd::Coord3D srcOrigin(offset); result = blitMgr().copyBufferToImage(*hostMemory, *memory, srcOrigin, vcmd.origin(), vcmd.size(), vcmd.isEntireMemory(), vcmd.rowPitch(), vcmd.slicePitch()); } else { result = blitMgr().writeImage(vcmd.source(), *memory, vcmd.origin(), vcmd.size(), vcmd.rowPitch(), vcmd.slicePitch(), vcmd.isEntireMemory()); } break; default: LogError("Unsupported type for the write command"); break; } if (!result) { LogError("submitWriteMemory failed!"); vcmd.setStatus(CL_INVALID_OPERATION); } else { // Mark this as the most-recently written cache of the destination vcmd.destination().signalWrite(&gpuDevice_); } profilingEnd(vcmd); } bool VirtualGPU::copyMemory(cl_command_type type, amd::Memory& srcMem, amd::Memory& dstMem, bool entire, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect) { // Translate memory references and ensure cache up-to-date gpu::Memory* dstMemory = dev().getGpuMemory(&dstMem); gpu::Memory* srcMemory = dev().getGpuMemory(&srcMem); // Synchronize source and destination memory device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; dstMemory->syncCacheFromHost(*this, syncFlags); srcMemory->syncCacheFromHost(*this); amd::Memory* bufferFromImageSrc = NULL; amd::Memory* bufferFromImageDst = NULL; // Force buffer read for IMAGE1D_BUFFER if ((srcMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImageSrc = createBufferFromImage(srcMem); if (NULL == bufferFromImageSrc) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_COPY_BUFFER; srcMemory = dev().getGpuMemory(bufferFromImageSrc); } } // Force buffer write for IMAGE1D_BUFFER if ((dstMem.getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImageDst = createBufferFromImage(dstMem); if (NULL == bufferFromImageDst) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_COPY_BUFFER; dstMemory = dev().getGpuMemory(bufferFromImageDst); } } bool result = false; // Check if HW can be used for memory copy switch (type) { case CL_COMMAND_SVM_MEMCPY: case CL_COMMAND_COPY_BUFFER: { amd::Coord3D realSrcOrigin(srcOrigin[0]); amd::Coord3D realDstOrigin(dstOrigin[0]); amd::Coord3D realSize(size.c[0], size.c[1], size.c[2]); if (NULL != bufferFromImageSrc) { size_t elemSize = srcMem.asImage()->getImageFormat().getElementSize(); realSrcOrigin.c[0] *= elemSize; if (NULL != bufferFromImageDst) { realDstOrigin.c[0] *= elemSize; } realSize.c[0] *= elemSize; } else if (NULL != bufferFromImageDst) { size_t elemSize = dstMem.asImage()->getImageFormat().getElementSize(); realDstOrigin.c[0] *= elemSize; realSize.c[0] *= elemSize; } result = blitMgr().copyBuffer(*srcMemory, *dstMemory, realSrcOrigin, realDstOrigin, realSize, entire); if (NULL != bufferFromImageSrc) { bufferFromImageSrc->release(); } if (NULL != bufferFromImageDst) { bufferFromImageDst->release(); } } break; case CL_COMMAND_COPY_BUFFER_RECT: result = blitMgr().copyBufferRect(*srcMemory, *dstMemory, srcRect, dstRect, size, entire); break; case CL_COMMAND_COPY_IMAGE_TO_BUFFER: result = blitMgr().copyImageToBuffer(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); break; case CL_COMMAND_COPY_BUFFER_TO_IMAGE: result = blitMgr().copyBufferToImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); break; case CL_COMMAND_COPY_IMAGE: result = blitMgr().copyImage(*srcMemory, *dstMemory, srcOrigin, dstOrigin, size, entire); break; default: LogError("Unsupported command type for memory copy!"); break; } if (!result) { LogError("submitCopyMemory failed!"); return false; } else { // Mark this as the most-recently written cache of the destination dstMem.signalWrite(&gpuDevice_); } return true; } void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); cl_command_type type = vcmd.type(); bool entire = vcmd.isEntireMemory(); if (!copyMemory(type, vcmd.source(), vcmd.destination(), entire, vcmd.srcOrigin(), vcmd.dstOrigin(), vcmd.size(), vcmd.srcRect(), vcmd.dstRect())) { vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); cl_command_type type = vcmd.type(); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { amd::Coord3D srcOrigin(0, 0, 0); amd::Coord3D dstOrigin(0, 0, 0); amd::Coord3D size(vcmd.srcSize(), 1, 1); amd::BufferRect srcRect; amd::BufferRect dstRect; bool result = false; amd::Memory* srcMem = amd::MemObjMap::FindMemObj(vcmd.src()); amd::Memory* dstMem = amd::MemObjMap::FindMemObj(vcmd.dst()); device::Memory::SyncFlags syncFlags; if (nullptr != srcMem) { srcMem->commitSvmMemory(); srcOrigin.c[0] = static_cast(vcmd.src()) - static_cast

(srcMem->getSvmPtr()); if (!(srcMem->validateRegion(srcOrigin, size))) { vcmd.setStatus(CL_INVALID_OPERATION); return; } } if (nullptr != dstMem) { dstMem->commitSvmMemory(); dstOrigin.c[0] = static_cast(vcmd.dst()) - static_cast

(dstMem->getSvmPtr()); if (!(dstMem->validateRegion(dstOrigin, size))) { vcmd.setStatus(CL_INVALID_OPERATION); return; } } if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space amd::Os::fastMemcpy(vcmd.dst(), vcmd.src(), vcmd.srcSize()); result = true; } else if (nullptr == srcMem && nullptr != dstMem) { // src not in svm space Memory* memory = dev().getGpuMemory(dstMem); // Synchronize source and destination memory syncFlags.skipEntire_ = dstMem->isEntirelyCovered(dstOrigin, size); memory->syncCacheFromHost(*this, syncFlags); result = blitMgr().writeBuffer(vcmd.src(), *memory, dstOrigin, size, dstMem->isEntirelyCovered(dstOrigin, size)); // Mark this as the most-recently written cache of the destination dstMem->signalWrite(&gpuDevice_); } else if (nullptr != srcMem && nullptr == dstMem) { // dst not in svm space Memory* memory = dev().getGpuMemory(srcMem); // Synchronize source and destination memory memory->syncCacheFromHost(*this); result = blitMgr().readBuffer(*memory, vcmd.dst(), srcOrigin, size, srcMem->isEntirelyCovered(srcOrigin, size)); } else if (nullptr != srcMem && nullptr != dstMem) { // both in svm space bool entire = srcMem->isEntirelyCovered(srcOrigin, size) && dstMem->isEntirelyCovered(dstOrigin, size); result = copyMemory(type, *srcMem, *dstMem, entire, srcOrigin, dstOrigin, size, srcRect, dstRect); } if (!result) { vcmd.setStatus(CL_INVALID_OPERATION); } } else { // direct memcpy for FGS enabled system amd::SvmBuffer::memFill(vcmd.dst(), vcmd.src(), vcmd.srcSize(), 1); } profilingEnd(vcmd); } void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); // Save map info for unmap operation memory->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory()); // If we have host memory, use it if ((memory->owner()->getHostMem() != NULL) && memory->isDirectMap()) { if (!memory->isHostMemDirectAccess()) { // Make sure GPU finished operation before // synchronization with the backing store memory->wait(*this); } // Target is the backing store, so just ensure that owner is up-to-date memory->owner()->cacheWriteBack(); // Add memory to VA cache, so rutnime can detect direct access to VA dev().addVACache(memory); } else if (memory->isPersistentDirectMap()) { // Nothing to do here } else if (memory->mapMemory() != NULL) { // Target is a remote resource, so copy assert(memory->mapMemory() != NULL); if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { amd::Coord3D dstOrigin(0, 0, 0); if (memory->cal()->buffer_) { if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { amd::Memory* bufferFromImage = NULL; Memory* memoryBuf = memory; amd::Coord3D origin(vcmd.origin()[0]); amd::Coord3D size(vcmd.size()[0]); size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; bufferFromImage = createBufferFromImage(vcmd.memory()); if (NULL == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { memoryBuf = dev().getGpuMemory(bufferFromImage); } if (!blitMgr().copyBuffer(*memoryBuf, *memory->mapMemory(), origin, dstOrigin, size, vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } if (NULL != bufferFromImage) { bufferFromImage->release(); } } else { // Validate if it's a view for a map of mip level if (vcmd.memory().parent() != NULL) { amd::Image* amdImage = vcmd.memory().parent()->asImage(); if ((amdImage != NULL) && (amdImage->getMipLevels() > 1)) { // Save map write info in the parent object dev().getGpuMemory(amdImage)->saveMapInfo(vcmd.mapPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory(), vcmd.memory().asImage()); } } if (!blitMgr().copyImageToBuffer(*memory, *memory->mapMemory(), vcmd.origin(), dstOrigin, vcmd.size(), vcmd.isEntireMemory())) { LogError("submitMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } } } else { LogError("Unhandled map!"); } profilingEnd(vcmd); } void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); amd::Memory* owner = memory->owner(); bool unmapMip = false; const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); if (nullptr == writeMapInfo) { LogError("Unmap without map call"); return; } profilingBegin(vcmd, true); // Check if image is a mipmap and assign a saved view amd::Image* amdImage = owner->asImage(); if ((amdImage != NULL) && (amdImage->getMipLevels() > 1) && (writeMapInfo->baseMip_ != NULL)) { // Assign mip level view amdImage = writeMapInfo->baseMip_; // Clear unmap flags from the parent image memory->clearUnmapInfo(vcmd.mapPtr()); memory = dev().getGpuMemory(amdImage); unmapMip = true; writeMapInfo = memory->writeMapInfo(vcmd.mapPtr()); } // We used host memory if ((owner->getHostMem() != NULL) && memory->isDirectMap()) { if (writeMapInfo->isUnmapWrite()) { // Target is the backing store, so sync owner->signalWrite(NULL); memory->syncCacheFromHost(*this); } // Remove memory from VA cache dev().removeVACache(memory); } // data check was added for persistent memory that failed to get aperture // and therefore are treated like a remote resource else if (memory->isPersistentDirectMap() && (memory->data() != NULL)) { memory->unmap(this); } else if (memory->mapMemory() != NULL) { if (writeMapInfo->isUnmapWrite()) { amd::Coord3D srcOrigin(0, 0, 0); // Target is a remote resource, so copy assert(memory->mapMemory() != NULL); if (memory->cal()->buffer_) { if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } else if ((vcmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { amd::Memory* bufferFromImage = NULL; Memory* memoryBuf = memory; amd::Coord3D origin(writeMapInfo->origin_[0]); amd::Coord3D size(writeMapInfo->region_[0]); size_t elemSize = vcmd.memory().asImage()->getImageFormat().getElementSize(); origin.c[0] *= elemSize; size.c[0] *= elemSize; bufferFromImage = createBufferFromImage(vcmd.memory()); if (NULL == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { memoryBuf = dev().getGpuMemory(bufferFromImage); } if (!blitMgr().copyBuffer(*memory->mapMemory(), *memoryBuf, srcOrigin, origin, size, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } if (NULL != bufferFromImage) { bufferFromImage->release(); } } else { if (!blitMgr().copyBufferToImage(*memory->mapMemory(), *memory, srcOrigin, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } } } else { LogError("Unhandled unmap!"); vcmd.setStatus(CL_INVALID_VALUE); } // Clear unmap flags memory->clearUnmapInfo(vcmd.mapPtr()); // Release a view for a mipmap map if (unmapMip) { amdImage->release(); } profilingEnd(vcmd); } bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, const amd::Coord3D& size) { gpu::Memory* memory = dev().getGpuMemory(amdMemory); bool entire = amdMemory->isEntirelyCovered(origin, size); // Synchronize memory from host if necessary device::Memory::SyncFlags syncFlags; syncFlags.skipEntire_ = entire; memory->syncCacheFromHost(*this, syncFlags); bool result = false; amd::Memory* bufferFromImage = NULL; float fillValue[4]; // Force fill buffer for IMAGE1D_BUFFER if ((type == CL_COMMAND_FILL_IMAGE) && (amdMemory->getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) { bufferFromImage = createBufferFromImage(*amdMemory); if (NULL == bufferFromImage) { LogError("We should not fail buffer creation from image_buffer!"); } else { type = CL_COMMAND_FILL_BUFFER; memory = dev().getGpuMemory(bufferFromImage); } } // Find the the right fill operation switch (type) { case CL_COMMAND_FILL_BUFFER: case CL_COMMAND_SVM_MEMFILL: { amd::Coord3D realOrigin(origin[0]); amd::Coord3D realSize(size[0]); // Reprogram fill parameters if it's an IMAGE1D_BUFFER object if (NULL != bufferFromImage) { size_t elemSize = amdMemory->asImage()->getImageFormat().getElementSize(); realOrigin.c[0] *= elemSize; realSize.c[0] *= elemSize; memset(fillValue, 0, sizeof(fillValue)); amdMemory->asImage()->getImageFormat().formatColor(pattern, fillValue); pattern = fillValue; patternSize = elemSize; } result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, amdMemory->isEntirelyCovered(origin, size)); if (NULL != bufferFromImage) { bufferFromImage->release(); } } break; case CL_COMMAND_FILL_IMAGE: result = blitMgr().fillImage(*memory, pattern, origin, size, amdMemory->isEntirelyCovered(origin, size)); break; default: LogError("Unsupported command type for FillMemory!"); break; } if (!result) { LogError("fillMemory failed!"); return false; } // Mark this as the most-recently written cache of the destination amdMemory->signalWrite(&gpuDevice_); return true; } void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); if (!fillMemory(vcmd.type(), &vcmd.memory(), vcmd.pattern(), vcmd.patternSize(), vcmd.origin(), vcmd.size())) { vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { // Make sure we have memory for the command execution gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); memory->saveMapInfo(vcmd.svmPtr(), vcmd.origin(), vcmd.size(), vcmd.mapFlags(), vcmd.isEntireMemory()); if (memory->mapMemory() != NULL) { if (vcmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) { assert(memory->cal()->buffer_ && "SVM memory can't be an image"); if (!blitMgr().copyBuffer(*memory, *memory->mapMemory(), vcmd.origin(), vcmd.origin(), vcmd.size(), vcmd.isEntireMemory())) { LogError("submitSVMMapMemory() - copy failed"); vcmd.setStatus(CL_MAP_FAILURE); } } } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { if (!memory->isHostMemDirectAccess()) { // Make sure GPU finished operation before // synchronization with the backing store memory->wait(*this); } // Target is the backing store, so just ensure that owner is up-to-date memory->owner()->cacheWriteBack(); } else { LogError("Unhandled svm map!"); } } profilingEnd(vcmd); } void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); // no op for FGS supported device if (!dev().isFineGrainedSystem()) { gpu::Memory* memory = dev().getGpuMemory(vcmd.getSvmMem()); const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(vcmd.svmPtr()); if (memory->mapMemory() != NULL) { if (writeMapInfo->isUnmapWrite()) { // Target is a remote resource, so copy assert(memory->cal()->buffer_ && "SVM memory can't be an image"); if (!blitMgr().copyBuffer(*memory->mapMemory(), *memory, writeMapInfo->origin_, writeMapInfo->origin_, writeMapInfo->region_, writeMapInfo->isEntire())) { LogError("submitSvmUnmapMemory() - copy failed"); vcmd.setStatus(CL_OUT_OF_RESOURCES); } } } else if ((memory->owner()->getHostMem() != nullptr) && memory->isDirectMap()) { if (writeMapInfo->isUnmapWrite()) { // Target is the backing store, so sync memory->owner()->signalWrite(nullptr); memory->syncCacheFromHost(*this); } } memory->clearUnmapInfo(vcmd.svmPtr()); } profilingEnd(vcmd); } void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); if (!dev().isFineGrainedSystem()) { size_t patternSize = vcmd.patternSize(); size_t fillSize = patternSize * vcmd.times(); size_t offset = 0; amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst()); assert(dstMemory && "No svm Buffer to fill with!"); offset = reinterpret_cast(vcmd.dst()) - reinterpret_cast(dstMemory->getSvmPtr()); assert((offset >= 0) && "wrong svm ptr to fill with!"); gpu::Memory* memory = dev().getGpuMemory(dstMemory); amd::Coord3D origin(offset, 0, 0); amd::Coord3D size(fillSize, 1, 1); assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) { vcmd.setStatus(CL_INVALID_OPERATION); } // Mark this as the most-recently written cache of the destination dstMemory->signalWrite(&gpuDevice_); } else { // for FGS capable device, fill CPU memory directly amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); } profilingEnd(vcmd); } void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd, true); for (const auto& it : vcmd.memObjects()) { // Find device memory gpu::Memory* memory = dev().getGpuMemory(it); if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) { memory->mgpuCacheWriteBack(); } else if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) { // Synchronize memory from host if necessary. // The sync function will perform memory migration from // another device if necessary device::Memory::SyncFlags syncFlags; memory->syncCacheFromHost(*this, syncFlags); } else { LogWarning("Unknown operation for memory migration!"); } } profilingEnd(vcmd); } void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) { // in-order semantics: previous commands need to be done before we start // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); std::vector& svmPointers = vcmd.svmPointers(); if (vcmd.pfnFreeFunc() == NULL) { // pointers allocated using clSVMAlloc for (cl_uint i = 0; i < svmPointers.size(); ++i) { dev().svmFree(svmPointers[i]); } } else { vcmd.pfnFreeFunc()(as_cl(vcmd.queue()->asCommandQueue()), svmPointers.size(), static_cast(&(svmPointers[0])), vcmd.userData()); } profilingEnd(vcmd); } void VirtualGPU::findIterations(const amd::NDRangeContainer& sizes, const amd::NDRange& local, amd::NDRange& groups, amd::NDRange& remainder, size_t& extra) { size_t dimensions = sizes.dimensions(); if (cal()->iterations_ > 1) { size_t iterations = cal()->iterations_; cal_.iterations_ = 1; // Find the total amount of all groups groups = sizes.global() / local; for (uint j = 0; j < dimensions; ++j) { if ((sizes.global()[j] % local[j]) != 0) { groups[j]++; } } // Calculate the real number of required iterations and // the workgroup size of each iteration for (int j = (dimensions - 1); j >= 0; --j) { // Find possible size of each iteration size_t tmp = (groups[j] / iterations); // Make sure the group size is more than 1 if (tmp > 0) { remainder = groups; remainder[j] = (groups[j] % tmp); extra = ((groups[j] / tmp) + // Check for the remainder ((remainder[j] != 0) ? 1 : 0)); // Recalculate the number of iterations cal_.iterations_ *= extra; if (remainder[j] == 0) { extra = 0; } groups[j] = tmp; break; } else { iterations = ((iterations / groups[j]) + (((iterations % groups[j]) != 0) ? 1 : 0)); cal_.iterations_ *= groups[j]; groups[j] = 1; } } } } void VirtualGPU::setupIteration(uint iteration, const amd::NDRangeContainer& sizes, Kernel& gpuKernel, amd::NDRange& global, amd::NDRange& offsets, amd::NDRange& local, amd::NDRange& groups, amd::NDRange& groupOffset, amd::NDRange& divider, amd::NDRange& remainder, size_t extra) { size_t dimensions = sizes.dimensions(); // Calculate the workload size for the remainder if ((extra != 0) && ((iteration % extra) == 0)) { groups = remainder; } else { groups = divider; } global = groups * local; for (uint j = 0; j < dimensions; ++j) { size_t offset = groupOffset[j] * local[j]; if ((offset + global[j]) > sizes.global()[j]) { global[j] = sizes.global()[j] - offset; } } // Reprogram the kernel parameters for the GPU execution gpuKernel.setupProgramGrid(*this, dimensions, offsets, global, local, groupOffset, sizes.offset(), sizes.global()); // Update the constant buffers gpuKernel.bindConstantBuffers(*this); uint sub = 0; // Find the offsets for the next execution for (uint j = 0; j < dimensions; ++j) { groupOffset[j] += groups[j]; // Make sure the offset doesn't go over the size limit if (sizes.global()[j] <= groupOffset[j] * local[j]) { // Check if we counted a group in one dimension already if (sub) { groupOffset[j] -= groups[j]; } else { groupOffset[j] = 0; } } else { groupOffset[j] -= sub; // We already counted elements in one dimension sub = 1; } offsets[j] = groupOffset[j] * local[j] + sizes.offset()[j]; } } void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); // Submit kernel to HW if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, &vcmd.event())) { vcmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(vcmd); } bool VirtualGPU::submitKernelInternalHSA(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, bool nativeMem, amd::Event* enqueueEvent) { uint64_t vmParentWrap = 0; uint64_t vmDefQueue = 0; amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev()); VirtualGPU* gpuDefQueue = NULL; amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); // Get the HSA kernel object const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); std::vector memList; bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true : false; if (!printfDbgHSA().init(*this, printfEnabled)) { LogError("Printf debug buffer initialization failed!"); return false; } // Check memory dependency and SVM objects if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) { LogError("Wrong memory objects!"); return false; } cal_.memCount_ = 0; if (hsaKernel.dynamicParallelism()) { if (NULL == defQueue) { LogError("Default device queue wasn't allocated"); return false; } else { if (dev().settings().useDeviceQueue_) { gpuDefQueue = static_cast(defQueue->vDev()); if (gpuDefQueue->hwRing() == hwRing()) { LogError("Can't submit the child kernels to the same HW ring as the host queue!"); return false; } } else { createVirtualQueue(defQueue->size()); gpuDefQueue = this; } } vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress(); // Add memory handles before the actual dispatch memList.push_back(gpuDefQueue->virtualQueue_); memList.push_back(gpuDefQueue->schedParams_); memList.push_back(hsaKernel.prog().kernelTable()); gpuDefQueue->writeVQueueHeader(*this, hsaKernel.prog().kernelTable()->vmAddress()); } // setup the storage for the memory pointers of the kernel parameters uint numParams = kernel.signature().numParameters(); if (dbgManager) { dbgManager->allocParamMemList(numParams); } bool needFlush = false; dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), hsaKernel.aqlCodeSize()); if (dmaFlushMgmt().dispatchSplitSize() != 0) { needFlush = true; } size_t newOffset[3] = {0, 0, 0}; size_t newGlobalSize[3] = {0, 0, 0}; int dim = -1; int iteration = 1; size_t globalStep = 0; for (uint i = 0; i < sizes.dimensions(); i++) { newGlobalSize[i] = sizes.global()[i]; newOffset[i] = sizes.offset()[i]; } // Check if it is blit kernel. If it is, then check if split is needed. if (hsaKernel.isInternalKernel()) { // Calculate new group size for each submission for (uint i = 0; i < sizes.dimensions(); i++) { if (sizes.global()[i] > static_cast(0xffffffff)) { dim = i; iteration = sizes.global()[i] / 0xC0000000 + ((sizes.global()[i] % 0xC0000000) ? 1 : 0); globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration * sizes.local()[dim]; break; } } } for (int j = 0; j < iteration; j++) { // Reset global size for dimension dim if split is needed if (dim != -1) { newOffset[dim] = sizes.offset()[dim] + globalStep * j; if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && (j != (iteration - 1))) { newGlobalSize[dim] = globalStep; } else { newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; } } amd::NDRangeContainer tmpSizes(sizes.dimensions(), &newOffset[0], &newGlobalSize[0], &(const_cast(sizes).local()[0])); // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments( *this, kernel, tmpSizes, parameters, nativeMem, vmDefQueue, &vmParentWrap, memList); if (NULL == aqlPkt) { LogError("Couldn't load kernel arguments"); return false; } gslMemObject scratch = NULL; uint scratchOffset = 0; // Check if the device allocated more registers than the old setup if (hsaKernel.workGroupInfo()->scratchRegs_ > 0) { const Device::ScratchBuffer* scratchObj = dev().scratch(hwRing()); scratch = scratchObj->memObj_->gslResource(); memList.push_back(scratchObj->memObj_); scratchOffset = scratchObj->offset_; } // Add GSL handle to the memory list for VidMM for (uint i = 0; i < memList.size(); ++i) { addVmMemory(memList[i]); } // HW Debug for the kernel? HwDbgKernelInfo kernelInfo; HwDbgKernelInfo* pKernelInfo = NULL; if (dbgManager) { buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); pKernelInfo = &kernelInfo; } // Set up the dispatch information KernelDispatchInfo dispatchInfo; dispatchInfo.aqlPacket = aqlPkt; dispatchInfo.mems = vmMems(); dispatchInfo.numMems = cal_.memCount_; dispatchInfo.scratch = scratch; dispatchInfo.scratchOffset = scratchOffset; dispatchInfo.cpuAqlCode = hsaKernel.cpuAqlCode(); dispatchInfo.hsaQueueVA = hsaQueueMem_->vmAddress(); dispatchInfo.kernelInfo = pKernelInfo; dispatchInfo.wavesPerSH = hsaKernel.getWavesPerSH(this); dispatchInfo.lastDoppSubmission = kernel.parameters().getExecNewVcop(); dispatchInfo.pfpaDoppSubmission = kernel.parameters().getExecPfpaVcop(); GpuEvent gpuEvent; // Run AQL dispatch in HW eventBegin(MainEngine); cs()->AqlDispatch(&dispatchInfo); eventEnd(MainEngine, gpuEvent); if (dbgManager && (NULL != dbgManager->postDispatchCallBackFunc())) { dbgManager->executePostDispatchCallBack(); } if (hsaKernel.dynamicParallelism()) { // Make sure exculsive access to the device queue amd::ScopedLock(defQueue->lock()); if (GPU_PRINT_CHILD_KERNEL != 0) { waitForEvent(&gpuEvent); AmdAqlWrap* wraps = (AmdAqlWrap*)(&((AmdVQueueHeader*)gpuDefQueue->virtualQueue_->data())[1]); uint p = 0; for (uint i = 0; i < gpuDefQueue->vqHeader_->aql_slot_num; ++i) { if (wraps[i].state != 0) { uint j; if (p == GPU_PRINT_CHILD_KERNEL) { break; } p++; std::stringstream print; print.flags(std::ios::right | std::ios_base::hex | std::ios_base::uppercase); print << "Slot#: " << i << "\n"; print << "\tenqueue_flags: " << wraps[i].enqueue_flags << "\n"; print << "\tcommand_id: " << wraps[i].command_id << "\n"; print << "\tchild_counter: " << wraps[i].child_counter << "\n"; print << "\tcompletion: " << wraps[i].completion << "\n"; print << "\tparent_wrap: " << wraps[i].parent_wrap << "\n"; print << "\twait_list: " << wraps[i].wait_list << "\n"; print << "\twait_num: " << wraps[i].wait_num << "\n"; uint offsEvents = wraps[i].wait_list - gpuDefQueue->virtualQueue_->vmAddress(); size_t* events = reinterpret_cast(gpuDefQueue->virtualQueue_->data() + offsEvents); for (j = 0; j < wraps[i].wait_num; ++j) { uint offs = static_cast(events[j]) - gpuDefQueue->virtualQueue_->vmAddress(); AmdEvent* eventD = (AmdEvent*)(gpuDefQueue->virtualQueue_->data() + offs); print << "Wait Event#: " << j << "\n"; print << "\tState: " << eventD->state << "; Counter: " << eventD->counter << "\n"; } print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", "; print << wraps[i].aql.workgroup_size_y << ", "; print << wraps[i].aql.workgroup_size_z << "]\n"; print << "GridSize[ " << wraps[i].aql.grid_size_x << ", "; print << wraps[i].aql.grid_size_y << ", "; print << wraps[i].aql.grid_size_z << "]\n"; uint64_t* kernels = (uint64_t*)(const_cast(hsaKernel.prog().kernelTable())->map(this)); for (j = 0; j < hsaKernel.prog().kernels().size(); ++j) { if (kernels[j] == wraps[i].aql.kernel_object) { break; } } const_cast(hsaKernel.prog().kernelTable())->unmap(this); HSAILKernel* child = NULL; for (auto it = hsaKernel.prog().kernels().begin(); it != hsaKernel.prog().kernels().end(); ++it) { if (j == static_cast(it->second)->index()) { child = static_cast(it->second); } } if (child == NULL) { printf("Error: couldn't find child kernel!\n"); continue; } const uint64_t kernarg_address = static_cast(reinterpret_cast(wraps[i].aql.kernarg_address)); uint offsArg = kernarg_address - gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { "Offset0: ", "Offset1: ", "Offset2: ", "PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; for (j = 0; j < child->extraArgumentsNum(); ++j) { print << "\t" << Names[j] << *(size_t*)argum; print << "\n"; argum += sizeof(size_t); } for (j = 0; j < child->numArguments(); ++j) { print << "\t" << child->argument(j)->name_ << ": "; for (int s = child->argument(j)->size_ - 1; s >= 0; --s) { print.width(2); print.fill('0'); print << (uint32_t)(argum[s]); } argum += child->argument(j)->size_; print << "\n"; } printf("%s", print.str().c_str()); } } } if (!dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, vmParentWrap + offsetof(AmdAqlWrap, child_counter), 0, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); } // Get the global loop start before the scheduler mcaddr loopStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); static_cast(gpuDefQueue->blitMgr()) .runScheduler(*gpuDefQueue->virtualQueue_, *gpuDefQueue->schedParams_, gpuDefQueue->schedParamIdx_, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); const static bool FlushL2 = true; gpuDefQueue->flushCUCaches(FlushL2); // Get the address of PM4 template and add write it to params //! @note DMA flush must not occur between patch and the scheduler mcaddr patchStart = gpuDefQueue->cs()->VirtualQueueDispatcherStart(); // Program parameters for the scheduler SchedulerParam* param = &reinterpret_cast( gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_]; param->signal = 1; // Scale clock to 1024 to avoid 64 bit div in the scheduler param->eng_clk = (1000 * 1024) / dev().info().maxEngineClockFrequency_; param->hw_queue = patchStart + sizeof(uint32_t) /* Rewind packet*/; param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress(); param->releaseHostCP = 0; param->parentAQL = vmParentWrap; param->dedicatedQueue = dev().settings().useDeviceQueue_; param->useATC = dev().settings().svmFineGrainSystem_; // Fill the scratch buffer information if (hsaKernel.prog().maxScratchRegs() > 0) { gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObj_; param->scratchSize = scratchBuf->size(); param->scratch = scratchBuf->vmAddress(); param->numMaxWaves = 32 * dev().info().maxComputeUnits_; param->scratchOffset = dev().scratch(gpuDefQueue->hwRing())->offset_; memList.push_back(scratchBuf); } else { param->numMaxWaves = 0; param->scratchSize = 0; param->scratch = 0; param->scratchOffset = 0; } // Add all kernels in the program to the mem list. //! \note Runtime doesn't know which one will be called hsaKernel.prog().fillResListWithKernels(memList); // Add GSL handle to the memory list for VidMM for (uint i = 0; i < memList.size(); ++i) { gpuDefQueue->addVmMemory(memList[i]); } mcaddr signalAddr = gpuDefQueue->schedParams_->vmAddress() + gpuDefQueue->schedParamIdx_ * sizeof(SchedulerParam); gpuDefQueue->eventBegin(MainEngine); gpuDefQueue->cs()->VirtualQueueDispatcherEnd( gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_, signalAddr, loopStart, gpuDefQueue->vqHeader_->aql_slot_num / (DeviceQueueMaskSize * maskGroups_)); gpuDefQueue->eventEnd(MainEngine, gpuEvent); // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*gpuDefQueue, gpuEvent); } if (dev().settings().useDeviceQueue_) { // Add the termination handshake to the host queue eventBegin(MainEngine); cs()->VirtualQueueHandshake(gpuDefQueue->schedParams_->gslResource(), vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE, vmParentWrap + offsetof(AmdAqlWrap, child_counter), signalAddr, dev().settings().useDeviceQueue_); eventEnd(MainEngine, gpuEvent); } ++gpuDefQueue->schedParamIdx_ %= gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam); //! \todo optimize the wrap around if (gpuDefQueue->schedParamIdx_ == 0) { gpuDefQueue->schedParams_->wait(*gpuDefQueue); } } // Set GPU event for the used resources for (uint i = 0; i < memList.size(); ++i) { memList[i]->setBusy(*this, gpuEvent); } // Update the global GPU event setGpuEvent(gpuEvent, needFlush); if (!printfDbgHSA().output(*this, printfEnabled, hsaKernel.printfInfo())) { LogError("Couldn't read printf data from the buffer!\n"); return false; } } // Runtime submitted a HSAIL kernel state_.hsailKernel_ = true; return true; } bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, bool nativeMem, amd::Event* enqueueEvent) { bool result = true; uint i; size_t dimensions = sizes.dimensions(); amd::NDRange local(sizes.local()); amd::NDRange groupOffset(dimensions); GpuEvent gpuEvent; groupOffset = 0; // Get the GPU kernel object device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernelOpt = static_cast(*devKernel); if (gpuKernelOpt.hsa()) { return submitKernelInternalHSA(sizes, kernel, parameters, nativeMem, enqueueEvent); } else if (state_.hsailKernel_) { // Reload GSL state to HW, so runtime could run AMDIL kernel flushDMA(MainEngine); // Reset HSAIL state state_.hsailKernel_ = false; } // Find if arguments contain memory aliases or a dependency in the queue gpuKernelOpt.processMemObjects(*this, kernel, parameters, nativeMem); Kernel& gpuKernel = static_cast(*devKernel); bool printfEnabled = (gpuKernel.flags() & gpu::NullKernel::PrintfOutput) ? true : false; // Set current kernel CAL descriptor as active if (!setActiveKernelDesc(sizes, &gpuKernel) || // Initialize printf support !printfDbg().init(*this, printfEnabled, sizes.global())) { LogPrintfError("We couldn't set \"%s\" kernel as active!", gpuKernel.name().data()); return false; } // Find if we have to split workload dmaFlushMgmt_.findSplitSize(dev(), sizes.global().product(), gpuKernel.instructionCnt()); // Program the kernel parameters for the GPU execution cal_.memCount_ = 0; gpuKernel.setupProgramGrid(*this, dimensions, sizes.offset(), sizes.global(), local, groupOffset, sizes.offset(), sizes.global()); // Load kernel arguments if (gpuKernel.loadParameters(*this, kernel, parameters, nativeMem)) { amd::NDRange global(sizes.global()); amd::NDRange groups(dimensions); amd::NDRange offsets(sizes.offset()); amd::NDRange divider(dimensions); amd::NDRange remainder(dimensions); size_t extra = 0; // Split the workload if necessary for local/private emulation or printf findIterations(sizes, local, groups, remainder, extra); divider = groups; i = 0; do { bool lastRun = (i == (cal()->iterations_ - 1)) ? true : false; // Reprogram the CAL grid and constant buffers if // the workload split is on if (cal()->iterations_ > 1) { // Initialize printf support if (!printfDbg().init(*this, printfEnabled, local)) { result = false; break; } // Reprogram the CAL grid and constant buffers setupIteration(i, sizes, gpuKernel, global, offsets, local, groups, groupOffset, divider, remainder, extra); } // Execute the kernel if (gpuKernel.run(*this, &gpuEvent, lastRun, kernel.parameters().getExecNewVcop(), kernel.parameters().getExecPfpaVcop())) { //! @todo A flush is necessary to make sure // that 2 consecutive runs won't access to the same // private/local memory. CAL has to generate cache flush // and wait for idle commands bool flush = ((cal()->iterations_ > 1) || dmaFlushMgmt_.isCbReady(*this, global.product(), gpuKernel.instructionCnt())) ? true : false; // Update the global GPU event setGpuEvent(gpuEvent, flush); // This code for the kernel execution debugging if (dev().settings().debugFlags_ & Settings::LockGlobalMemory) { gpuKernel.debug(*this); } } else { result = false; break; } // Print the debug buffer output result if (printfDbg().output(*this, printfEnabled, (cal()->iterations_ > 1) ? local : sizes.global(), gpuKernel.prog().printfInfo())) { // Go to the next iteration ++i; } else { result = false; break; } } // Check if we have to make multiple iterations while (i < cal()->iterations_); } else { result = false; } if (!result) { LogPrintfError("submitKernel failed to execute the \"%s\" kernel on HW!", gpuKernel.name().data()); } return result; } void VirtualGPU::submitNativeFn(amd::NativeFnCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); Unimplemented(); //!< @todo: Unimplemented } void VirtualGPU::submitMarker(amd::Marker& vcmd) { //!@note runtime doesn't need to lock this command on execution if (vcmd.waitingEvent() != NULL) { bool foundEvent = false; // Loop through all outstanding command batches while (!cbList_.empty()) { const auto it = cbList_.cbegin(); // Wait for completion foundEvent = awaitCompletion(*it, vcmd.waitingEvent()); // Release a command batch delete *it; // Remove command batch from the list cbList_.pop_front(); // Early exit if we found a command if (foundEvent) break; } // Event should be in the current command batch if (!foundEvent) { state_.forceWait_ = true; } // If we don't have any more batches, then assume GPU is idle else if (cbList_.empty()) { dmaFlushMgmt_.resetCbWorkload(dev()); } } } void VirtualGPU::releaseMemory(gslMemObject gslResource, bool wait) { bool result = true; if (wait) { waitForEvent(&gpuEvents_[gslResource]); } // Unbind resource if it's active kernel desc for (uint i = 0; i < MaxUavArguments; ++i) { if (gslResource == cal_.uavs_[i]) { result = setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); cal_.uavs_[i] = 0; } } for (uint i = 0; i < MaxReadImage; ++i) { if (gslResource == cal_.readImages_[i]) { result = setInput(i, 0); cal_.readImages_[i] = 0; } } for (uint i = 0; i < MaxConstBuffers; ++i) { if (gslResource == cal_.constBuffers_[i]) { result = setConstantBuffer(i, 0, 0, 0); cal_.constBuffers_[i] = 0; } } if ((dev().scratch(hwRing()) != NULL) && (dev().scratch(hwRing())->regNum_ > 0)) { // Unbind scratch memory const Device::ScratchBuffer* scratch = dev().scratch(hwRing()); if ((scratch->memObj_ != NULL) && (scratch->memObj_->gslResource() == gslResource)) { setScratchBuffer(NULL, 0); } } gpuEvents_.erase(gslResource); } void VirtualGPU::releaseKernel(CALimage calImage) { GslKernelDesc* desc = gslKernels_[calImage]; if (desc != NULL) { freeKernelDesc(desc); } gslKernels_.erase(calImage); } void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); gslQueryObject gslCounter; const amd::PerfCounterCommand::PerfCounterList counters = vcmd.getCounters(); // Create a HW counter gslCounter = cs()->createQuery(GSL_PERFORMANCE_COUNTERS_ATI); if (0 == gslCounter) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; } CalCounterReference* calRef = new CalCounterReference(*this, gslCounter); if (calRef == NULL) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; } gslCounter = 0; for (uint i = 0; i < vcmd.getNumCounters(); ++i) { amd::PerfCounter* amdCounter = static_cast(counters[i]); const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); // Make sure we have a valid gpu performance counter if (NULL == counter) { amd::PerfCounter::Properties prop = amdCounter->properties(); PerfCounter* gpuCounter = new PerfCounter( gpuDevice_, *this, prop[CL_PERFCOUNTER_GPU_BLOCK_INDEX], prop[CL_PERFCOUNTER_GPU_COUNTER_INDEX], prop[CL_PERFCOUNTER_GPU_EVENT_INDEX]); if (NULL == gpuCounter) { LogError("We failed to allocate memory for the GPU perfcounter"); vcmd.setStatus(CL_INVALID_OPERATION); return; } else if (gpuCounter->create(calRef)) { amdCounter->setDeviceCounter(gpuCounter); } else { LogPrintfError( "We failed to allocate a perfcounter in CAL.\ Block: %d, counter: #d, event: %d", gpuCounter->info()->blockIndex_, gpuCounter->info()->counterIndex_, gpuCounter->info()->eventIndex_); delete gpuCounter; vcmd.setStatus(CL_INVALID_OPERATION); return; } counter = gpuCounter; } } calRef->release(); for (uint i = 0; i < vcmd.getNumCounters(); ++i) { amd::PerfCounter* amdCounter = static_cast(counters[i]); const PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); if (gslCounter != counter->gslCounter()) { gslCounter = counter->gslCounter(); // Find the state and sends the command to CAL if (vcmd.getState() == amd::PerfCounterCommand::Begin) { gslCounter->BeginQuery(cs(), GSL_PERFORMANCE_COUNTERS_ATI, 0); } else if (vcmd.getState() == amd::PerfCounterCommand::End) { GpuEvent event; eventBegin(MainEngine); gslCounter->EndQuery(cs(), 0); eventEnd(MainEngine, event); setGpuEvent(event); } else { LogError("Unsupported performance counter state"); vcmd.setStatus(CL_INVALID_OPERATION); return; } } } } void VirtualGPU::submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); switch (cmd.type()) { case CL_COMMAND_THREAD_TRACE_MEM: { amd::ThreadTrace* amdThreadTrace = &cmd.getThreadTrace(); ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); if (threadTrace == NULL) { gslQueryObject gslThreadTrace; // Create a HW thread trace query object gslThreadTrace = cs()->createQuery(GSL_SHADER_TRACE_BYTES_WRITTEN); if (0 == gslThreadTrace) { LogError("Failure in memory allocation for the GPU threadtrace"); cmd.setStatus(CL_INVALID_OPERATION); return; } CalThreadTraceReference* calRef = new CalThreadTraceReference(*this, gslThreadTrace); if (calRef == NULL) { LogError("Failure in memory allocation for the GPU threadtrace"); cmd.setStatus(CL_INVALID_OPERATION); return; } size_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); ThreadTrace* gpuThreadTrace = new ThreadTrace(gpuDevice_, *this, seNum); if (NULL == gpuThreadTrace) { LogError("Failure in memory allocation for the GPU threadtrace"); cmd.setStatus(CL_INVALID_OPERATION); return; } if (gpuThreadTrace->create(calRef)) { amdThreadTrace->setDeviceThreadTrace(gpuThreadTrace); } else { LogError("Failure in memory allocation for the GPU threadtrace"); delete gpuThreadTrace; cmd.setStatus(CL_INVALID_OPERATION); return; } threadTrace = gpuThreadTrace; calRef->release(); } gslShaderTraceBufferObject* threadTraceBufferObjects = threadTrace->getThreadTraceBufferObjects(); const size_t memObjSize = cmd.getMemoryObjectSize(); const std::vector& memObj = cmd.getMemList(); size_t se = 0; for (auto itMemObj = memObj.cbegin(); itMemObj != memObj.cend(); ++itMemObj, ++se) { // Find GSL Mem Object gslMemObject gslMemObj = dev().getGpuMemory(*itMemObj)->gslResource(); // Bind GSL MemObject to the appropriate SE Thread Trace Buffer Object threadTraceBufferObjects[se]->attachMemObject(cs(), gslMemObj, 0, 0, memObjSize, se); } break; } default: LogError("Unsupported command type for ThreadTraceMemObjects!"); break; } } void VirtualGPU::submitThreadTrace(amd::ThreadTraceCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(cmd); switch (cmd.type()) { case CL_COMMAND_THREAD_TRACE: { amd::ThreadTrace* amdThreadTrace = static_cast(&cmd.getThreadTrace()); ThreadTrace* threadTrace = static_cast(amdThreadTrace->getDeviceThreadTrace()); // gpu thread trace object had to be generated prior to begin/end/pause/resume due // to ThreadTraceMemObjectsCommand execution if (threadTrace == NULL) { return; } else { gslQueryObject gslThreadTrace; gslThreadTrace = threadTrace->gslThreadTrace(); uint32_t seNum = amdThreadTrace->deviceSeNumThreadTrace(); // Find the state and sends the commands to GSL if (cmd.getState() == amd::ThreadTraceCommand::Begin) { amd::ThreadTrace::ThreadTraceConfig* traceCfg = static_cast(cmd.threadTraceConfig()); const gslErrorCode ec = gslThreadTrace->BeginQuery(cs(), GSL_SHADER_TRACE_BYTES_WRITTEN, 0); assert(ec == GSL_NO_ERROR); for (uint32_t idx = 0; idx < seNum; ++idx) { rs()->enableShaderTrace(cs(), idx, true); rs()->setShaderTraceComputeUnit(idx, traceCfg->cu_); rs()->setShaderTraceShaderArray(idx, traceCfg->sh_); rs()->setShaderTraceSIMDMask(idx, traceCfg->simdMask_); rs()->setShaderTraceVmIdMask(idx, traceCfg->vmIdMask_); rs()->setShaderTraceTokenMask(idx, traceCfg->tokenMask_); rs()->setShaderTraceRegisterMask(idx, traceCfg->regMask_); rs()->setShaderTraceIssueMask(idx, traceCfg->instMask_); rs()->setShaderTraceRandomSeed(idx, traceCfg->randomSeed_); rs()->setShaderTraceCaptureMode(idx, traceCfg->captureMode_); rs()->setShaderTraceWrap(idx, traceCfg->isWrapped_); rs()->setShaderTraceUserData(idx, (traceCfg->isUserData_) ? traceCfg->userData_ : 0); } } else if (cmd.getState() == amd::ThreadTraceCommand::End) { for (uint32_t idx = 0; idx < seNum; ++idx) { rs()->enableShaderTrace(cs(), idx, false); } gslThreadTrace->EndQuery(cs(), 0); } else if (cmd.getState() == amd::ThreadTraceCommand::Pause) { for (uint32_t idx = 0; idx < seNum; ++idx) { rs()->setShaderTraceIsPaused(cs(), idx, true); } } else if (cmd.getState() == amd::ThreadTraceCommand::Resume) { for (uint32_t idx = 0; idx < seNum; ++idx) { rs()->setShaderTraceIsPaused(cs(), idx, false); } } } break; } default: LogError("Unsupported command type for ThreadTrace!"); break; } } void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); for (const auto& it : vcmd.getMemList()) { // amd::Memory object should never be NULL assert(it && "Memory object for interop is NULL"); gpu::Memory* memory = dev().getGpuMemory(it); // If resource is a shared copy of original resource, then // runtime needs to copy data from original resource it->getInteropObj()->copyOrigToShared(); // Check if OpenCL has direct access to the interop memory if (memory->interopType() == Memory::InteropDirectAccess) { continue; } // Does interop use HW emulation? if (memory->interopType() == Memory::InteropHwEmulation) { static const bool Entire = true; amd::Coord3D origin(0, 0, 0); amd::Coord3D region(memory->size()); // Synchronize the object if (!blitMgr().copyBuffer(*memory->interop(), *memory, origin, origin, region, Entire)) { LogError("submitAcquireExtObjects - Interop synchronization failed!"); vcmd.setStatus(CL_INVALID_OPERATION); return; } } } profilingEnd(vcmd); } void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); profilingBegin(vcmd); for (const auto& it : vcmd.getMemList()) { // amd::Memory object should never be NULL assert(it && "Memory object for interop is NULL"); gpu::Memory* memory = dev().getGpuMemory(it); // Check if we can use HW interop if (memory->interopType() == Memory::InteropHwEmulation) { static const bool Entire = true; amd::Coord3D origin(0, 0, 0); amd::Coord3D region(memory->size()); // Synchronize the object if (!blitMgr().copyBuffer(*memory, *memory->interop(), origin, origin, region, Entire)) { LogError("submitReleaseExtObjects interop synchronization failed!"); vcmd.setStatus(CL_INVALID_OPERATION); return; } } else { if (memory->interopType() != Memory::InteropDirectAccess) { LogError("None interop release!"); } } // If resource is a shared copy of original resource, then // runtime needs to copy data back to original resource it->getInteropObj()->copySharedToOrig(); } profilingEnd(vcmd); } void VirtualGPU::submitSignal(amd::SignalCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); gpu::Memory* gpuMemory = dev().getGpuMemory(&vcmd.memory()); GpuEvent gpuEvent; eventBegin(MainEngine); if (vcmd.type() == CL_COMMAND_WAIT_SIGNAL_AMD) { uint64_t surfAddr = gpuMemory->gslResource()->getPhysicalAddress(cs()); uint64_t markerAddr = gpuMemory->gslResource()->getMarkerAddress(cs()); uint64_t markerOffset = markerAddr - surfAddr; cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), markerOffset, false); } else if (vcmd.type() == CL_COMMAND_WRITE_SIGNAL_AMD) { static constexpr bool FlushL2 = true; flushCUCaches(FlushL2); cs()->p2pMarkerOp(gpuMemory->gslResource(), vcmd.markerValue(), vcmd.markerOffset(), true); } eventEnd(MainEngine, gpuEvent); gpuMemory->setBusy(*this, gpuEvent); // Update the global GPU event setGpuEvent(gpuEvent); profilingEnd(vcmd); } void VirtualGPU::submitMakeBuffersResident(amd::MakeBuffersResidentCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); std::vector memObjects = vcmd.memObjects(); cl_uint numObjects = memObjects.size(); gslMemObject* pGSLMemObjects = new gslMemObject[numObjects]; for (cl_uint i = 0; i < numObjects; ++i) { gpu::Memory* gpuMemory = dev().getGpuMemory(memObjects[i]); pGSLMemObjects[i] = gpuMemory->gslResource(); gpuMemory->syncCacheFromHost(*this); } uint64* surfBusAddr = new uint64[numObjects]; uint64* markerBusAddr = new uint64[numObjects]; gslErrorCode res = cs()->makeBuffersResident(numObjects, pGSLMemObjects, surfBusAddr, markerBusAddr); if (res != GSL_NO_ERROR) { LogError("MakeBuffersResident failed"); vcmd.setStatus(CL_INVALID_OPERATION); } else { cl_bus_address_amd* busAddr = vcmd.busAddress(); for (cl_uint i = 0; i < numObjects; ++i) { busAddr[i].surface_bus_address = surfBusAddr[i]; busAddr[i].marker_bus_address = markerBusAddr[i]; } } delete[] pGSLMemObjects; delete[] surfBusAddr; delete[] markerBusAddr; profilingEnd(vcmd); } bool VirtualGPU::awaitCompletion(CommandBatch* cb, const amd::Event* waitingEvent) { bool found = false; amd::Command* current; amd::Command* head = cb->head_; // Make sure that profiling is enabled if (profileEnabled_) { return profilingCollectResults(cb, waitingEvent); } // Mark the first command in the batch as running if (head != NULL) { head->setStatus(CL_RUNNING); } else { return found; } // Wait for the last known GPU event waitEventLock(cb); while (NULL != head) { current = head->getNext(); if (head->status() == CL_SUBMITTED) { head->setStatus(CL_RUNNING); head->setStatus(CL_COMPLETE); } else if (head->status() == CL_RUNNING) { head->setStatus(CL_COMPLETE); } else if ((head->status() != CL_COMPLETE) && (current != NULL)) { LogPrintfError("Unexpected command status - %d!", head->status()); } // Check if it's a waiting command if (head == waitingEvent) { found = true; } head->release(); head = current; } return found; } void VirtualGPU::flush(amd::Command* list, bool wait) { CommandBatch* cb = NULL; bool gpuCommand = false; for (uint i = 0; i < AllEngines; ++i) { if (cal_.events_[i].isValid()) { gpuCommand = true; } } // If the batch doesn't have any GPU command and the list is empty if (!gpuCommand && cbList_.empty()) { state_.forceWait_ = true; } // Insert the current batch into a list if (NULL != list) { cb = new CommandBatch(list, cal()->events_, cal()->lastTS_); } { //! @note: flushDMA() requires a lock, because GSL can //! defer destruction of internal memory objects and releases them //! on GSL flush. If runtime calls another GSL flush at the same time, //! then double release can occur. amd::ScopedLock lock(execution()); for (uint i = 0; i < AllEngines; ++i) { flushDMA(i); // Reset event so we won't try to wait again, // if runtime didn't submit any commands //! @note: it's safe to invalidate events, since //! we already saved them with the batch creation step above cal_.events_[i].invalidate(); } } // Mark last TS as NULL, so runtime won't process empty batches with the old TS cal_.lastTS_ = NULL; if (NULL != cb) { cbList_.push_back(cb); } wait |= state_.forceWait_; // Loop through all outstanding command batches while (!cbList_.empty()) { const auto it = cbList_.cbegin(); // Check if command batch finished without a wait bool finished = true; for (uint i = 0; i < AllEngines; ++i) { finished &= isDone(&(*it)->events_[i]); } if (finished || wait) { // Wait for completion awaitCompletion(*it); // Release a command batch delete *it; // Remove command batch from the list cbList_.pop_front(); } else { // Early exit if no finished break; } } state_.forceWait_ = false; } void VirtualGPU::enableSyncedBlit() const { return blitMgr_->enableSynchronization(); } void VirtualGPU::releaseMemObjects(bool scratch) { for (const auto& it : gpuEvents_) { GpuEvent event = it.second; waitForEvent(&event); } // Unbind all resources.So the queue won't have any bound mem objects for (uint i = 0; i < MaxUavArguments; ++i) { if (NULL != cal_.uavs_[i]) { setUAVBuffer(i, 0, GSL_UAV_TYPE_UNKNOWN); cal_.uavs_[i] = 0; } } for (uint i = 0; i < MaxReadImage; ++i) { if (NULL != cal_.readImages_[i]) { setInput(i, 0); cal_.readImages_[i] = 0; } } for (uint i = 0; i < MaxConstBuffers; ++i) { if (NULL != cal_.constBuffers_[i]) { setConstantBuffer(i, 0, 0, 0); cal_.constBuffers_[i] = 0; } } if (scratch) { setScratchBuffer(NULL, 0); } gpuEvents_.clear(); } void VirtualGPU::setGpuEvent(GpuEvent gpuEvent, bool flush) { cal_.events_[engineID_] = gpuEvent; // Flush current DMA buffer if requested if (flush || GPU_FLUSH_ON_EXECUTION) { flushDMA(engineID_); } } void VirtualGPU::flushDMA(uint engineID) { if (engineID == MainEngine) { // Clear memory dependency state, since runtime flushes compute // memoryDependency().clear(); //!@todo Keep memory dependency alive even if we flush DMA, //! since only L2 cache is flushed in KMD frame, //! but L1 still has to be invalidated. } //! \note Use CtxIsEventDone, so we won't flush compute for DRM engine isDone(&cal_.events_[engineID]); } bool VirtualGPU::waitAllEngines(CommandBatch* cb) { uint i; GpuEvent* events; //!< GPU events for the batch // If command batch is NULL then wait for the current if (NULL == cb) { events = cal_.events_; } else { events = cb->events_; } bool earlyDone = true; // The first loop is to flush all engines and/or check if // engines are idle already for (i = 0; i < AllEngines; ++i) { earlyDone &= isDone(&events[i]); } // Release all transfer buffers on this command queue releaseXferWrite(); // Rlease all pinned memory releasePinnedMem(); // The second loop is to wait all engines for (i = 0; i < AllEngines; ++i) { waitForEvent(&events[i]); } return earlyDone; } void VirtualGPU::waitEventLock(CommandBatch* cb) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); bool earlyDone = waitAllEngines(cb); // Free resource cache if we have too many entries //! \note we do it here, when all engines are idle, // because Vista/Win7 idles GPU on a resource destruction static const size_t MinCacheEntries = 4096; dev().resourceCache().free(MinCacheEntries); // Find the timestamp object of the last command in the batch if (cb->lastTS_ != NULL) { // If earlyDone is TRUE, then CPU didn't wait for GPU. // Thus the sync point between CPU and GPU is unclear and runtime // will use an older adjustment value to maintain the same timeline if (!earlyDone || //! \note Workaround for APU(s). //! GPU-CPU timelines may go off too much, thus always //! force calibration with the last batch in the list (cbList_.size() <= 1) || (readjustTimeGPU_ == 0)) { uint64_t startTimeStampGPU = 0; uint64_t endTimeStampGPU = 0; // Get the timestamp value of the last command in the batch cb->lastTS_->value(&startTimeStampGPU, &endTimeStampGPU); uint64_t endTimeStampCPU = amd::Os::timeNanos(); // Make sure the command batch has a valid GPU TS if (!GPU_RAW_TIMESTAMP) { // Adjust the base time by the execution time readjustTimeGPU_ = endTimeStampGPU - endTimeStampCPU; } } } } void VirtualGPU::validateScratchBuffer(const Kernel* kernel) { // Check if a scratch buffer is required if (dev().scratch(hwRing())->regNum_ > 0) { // Setup scratch buffer setScratchBuffer(dev().scratch(hwRing())->memObj_->gslResource(), 0); } } bool VirtualGPU::setActiveKernelDesc(const amd::NDRangeContainer& sizes, const Kernel* kernel) { bool result = true; CALimage calImage = kernel->calImage(); GslKernelDesc* desc = gslKernels_[calImage]; validateScratchBuffer(kernel); // Early exit if ((activeKernelDesc_ == desc) && (desc != NULL)) { return result; } // Does the kernel descriptor for this virtual device exist? if (desc == NULL) { desc = allocKernelDesc(kernel, calImage); if (desc == NULL) { return false; } gslKernels_[calImage] = desc; } // Set the descriptor as active activeKernelDesc_ = desc; // Program the samplers defined in the kernel if (!kernel->setInternalSamplers(*this)) { result = false; } // Bind global HW constant buffers if (!kernel->bindGlobalHwCb(*this, desc)) { result = false; } if (result) { // Set program in GSL rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, desc->func_); // Update internal constant buffer if (desc->intCb_ != 0) { cs()->setIntConstants(GSL_COMPUTE_PROGRAM, desc->intCb_); } } return result; } bool VirtualGPU::allocConstantBuffers() { // Allocate/reallocate constant buffers size_t minCbSize; // GCN doesn't really have a limit minCbSize = 128 * Ki; uint i; // Create/reallocate constant buffer resources for (i = 0; i < MaxConstBuffersArguments; ++i) { ConstBuffer* constBuf = new ConstBuffer( *this, ((minCbSize + ConstBuffer::VectorSize - 1) / ConstBuffer::VectorSize)); if ((constBuf != NULL) && constBuf->create()) { addConstBuffer(constBuf); } else { // We failed to create a constant buffer delete constBuf; return false; } } return true; } VirtualGPU::GslKernelDesc* VirtualGPU::allocKernelDesc(const Kernel* kernel, CALimage calImage) { // Sanity checks assert(kernel != NULL); GslKernelDesc* desc = new GslKernelDesc; if (desc != NULL) { memset(desc, 0, sizeof(GslKernelDesc)); if (kernel->calImage() != calImage) { desc->image_ = calImage; } if (!moduleLoad(calImage, &desc->func_, &desc->intCb_)) { LogPrintfError("calModuleLoad failed for \"%s\" kernel!", kernel->name().c_str()); delete desc; return NULL; } } if (kernel->argSize() > slots_.size()) { slots_.resize(kernel->argSize()); } return desc; } void VirtualGPU::freeKernelDesc(VirtualGPU::GslKernelDesc* desc) { if (desc) { if (gslKernelDesc() == desc) { // Clear active kernel desc activeKernelDesc_ = NULL; rs()->setCurrentProgramObject(GSL_COMPUTE_PROGRAM, 0); } if (desc->image_ != 0) { // Free CAL image free(desc->image_); } if (desc->func_ != 0) { if (desc->intCb_ != 0) { cs()->setIntConstants(GSL_COMPUTE_PROGRAM, 0); cs()->destroyMemObject(desc->intCb_); } cs()->destroyProgramObject(desc->func_); } delete desc; } } void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) { // Is profiling enabled? if (command.profilingInfo().enabled_) { // Allocate a timestamp object from the cache TimeStamp* ts = tsCache_->allocTimeStamp(); if (NULL == ts) { return; } // Save the TimeStamp object in the current OCL event command.setData(ts); currTs_ = ts; profileEnabled_ = true; } } void VirtualGPU::profilingEnd(amd::Command& command) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = reinterpret_cast(command.data()); if (ts != NULL) { // Check if the command actually did any GPU submission if (ts->isValid()) { cal_.lastTS_ = ts; } else { // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); command.setData(NULL); } } } bool VirtualGPU::profilingCollectResults(CommandBatch* cb, const amd::Event* waitingEvent) { bool found = false; amd::Command* current; amd::Command* first = cb->head_; // If the command list is, empty then exit if (NULL == first) { return found; } // Wait for the last known GPU events on all engines waitEventLock(cb); // Find the CPU base time of the entire command batch execution uint64_t endTimeStamp = amd::Os::timeNanos(); uint64_t startTimeStamp = endTimeStamp; // First step, walk the command list to find the first valid command //! \note The batch may have empty markers at the beginning. //! So the start/end of the empty commands is equal to //! the start of the first valid command in the batch. first = cb->head_; while (NULL != first) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = reinterpret_cast(first->data()); if (ts != NULL) { ts->value(&startTimeStamp, &endTimeStamp); endTimeStamp -= readjustTimeGPU_; startTimeStamp -= readjustTimeGPU_; // Assign to endTimeStamp the start of the first valid command endTimeStamp = startTimeStamp; break; } first = first->getNext(); } // Second step, walk the command list to construct the time line first = cb->head_; while (NULL != first) { // Get the TimeStamp object associated witht the current command TimeStamp* ts = reinterpret_cast(first->data()); current = first->getNext(); if (ts != NULL) { ts->value(&startTimeStamp, &endTimeStamp); endTimeStamp -= readjustTimeGPU_; startTimeStamp -= readjustTimeGPU_; // Destroy the TimeStamp object tsCache_->freeTimeStamp(ts); first->setData(NULL); } else { // For empty commands start/end is equal to // the end of the last valid command startTimeStamp = endTimeStamp; } // Update the command status with the proper timestamps if (first->status() == CL_SUBMITTED) { first->setStatus(CL_RUNNING, startTimeStamp); first->setStatus(CL_COMPLETE, endTimeStamp); } else if (first->status() == CL_RUNNING) { first->setStatus(CL_COMPLETE, endTimeStamp); } else if ((first->status() != CL_COMPLETE) && (current != NULL)) { LogPrintfError("Unexpected command status - %d!", first->status()); } // Do we wait this event? if (first == waitingEvent) { found = true; } first->release(); first = current; } return found; } bool VirtualGPU::addVmMemory(const Memory* memory) { uint* cnt = &cal_.memCount_; (*cnt)++; // Reallocate array if kernel uses more memory objects if (numVmMems_ < *cnt) { gslMemObject* tmp; tmp = new gslMemObject[*cnt]; if (tmp == NULL) { return false; } memcpy(tmp, vmMems_, sizeof(gslMemObject) * numVmMems_); delete[] vmMems_; vmMems_ = tmp; numVmMems_ = *cnt; } vmMems_[*cnt - 1] = memory->gslResource(); return true; } void VirtualGPU::profileEvent(EngineType engine, bool type) const { if (NULL == currTs_) { return; } if (type) { currTs_->begin((engine == SdmaEngine) ? true : false); } else { currTs_->end((engine == SdmaEngine) ? true : false); } } bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address params, bool nativeMem, std::vector* memList) { const HSAILKernel& hsaKernel = static_cast(*(kernel.getDeviceKernel(dev()))); const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); // Mark the tracker with a new kernel, // so we can avoid checks of the aliased objects memoryDependency().newKernel(); bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true); bool supportFineGrainedSystem = deviceSupportFGS; FGSStatus status = kernelParams.getSvmSystemPointersSupport(); switch (status) { case FGS_YES: if (!deviceSupportFGS) { return false; } supportFineGrainedSystem = true; break; case FGS_NO: supportFineGrainedSystem = false; break; case FGS_DEFAULT: default: break; } size_t count = kernelParams.getNumberOfSvmPtr(); size_t execInfoOffset = kernelParams.getExecInfoOffset(); bool sync = true; amd::Memory* memory = NULL; // get svm non arugment information void* const* svmPtrArray = reinterpret_cast(params + execInfoOffset); for (size_t i = 0; i < count; i++) { memory = amd::MemObjMap::FindMemObj(svmPtrArray[i]); if (NULL == memory) { if (!supportFineGrainedSystem) { return false; } else if (sync) { flushCUCaches(); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); continue; } } else { Memory* gpuMemory = dev().getGpuMemory(memory); if (NULL != gpuMemory) { // Synchronize data with other memory instances if necessary gpuMemory->syncCacheFromHost(*this); const static bool IsReadOnly = false; // Validate SVM passed in the non argument list memoryDependency().validate(*this, gpuMemory, IsReadOnly); // Mark signal write for cache coherency, // since this object isn't a part of kernel arg setup if ((memory->getMemFlags() & CL_MEM_READ_ONLY) == 0) { memory->signalWrite(&dev()); } memList->push_back(gpuMemory); } else { return false; } } } amd::Memory* const* memories = reinterpret_cast(params + kernelParams.memoryObjOffset()); // Check all parameters for the current kernel for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); const HSAILKernel::Argument* arg = hsaKernel.argument(i); Memory* gpuMem = nullptr; bool readOnly = false; amd::Memory* mem = nullptr; // Find if current argument is a buffer if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) { uint32_t index = desc.info_.arrayIndex_; if (nativeMem) { gpuMem = reinterpret_cast(memories)[index]; if (nullptr != gpuMem) { mem = gpuMem->owner(); } } else { mem = memories[index]; if (mem != nullptr) { gpuMem = dev().getGpuMemory(mem); // Synchronize data with other memory instances if necessary gpuMem->syncCacheFromHost(*this); } } //! This condition is for SVM fine-grain if ((gpuMem == nullptr) && dev().isFineGrainedSystem(true)) { flushCUCaches(); // Clear memory dependency state const static bool All = true; memoryDependency().clear(!All); continue; } else if (gpuMem != nullptr) { // Check image readOnly = (desc.accessQualifier_ == CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false; // Check buffer readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false; // Validate memory for a dependency in the queue memoryDependency().validate(*this, gpuMem, readOnly); } } } for (gpu::Memory* mem : hsaKernel.prog().globalStores()) { const static bool IsReadOnly = false; // Validate global store for a dependency in the queue memoryDependency().validate(*this, mem, IsReadOnly); } return true; } amd::Memory* VirtualGPU::createBufferFromImage(amd::Memory& amdImage) { amd::Memory* mem = new (amdImage.getContext()) amd::Buffer(amdImage, 0, 0, amdImage.getSize()); mem->setVirtualDevice(this); if ((mem != NULL) && !mem->create()) { mem->release(); } return mem; } void VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) { const static bool Wait = true; vqHeader_->kernel_table = kernelTable; virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); } void VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) { //! @todo: fix issue of no event available for the flush/invalidate cache command InvalidateSqCaches(cache_mask.sqICache_, cache_mask.sqKCache_, cache_mask.tcL1_, cache_mask.tcL2_); flushDMA(engineID_); return; } void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt, HwDbgKernelInfo& kernelInfo, amd::Event* enqueueEvent) { amd::HwDebugManager* dbgManager = dev().hwDebugMgr(); assert(dbgManager && "No HW Debug Manager!"); // Initialize structure with default values if (hsaKernel.prog().maxScratchRegs() > 0) { gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObj_; kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); // Get the address of the scratch buffer and its size for CPU access address scratchRingAddr = NULL; scratchRingAddr = static_cast

(scratchBuf->map(NULL, 0)); dbgManager->setScratchRing(scratchRingAddr, scratchBuf->size()); scratchBuf->unmap(NULL); } else { kernelInfo.scratchBufAddr = 0; kernelInfo.scratchBufferSizeInBytes = 0; dbgManager->setScratchRing(NULL, 0); } //! @todo: need to verify what is wanted for the global memory kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress(); kernelInfo.pAqlDispatchPacket = aqlPkt; kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); // Get the address of the kernel code and its size for CPU access gpu::Memory* aqlCode = hsaKernel.gpuAqlCode(); if (NULL != aqlCode) { address aqlCodeAddr = static_cast

(aqlCode->map(NULL, 0)); dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); aqlCode->unmap(NULL); } else { dbgManager->setKernelCodeInfo(NULL, 0); } kernelInfo.trapPresent = false; kernelInfo.trapHandler = NULL; kernelInfo.trapHandlerBuffer = NULL; kernelInfo.excpEn = 0; kernelInfo.cacheDisableMask = 0; kernelInfo.sqDebugMode = 0; kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; // set kernel info for HW debug and call the callback function if (NULL != dbgManager->preDispatchCallBackFunc()) { DebugToolInfo dbgSetting = {0}; dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); dbgSetting.event_ = enqueueEvent; // Call the predispatch callback function & set the trap info AqlCodeInfo aqlCodeInfo; aqlCodeInfo.aqlCode_ = (amd_kernel_code_t*)hsaKernel.cpuAqlCode(); aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); // Execute the pre-dispatch call back function dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); // assign the debug TMA and TBA for kernel dispatch if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) { assignDebugTrapHandler(dbgSetting, kernelInfo); } kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; // Execption policy kernelInfo.excpEn = dbgSetting.exceptionMask_; kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; // Compute the mask for reserved CUs. These two dwords correspond to // two registers used for reserving CUs for display. In the current // implementation, the number of CUs reserved can be 0 to 7, and it // is set by debugger users. if (dbgSetting.monitorMode_) { uint32_t i = dbgSetting.reservedCuNum_ / 2; kernelInfo.mgmtSe0Mask <<= i; i = dbgSetting.reservedCuNum_ - i; kernelInfo.mgmtSe1Mask <<= i; } // flush/invalidate the instruction, data, L1 and L2 caches InvalidateSqCaches(); } } void VirtualGPU::assignDebugTrapHandler(const DebugToolInfo& dbgSetting, HwDbgKernelInfo& kernelInfo) { // setup the runtime trap handler code and trap buffer to be assigned before kernel dispatching // Memory* rtTrapHandlerMem = static_cast(dev().hwDebugMgr()->runtimeTBA()); Memory* rtTrapBufferMem = static_cast(dev().hwDebugMgr()->runtimeTMA()); kernelInfo.trapHandler = reinterpret_cast(rtTrapHandlerMem->vmAddress() + TbaStartOffset); // With the TMA corruption hw bug workaround, the trap handler buffer can be set to zero. // However, by setting the runtime trap buffer (TMA) correct, the runtime trap hander // without the workaround can still function correctly. kernelInfo.trapHandlerBuffer = reinterpret_cast(rtTrapBufferMem->vmAddress()); address rtTrapBufferAddress = static_cast

(rtTrapBufferMem->map(this)); Memory* trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); Memory* trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); // Address of the trap handler code/buffer should be 256-byte aligned uint64_t tbaAddress = trapHandlerMem->vmAddress(); uint64_t tmaAddress = trapBufferMem->vmAddress(); if ((tbaAddress & 0xFF) != 0 || (tmaAddress & 0xFF) != 0) { assert(false && "Trap handler/buffer is not 256-byte aligned"); } // The addresses of the debug trap handler code (TBA) and buffer (TMA) are // stored in the runtime trap handler buffer with offset location of 0x18-19 // and 0x20-21, respectively. uint64_t* rtTmaPtr = reinterpret_cast(rtTrapBufferAddress + 0x18); rtTmaPtr[0] = tbaAddress; rtTmaPtr[1] = tmaAddress; rtTrapBufferMem->unmap(NULL); // Add GSL handle to the memory list for VidMM addVmMemory(trapHandlerMem); addVmMemory(trapBufferMem); addVmMemory(rtTrapHandlerMem); addVmMemory(rtTrapBufferMem); } void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { size_t copySize = cmd.size()[0]; size_t fileOffset = cmd.fileOffset(); Memory* mem = dev().getGpuMemory(&cmd.memory()); uint idx = 0; assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); if (writeBuffer) { size_t dstOffset = cmd.origin()[0]; while (copySize > 0) { Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; dstSize = std::min(dstSize, copySize); void* dstBuffer = staging->cpuMap(*this); if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0, dstSize)) { cmd.setStatus(CL_INVALID_OPERATION); return; } staging->cpuUnmap(*this); bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false); flushDMA(getGpuEvent(staging->gslResource())->engineId_); fileOffset += dstSize; dstOffset += dstSize; copySize -= dstSize; } } else { size_t srcOffset = cmd.origin()[0]; while (copySize > 0) { Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; srcSize = std::min(srcSize, copySize); bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false); void* srcBuffer = staging->cpuMap(*this); if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0, srcSize)) { cmd.setStatus(CL_INVALID_OPERATION); return; } staging->cpuUnmap(*this); fileOffset += srcSize; srcOffset += srcSize; copySize -= srcSize; } } } } // namespace gpu