diff --git a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp index 930bcbdab2..2203b01e66 100644 --- a/projects/clr/rocclr/runtime/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/runtime/device/pal/paldevice.cpp @@ -466,8 +466,10 @@ void NullDevice::fillDeviceInfo( info_.localMemBanks_ = hwInfo()->localMemBanks_; info_.gfxipVersion_ = hwInfo()->gfxipVersion_; info_.numAsyncQueues_ = numComputeRings; - info_.numRTQueues_ = 2; - info_.numRTCUs_ = 4; + info_.numRTQueues_ = + palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount - 1; + info_.numRTCUs_ = 0x8; + //palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu; info_.threadTraceEnable_ = settings().threadTraceEnable_; } } @@ -693,13 +695,11 @@ Device::create(Pal::IDevice* device) return false; } - // Find the number of available engines numComputeEngines_ = - properties().engineProperties[Pal::QueueTypeCompute].engineCount - - properties().engineProperties[Pal::QueueTypeCompute].numExclusiveComputeEngines; + properties().engineProperties[Pal::EngineTypeCompute].engineCount; numDmaEngines_ = - properties().engineProperties[Pal::QueueTypeDma].engineCount; + properties().engineProperties[Pal::EngineTypeDma].engineCount; Pal::PalPublicSettings*const palSettings = iDev()->GetPublicSettings(); // Modify settings here @@ -715,9 +715,13 @@ Device::create(Pal::IDevice* device) Pal::DeviceFinalizeInfo finalizeInfo = {}; // Request all compute engines - finalizeInfo.engineCounts[Pal::QueueTypeCompute] = numComputeEngines_; + finalizeInfo.requestedEngineCounts[Pal::EngineTypeCompute].engines = + ((1 << numComputeEngines_) - 1); + // Request real time compute engines + //finalizeInfo.requestedEngineCounts[Pal::EngineTypeExclusiveCompute].engines = 3; // Request all SDMA engines - finalizeInfo.engineCounts[Pal::QueueTypeDma] = numDmaEngines_; + finalizeInfo.requestedEngineCounts[Pal::EngineTypeDma].engines = + (1 << numDmaEngines_) - 1; result = iDev()->Finalize(finalizeInfo); } @@ -943,10 +947,7 @@ Device::createVirtualDevice( } VirtualGPU* vgpu = new VirtualGPU(*this); - if (vgpu && vgpu->create( - profiling - , deviceQueueSize - )) { + if (vgpu && vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { return vgpu; } else { delete vgpu; diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp index 34549e699d..bfa1a5b032 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp @@ -37,21 +37,41 @@ VirtualGPU::Queue::Create( Pal::IDevice* palDev, Pal::QueueType queueType, uint engineIdx, - Pal::ICmdAllocator* cmdAllocator) + Pal::ICmdAllocator* cmdAllocator, + uint rtCU, + amd::CommandQueue::Priority priority) { Pal::Result result; + Pal::CmdBufferCreateInfo cmdCreateInfo = {}; Pal::QueueCreateInfo qCreateInfo = {}; - qCreateInfo.engineType = queueType; qCreateInfo.engineIndex = engineIdx; - qCreateInfo.aqlQueue = true; + qCreateInfo.aqlQueue = true; + qCreateInfo.queueType = queueType; + if (queueType == Pal::QueueTypeDma) { + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeDma; + } + else { + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeCompute; + } +/* + if (priority == amd::CommandQueue::Priority::Medium) { + qCreateInfo.engineIndex = 0x1; + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; + } + else if (amd::CommandQueue::RealTimeDisabled != rtCU) { + qCreateInfo.numReservedCu = rtCU; + qCreateInfo.engineIndex = 0x0; + cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute; + cmdCreateInfo.flags.rtCu = true; + } +*/ // Find queue object size size_t qSize = palDev->GetQueueSize(qCreateInfo, &result); if (result != Pal::Result::Success) { return nullptr; } - Pal::CmdBufferCreateInfo cmdCreateInfo = {}; cmdCreateInfo.pCmdAllocator = cmdAllocator; cmdCreateInfo.queueType = queueType; @@ -678,7 +698,8 @@ VirtualGPU::VirtualGPU( } bool -VirtualGPU::create(bool profiling, uint deviceQueueSize) +VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs, + amd::CommandQueue::Priority priority) { device::BlitManager::Setup blitSetup; @@ -726,7 +747,7 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; queues_[MainEngine] = Queue::Create( - dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_); + dev().iDev(), Pal::QueueTypeCompute, idx + firstQueue, cmdAllocator_, rtCUs, priority); if (nullptr == queues_[MainEngine]) { return false; } @@ -743,7 +764,8 @@ VirtualGPU::create(bool profiling, uint deviceQueueSize) } queues_[SdmaEngine] = Queue::Create( - dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_); + dev().iDev(), Pal::QueueTypeDma, sdma, cmdAllocator_, + amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal); if (nullptr == queues_[SdmaEngine]) { return false; } @@ -2011,15 +2033,15 @@ VirtualGPU::submitKernelInternal( GpuEvent gpuEvent; // Set up the dispatch information Pal::DispatchAqlParams dispatchParam = {}; - dispatchParam.pAqlPacket = aqlPkt; + dispatchParam.pAqlPacket = aqlPkt; if (nullptr != scratch) { - dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); - dispatchParam.scratchSize = scratch->size_; - dispatchParam.scratchOffset = scratch->offset_; + dispatchParam.scratchAddr = scratch->memObj_->vmAddress(); + dispatchParam.scratchSize = scratch->size_; + dispatchParam.scratchOffset = scratch->offset_; } - dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); - dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); - dispatchParam.wavesPerSh = hsaKernel.getWavesPerSH(this); + dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); + dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); + dispatchParam.wavesPerSh = hsaKernel.getWavesPerSH(this); // Run AQL dispatch in HW eventBegin(MainEngine); @@ -3415,31 +3437,56 @@ VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) { size_t copySize = cmd.size()[0]; size_t fileOffset = cmd.fileOffset(); - size_t srcDstOffset = cmd.origin()[0]; Memory* mem = dev().getGpuMemory(&cmd.memory()); uint idx = 0; assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) || (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD)); - bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); + const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD); - while (copySize > 0) { - Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); - size_t srcDstSize = amd::TransferBufferFileCommand::StagingBufferSize; - srcDstSize = std::min(srcDstSize, copySize); - void* srcDstBuffer = staging->cpuMap(*this); - if (!cmd.file()->transferBlock(writeBuffer, - srcDstBuffer, staging->size(), fileOffset, 0, srcDstSize)) { - cmd.setStatus(CL_INVALID_OPERATION); - return; + if (writeBuffer) { + size_t dstOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize; + dstSize = std::min(dstSize, copySize); + void* dstBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, + dstBuffer, staging->size(), fileOffset, 0, dstSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); + + bool result = blitMgr().copyBuffer(*staging, *mem, + 0, dstOffset, dstSize, false); + flushDMA(getGpuEvent(staging->iMem())->engineId_); + fileOffset += dstSize; + dstOffset += dstSize; + copySize -= dstSize; } - staging->cpuUnmap(*this); + } + else { + size_t srcOffset = cmd.origin()[0]; + while (copySize > 0) { + Memory* staging = dev().getGpuMemory(&cmd.staging(idx)); + size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize; + srcSize = std::min(srcSize, copySize); + bool result = blitMgr().copyBuffer(*mem, *staging, + srcOffset, 0, srcSize, false); - bool result = blitMgr().copyBuffer(*staging, *mem, - fileOffset, srcDstOffset, srcDstSize, false); - flushDMA(getGpuEvent(staging->iMem())->engineId_); - srcDstOffset += srcDstSize; - copySize -= srcDstSize; + void* srcBuffer = staging->cpuMap(*this); + if (!cmd.file()->transferBlock(writeBuffer, + srcBuffer, staging->size(), fileOffset, 0, srcSize)) { + cmd.setStatus(CL_INVALID_OPERATION); + return; + } + staging->cpuUnmap(*this); + + fileOffset += srcSize; + srcOffset += srcSize; + copySize -= srcSize; + } } } } // namespace pal diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp index e68248c68c..80f2b8ec9b 100644 --- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp @@ -10,6 +10,7 @@ #include "device/pal/paltimestamp.hpp" #include "device/pal/palsched.hpp" #include "device/pal/paldebugger.hpp" +#include "platform/commandqueue.hpp" #include "device/blit.hpp" #include "palUtil.h" #include "palCmdBuffer.h" @@ -47,12 +48,15 @@ public: static const uint StartCmdBufIdx = 1; static const uint FirstMemoryReference = 0x80000000; static const uint64_t WaitTimeoutInNsec = 6000000000; + static const uint64_t PollIntervalInNsec = 500000; static Queue* Create( - Pal::IDevice* palDev, //!< PAL device object - Pal::QueueType queueType, //!< PAL queue type - uint engineIdx, //!< Select particular engine index - Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator + Pal::IDevice* palDev, //!< PAL device object + Pal::QueueType queueType, //!< PAL queue type + uint engineIdx, //!< Select particular engine index + Pal::ICmdAllocator* cmdAlloc, //!< PAL CMD buffer allocator + uint rtCU, //!< The number of reserved CUs + amd::CommandQueue::Priority priority //!< Queue priority ); Queue(Pal::IDevice* palDev) @@ -87,11 +91,14 @@ public: bool waifForFence(uint cbId) const { Pal::Result result = Pal::Result::Success; + uint64_t start = amd::Os::timeNanos(); while (Pal::Result::Success != (result = iCmdFences_[cbId]->GetStatus())) { if (result == Pal::Result::ErrorFenceNeverSubmitted) { result = Pal::Result::Success; break; } + uint64_t end = amd::Os::timeNanos(); + if ((end - start) < PollIntervalInNsec) continue; result = iDev_->WaitForFences(1, &iCmdFences_[cbId], true, WaitTimeoutInNsec); if (Pal::Result::Success == result) { break; @@ -170,11 +177,8 @@ public: { struct { - uint boundGlobal_ : 1; //!< Global buffer was bound uint profiling_ : 1; //!< Profiling is enabled uint forceWait_ : 1; //!< Forces wait in flush() - uint boundCb_ : 1; //!< Constant buffer was bound - uint boundPrintf_ : 1; //!< Printf buffer was bound uint profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter }; uint value_; @@ -265,8 +269,10 @@ public: VirtualGPU(Device& device); //! Creates virtual gpu object bool create( - bool profiling, //!< Enables profilng on the queue - uint deviceQueueSize = 0 //!< Device queue size, 0 if host queue + bool profiling, //!< Enables profilng on the queue + uint deviceQueueSize = 0, //!< Device queue size, 0 if host queue + uint rtCUs = amd::CommandQueue::RealTimeDisabled, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal ); ~VirtualGPU();