diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp index 88534a51f3..e2f7d137fd 100644 --- a/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/rocclr/runtime/device/gpu/gpudevice.cpp @@ -174,7 +174,7 @@ NullDevice::create(CALtarget target) calAttr.localRAM = 512; // Fill the device info structure - fillDeviceInfo(calAttr, memInfo, 4096, 1); + fillDeviceInfo(calAttr, memInfo, 4096, 1, 0); if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { // Runtime doesn't know what local size could be on the real board @@ -280,11 +280,14 @@ NullDevice::createProgram(amd::option::Options* options) return new NullProgram(*this); } -void NullDevice::fillDeviceInfo( +void +NullDevice::fillDeviceInfo( const CALdeviceattribs& calAttr, const gslMemInfo& memInfo, size_t maxTextureSize, - uint numComputeRings) + uint numComputeRings, + uint numComputeRingsRT + ) { info_.type_ = CL_DEVICE_TYPE_GPU; info_.vendorId_ = 0x1002; @@ -549,8 +552,8 @@ void NullDevice::fillDeviceInfo( info_.localMemBanks_ = hwInfo()->localMemBanks_; info_.gfxipVersion_ = hwInfo()->gfxipVersion_; info_.numAsyncQueues_ = numComputeRings; - info_.numRTQueues_ = 2; - info_.numRTCUs_ = 4; + info_.numRTQueues_ = numComputeRingsRT; + info_.numRTCUs_ = calAttr.maxRTCUs; info_.threadTraceEnable_ = settings().threadTraceEnable_; } } @@ -576,6 +579,7 @@ void Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings) { numComputeRings_ = 0; + numComputeRingsRT_ = 0; numDmaEngines_ = 0; for (uint i = 0; i < num; ++i) { @@ -587,6 +591,13 @@ Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeR numComputeRings_++; } + if (desc[i].id == GSL_ENGINEID_COMPUTE_RT) { + numComputeRingsRT_++; + } + if (desc[i].id == GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) { + numComputeRingsRT_++; + } + if (desc[i].id >= GSL_ENGINEID_DRMDMA0 && desc[i].id <= GSL_ENGINEID_DRMDMA1) { numDmaEngines_++; @@ -910,7 +921,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices) // Fill the device info structure fillDeviceInfo(getAttribs(), getMemInfo(), static_cast(getMaxTextureSize()), - engines().numComputeRings()); + engines().numComputeRings(), engines().numComputeRingsRT()); if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) { if (NULL == hsaCompiler_) { @@ -969,7 +980,7 @@ Device::initializeHeapResources() PerformFullInitialization(); - uint numComputeRings = engines_.numComputeRings(); + uint numComputeRings = engines_.numComputeRings() + engines_.numComputeRingsRT(); scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1)); // Initialize the number of mem object for the scratch buffer @@ -1074,7 +1085,7 @@ Device::createVirtualDevice( { bool profiling = false; bool interopQueue = false; - uint rtCUs = 0; + uint rtCUs = amd::CommandQueue::RealTimeDisabled; uint deviceQueueSize = 0; if (queue != NULL) { @@ -1101,10 +1112,7 @@ Device::createVirtualDevice( } VirtualGPU* vgpu = new VirtualGPU(*this); - if (vgpu && vgpu->create( - profiling - , deviceQueueSize - )) { + if (vgpu && vgpu->create(profiling, rtCUs, deviceQueueSize, queue->priority())) { return vgpu; } else { delete vgpu; diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp index 55328d48ae..9c4d4027ea 100644 --- a/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/rocclr/runtime/device/gpu/gpudevice.hpp @@ -129,7 +129,8 @@ protected: const CALdeviceattribs& calAttr, //!< CAL device attributes info const gslMemInfo& memInfo, //!< GSL mem info size_t maxTextureSize, //!< Maximum texture size supported in HW - uint numComputeRings //!< Number of compute rings + uint numComputeRings, //!< Number of compute rings + uint numComputeRingsRT //!< Number of RT compute rings ); }; @@ -237,7 +238,11 @@ public: { public: //! Default constructor - Engines() { memset(desc_, 0xff, sizeof(desc_)); } + Engines() + : numComputeRings_(0) + , numComputeRingsRT_(0) + , numDmaEngines_(0) + { memset(desc_, 0xff, sizeof(desc_)); } //! Creates engine descriptor for this class void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings); @@ -251,11 +256,15 @@ public: //! Returns the number of available compute rings uint numComputeRings() const { return numComputeRings_; } + //! Returns the number of available real time compute rings + uint numComputeRingsRT() const { return numComputeRingsRT_; } + //! Returns the number of available DMA engines uint numDMAEngines() const { return numDmaEngines_; } private: uint numComputeRings_; + uint numComputeRingsRT_; uint numDmaEngines_; gslEngineDescriptor desc_[GSL_ENGINEID_MAX]; //!< Engine descriptor }; diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp index 807b26d6d4..6fd4bfd7b3 100644 --- a/rocclr/runtime/device/gpu/gpumemory.cpp +++ b/rocclr/runtime/device/gpu/gpumemory.cpp @@ -171,6 +171,7 @@ Memory::create( // Check if CAL created a resource if (result) { switch (memoryType()) { + case Resource::Persistent: case Resource::Pinned: case Resource::ExternalPhysical: // Marks memory object for direct GPU access to the host memory @@ -186,6 +187,9 @@ Memory::create( case Resource::View: { Resource::ViewParams* view = reinterpret_cast(params); + if (view->resource_->memoryType() == Resource::Persistent) { + flags_ |= HostMemoryDirectAccess; + } // Check if parent was allocated in system memory if ((view->resource_->memoryType() == Resource::Pinned) || (((view->resource_->memoryType() == Resource::Remote) || diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index ef9773d503..2ec9386fae 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -196,7 +196,7 @@ VirtualGPU::DmaFlushMgmt::isCbReady( } bool -VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines) +VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs) { // GSL device initialization dev().PerformFullInitialization(); @@ -206,7 +206,7 @@ VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines) ? CAL_WAIT_LOW_CPU_UTILIZATION : CAL_WAIT_POLLING; - if (!open(&dev(), nEngines, engines)) { + if (!open(&dev(), nEngines, engines, rtCUs)) { return false; } @@ -432,10 +432,8 @@ VirtualGPU::VirtualGPU( } bool -VirtualGPU::create( - bool profiling - , uint deviceQueueSize - ) +VirtualGPU::create(bool profiling, uint rtCUs, uint deviceQueueSize, + amd::CommandQueue::Priority priority) { device::BlitManager::Setup blitSetup; gslEngineDescriptor engines[2]; @@ -452,14 +450,34 @@ VirtualGPU::create( { if (dev().engines().numComputeRings()) { - uint idx = index() % dev().engines().numComputeRings(); + uint idx; + if ((amd::CommandQueue::RealTimeDisabled == rtCUs) && + (priority == amd::CommandQueue::Priority::Normal)) { + idx = index() % dev().engines().numComputeRings(); + engineMask = dev().engines().getMask( + (gslEngineID)(dev().isComputeRingIDForced() ? + dev().getforcedComputeEngineID() : + (dev().getFirstAvailableComputeEngineID() + idx))); + + } + else { + if (priority == amd::CommandQueue::Priority::Medium) { + engineMask = dev().engines().getMask((gslEngineID) + (GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)); + } + else { + engineMask = dev().engines().getMask((gslEngineID) + (GSL_ENGINEID_COMPUTE_RT)); + } + //!@todo This is not a generic solution and + // may have issues with > 8 queues + idx = index() % (dev().engines().numComputeRings() + + dev().engines().numComputeRingsRT()); + } // hwRing_ should be set 0 if forced to have single scratch buffer hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx; - engineMask = dev().engines().getMask((gslEngineID)(dev().isComputeRingIDForced() ? - dev().getforcedComputeEngineID() : - (dev().getFirstAvailableComputeEngineID() + idx))); if (dev().canDMA()) { // If only 1 DMA engine is available then use that one if (dev().engines().numDMAEngines() < 2) { @@ -479,12 +497,12 @@ VirtualGPU::create( engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0); } } - num = dev().engines().getRequested(engineMask, engines); + } + num = dev().engines().getRequested(engineMask, engines); - // Open GSL context - if ((num == 0) || !gslOpen(num, engines)) { - return false; - } + // Open GSL context + if ((num == 0) || !gslOpen(num, engines, rtCUs)) { + return false; } // Diable double copy optimization, @@ -1178,7 +1196,6 @@ VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory()); amd::Memory* owner = memory->owner(); bool unmapMip = false; @@ -2831,7 +2848,6 @@ VirtualGPU::flushDMA(uint engineID) //! since only L2 cache is flushed in KMD frame, //! but L1 still has to be invalidated. } - //! \note Use CtxIsEventDone, so we won't flush compute for DRM engine isDone(&cal_.events_[engineID]); } @@ -2841,7 +2857,6 @@ VirtualGPU::waitAllEngines(CommandBatch* cb) { uint i; GpuEvent* events; //!< GPU events for the batch - // If command batch is NULL then wait for the current if (NULL == cb) { events = cal_.events_; diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp index c8dc7a9bc0..e3cce21d2f 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -10,6 +10,7 @@ #include "device/gpu/gpuprintf.hpp" #include "device/gpu/gputimestamp.hpp" #include "device/gpu/gpusched.hpp" +#include "platform/commandqueue.hpp" #include "device/blit.hpp" #include "device/gpu/gpudebugger.hpp" @@ -199,12 +200,10 @@ public: typedef std::vector ResourceSlots; public: - VirtualGPU(Device& device); - bool create( - bool profiling - , uint deviceQueueSize = 0 - ); + bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled, + uint deviceQueueSize = 0, + amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal); ~VirtualGPU(); void submitReadMemory(amd::ReadMemoryCommand& vcmd); @@ -443,7 +442,7 @@ private: //! Frees CAL kernel descriptor of the virtual device void freeKernelDesc(GslKernelDesc* desc); - bool gslOpen(uint nEngines, gslEngineDescriptor *engines); + bool gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs); void gslDestroy(); //! Releases stage write buffers diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 7e52e6c493..41fe0c427c 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -46,7 +46,8 @@ bool CALGSLContext::open( const CALGSLDevice* pDeviceObject, uint32 nEngines, - gslEngineDescriptor* engines) + gslEngineDescriptor* engines, + uint32 rtCUs) { m_Dev = pDeviceObject; @@ -63,7 +64,7 @@ CALGSLContext::open( for (uint i = 0; i < nEngines; i++) { if (engines[i].id >= GSL_ENGINEID_3DCOMPUTE0 && - engines[i].id <= GSL_ENGINEID_COMPUTE7) + engines[i].id <= GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) { mainEngineOrdinal = engines[i].id; } @@ -76,7 +77,7 @@ CALGSLContext::open( } } - m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false); + m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false, rtCUs); if (m_cs == 0) { diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index 61b77c9435..06cde42b6d 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -29,7 +29,7 @@ public: CALGSLContext(); ~CALGSLContext(); - bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines); + bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines, uint32 rtCUs = 0); void close(gsl::gsAdaptor* native); bool setInput(uint32 physUnit, gslMemObject mem); diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp index 473c96d04d..05b858c1ca 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp @@ -119,6 +119,7 @@ CALGSLDevice::getAttribs_int(gsl::gsCtx* cs) m_attribs.isOpenCL200Device = m_adp->pAsicInfo->bIsOpen2Device; m_attribs.isSVMFineGrainSystem = m_adp->pAsicInfo->svmFineGrainSystem; m_attribs.isWDDM2Enabled = m_adp->pAsicInfo->vaAvailable && m_adp->pAsicInfo->bNoVATranslation; + m_attribs.maxRTCUs = cs->getMaxRTCUs(); } bool diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h index 0308691384..f94332bcf8 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h @@ -84,6 +84,7 @@ typedef struct CALdeviceattribsRec { bool isOpenCL200Device; /**< the flag to mark if the device is OpenCL 200 */ bool isSVMFineGrainSystem; /**< check if SVM finegrainsystem */ bool isWDDM2Enabled; /**< check if WDDM2 is enabled */ + CALuint maxRTCUs; /**< The maximum number of RT CUs for RT queues */ } CALdeviceattribs; diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp index 1f3071440d..a954685ac2 100644 --- a/rocclr/runtime/platform/commandqueue.cpp +++ b/rocclr/runtime/platform/commandqueue.cpp @@ -18,10 +18,11 @@ namespace amd { HostQueue::HostQueue( - Context& context, Device& device, cl_command_queue_properties properties, uint queueRTCUs + Context& context, Device& device, + cl_command_queue_properties properties, uint queueRTCUs, Priority priority ) : CommandQueue(context, device, properties, device.info().queueProperties_ - | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs) + | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs, priority) { if (thread_.state() >= Thread::INITIALIZED) { ScopedLock sl(queueLock_); diff --git a/rocclr/runtime/platform/commandqueue.hpp b/rocclr/runtime/platform/commandqueue.hpp index 05ad3b42d3..3bea84fc45 100644 --- a/rocclr/runtime/platform/commandqueue.hpp +++ b/rocclr/runtime/platform/commandqueue.hpp @@ -31,6 +31,12 @@ class DeviceQueue; class CommandQueue : public RuntimeObject { public: + static const uint RealTimeDisabled = 0xffffffff; + enum class Priority : uint { + Normal = 0, + Medium + }; + struct Properties { typedef cl_command_queue_properties value_type; @@ -92,6 +98,9 @@ public: //! Returns the number or requested real time CUs uint rtCUs() const { return rtCUs_; } + //! Returns the queue priority + Priority priority() const { return priority_; } + protected: //! CommandQueue constructor is protected //! to keep the CommandQueue class as a virtual interface @@ -100,16 +109,19 @@ protected: Device& device, //!< Device object cl_command_queue_properties properties, //!< Queue properties cl_command_queue_properties propMask, //!< Queue properties mask - uint rtCUs = 0 //!< Avaialble real time compute units + uint rtCUs = RealTimeDisabled, //!< Avaialble real time compute units + Priority priority = Priority::Normal //!< Queue priority ) : properties_(propMask, properties) , rtCUs_(rtCUs) + , priority_(priority) , queueLock_("CommandQueue::queueLock") , device_(device) , context_(context) {} Properties properties_; //!< Queue properties uint rtCUs_; //!< The number of used RT compute units + Priority priority_; //!< Queue priority Monitor queueLock_; //!< Lock protecting the queue Device& device_; //!< The device SharedReference context_; //!< The context of this command queue @@ -179,7 +191,8 @@ public: Context& context, Device& device, cl_command_queue_properties properties, - uint queueRTCUs = 0 + uint queueRTCUs = 0, + Priority priority = Priority::Normal ); //! Returns TRUE if this command queue can accept commands.