From 9d158024305dc9941562b4629584c0558fba79c1 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 3 May 2016 14:32:32 -0400
Subject: [PATCH] P4 to Git Change 1264675 by gandryey@gera-w8 on 2016/05/03
 14:13:52

	SWDEV-86170 - Need OCL changes for Compute Unit Reservation
	- Add support for RT and Medium priority queues
	- Use the new packet for the CU mask programming. It will allow CU reservation for RT queue in KMD.

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_command.cpp#11 edit
... //depot/stg/opencl/drivers/opencl/library/hsa/hsail/src/devenq/schedule.cl#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#546 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#159 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpumemory.cpp#127 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#402 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#139 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#81 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#52 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp#165 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/backend.h#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.cpp#22 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/commandqueue.hpp#17 edit
---
 rocclr/runtime/device/gpu/gpudevice.cpp       | 32 +++++++-----
 rocclr/runtime/device/gpu/gpudevice.hpp       | 13 ++++-
 rocclr/runtime/device/gpu/gpumemory.cpp       |  4 ++
 rocclr/runtime/device/gpu/gpuvirtual.cpp      | 51 ++++++++++++-------
 rocclr/runtime/device/gpu/gpuvirtual.hpp      | 11 ++--
 .../device/gpu/gslbe/src/rt/GSLContext.cpp    |  7 +--
 .../device/gpu/gslbe/src/rt/GSLContext.h      |  2 +-
 .../device/gpu/gslbe/src/rt/GSLDevice.cpp     |  1 +
 .../runtime/device/gpu/gslbe/src/rt/backend.h |  1 +
 rocclr/runtime/platform/commandqueue.cpp      |  5 +-
 rocclr/runtime/platform/commandqueue.hpp      | 17 ++++++-
 11 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 88534a51f3..e2f7d137fd 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -174,7 +174,7 @@ NullDevice::create(CALtarget target)
     calAttr.localRAM = 512;
 
     // Fill the device info structure
-    fillDeviceInfo(calAttr, memInfo, 4096, 1);
+    fillDeviceInfo(calAttr, memInfo, 4096, 1, 0);
 
     if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
         // Runtime doesn't know what local size could be on the real board
@@ -280,11 +280,14 @@ NullDevice::createProgram(amd::option::Options* options)
     return new NullProgram(*this);
 }
 
-void NullDevice::fillDeviceInfo(
+void
+NullDevice::fillDeviceInfo(
     const CALdeviceattribs& calAttr,
     const gslMemInfo& memInfo,
     size_t  maxTextureSize,
-    uint    numComputeRings)
+    uint    numComputeRings,
+    uint    numComputeRingsRT
+    )
 {
     info_.type_     = CL_DEVICE_TYPE_GPU;
     info_.vendorId_ = 0x1002;
@@ -549,8 +552,8 @@ void NullDevice::fillDeviceInfo(
         info_.localMemBanks_        = hwInfo()->localMemBanks_;
         info_.gfxipVersion_         = hwInfo()->gfxipVersion_;
         info_.numAsyncQueues_       = numComputeRings;
-        info_.numRTQueues_          = 2;
-        info_.numRTCUs_             = 4;
+        info_.numRTQueues_          = numComputeRingsRT;
+        info_.numRTCUs_             = calAttr.maxRTCUs;
         info_.threadTraceEnable_    = settings().threadTraceEnable_;
     }
 }
@@ -576,6 +579,7 @@ void
 Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings)
 {
     numComputeRings_ = 0;
+    numComputeRingsRT_ = 0;
     numDmaEngines_ = 0;
 
     for (uint i = 0; i < num; ++i) {
@@ -587,6 +591,13 @@ Device::Engines::create(uint num, gslEngineDescriptor* desc, uint maxNumComputeR
             numComputeRings_++;
         }
 
+        if (desc[i].id == GSL_ENGINEID_COMPUTE_RT) {
+            numComputeRingsRT_++;
+        }
+        if (desc[i].id == GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY) {
+            numComputeRingsRT_++;
+        }
+
         if (desc[i].id >= GSL_ENGINEID_DRMDMA0 &&
             desc[i].id <= GSL_ENGINEID_DRMDMA1) {
             numDmaEngines_++;
@@ -910,7 +921,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
     // Fill the device info structure
     fillDeviceInfo(getAttribs(), getMemInfo(),
         static_cast<size_t>(getMaxTextureSize()),
-        engines().numComputeRings());
+        engines().numComputeRings(), engines().numComputeRingsRT());
 
     if (settings().hsail_ || (settings().oclVersion_ == OpenCL20)) {
         if (NULL == hsaCompiler_) {
@@ -969,7 +980,7 @@ Device::initializeHeapResources()
 
         PerformFullInitialization();
 
-        uint numComputeRings = engines_.numComputeRings();
+        uint numComputeRings = engines_.numComputeRings() + engines_.numComputeRingsRT();
         scratch_.resize((settings().useSingleScratch_) ? 1 : (numComputeRings ? numComputeRings : 1));
 
         // Initialize the number of mem object for the scratch buffer
@@ -1074,7 +1085,7 @@ Device::createVirtualDevice(
 {
     bool    profiling = false;
     bool    interopQueue = false;
-    uint    rtCUs  = 0;
+    uint    rtCUs  = amd::CommandQueue::RealTimeDisabled;
     uint    deviceQueueSize = 0;
 
     if (queue != NULL) {
@@ -1101,10 +1112,7 @@ Device::createVirtualDevice(
     }
 
     VirtualGPU* vgpu = new VirtualGPU(*this);
-    if (vgpu && vgpu->create(
-        profiling
-        , deviceQueueSize
-        )) {
+    if (vgpu && vgpu->create(profiling, rtCUs, deviceQueueSize, queue->priority())) {
         return vgpu;
     } else {
         delete vgpu;
diff --git a/rocclr/runtime/device/gpu/gpudevice.hpp b/rocclr/runtime/device/gpu/gpudevice.hpp
index 55328d48ae..9c4d4027ea 100644
--- a/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -129,7 +129,8 @@ protected:
         const CALdeviceattribs& calAttr,    //!< CAL device attributes info
         const gslMemInfo&  memInfo,         //!< GSL mem info
         size_t  maxTextureSize,             //!< Maximum texture size supported in HW
-        uint    numComputeRings             //!< Number of compute rings
+        uint    numComputeRings,            //!< Number of compute rings
+        uint    numComputeRingsRT           //!< Number of RT compute rings
         );
 };
 
@@ -237,7 +238,11 @@ public:
     {
     public:
         //! Default constructor
-        Engines() { memset(desc_, 0xff, sizeof(desc_)); }
+        Engines()
+            : numComputeRings_(0)
+            , numComputeRingsRT_(0)
+            , numDmaEngines_(0)
+            { memset(desc_, 0xff, sizeof(desc_)); }
 
         //! Creates engine descriptor for this class
         void create(uint num, gslEngineDescriptor* desc, uint maxNumComputeRings);
@@ -251,11 +256,15 @@ public:
         //! Returns the number of available compute rings
         uint numComputeRings() const { return numComputeRings_; }
 
+        //! Returns the number of available real time compute rings
+        uint numComputeRingsRT() const { return numComputeRingsRT_; }
+
         //! Returns the number of available DMA engines
         uint numDMAEngines() const { return numDmaEngines_; }
 
     private:
         uint numComputeRings_;
+        uint numComputeRingsRT_;
         uint numDmaEngines_;
         gslEngineDescriptor desc_[GSL_ENGINEID_MAX];    //!< Engine descriptor
     };
diff --git a/rocclr/runtime/device/gpu/gpumemory.cpp b/rocclr/runtime/device/gpu/gpumemory.cpp
index 807b26d6d4..6fd4bfd7b3 100644
--- a/rocclr/runtime/device/gpu/gpumemory.cpp
+++ b/rocclr/runtime/device/gpu/gpumemory.cpp
@@ -171,6 +171,7 @@ Memory::create(
     // Check if CAL created a resource
     if (result) {
         switch (memoryType()) {
+        case Resource::Persistent:
         case Resource::Pinned:
         case Resource::ExternalPhysical:
             // Marks memory object for direct GPU access to the host memory
@@ -186,6 +187,9 @@ Memory::create(
         case Resource::View: {
             Resource::ViewParams* view =
                 reinterpret_cast<Resource::ViewParams*>(params);
+            if (view->resource_->memoryType() == Resource::Persistent) {
+                flags_ |= HostMemoryDirectAccess;
+            }
             // Check if parent was allocated in system memory
             if ((view->resource_->memoryType() == Resource::Pinned) ||
                 (((view->resource_->memoryType() == Resource::Remote) ||
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index ef9773d503..2ec9386fae 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -196,7 +196,7 @@ VirtualGPU::DmaFlushMgmt::isCbReady(
 }
 
 bool
-VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines)
+VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs)
 {
     // GSL device initialization
     dev().PerformFullInitialization();
@@ -206,7 +206,7 @@ VirtualGPU::gslOpen(uint nEngines, gslEngineDescriptor *engines)
         ? CAL_WAIT_LOW_CPU_UTILIZATION
         : CAL_WAIT_POLLING;
 
-    if (!open(&dev(), nEngines, engines)) {
+    if (!open(&dev(), nEngines, engines, rtCUs)) {
         return false;
     }
 
@@ -432,10 +432,8 @@ VirtualGPU::VirtualGPU(
 }
 
 bool
-VirtualGPU::create(
-    bool    profiling
-    , uint  deviceQueueSize
-    )
+VirtualGPU::create(bool profiling, uint rtCUs, uint  deviceQueueSize,
+    amd::CommandQueue::Priority priority)
 {
     device::BlitManager::Setup  blitSetup;
     gslEngineDescriptor engines[2];
@@ -452,14 +450,34 @@ VirtualGPU::create(
 
     {
         if (dev().engines().numComputeRings()) {
-            uint    idx = index() % dev().engines().numComputeRings();
+            uint    idx;
 
+            if ((amd::CommandQueue::RealTimeDisabled == rtCUs) &&
+                (priority == amd::CommandQueue::Priority::Normal)) {
+                idx = index() % dev().engines().numComputeRings();
+                engineMask = dev().engines().getMask(
+                    (gslEngineID)(dev().isComputeRingIDForced() ?
+                    dev().getforcedComputeEngineID() :
+                    (dev().getFirstAvailableComputeEngineID() + idx)));
+
+            }
+            else {
+                if (priority == amd::CommandQueue::Priority::Medium) {
+                    engineMask = dev().engines().getMask((gslEngineID)
+                        (GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY));
+                }
+                else {
+                    engineMask = dev().engines().getMask((gslEngineID)
+                        (GSL_ENGINEID_COMPUTE_RT));
+                }
+                //!@todo This is not a generic solution and
+                // may have issues with > 8 queues
+                idx = index() % (dev().engines().numComputeRings() + 
+                        dev().engines().numComputeRingsRT());
+            }
             // hwRing_ should be set 0 if forced to have single scratch buffer
             hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
 
-            engineMask = dev().engines().getMask((gslEngineID)(dev().isComputeRingIDForced() ?
-                         dev().getforcedComputeEngineID() :
-                         (dev().getFirstAvailableComputeEngineID() + idx)));
             if (dev().canDMA()) {
                 // If only 1 DMA engine is available then use that one
                 if (dev().engines().numDMAEngines() < 2) {
@@ -479,12 +497,12 @@ VirtualGPU::create(
                 engineMask |= dev().engines().getMask(GSL_ENGINEID_DRMDMA0);
             }
         }
-        num = dev().engines().getRequested(engineMask, engines);
+    }
+    num = dev().engines().getRequested(engineMask, engines);
 
-        // Open GSL context
-        if ((num == 0) || !gslOpen(num, engines)) {
-            return false;
-        }
+    // Open GSL context
+    if ((num == 0) || !gslOpen(num, engines, rtCUs)) {
+        return false;
     }
 
     // Diable double copy optimization,
@@ -1178,7 +1196,6 @@ VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd)
 {
     // Make sure VirtualGPU has an exclusive access to the resources
     amd::ScopedLock lock(execution());
-
     gpu::Memory* memory = dev().getGpuMemory(&vcmd.memory());
     amd::Memory* owner = memory->owner();
     bool    unmapMip = false;
@@ -2831,7 +2848,6 @@ VirtualGPU::flushDMA(uint engineID)
         //! since only L2 cache is flushed in KMD frame,
         //! but L1 still has to be invalidated.
     }
-
     //! \note Use CtxIsEventDone, so we won't flush compute for DRM engine
     isDone(&cal_.events_[engineID]);
 }
@@ -2841,7 +2857,6 @@ VirtualGPU::waitAllEngines(CommandBatch* cb)
 {
     uint i;
     GpuEvent*   events;    //!< GPU events for the batch
-
     // If command batch is NULL then wait for the current
     if (NULL == cb) {
         events = cal_.events_;
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp
index c8dc7a9bc0..e3cce21d2f 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -10,6 +10,7 @@
 #include "device/gpu/gpuprintf.hpp"
 #include "device/gpu/gputimestamp.hpp"
 #include "device/gpu/gpusched.hpp"
+#include "platform/commandqueue.hpp"
 #include "device/blit.hpp"
 
 #include "device/gpu/gpudebugger.hpp"
@@ -199,12 +200,10 @@ public:
     typedef std::vector<ResourceSlot> ResourceSlots;
 
 public:
-
     VirtualGPU(Device& device);
-    bool create(
-        bool    profiling
-        , uint  deviceQueueSize = 0
-        );
+    bool create(bool profiling, uint rtCUs = amd::CommandQueue::RealTimeDisabled,
+            uint deviceQueueSize = 0,
+            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
     ~VirtualGPU();
 
     void submitReadMemory(amd::ReadMemoryCommand& vcmd);
@@ -443,7 +442,7 @@ private:
     //! Frees CAL kernel descriptor of the virtual device
     void freeKernelDesc(GslKernelDesc* desc);
 
-    bool gslOpen(uint nEngines, gslEngineDescriptor *engines);
+    bool gslOpen(uint nEngines, gslEngineDescriptor *engines, uint32_t rtCUs);
     void gslDestroy();
 
     //! Releases stage write buffers
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 7e52e6c493..41fe0c427c 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -46,7 +46,8 @@ bool
 CALGSLContext::open(
     const CALGSLDevice* pDeviceObject,
     uint32              nEngines,
-    gslEngineDescriptor* engines)
+    gslEngineDescriptor* engines,
+    uint32              rtCUs)
 {
     m_Dev = pDeviceObject;
 
@@ -63,7 +64,7 @@ CALGSLContext::open(
     for (uint i = 0; i < nEngines; i++)
     {
         if (engines[i].id >= GSL_ENGINEID_3DCOMPUTE0 &&
-            engines[i].id <= GSL_ENGINEID_COMPUTE7)
+            engines[i].id <= GSL_ENGINEID_COMPUTE_MEDIUM_PRIORITY)
         {
             mainEngineOrdinal = engines[i].id;
         }
@@ -76,7 +77,7 @@ CALGSLContext::open(
         }
     }
 
-    m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false);
+    m_cs = native->createComputeContext(mainEngineOrdinal, sdmaOrdinal, false, rtCUs);
 
     if (m_cs == 0)
     {
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 61b77c9435..06cde42b6d 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -29,7 +29,7 @@ public:
     CALGSLContext();
     ~CALGSLContext();
 
-    bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines);
+    bool open(const CALGSLDevice* pDeviceObject, uint32 nEngines, gslEngineDescriptor *engines, uint32 rtCUs = 0);
     void close(gsl::gsAdaptor* native);
 
     bool             setInput(uint32 physUnit, gslMemObject mem);
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
index 473c96d04d..05b858c1ca 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLDevice.cpp
@@ -119,6 +119,7 @@ CALGSLDevice::getAttribs_int(gsl::gsCtx* cs)
     m_attribs.isOpenCL200Device = m_adp->pAsicInfo->bIsOpen2Device;
     m_attribs.isSVMFineGrainSystem = m_adp->pAsicInfo->svmFineGrainSystem;
     m_attribs.isWDDM2Enabled = m_adp->pAsicInfo->vaAvailable && m_adp->pAsicInfo->bNoVATranslation;
+    m_attribs.maxRTCUs = cs->getMaxRTCUs();
 }
 
 bool
diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
index 0308691384..f94332bcf8 100644
--- a/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
+++ b/rocclr/runtime/device/gpu/gslbe/src/rt/backend.h
@@ -84,6 +84,7 @@ typedef struct CALdeviceattribsRec {
     bool       isOpenCL200Device;                   /**< the flag to mark if the device is OpenCL 200 */
     bool       isSVMFineGrainSystem;                /**< check if SVM finegrainsystem */
     bool       isWDDM2Enabled;                      /**< check if WDDM2 is enabled */
+    CALuint    maxRTCUs;                            /**< The maximum number of RT CUs for RT queues */
 } CALdeviceattribs;
 
 
diff --git a/rocclr/runtime/platform/commandqueue.cpp b/rocclr/runtime/platform/commandqueue.cpp
index 1f3071440d..a954685ac2 100644
--- a/rocclr/runtime/platform/commandqueue.cpp
+++ b/rocclr/runtime/platform/commandqueue.cpp
@@ -18,10 +18,11 @@
 namespace amd {
 
 HostQueue::HostQueue(
-    Context& context, Device& device, cl_command_queue_properties properties, uint queueRTCUs
+    Context& context, Device& device,
+    cl_command_queue_properties properties, uint queueRTCUs, Priority priority
     )
         : CommandQueue(context, device, properties, device.info().queueProperties_
-            | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs)
+            | CL_QUEUE_COMMAND_INTERCEPT_ENABLE_AMD, queueRTCUs, priority)
 {
     if (thread_.state() >= Thread::INITIALIZED) {
         ScopedLock sl(queueLock_);
diff --git a/rocclr/runtime/platform/commandqueue.hpp b/rocclr/runtime/platform/commandqueue.hpp
index 05ad3b42d3..3bea84fc45 100644
--- a/rocclr/runtime/platform/commandqueue.hpp
+++ b/rocclr/runtime/platform/commandqueue.hpp
@@ -31,6 +31,12 @@ class DeviceQueue;
 class CommandQueue : public RuntimeObject
 {
 public:
+    static const uint RealTimeDisabled = 0xffffffff;
+    enum class Priority : uint {
+        Normal = 0,
+        Medium
+    };
+
     struct Properties
     {
         typedef cl_command_queue_properties value_type;
@@ -92,6 +98,9 @@ public:
     //! Returns the number or requested real time CUs
     uint    rtCUs() const { return rtCUs_; }
 
+    //! Returns the queue priority
+    Priority    priority() const { return priority_; }
+
 protected:
     //! CommandQueue constructor is protected
     //! to keep the CommandQueue class as a virtual interface
@@ -100,16 +109,19 @@ protected:
         Device&     device,     //!< Device object
         cl_command_queue_properties properties, //!< Queue properties
         cl_command_queue_properties propMask,   //!< Queue properties mask
-        uint        rtCUs = 0   //!< Avaialble real time compute units
+        uint        rtCUs = RealTimeDisabled,   //!< Avaialble real time compute units
+        Priority    priority = Priority::Normal //!< Queue priority
         )
         : properties_(propMask, properties)
         , rtCUs_(rtCUs)
+        , priority_(priority)
         , queueLock_("CommandQueue::queueLock")
         , device_(device)
         , context_(context) {}
 
     Properties  properties_;    //!< Queue properties
     uint        rtCUs_;         //!< The number of used RT compute units
+    Priority    priority_;      //!< Queue priority
     Monitor     queueLock_;     //!< Lock protecting the queue
     Device&     device_;        //!< The device
     SharedReference<Context> context_;  //!< The context of this command queue
@@ -179,7 +191,8 @@ public:
         Context& context,
         Device& device,
         cl_command_queue_properties properties,
-        uint    queueRTCUs = 0
+        uint    queueRTCUs = 0,
+        Priority priority = Priority::Normal
     );
 
     //! Returns TRUE if this command queue can accept commands.