From f8dc564915875c6eff34acc204091040c7facc0d Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Fri, 25 Jul 2014 20:41:05 -0400
Subject: [PATCH] P4 to Git Change 1059564 by gandryey@gera-dev-w7 on
 2014/07/25 18:14:33

	ECR #304775 - Device enqueuing
	- Run the scheduler in the host queue by default.
	- GPU_USE_DEVICE_QUEUE=1 can force execution in the device queue

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#451 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#128 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#260 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#10 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuschedcl.cpp#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#273 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.hpp#86 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#327 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#119 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#208 edit


[ROCm/clr commit: 3a4400135d4f1716beb799527806e06582b97a75]
---
 .../rocclr/runtime/device/gpu/gpudevice.cpp   |  1 -
 .../rocclr/runtime/device/gpu/gpudevice.hpp   |  1 -
 .../rocclr/runtime/device/gpu/gpukernel.cpp   | 12 ++-
 .../rocclr/runtime/device/gpu/gpusched.hpp    |  3 +
 .../rocclr/runtime/device/gpu/gpuschedcl.cpp  | 25 ++++--
 .../rocclr/runtime/device/gpu/gpusettings.cpp |  3 +
 .../rocclr/runtime/device/gpu/gpusettings.hpp |  3 +-
 .../rocclr/runtime/device/gpu/gpuvirtual.cpp  | 90 ++++++++++++-------
 .../rocclr/runtime/device/gpu/gpuvirtual.hpp  | 11 +--
 .../device/gpu/gslbe/src/rt/GSLContext.cpp    |  4 +-
 .../device/gpu/gslbe/src/rt/GSLContext.h      |  3 +-
 projects/clr/rocclr/runtime/utils/flags.hpp   |  2 +
 12 files changed, 107 insertions(+), 51 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
index 025322fa8c..1d088139e2 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -359,7 +359,6 @@ Device::Device()
     : NullDevice()
     , CALGSLDevice()
     , numOfVgpus_(0)
-    , numDeviceQueues_(0)
     , context_(NULL)
     , heap_(NULL)
     , dummyPage_(NULL)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
index 2a6cb24aaf..e7f384ecd5 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp
@@ -468,7 +468,6 @@ public:
     //! Returns the number of virtual GPUs allocated on this device
     uint    numOfVgpus() const { return numOfVgpus_; }
     uint    numOfVgpus_;        //!< The number of virtual GPUs (lock protected)
-    uint    numDeviceQueues_;   //!< Number of device queues
 
     typedef std::vector<VirtualGPU*> VirtualGPUs;
 
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
index 0a40baefed..142e3d7029 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3975,7 +3975,17 @@ HSAILKernel::loadArguments(
             const amd::DeviceQueue* queue =
                 *reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
             VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
-            uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
+            uint64_t vmQueue;
+            if (dev().settings().useDeviceQueue_) {
+                vmQueue = gpuQueue->vQueue()->vmAddress();
+            }
+            else {
+                if (!gpu.createVirtualQueue(queue->size())) {
+                    LogError( "Virtual queue creaiton failed!");
+                    return false;
+                }
+                vmQueue = gpu.vQueue()->vmAddress();
+            }
             WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
             break;
         }
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
index f0bcd2f4c0..bb456c3b60 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusched.hpp
@@ -66,6 +66,9 @@ struct SchedulerParam {
     uint64_t    scratch;        //!< GPU address to the scratch buffer
     uint32_t    numMaxWaves;    //!< The max number of possible waves
     uint32_t    releaseHostCP;  //!< Releases CP on the host queue
+    uint64_t    parentAQL;      //!< Host parent AmdAqlWrap packet
+    uint32_t    dedicatedQueue; //!< Scheduler uses a dedicated queue
+    uint32_t    reserved;       //!< Reserved field
 };
 
 } // namespace gpu
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
index 6390025655..98be8f0c39 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuschedcl.cpp
@@ -90,6 +90,9 @@ typedef struct _SchedulerParam {
     ulong   scratch;        //!< GPU address to the scratch buffer
     uint    numMaxWaves;    //!< Num max waves on the asic
     uint    releaseHostCP;  //!< Releases CP on the host queue
+    ulong   parentAQL;      //!< Host parent AmdAqlWrap packet
+    uint    dedicatedQueue; //!< Scheduler uses a dedicated queue
+    uint    reserved;       //!< Reserved field
 } SchedulerParam;
 
 typedef struct _HwDispatch {
@@ -276,7 +279,9 @@ static inline bool
 checkWaitEvents(__global AmdEvent** events, uint numEvents)
 {
     for (uint i = 0; i < numEvents; ++i) {
-        if (atomic_and(&events[i]->state, 0xffffffff) != CL_COMPLETE) {
+        if (atomic_load_explicit(
+            (__global atomic_uint*)(&events[i]->state),
+            memory_order_acquire, memory_scope_device) != CL_COMPLETE) {
             return false;
         }
     }
@@ -348,6 +353,8 @@ scheduler(
     __global  SchedulerParam* param = &params[paramIdx];
     volatile __global HwDispatch* hwDisp =
             (volatile __global HwDispatch*)param->hw_queue;
+    __global AmdAqlWrap*    hostParent = (__global AmdAqlWrap*)(param->parentAQL);
+    __global uint*          counter = (__global uint*)(&hostParent->child_counter);
     __global uint*          signal = (__global uint*)(&param->signal);
     __global AmdAqlWrap*    wraps = (__global AmdAqlWrap*)&queue[1];
     __global uint*          amask = (__global uint *)queue->aql_slot_mask;
@@ -360,7 +367,7 @@ scheduler(
     }
 
     uint launch = 0;
-    uint loop;
+    uint loop = 1;
 
     do {
         uint mask = atomic_load_explicit((__global atomic_uint*)(&amask[get_group_id(0)]),
@@ -491,10 +498,16 @@ scheduler(
         launch = atomic_load_explicit((__global atomic_uint*)&param->launch,
             memory_order_acquire, memory_scope_device);
 
-        loop = atomic_load_explicit((__global atomic_uint*)signal,
-            memory_order_acquire, memory_scope_device);
+        if (param->dedicatedQueue) {
+            loop = atomic_load_explicit((__global atomic_uint*)signal,
+                memory_order_acquire, memory_scope_device);
+        }
+        else {
+            loop = atomic_load_explicit((__global atomic_uint*)counter,
+                memory_order_acquire, memory_scope_device);
+        }
 
-    } while ((launch == 0) && (loop == 1));
+    } while ((launch == 0) && (loop != 0));
 
     if (loop == 0) {
         //! \todo Write deadcode to the template, but somehow
@@ -504,6 +517,8 @@ scheduler(
         hwDisp[1].condExe1 = 0xdeadc0de;
         hwDisp[1].condExe2 = 0xdeadc0de;
         hwDisp[1].condExe3 = 0xdeadc0de;
+        atomic_store_explicit((__global atomic_uint*)signal,
+            0, memory_order_release, memory_scope_device);
         barrier(CLK_GLOBAL_MEM_FENCE);
         atomic_store_explicit((__global atomic_uint*)&hwDisp->startExe,
             ResumeExecution, memory_order_release, memory_scope_device);
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
index 580976e51a..363ba49192 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp
@@ -137,6 +137,9 @@ Settings::Settings()
 
     // Use direct SRD by default
     hsailDirectSRD_ = GPU_DIRECT_SRD;
+
+    // Use host queue for device enqueuing by default
+    useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
 }
 
 bool
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
index 5835b6ca39..9ea5f6b1c6 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.hpp
@@ -73,7 +73,8 @@ public:
             uint    apuSystem_: 1;      //!< Device is APU system with shared memory
             uint    asyncMemCopy_: 1;   //!< Use async memory transfers
             uint    hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
-            uint    reserved_: 2;
+            uint    useDeviceQueue_: 1; //!< Submit to separate device queue
+            uint    reserved_: 1;
         };
         uint    value_;
     };
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 865c216ea4..2664676f5e 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -264,6 +264,24 @@ VirtualGPU::releasePinnedMem()
 bool
 VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 {
+    if (deviceQueueSize_ == deviceQueueSize) {
+        return true;
+    }
+    else {
+        //! @todo Temporarily keep the buffer mapped for debug purpose
+        if (NULL != schedParams_) {
+            schedParams_->unmap(this);
+        }
+        delete vqHeader_;
+        delete virtualQueue_;
+        delete schedParams_;
+        vqHeader_ = NULL;
+        virtualQueue_ = NULL;
+        schedParams_ = NULL;
+        schedParamIdx_ = 0;
+        deviceQueueSize_ = 0;
+    }
+
     uint    numSlots = deviceQueueSize / sizeof(AmdAqlWrap);
     uint    allocSize = deviceQueueSize;
 
@@ -339,6 +357,8 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 
     ptr  = reinterpret_cast<address>(schedParams_->map(this));
 
+    deviceQueueSize_ = deviceQueueSize;
+
     return true;
 }
 
@@ -365,6 +385,7 @@ VirtualGPU::VirtualGPU(
     , virtualQueue_(NULL)
     , schedParams_(NULL)
     , schedParamIdx_(0)
+    , deviceQueueSize_(0)
     , hsaQueueMem_(NULL)
 {
     memset(&cal_, 0, sizeof(CalVirtualDesc));
@@ -453,23 +474,7 @@ VirtualGPU::create(
 #endif // !cl_amd_open_video
     {
         if (dev().engines().numComputeRings()) {
-            uint    idx;
-
-            //! @todo Temporary workaround for Linux, because 2 HW queues only
-            //! Fixes conformance failures with multi queues
-            if ((0 == deviceQueueSize) || IS_WINDOWS) {
-                //! @note: Add 1 to account the device queue for transfers
-                idx = (index() + 1) % (dev().engines().numComputeRings() -
-                    gpuDevice_.numDeviceQueues_);
-            }
-            else {
-                gpuDevice_.numDeviceQueues_++;
-                if (gpuDevice_.numDeviceQueues_ >= dev().engines().numComputeRings()) {
-                    return false;
-                }
-                idx = (dev().engines().numComputeRings() - gpuDevice_.numDeviceQueues_)
-                    % dev().engines().numComputeRings();
-            }
+            uint    idx = index() % dev().engines().numComputeRings();
 
             // hwRing_ should be set 0 if forced to have single scratch buffer
             hwRing_ = (dev().settings().useSingleScratch_) ? 0 : idx;
@@ -558,10 +563,9 @@ VirtualGPU::create(
         return false;
     }
 
-    //! @todo for testing only
-    //deviceQueueSize = (deviceQueueSize == 0) ? (128 * Ki) : deviceQueueSize;
     // Check if the app requested a device queue creation
-    if ((0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
+    if (dev().settings().useDeviceQueue_ &&
+        (0 != deviceQueueSize) && !createVirtualQueue(deviceQueueSize)) {
         LogError("Could not create a virtual queue!");
         return false;
     }
@@ -598,10 +602,6 @@ VirtualGPU::~VirtualGPU()
     amd::ScopedLock k(dev().lockAsyncOps());
     amd::ScopedLock lock(dev().vgpusAccess());
 
-    if ((NULL != virtualQueue_) && IS_LINUX) {
-        gpuDevice_.numDeviceQueues_--;
-    }
-
     uint    i;
     // Destroy all kernels
     for (GslKernels::const_iterator it = gslKernels_.begin();
@@ -1696,13 +1696,19 @@ VirtualGPU::submitKernelInternalHSA(
             return false;
         }
         else {
-            gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
+            if (dev().settings().useDeviceQueue_) {
+                gpuDefQueue = static_cast<VirtualGPU*>(defQueue->vDev());
+                if (gpuDefQueue->hwRing() == hwRing()) {
+                    LogError("Can't submit the child kernels to the same HW ring as the host queue!");
+                    return false;
+                }
+            }
+            else {
+                createVirtualQueue(defQueue->size());
+                gpuDefQueue = this;
+            }
         }
         vmDefQueue = gpuDefQueue->virtualQueue_->vmAddress();
-        if (gpuDefQueue->hwRing() == hwRing()) {
-            LogError("Can't submit the child kernels to the same HW ring as the host queue!");
-            return false;
-        }
 
         // Add memory handles before the actual dispatch
         memList.push_back(gpuDefQueue->virtualQueue_);
@@ -1830,6 +1836,14 @@ VirtualGPU::submitKernelInternalHSA(
             }
         }
 
+        if (!dev().settings().useDeviceQueue_) {
+            // Add the termination handshake to the host queue
+            virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+                vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                0, dev().settings().useDeviceQueue_);
+        }
+
         // Get the global loop start before the scheduler
         mcaddr loopStart = gpuDefQueue->virtualQueueDispatcherStart();
         static_cast<KernelBlitManager&>(gpuDefQueue->blitMgr()).runScheduler(
@@ -1842,6 +1856,7 @@ VirtualGPU::submitKernelInternalHSA(
         // Get the address of PM4 template and add write it to params
         //! @note DMA flush must not occur between patch and the scheduler
         mcaddr patchStart = gpuDefQueue->virtualQueueDispatcherStart();
+
         // Program parameters for the scheduler
         SchedulerParam* param = &reinterpret_cast<SchedulerParam*>
             (gpuDefQueue->schedParams_->data())[gpuDefQueue->schedParamIdx_];
@@ -1852,6 +1867,9 @@ VirtualGPU::submitKernelInternalHSA(
         param->hsa_queue = gpuDefQueue->hsaQueueMem()->vmAddress();
         param->launch = 0;
         param->releaseHostCP = 0;
+        param->parentAQL = vmParentWrap;
+        param->dedicatedQueue = dev().settings().useDeviceQueue_;
+
         // Fill the scratch buffer information
         if (hsaKernel.prog().maxScratchRegs() > 0) {
             gpu::Memory* scratchBuf = dev().scratch(gpuDefQueue->hwRing())->memObjs_[0];
@@ -1880,16 +1898,20 @@ VirtualGPU::submitKernelInternalHSA(
         gpuDefQueue->virtualQueueDispatcherEnd(gpuEvent,
             gpuDefQueue->vmMems(), gpuDefQueue->cal_.memCount_,
             signalAddr, loopStart);
+
         // Set GPU event for the used resources
         for (uint i = 0; i < memList.size(); ++i) {
             memList[i]->setBusy(*gpuDefQueue, gpuEvent);
         }
 
-        // Add the termination handshake to the host queue
-        virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
-            vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
-            vmParentWrap + offsetof(AmdAqlWrap, child_counter),
-            signalAddr);
+        if (dev().settings().useDeviceQueue_) {
+            // Add the termination handshake to the host queue
+            virtualQueueHandshake(gpuEvent, gpuDefQueue->schedParams_->gslResource(),
+                vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
+                vmParentWrap + offsetof(AmdAqlWrap, child_counter),
+                signalAddr, dev().settings().useDeviceQueue_);
+        }
+
         ++gpuDefQueue->schedParamIdx_ %=
             gpuDefQueue->schedParams_->size() / sizeof(SchedulerParam);
         //! \todo optimize the wrap around
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
index 79af320c25..93b09a5a0f 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp
@@ -387,6 +387,11 @@ public:
     //! Update virtual queue header
     void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
 
+    //! Returns TRUE if virtual queue was successfully allocatted
+    bool createVirtualQueue(
+        uint deviceQueueSize            //!< Device queue size
+        );
+
     EngineType      engineID_;  //!< Engine ID for this VirtualGPU
     ResourceSlots   slots_;     //!< Resource slots for kernel arguments
     State           state_;     //!< virtual GPU current state
@@ -488,11 +493,6 @@ private:
         const amd::BufferRect& dstRect      //!< region of destination for copy
         );
 
-    //! Returns TRUE if virtual queue was successfully allocatted
-    bool createVirtualQueue(
-        uint deviceQueueSize            //!< Device queue size
-        );
-
     GslKernels      gslKernels_;        //!< GSL kernel descriptors
     GslKernelDesc*  activeKernelDesc_;  //!< active GSL kernel descriptors
     GpuEvents       gpuEvents_;         //!< GPU events
@@ -529,6 +529,7 @@ private:
     Memory*         virtualQueue_;  //!< Virtual device queue
     Memory*         schedParams_;   //!< The scheduler parameters
     uint            schedParamIdx_; //!< Index in the scheduler parameters buffer
+    uint            deviceQueueSize_;   //!< Device queue size
 
     Memory*         hsaQueueMem_;   //!< Memory for the amd_queue_t object
 };
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
index 46c078df59..303ff0c120 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp
@@ -1290,9 +1290,9 @@ CALGSLContext::virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* me
 
 void
 CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
-    uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal)
+    uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue)
 {
     eventBegin(MainEngine);
-    m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal);
+    m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
     eventEnd(MainEngine, event);
 }
diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
index 849101423e..2f7cc62cea 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
+++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h
@@ -47,7 +47,8 @@ public:
                         uint32 numMems, gslMemObject scratch, const void* cpuKernelCode, uint64 hsaQueueVA);
     mcaddr           virtualQueueDispatcherStart();
     void             virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart);
-    void             virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState, uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal);
+    void             virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mcaddr parentState,
+                        uint32 newStateValue, mcaddr parentChildCounter, mcaddr signal, bool dedicatedQueue);
     bool             isDone(GpuEvent* event);
     void             waitForEvent(GpuEvent* event);
     void             flushIOCaches() const;
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index a36f7ea156..862c7d199b 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -150,6 +150,8 @@ release(uint, GPU_PRINT_CHILD_KERNEL, 0,                                      \
         "Prints the specified number of the child kernels")                   \
 release(bool, GPU_DIRECT_SRD, true,                                           \
         "Use indirect SRD access in HSAIL")                                   \
+release(bool, GPU_USE_DEVICE_QUEUE, false,                                    \
+        "Use a dedicated device queue for the actual submissions")            \
 release(bool, AMD_DEPTH_MSAA_INTEROP, false,                                  \
         "Enable depth stencil and MSAA buffer interop")                       \
 release(bool, AMD_THREAD_TRACE_ENABLE, false,                                 \