diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 09db38f4fd..ed04665560 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1313,6 +1313,7 @@ class VirtualDevice : public amd::HeapObject {
   virtual void submitVirtualMap(amd::VirtualMapCommand& cmd) { ShouldNotReachHere(); }
 
   virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
+  virtual void ReleaseHwQueue() {}
 
   //! Get the blit manager object
   device::BlitManager& blitMgr() const { return *blitMgr_; }
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 219cc3a46c..0ecb1e7c59 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -2876,10 +2876,21 @@ hsa_queue_t* Device::getQueueFromPool(const uint qIndex) {
   return nullptr;
 }
 
+// ================================================================================================
+hsa_queue_t* Device::AcquireActiveNormalQueue() {
+  uint32_t queue_size = ROC_AQL_QUEUE_SIZE;
+  auto queue = acquireQueue(
+    queue_size, false, std::vector<uint32_t>{}, amd::CommandQueue::Priority::Normal, true);
+  return queue;
+}
+
 // ================================================================================================
 hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
                                   const std::vector<uint32_t>& cuMask,
-                                  amd::CommandQueue::Priority priority) {
+                                  amd::CommandQueue::Priority priority,
+                                  bool managed) {
+  amd::ScopedLock l(active_queue_access_);
+
   assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
          queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
          queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
@@ -2916,6 +2927,9 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
       ((queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES) || queuePool_[qIndex].size() > 0)) {
     hsa_queue_t* queue = getQueueFromPool(qIndex);
     if (queue != nullptr) {
+      if (!managed && (qIndex  == QueuePriority::Normal)) {
+        num_normal_queues_++;
+      }
       return queue;
     }
   }
@@ -3057,13 +3071,33 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
   qInfo.refCount = 1;
   ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "acquireQueue refCount: %p (%d)",
           result.first->first->base_address, result.first->second.refCount);
+  if (!managed && (cuMask.size() == 0) && (qIndex = QueuePriority::Normal)) {
+    num_normal_queues_++;
+  }
   return queue;
 }
 
-void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask, bool coop_queue) {
+// ================================================================================================
+bool Device::ReleaseActiveNormalQueue(hsa_queue_t* queue) {
+  // Release a queue if the total number of allocated queues exceeds the max possible
+  if (num_normal_queues_.load() > GPU_MAX_HW_QUEUES) {
+    releaseQueue(queue, std::vector<uint32_t>{}, false, true);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// ================================================================================================
+void Device::releaseQueue(hsa_queue_t* queue, const std::vector<uint32_t>& cuMask,
+    bool coop_queue, bool managed) {
+  amd::ScopedLock l(active_queue_access_);
   for (auto& it : cuMask.size() == 0 ? queuePool_ : queueWithCUMaskPool_) {
     auto qIter = it.find(queue);
     if (qIter != it.end()) {
+      if (!managed && (cuMask.size() == 0) && (&it == &queuePool_[QueuePriority::Normal])) {
+        num_normal_queues_--;
+      }
       auto &qInfo = qIter->second;
       assert(qInfo.refCount > 0);
       qInfo.refCount--;
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index b1307486e8..8980d98530 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -511,10 +511,15 @@ class Device : public NullDevice {
   //! share previously created
   hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
                             const std::vector<uint32_t>& cuMask = {},
-                            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
+                            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal,
+                            bool managed = false);
 
   //! Release HSA queue
-  void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {}, bool coop_queue = false);
+  void releaseQueue(hsa_queue_t*, const std::vector<uint32_t>& cuMask = {},
+                    bool coop_queue = false, bool managed = false);
+
+  hsa_queue_t* AcquireActiveNormalQueue();
+  bool ReleaseActiveNormalQueue(hsa_queue_t* queue);
 
   //! For the given HSA queue, return an existing hostcall buffer or create a
   //! new one. queuePool_ keeps a mapping from HSA queue to hostcall buffer.
@@ -620,6 +625,8 @@ class Device : public NullDevice {
   };
   //! a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
   std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queuePool_;
+  amd::Monitor active_queue_access_;                //!< Lock to serialise virtual gpu list access
+  std::atomic<uint32_t> num_normal_queues_{0};      //!< The total number of allocated normal queues
 
   //! returns a hsa queue from queuePool with least refCount and updates the refCount as well
   hsa_queue_t* getQueueFromPool(const uint qIndex);
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp
index c68065f9f5..5ddf853dd3 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp
@@ -95,6 +95,8 @@ Settings::Settings() {
   kernel_arg_impl_ = KernelArgImpl::HostKernelArgs;
   gwsInitSupported_ = true;
   limit_blit_wg_ = 16;
+
+  dynamic_queues_ = amd::IS_HIP ? DEBUG_HIP_DYNAMIC_QUEUES : false;
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index b4da5df0d5..dd323b86ab 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -50,7 +50,8 @@ class Settings : public device::Settings {
       uint system_scope_signal_ : 1;    //!< HSA signal is visibile to the entire system
       uint fgs_kernel_arg_ : 1;         //!< Use fine grain kernel arg segment
       uint barrier_value_packet_ : 1;   //!< Barrier value packet functionality
-      uint reserved_ : 23;
+      uint dynamic_queues_ : 1;         //!< Dynamic queues management
+      uint reserved_ : 22;
     };
     uint value_;
   };
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 7f43aaef35..6881654509 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -883,6 +883,15 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
   return true;
 }
 
+// ================================================================================================
+uint64_t VirtualGPU::getQueueID() {
+  amd::ScopedLock lock(execution());
+  if (gpu_queue_ == nullptr) {
+    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
+  }
+  return gpu_queue_->id;
+}
+
 // ================================================================================================
 static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
@@ -1463,7 +1472,7 @@ VirtualGPU::~VirtualGPU() {
     roc_device_.vgpus()[idx]->index_--;
   }
 
-  if (gpu_queue_) {
+  if (gpu_queue_ != nullptr) {
     roc_device_.releaseQueue(gpu_queue_, cuMask_, cooperative_);
   }
 }
@@ -1522,6 +1531,8 @@ bool VirtualGPU::create() {
     LogError("Could not create managed buffer for this queue!");
     return false;
   }
+  // Release HW queue until the first usage
+  ReleaseHwQueue();
   return true;
 }
 
@@ -1607,7 +1618,6 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
   return result;
 }
 
-
 // ================================================================================================
 void VirtualGPU::ManagedBuffer::ResetPool() {
   pool_cur_offset_ = 0;
@@ -1631,12 +1641,27 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
   }
 }
 
+// ================================================================================================
+void VirtualGPU::ReleaseHwQueue() {
+  // Try to release normal queue to the pool of active queues
+  if (roc_device_.settings().dynamic_queues_ &&
+      (priority_ == amd::CommandQueue::Priority::Normal)) {
+    amd::ScopedLock lock(execution());
+    if ((gpu_queue_ != nullptr) && roc_device_.ReleaseActiveNormalQueue(gpu_queue_)) {
+      gpu_queue_ = nullptr;
+    }
+  }
+}
+
 // ================================================================================================
 /* profilingBegin, when profiling is enabled, creates a timestamp to save in
 * virtualgpu's timestamp_, saves the pointer timestamp_ to the command's data
 * and then calls start() to get the current host timestamp.
 */
 void VirtualGPU::profilingBegin(amd::Command& command, bool sdmaProfiling) {
+  if (gpu_queue_ == nullptr) {
+    gpu_queue_ = roc_device_.AcquireActiveNormalQueue();
+  }
   // Track the current command
   command_ = &command;
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index c243f4cb71..43c927ff6e 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -376,6 +376,7 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd){}
 
   virtual address allocKernelArguments(size_t size, size_t alignment) final;
+  virtual void ReleaseHwQueue() final;
 
   /**
    * @brief Waits on an outstanding kernel without regard to how
@@ -436,7 +437,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
   uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
-  uint64_t getQueueID() { return gpu_queue_->id; }
+  uint64_t getQueueID();
 
   //! Analyzes a crashed AQL queue to find a broken AQL packet
   void AnalyzeAqlQueue() const;
@@ -531,7 +532,7 @@ class VirtualGPU : public device::VirtualDevice {
   Timestamp* timestamp_;
   amd::Command* command_;   //!< Current command
   hsa_agent_t gpu_device_;  //!< Physical device
-  hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
+  hsa_queue_t* gpu_queue_;  //!< Active queue associated with a vgpu
   hsa_barrier_and_packet_t barrier_packet_;
   hsa_amd_barrier_value_packet_t barrier_value_packet_;
 
diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp
index a6d7077358..77aa26eb7a 100644
--- a/projects/clr/rocclr/platform/commandqueue.cpp
+++ b/projects/clr/rocclr/platform/commandqueue.cpp
@@ -197,6 +197,8 @@ void HostQueue::finish(bool cpu_wait) {
       }
     }
   }
+  // Release HW queue to the pool for dynamic management if enabled
+  vdev()->ReleaseHwQueue();
 
   command->release();
   ClPrint(LOG_DEBUG, LOG_CMD, "All commands finished for host queue : %p", this);
diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp
index 50edf21e11..24903a48fa 100644
--- a/projects/clr/rocclr/utils/flags.hpp
+++ b/projects/clr/rocclr/utils/flags.hpp
@@ -273,6 +273,8 @@ release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false,                          \
         "Toggle kernel arg copy workaround")                                  \
 release(bool, DEBUG_CLR_SKIP_RELEASE_SCOPE, false,                            \
         "Forces release scope to SCOPE_NONE for aql packets")                 \
+release(bool, DEBUG_HIP_DYNAMIC_QUEUES, true,                                 \
+        "Forces dynamic queue management")                                    \
 release(uint, DEBUG_HIP_7_PREVIEW, 0,                                         \
         "Enables specific backward incompatible changes support before 7.0,"  \
         "using the mask. By default the changes are disabled and is set to 0")\