diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp
index 16a37f3c4c..b0e8e2e259 100644
--- a/rocclr/device/rocm/rocdevice.cpp
+++ b/rocclr/device/rocm/rocdevice.cpp
@@ -171,7 +171,8 @@ Device::Device(hsa_agent_t bkendDevice)
     , freeMem_(0)
     , vgpusAccess_("Virtual GPU List Ops Lock", true)
     , hsa_exclusive_gpu_access_(false)
-    , numOfVgpus_(0) {
+    , numOfVgpus_(0)
+    , queuePool_(QueuePriority::Total) {
   group_segment_.handle = 0;
   system_segment_.handle = 0;
   system_coarse_segment_.handle = 0;
@@ -1503,8 +1504,11 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
   // Initialization of heap and other resources occur during the command
   // queue creation time.
   const std::vector<uint32_t> defaultCuMask = {};
+  bool q = (queue != nullptr);
   VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative,
-                                            (queue != nullptr) ? queue->cuMask() : defaultCuMask);
+                                            q ? queue->cuMask() : defaultCuMask,
+                                            q ? queue->priority()
+                                              : amd::CommandQueue::Priority::Normal);
 
   if (!virtualDevice->create()) {
     delete virtualDevice;
@@ -1935,17 +1939,43 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
 }
 
 hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
-                                  const std::vector<uint32_t>& cuMask) {
-  assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
-          queuePool_.size(), GPU_MAX_HW_QUEUES);
+                                  const std::vector<uint32_t>& cuMask,
+                                  amd::CommandQueue::Priority priority) {
+  assert(queuePool_[QueuePriority::Low].size() <= GPU_MAX_HW_QUEUES ||
+         queuePool_[QueuePriority::Normal].size() <= GPU_MAX_HW_QUEUES ||
+         queuePool_[QueuePriority::High].size() <= GPU_MAX_HW_QUEUES);
 
-  // If we have reached the max number of queues, reuse an existing queue,
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues with low priority: %d,"
+      " with normal priority: %d, with high priority: %d, maximum per priority is: %d",
+      queuePool_[QueuePriority::Low].size(),
+      queuePool_[QueuePriority::Normal].size(),
+      queuePool_[QueuePriority::High].size(), GPU_MAX_HW_QUEUES);
+
+  hsa_amd_queue_priority_t queue_priority;
+  uint qIndex;
+  switch (priority) {
+    case amd::CommandQueue::Priority::Low:
+      queue_priority = HSA_AMD_QUEUE_PRIORITY_LOW;
+      qIndex = QueuePriority::Low;
+      break;
+    case amd::CommandQueue::Priority::High:
+      queue_priority = HSA_AMD_QUEUE_PRIORITY_HIGH;
+      qIndex = QueuePriority::High;
+      break;
+    case amd::CommandQueue::Priority::Normal:
+    case amd::CommandQueue::Priority::Medium:
+    default:
+      queue_priority = HSA_AMD_QUEUE_PRIORITY_NORMAL;
+      qIndex = QueuePriority::Normal;
+      break;
+  }
+
+  // If we have reached the max number of queues, reuse an existing queue with the matching queue priority,
   // choosing the one with the least number of users.
   // Note: Don't attempt to reuse the cooperative queue, since it's single per device
-  if (!coop_queue && (cuMask.size() == 0) && (queuePool_.size() == GPU_MAX_HW_QUEUES)) {
-    typedef decltype(queuePool_)::const_reference PoolRef;
-    auto lowest = std::min_element(queuePool_.begin(), queuePool_.end(),
+  if (!coop_queue && (cuMask.size() == 0) && (queuePool_[qIndex].size() == GPU_MAX_HW_QUEUES)) {
+    typedef decltype(queuePool_)::value_type::const_reference PoolRef;
+    auto lowest = std::min_element(queuePool_[qIndex].begin(), queuePool_[qIndex].end(),
                                    [] (PoolRef A, PoolRef B) {
                                      return A.second.refCount < B.second.refCount;
                                    });
@@ -1982,8 +2012,18 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
       return nullptr;
     }
   }
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d, cooperative: %i",
-                queue, queue_size, coop_queue);
+
+  hsa_status_t st = HSA_STATUS_SUCCESS;
+  st = hsa_amd_queue_set_priority(queue, queue_priority);
+  if (st != HSA_STATUS_SUCCESS) {
+    DevLogError("Device::acquireQueue: hsa_amd_queue_set_priority failed!");
+    hsa_queue_destroy(queue);
+    return nullptr;
+  }
+
+  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d with priority %d,"
+      " cooperative: %i", queue, queue_size, queue_priority, coop_queue);
+
   hsa_amd_profiling_set_profiler_enabled(queue, 1);
   if (cuMask.size() != 0) {
     std::stringstream ss;
@@ -2008,7 +2048,7 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
     // per device.
     return queue;
   }
-  auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
+  auto result = queuePool_[qIndex].emplace(std::make_pair(queue, QueueInfo()));
   assert(result.second && "QueueInfo already exists");
   auto &qInfo = result.first->second;
   qInfo.refCount = 1;
@@ -2016,32 +2056,45 @@ hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue,
 }
 
 void Device::releaseQueue(hsa_queue_t* queue) {
-  auto qIter = queuePool_.find(queue);
-  if (qIter != queuePool_.end()) {
-    auto &qInfo = qIter->second;
-    assert(qInfo.refCount > 0);
-    qInfo.refCount--;
-    if (qInfo.refCount != 0) {
+  for (auto& it : queuePool_) {
+    auto qIter = it.find(queue);
+    if (qIter != it.end()) {
+      auto &qInfo = qIter->second;
+      assert(qInfo.refCount > 0);
+      qInfo.refCount--;
+      if (qInfo.refCount != 0) {
         return;
-    }
-    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
+      }
+      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
+          "deleting hardware queue %p with refCount 0", queue);
 
-    if (qInfo.hostcallBuffer_) {
-      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
-              qInfo.hostcallBuffer_, queue);
-      disableHostcalls(qInfo.hostcallBuffer_, queue);
-      context().svmFree(qInfo.hostcallBuffer_);
-    }
+      if (qInfo.hostcallBuffer_) {
+        ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
+            "deleting hostcall buffer %p for hardware queue %p",
+            qInfo.hostcallBuffer_, queue);
+        disableHostcalls(qInfo.hostcallBuffer_, queue);
+        context().svmFree(qInfo.hostcallBuffer_);
+      }
 
-    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
-    queuePool_.erase(qIter);
+      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE,
+          "deleting hardware queue %p with refCount 0", queue);
+      it.erase(qIter);
+      break;
+    }
   }
   hsa_queue_destroy(queue);
 }
 
 void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
-  auto qIter = queuePool_.find(queue);
-  assert(qIter != queuePool_.end());
+  decltype(queuePool_)::value_type::iterator qIter;
+  for (auto& it : queuePool_) {
+    qIter = it.find(queue);
+    if (qIter != it.end()) {
+      break;
+    }
+  }
+
+  assert(qIter != queuePool_[QueuePriority::High].end());
 
   auto& qInfo = qIter->second;
   if (qInfo.hostcallBuffer_) {
diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp
index ebd7b2c79d..75acf93813 100644
--- a/rocclr/device/rocm/rocdevice.hpp
+++ b/rocclr/device/rocm/rocdevice.hpp
@@ -444,7 +444,8 @@ class Device : public NullDevice {
   //! Acquire HSA queue. This method can create a new HSA queue or
   //! share previously created
   hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
-                            const std::vector<uint32_t>& cuMask = {});
+                            const std::vector<uint32_t>& cuMask = {},
+                            amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
 
   //! Release HSA queue
   void releaseQueue(hsa_queue_t*);
@@ -503,10 +504,16 @@ class Device : public NullDevice {
     int refCount;
     void* hostcallBuffer_;
   };
-  std::map<hsa_queue_t*, QueueInfo> queuePool_;  //!< Pool of HSA queues for recycling
+
+  //!< a vector for keeping Pool of HSA queues with low, normal and high priorities for recycling
+  std::vector<std::map<hsa_queue_t*, QueueInfo>> queuePool_;
 
  public:
   amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
+
+  //!< enum for keeping the total and available queue priorities
+  enum QueuePriority : uint { Low = 0, Normal = 1, High = 2, Total = 3};
+
 };                                // class roc::Device
 }  // namespace roc
 
diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp
index e936468fd8..cc5dc4d3e2 100644
--- a/rocclr/device/rocm/rocvirtual.cpp
+++ b/rocclr/device/rocm/rocvirtual.cpp
@@ -591,7 +591,8 @@ bool VirtualGPU::releaseGpuMemoryFence() {
 }
 
 VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
-                       const std::vector<uint32_t>& cuMask)
+                       const std::vector<uint32_t>& cuMask,
+                       amd::CommandQueue::Priority priority)
     : device::VirtualDevice(device),
       state_(0),
       gpu_queue_(nullptr),
@@ -603,7 +604,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
       schedulerParam_(nullptr),
       schedulerQueue_(nullptr),
       schedulerSignal_({0}),
-      cuMask_(cuMask)
+      cuMask_(cuMask),
+      priority_(priority)
 {
   index_ = device.numOfVgpus_++;
   gpu_device_ = device.getBackendDevice();
@@ -705,7 +707,7 @@ VirtualGPU::~VirtualGPU() {
 bool VirtualGPU::create() {
   // Pick a reasonable queue size
   uint32_t queue_size = 1024;
-  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_);
+  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
   if (!gpu_queue_) return false;
 
   if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index e14a5101d2..e3a9cfd879 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "platform/commandqueue.hpp"
 #include "rocdevice.hpp"
 #include "utils/util.hpp"
 #include "hsa.h"
@@ -164,7 +165,8 @@ class VirtualGPU : public device::VirtualDevice {
   };
 
   VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
-             const std::vector<uint32_t>& cuMask = {});
+             const std::vector<uint32_t>& cuMask = {},
+             amd::CommandQueue::Priority priority = amd::CommandQueue::Priority::Normal);
   ~VirtualGPU();
 
   bool create();
@@ -364,7 +366,9 @@ class VirtualGPU : public device::VirtualDevice {
   uint16_t dispatchPacketHeaderNoSync_;
   uint16_t dispatchPacketHeader_;
 
-  const std::vector<uint32_t>& cuMask_;  //!< The CU mask
+  //!< bit-vector representing the CU mask. Each active bit represents using one CU
+  const std::vector<uint32_t>& cuMask_;
+  amd::CommandQueue::Priority priority_; //!< The priority for the hsa queue
 };
 
 template <typename T>
diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp
index e051bdecd8..7def25bb73 100644
--- a/rocclr/platform/commandqueue.hpp
+++ b/rocclr/platform/commandqueue.hpp
@@ -47,7 +47,7 @@ class DeviceQueue;
 class CommandQueue : public RuntimeObject {
  public:
   static const uint RealTimeDisabled = 0xffffffff;
-  enum class Priority : uint { Normal = 0, Medium, High };
+  enum class Priority : uint { Low = 0, Normal , Medium, High };
 
   struct Properties {
     typedef cl_command_queue_properties value_type;