diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp
index 3e9f96b2ef..fc30c0f0db 100644
--- a/projects/clr/hipamd/src/hip_memory.cpp
+++ b/projects/clr/hipamd/src/hip_memory.cpp
@@ -599,7 +599,9 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
     return hipSuccess;
   } else if (((srcMemory == nullptr) && (dstMemory != nullptr)) ||
              ((srcMemory != nullptr) && (dstMemory == nullptr))) {
-    isHostAsync = false;
+    // Don't wait for unpinned H2D copy if staging is used for copy
+    isHostAsync &= ((srcMemory == nullptr) && (dstMemory != nullptr) && AMD_DIRECT_DISPATCH &&
+      (sizeBytes <= stream.device().settings().stagedXferSize_)) ? true : false;
   } else if (srcMemory->GetDeviceById() == dstMemory->GetDeviceById()) {
     hipMemoryType srcMemoryType = ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) &
         srcMemory->getMemFlags())? hipMemoryTypeHost : hipMemoryTypeDevice;
diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index c995e5e85e..ef82630325 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -707,6 +707,8 @@ class Settings : public amd::HeapObject {
   //! Enable the specified extension
   void enableExtension(uint name) { extensions_ |= static_cast<uint64_t>(1) << name; }
 
+  size_t stagedXferSize_ = 0;     //!< Staged buffer size
+
  private:
   //! Disable copy constructor
   Settings(const Settings&);
diff --git a/projects/clr/rocclr/device/pal/palsettings.hpp b/projects/clr/rocclr/device/pal/palsettings.hpp
index bd8aac9a4f..cb270ad5c6 100644
--- a/projects/clr/rocclr/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/device/pal/palsettings.hpp
@@ -98,7 +98,6 @@ class Settings : public device::Settings {
   uint hostMemDirectAccess_;     //!< Enables direct access to the host memory
   uint numScratchWavesPerCu_;    //!< Maximum number of waves when scratch is enabled
   size_t xferBufSize_;           //!< Transfer buffer size for image copy optimization
-  size_t stagedXferSize_;        //!< Staged buffer size
   size_t pinnedXferSize_;        //!< Pinned buffer size for transfer
   size_t pinnedMinXferSize_;     //!< Minimal buffer size for pinned transfer
   size_t cpDmaCopySizeMax_;      //!< Threshold for CP DMA path in copy
diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp
index 132fe7d2fb..84f659d44e 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.cpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.cpp
@@ -217,11 +217,10 @@ bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
 }
 
 // ================================================================================================
-bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, Memory& xferBuf,
+bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, address staging,
                                        size_t origin, size_t& offset, size_t& totalSize,
                                        size_t xferSize) const {
   address dst = dstMemory.getDeviceMemory();
-  address staging = xferBuf.getDeviceMemory();
 
   // Copy data from host to device
   dst += origin + offset;
@@ -308,16 +307,15 @@ bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
     }
 
     if (dstSize != 0) {
-      Memory& xferBuf = dev().xferWrite().acquire();
+      address staging = gpu().Staging().Acquire(
+        std::min(dstSize, dev().settings().stagedXferSize_));
 
       // Write memory using a staging resource
-      if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), xferBuf, origin[0], offset, dstSize,
+      if (!writeMemoryStaged(srcHost, gpuMem(dstMemory), staging, origin[0], offset, dstSize,
                              dstSize)) {
         LogError("DmaBlitManager::writeBuffer failed!");
         return false;
       }
-
-      gpu().addXferWrite(xferBuf);
     }
   }
 
@@ -338,8 +336,8 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
     return HostBlitManager::writeBufferRect(srcHost, dstMemory, hostRect, bufRect, size, entire,
                                             copyMetadata);
   } else {
-    Memory& xferBuf = dev().xferWrite().acquire();
-    address staging = xferBuf.getDeviceMemory();
+    address staging = gpu().Staging().Acquire(
+      std::min(size[0], dev().settings().stagedXferSize_));
     address dst = static_cast<roc::Memory&>(dstMemory).getDeviceMemory();
 
     size_t srcOffset;
@@ -358,7 +356,6 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
         }
       }
     }
-    gpu().addXferWrite(xferBuf);
   }
 
   return true;
@@ -780,7 +777,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
 bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
                                    address staging, bool hostToDev) const {
   // Stall GPU, sicne CPU copy is possible
-  gpu().releaseGpuMemoryFence();
+  gpu().releaseGpuMemoryFence(hostToDev);
 
   // No allocation is necessary for Full Profile
   hsa_status_t status;
@@ -826,8 +823,11 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
         LogPrintfError("Hsa copy from host to device failed with code %d", status);
         return false;
       }
-      gpu().Barriers().WaitCurrent();
       totalSize -= size;
+      if (totalSize > 0) {
+        // Wait if there are extra copies, which don't fit in a single staging buffer
+        gpu().Barriers().WaitCurrent();
+      }
       offset += size;
       continue;
     }
diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp
index f01ce40bdf..537fb4f628 100644
--- a/projects/clr/rocclr/device/rocm/rocblit.hpp
+++ b/projects/clr/rocclr/device/rocm/rocblit.hpp
@@ -261,7 +261,7 @@ class DmaBlitManager : public device::HostBlitManager {
   //! Write into video memory, using a staged buffer
   bool writeMemoryStaged(const void* srcHost,  //!< Source host memory
                          Memory& dstMemory,    //!< Destination memory object
-                         Memory& xferBuf,      //!< Staged buffer for write
+                         address staging,      //!< Staged buffer for write
                          size_t origin,        //!< Original offset in the destination memory
                          size_t& offset,       //!< Offset for the current copy pointer
                          size_t& totalSize,    //!< Total size for the copy region
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index dd4f37b091..e2a82d8c71 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -179,7 +179,6 @@ Device::Device(hsa_agent_t bkendDevice)
     , alloc_granularity_(0)
     , xferQueue_(nullptr)
     , xferRead_(nullptr)
-    , xferWrite_(nullptr)
     , freeMem_(0)
     , vgpusAccess_(true) /* Virtual GPU List Ops Lock */
     , hsa_exclusive_gpu_access_(false)
@@ -290,7 +289,6 @@ Device::~Device() {
 
   // Destroy temporary buffers for read/write
   delete xferRead_;
-  delete xferWrite_;
 
   // Destroy transfer queue
   delete xferQueue_;
@@ -823,15 +821,6 @@ bool Device::create() {
   mapCache_->push_back(nullptr);
 
   if (settings().stagedXferSize_ != 0) {
-    // Initialize staged write buffers
-    if (settings().stagedXferWrite_) {
-      xferWrite_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
-      if ((xferWrite_ == nullptr) || !xferWrite_->create()) {
-        LogError("Couldn't allocate transfer buffer objects for read");
-        return false;
-      }
-    }
-
     // Initialize staged read buffers
     if (settings().stagedXferRead_) {
       xferRead_ = new XferBuffers(*this, amd::alignUp(settings().stagedXferSize_, 4 * Ki));
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index 7111fcec14..6417c9d881 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -520,9 +520,6 @@ class Device : public NullDevice {
   //! Adds a map target to the cache
   bool addMapTarget(amd::Memory* memory) const;
 
-  //! Returns transfer buffer object
-  XferBuffers& xferWrite() const { return *xferWrite_; }
-
   //! Returns transfer buffer object
   XferBuffers& xferRead() const { return *xferRead_; }
 
@@ -653,7 +650,6 @@ class Device : public NullDevice {
   VirtualGPU* xferQueue_;  //!< Transfer queue, created on demand
 
   XferBuffers* xferRead_;   //!< Transfer buffers read
-  XferBuffers* xferWrite_;  //!< Transfer buffers write
   std::atomic<size_t> freeMem_;   //!< Total of free memory available
   mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
   bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
diff --git a/projects/clr/rocclr/device/rocm/rocsettings.hpp b/projects/clr/rocclr/device/rocm/rocsettings.hpp
index 525cfefc38..b15b8cbe64 100644
--- a/projects/clr/rocclr/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/device/rocm/rocsettings.hpp
@@ -68,7 +68,6 @@ class Settings : public device::Settings {
   uint numWaitEvents_;        //!< The number of wait events for device enqueue
 
   size_t xferBufSize_;        //!< Transfer buffer size for image copy optimization
-  size_t stagedXferSize_;     //!< Staged buffer size
   size_t pinnedXferSize_;     //!< Pinned buffer size for transfer
   size_t pinnedMinXferSize_;  //!< Minimal buffer size for pinned transfer
 
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 4f0a93348b..ea3716b4fb 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1065,7 +1065,7 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal,
   fence_dirty_ = true;
   auto cache_state = extractAqlBits(packetHeader, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
                          HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE);
-  if (!skipSignal) {
+  if (!skipSignal && (signal.handle == 0)) {
     // Get active signal for current dispatch if profiling is necessary
     barrier_packet_.completion_signal =
       Barriers().ActiveSignal(kInitSignalValueOne, timestamp_);
@@ -1188,9 +1188,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
 
 // ================================================================================================
 void VirtualGPU::ResetQueueStates() {
-  // Release all transfer buffers on this command queue
-  releaseXferWrite();
-
   // Release all memory dependencies
   memoryDependency().clear();
 
@@ -1234,6 +1231,7 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
       schedulerSignal_({0}),
       barriers_(*this),
       kernarg_pool_signal_(KernelArgPoolNumSignal),
+      managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
       cuMask_(cuMask),
       priority_(priority),
       copy_command_type_(0),
@@ -1385,9 +1383,77 @@ bool VirtualGPU::create() {
     LogError("Could not create signal for copy queue!");
     return false;
   }
+  // Create managed buffer for staging copies
+  if (!managed_buffer_.Create()) {
+    LogError("Could not create managed buffer for this queue!");
+    return false;
+  }
   return true;
 }
 
+// ================================================================================================
+VirtualGPU::ManagedBuffer::~ManagedBuffer() {
+  for (auto& it : pool_signal_) {
+    if (it.handle != 0) {
+      hsa_signal_destroy(it);
+    }
+  }
+  if (pool_base_ != nullptr) {
+    gpu_.dev().hostFree(pool_base_, pool_size_);
+  }
+}
+
+// ================================================================================================
+bool VirtualGPU::ManagedBuffer::Create() {
+  pool_chunk_end_ = pool_size_ / kPoolNumSignals;
+  active_chunk_ = 0;
+  // Allocate memory for managed buffer
+  pool_base_ = reinterpret_cast<address>(
+    gpu_.dev().hostAlloc(pool_size_, 0, Device::MemorySegment::kNoAtomics));
+  if (pool_base_ == nullptr) {
+    return false;
+  }
+  hsa_agent_t agent = gpu_.dev().getBackendDevice();
+  for (auto& it : pool_signal_) {
+    if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 1, &agent, &it)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// ================================================================================================
+address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
+  auto alignment = gpu_.dev().info().globalMemCacheLineSize_;
+  address result = nullptr;
+  result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
+  const size_t pool_new_usage = (result + size) - pool_base_;
+  if (pool_new_usage <= pool_chunk_end_) {
+    pool_cur_offset_ = pool_new_usage;
+    return result;
+  } else {
+    // Reset the signal for the barrier packet
+    hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
+    // Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
+    constexpr bool kSkipSignal = false;
+    // Dispatch a barrier packet into the queue
+    gpu_.dispatchBarrierPacket(kBarrierPacketHeader, kSkipSignal, pool_signal_[active_chunk_]);
+    // Get the next chunk
+    active_chunk_ = ++active_chunk_ % kPoolNumSignals;
+    // Make sure the new active chunk is free
+    bool test = WaitForSignal(pool_signal_[active_chunk_], gpu_.ActiveWait());
+    assert(test && "Runtime can't fail a wait for chunk!");
+    // Make sure the current offset matches the new chunk to avoid possible overlaps
+    // between chunks and issues during recycle
+    pool_cur_offset_ = (active_chunk_ == 0) ? 0 : pool_chunk_end_;
+    pool_chunk_end_ = pool_cur_offset_ + pool_size_ / kPoolNumSignals;
+    result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
+    pool_cur_offset_ = (result + size) - pool_base_;
+  }
+
+  return result;
+}
+
 // ================================================================================================
 bool VirtualGPU::initPool(size_t kernarg_pool_size) {
   kernarg_pool_size_ = kernarg_pool_size;
@@ -3562,28 +3628,6 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
   releasePinnedMem();
 }
 
-// ================================================================================================
-void VirtualGPU::addXferWrite(Memory& memory) {
-  //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
-  //!        unconditionally, before it can release pinned memory
-  releaseGpuMemoryFence();
-  if (xferWriteBuffers_.size() > 7) {
-    dev().xferWrite().release(*this, *xferWriteBuffers_.front());
-    xferWriteBuffers_.erase(xferWriteBuffers_.begin());
-  }
-
-  // Delay destruction
-  xferWriteBuffers_.push_back(&memory);
-}
-
-// ================================================================================================
-void VirtualGPU::releaseXferWrite() {
-  for (auto& memory : xferWriteBuffers_) {
-    dev().xferWrite().release(*this, *memory);
-  }
-  xferWriteBuffers_.resize(0);
-}
-
 // ================================================================================================
 void VirtualGPU::addPinnedMem(amd::Memory* mem) {
   //! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index 35b0fe7b05..3aeebe02b4 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -185,6 +185,31 @@ class Timestamp : public amd::ReferenceCountedObject {
 
 class VirtualGPU : public device::VirtualDevice {
  public:
+  class ManagedBuffer : public amd::EmbeddedObject {
+  public:
+    //! The number of chunks the arg pool will be divided
+    static constexpr uint32_t kPoolNumSignals = 4;
+    ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size)
+      : gpu_(gpu)
+      , pool_size_(pool_size)
+      , pool_signal_(kPoolNumSignals) {}
+    ~ManagedBuffer();
+
+    //! Allocates all necessary resources to manage memory
+    bool Create();
+
+    //! Acquires memory for use on the gpu
+    address Acquire(uint32_t size);
+
+  private:
+    VirtualGPU& gpu_;                 //!< Queue object for ROCm device
+    address   pool_base_ = nullptr;   //!< Memory pool base address
+    uint32_t  pool_size_;             //!< Memory pool base size
+    uint32_t  pool_chunk_end_ = 0;    //!< The end offset of the current chunk
+    uint32_t  active_chunk_ = 0;      //!< The index of the current active chunk
+    uint32_t  pool_cur_offset_ = 0;   //!< Current active offset for update
+    std::vector<hsa_signal_t> pool_signal_; //!< Pool of HSA signals to manage multiple chunks
+  };
   class MemoryDependency : public amd::EmbeddedObject {
    public:
     //! Default constructor
@@ -386,11 +411,8 @@ class VirtualGPU : public device::VirtualDevice {
                          std::vector<device::Memory*>& wrtBackImageBuffer //!< Images for writeback
                          );
 
-  //! Adds a stage write buffer into a list
-  void addXferWrite(Memory& memory);
-
-  //! Releases stage write buffers
-  void releaseXferWrite();
+  //! Returns a managed buffer for staging copies
+  ManagedBuffer& Staging() { return managed_buffer_; }
 
   //! Adds a pinned memory object into a map
   void addPinnedMem(amd::Memory* mem);
@@ -422,6 +444,7 @@ class VirtualGPU : public device::VirtualDevice {
 
   void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
   uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
+
   // } roc OpenCL integration
  private:
   //! Dispatches a barrier with blocking HSA signals
@@ -437,10 +460,10 @@ class VirtualGPU : public device::VirtualDevice {
   template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
                                                               uint16_t rest, bool blocking);
 
-  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
-                             hsa_signal_t signal = hsa_signal_t{0});
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
                                 bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
+  void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
+                             hsa_signal_t signal = hsa_signal_t{0});
   void dispatchBarrierValuePacket(uint16_t packetHeader,
                                   bool resolveDepSignal = false,
                                   hsa_signal_t signal = hsa_signal_t{0},
@@ -499,7 +522,6 @@ class VirtualGPU : public device::VirtualDevice {
   //! Resets the current queue state. Note: should be called after AQL queue becomes idle
   void ResetQueueStates();
 
-  std::vector<Memory*> xferWriteBuffers_;  //!< Stage write buffers
   std::vector<amd::Memory*> pinnedMems_;   //!< Pinned memory list
 
   //! Queue state flags
@@ -549,6 +571,8 @@ class VirtualGPU : public device::VirtualDevice {
   std::vector<hsa_signal_t> kernarg_pool_signal_; //!< Pool of HSA signals to manage
                                                   //!< multiple chunks
 
+  ManagedBuffer managed_buffer_;  //!< Memory manager for staging copies
+
   friend class Timestamp;
 
   //  PM4 packet for gfx8 performance counter