diff --git a/projects/clr/rocclr/runtime/device/pal/palblit.cpp b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
index e392c018f2..542028f300 100644
--- a/projects/clr/rocclr/runtime/device/pal/palblit.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palblit.cpp
@@ -267,7 +267,7 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
     chunkSize = gpu().xferWrite().MaxSize();
   } else {
     chunkSize = std::min(amd::alignUp(xferSize / 4, 256), gpu().xferWrite().MaxSize());
-    chunkSize = std::max(chunkSize, 128 * Ki);
+    chunkSize = std::max(chunkSize, 64 * Ki);
     bool flushDMA = true;
   }
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
index c0dfd536e2..ccd6dfb583 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.cpp
@@ -11,32 +11,34 @@ namespace pal {
 
 // ================================================================================================
 ManagedBuffer::ManagedBuffer(VirtualGPU& gpu, uint32_t size)
-    : gpu_(gpu)
-    , buffers_(MaxNumberOfBuffers)
-    , activeBuffer_(0)
-    , size_(size)
-    , wrtOffset_(0)
-    , wrtAddress_(nullptr) {}
+  : gpu_(gpu)
+  , pool_(MaxNumberOfBuffers)
+  , activeBuffer_(0)
+  , size_(size)
+  , wrtOffset_(0)
+  , wrtAddress_(nullptr) {}
 
 // ================================================================================================
 void ManagedBuffer::release() {
-  for (auto it : buffers_) {
-    if ((it != nullptr) && (it->data() != nullptr)) {
-      it->unmap(&gpu_);
+  for (auto it : pool_) {
+    if ((it.buf != nullptr) && (it.buf->data() != nullptr)) {
+      it.buf->unmap(&gpu_);
     }
-    delete it;
+    delete it.buf;
   }
 }
 
 // ================================================================================================
 bool ManagedBuffer::create(Resource::MemoryType type) {
-  for (uint i = 0; i < buffers_.size(); ++i) {
-    buffers_[i] = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
-    if (nullptr == buffers_[i] || !buffers_[i]->create(type)) {
+  for (uint i = 0; i < pool_.size(); ++i) {
+    pool_[i].buf = new Memory(const_cast<pal::Device&>(gpu_.dev()), size_);
+    if (nullptr == pool_[i].buf || !pool_[i].buf->create(type)) {
       LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
       return false;
     }
-    void* wrtAddress = buffers_[i]->map(&gpu_);
+    // Assign virtual gpu to the allocation. Buffer will be used only on a particular queue
+    pool_[i].buf->memRef()->gpu_ = &gpu_;
+    void* wrtAddress = pool_[i].buf->map(&gpu_);
     if (wrtAddress == nullptr) {
         LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
         return false;
@@ -45,9 +47,9 @@ bool ManagedBuffer::create(Resource::MemoryType type) {
     uint dummy = 0;
     static constexpr bool Wait = true;
     // Write 0 for the buffer paging by VidMM
-    buffers_[i]->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
+    pool_[i].buf->writeRawData(gpu_, 0, sizeof(dummy), &dummy, Wait);
   }
-  wrtAddress_ = buffers_[activeBuffer_]->data();
+  wrtAddress_ = pool_[activeBuffer_].buf->data();
   return true;
 }
 
@@ -59,18 +61,22 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
   // Align reserve size on the vector's boundary
   uint32_t count = amd::alignUp(size, MemAlignment);
 
+  // Save previous event
+  pinGpuEvent();
+
   // Check if buffer has enough space for reservation
   if ((wrtOffset_ + count) > size_) {
     // Get the next buffer in the list
     ++activeBuffer_;
     activeBuffer_ %= MaxNumberOfBuffers;
     // Make sure the buffer isn't busy
-    buffers_[activeBuffer_]->wait(gpu_);
-    wrtAddress_ = buffers_[activeBuffer_]->data();
+    gpu().waitForEvent(&pool_[activeBuffer_].events[SdmaEngine]);
+    gpu().waitForEvent(&pool_[activeBuffer_].events[MainEngine]);
+    wrtAddress_ = pool_[activeBuffer_].buf->data();
     wrtOffset_ = 0;
   }
 
-  *gpu_address = buffers_[activeBuffer_]->vmAddress() + wrtOffset_;
+  *gpu_address = pool_[activeBuffer_].buf->vmAddress() + wrtOffset_;
   address cpu_address = wrtAddress_ + wrtOffset_;
 
   // Adjust the offset by the reserved size
@@ -80,23 +86,17 @@ address ManagedBuffer::reserve(uint32_t size, uint64_t* gpu_address) {
 }
 
 // ================================================================================================
-Memory& ManagedBuffer::reserveAtTheTop(uint32_t size)
-{
-  // Get the next buffer in the list
-  ++activeBuffer_;
-  activeBuffer_ %= MaxNumberOfBuffers;
-  // Make sure the buffer isn't busy
-  buffers_[activeBuffer_]->wait(gpu_);
-  wrtAddress_ = buffers_[activeBuffer_]->data();
-  wrtOffset_ = 0;
-  return *buffers_[activeBuffer_];
+void ManagedBuffer::pinGpuEvent() {
+  GpuEvent* event = activeMemory()->getGpuEvent(gpu());
+  pool_[activeBuffer_].events[event->engineId_] = *event;
+  activeMemory()->setBusy(gpu(), GpuEvent::InvalidID);
 }
 
 // ================================================================================================
 ConstantBuffer::ConstantBuffer(ManagedBuffer& mbuf, uint32_t size)
-    : mbuf_(mbuf)
-    , sys_mem_copy_(nullptr)
-    , size_(size)
+  : mbuf_(mbuf)
+  , sys_mem_copy_(nullptr)
+  , size_(size)
 {}
 
 // ================================================================================================
@@ -127,11 +127,11 @@ uint64_t ConstantBuffer::UploadDataToHw(uint32_t size) const {
 
 // ================================================================================================
 uint64_t ConstantBuffer::UploadDataToHw(const void* sysmem, uint32_t size) const {
-    uint64_t  vm_address;
-    address   cpu_address = mbuf_.reserve(size, &vm_address);
-    // Update memory with new CB data
-    memcpy(cpu_address, sysmem, size);
-    return vm_address;
+  uint64_t  vm_address;
+  address   cpu_address = mbuf_.reserve(size, &vm_address);
+  // Update memory with new CB data
+  memcpy(cpu_address, sysmem, size);
+  return vm_address;
 }
 
 // ================================================================================================
diff --git a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
index da1984f636..c1853b0537 100644
--- a/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palconstbuf.hpp
@@ -30,9 +30,6 @@ class ManagedBuffer : public amd::EmbeddedObject {
   address reserve(uint32_t size,  //!< real data size for upload
                   uint64_t* gpu_address);
 
-  //! Reserves memory at the top of the active buffer
-  Memory& reserveAtTheTop(uint32_t size);
-
   //! Returns CB size
   uint32_t size() const { return size_; }
 
@@ -40,14 +37,23 @@ class ManagedBuffer : public amd::EmbeddedObject {
   uint32_t wrtOffset() const { return wrtOffset_; }
 
   //! Returns active GPU buffer
-  Memory* activeMemory() const { return buffers_[activeBuffer_]; }
+  Memory* activeMemory() const { return pool_[activeBuffer_].buf; }
 
-  uint64_t vmAddress() const { return buffers_[activeBuffer_]->vmAddress(); }
+  //! Retruns VM address for the active buffer
+  uint64_t vmAddress() const { return pool_[activeBuffer_].buf->vmAddress(); }
+
+  //! Update the timestamp for the HW operation
+  void pinGpuEvent();
 
   //! Returns VirtualGPU object this managed resource associated
   VirtualGPU& gpu() const { return gpu_; }
 
  private:
+  struct TimeStampedBuffer {
+    Memory*   buf;
+    GpuEvent  events[AllEngines];
+  };
+
   //! The maximum number of the managed buffers
   static constexpr uint32_t MaxNumberOfBuffers = 3;
 
@@ -58,7 +64,7 @@ class ManagedBuffer : public amd::EmbeddedObject {
   ManagedBuffer& operator=(const ManagedBuffer&) = delete;
 
   VirtualGPU& gpu_;                 //!< Virtual GPU object
-  std::vector<Memory*>  buffers_;   //!< Buffers for management
+  std::vector<TimeStampedBuffer>  pool_;   //!< Buffers for management
   uint32_t  activeBuffer_;          //!< Current active buffer
   uint32_t  size_;                  //!< Constant buffer size
   uint32_t  wrtOffset_;             //!< Current write offset
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index 6825c8b8c5..fbb95bcc80 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -145,6 +145,7 @@ class Memory : public device::Memory, public Resource {
   //! Quick view update for managed buffers. It should avoid expensive object allocations
   void updateView(Resource* view, size_t offset, size_t size) {
     size_ = size;
+    flags_ |= HostMemoryDirectAccess;
     Resource::updateView(view, offset, size);
   }
 
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index a838f2673a..3ae73d7ec6 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -378,6 +378,7 @@ class Resource : public amd::HeapObject {
       memRef_->retain();
       desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
         Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
+      setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
     }
   }