diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index cf57b6b45d..b759a479ee 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -774,8 +774,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
 KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
     : DmaBlitManager(gpu, setup),
       program_(nullptr),
-      constantBuffer_(nullptr),
-      constantBufferOffset_(0),
       xferBufferSize_(0),
       lockXferOps_("Transfer Ops Lock", true) {
   for (uint i = 0; i < BlitTotal; ++i) {
@@ -799,10 +797,6 @@ KernelBlitManager::~KernelBlitManager() {
     // Release a dummy context
     context_->release();
   }
-
-  if (nullptr != constantBuffer_) {
-    constantBuffer_->release();
-  }
 }
 
 bool KernelBlitManager::create(amd::Device& device) {
@@ -854,18 +848,6 @@ bool KernelBlitManager::createProgram(Device& device) {
     result = true;
   } while (!result);
 
-  // Create an internal constant buffer
-  constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
-  // Assign the constant buffer to the current virtual GPU
-  constantBuffer_->setVirtualDevice(&gpu());
-  if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) {
-    constantBuffer_->release();
-    constantBuffer_ = nullptr;
-    return false;
-  } else if (constantBuffer_ == nullptr) {
-    return false;
-  }
-
   return result;
 }
 
@@ -2030,14 +2012,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
         setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
       }
 
-      Memory* gpuCB = dev().getRocMemory(constantBuffer_);
-      if (gpuCB == nullptr) {
-        return false;
-      }
-
-      // Find offset in the current constant buffer to allow multipel fills
-      uint32_t  constBufOffset = ConstantBufferOffset();
-      auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
+      auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
 
       // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
       if (packed_obj.pattern_expanded_) {
@@ -2045,9 +2020,8 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
       } else {
         memcpy(constBuf, pattern, kpattern_size32);
       }
-
-      mem = as_cl<amd::Memory>(gpuCB->owner());
-      setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
+      constexpr bool kDirectVa = true;
+      setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
 
       koffset /= alignment;
       kpattern_size32 /= alignment;
@@ -2127,18 +2101,12 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
       setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
     }
 
-    Memory* gpuCB = dev().getRocMemory(constantBuffer_);
-    if (gpuCB == nullptr) {
-      return false;
-    }
-
-    // Find offset in the current constant buffer to allow multipel fills
-    uint32_t  constBufOffset = ConstantBufferOffset();
-    auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
+    // Get constant buffer to allow multipel fills
+    auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
     memcpy(constBuf, pattern, patternSize);
 
-    mem = as_cl<amd::Memory>(gpuCB->owner());
-    setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
+    constexpr bool kDirectVa = true;
+    setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
 
     uint64_t mem_origin = static_cast<uint64_t>(origin[0]);
     uint64_t width = static_cast<uint64_t>(size[0]);
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index aee3151845..6d9444431d 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -486,21 +486,11 @@ class KernelBlitManager : public DmaBlitManager {
 
   inline void setArgument(amd::Kernel* kernel, size_t index,
                           size_t size, const void* value, size_t offset = 0,
-                          const device::Memory* dev_mem = nullptr) const;
+                          const device::Memory* dev_mem = nullptr,
+                          bool writeVAImmediate = false) const;
 
-  uint32_t ConstantBufferOffset() const {
-    // Make sure it can fit at least 128 bytes for OCL memory fill of double16
-    constexpr uint32_t kManagedSize = 0x80;
-    // Adjust the ofset to the new location
-    constantBufferOffset_ += kManagedSize;
-    // Check if the allocation exceeds the limit
-    if ((constantBufferOffset_ + kManagedSize) > constantBuffer_->getSize()) {
-      // Stall GPU and reset the ofset
-      gpu().releaseGpuMemoryFence();
-      constantBufferOffset_ = 0;
-    }
-    return constantBufferOffset_;
-  }
+  static constexpr uint32_t kCBSize = 0x80;
+  static constexpr size_t   kCBAlignment = 0x80;
 
   inline uint32_t NumBlitKernels() {
     return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
@@ -514,8 +504,6 @@ class KernelBlitManager : public DmaBlitManager {
 
   amd::Program* program_;             //!< GPU program object
   amd::Kernel* kernels_[BlitTotal];   //!< GPU kernels for blit
-  amd::Memory* constantBuffer_;       //!< An internal CB for blits
-  mutable uint32_t constantBufferOffset_; //!< Current offset in the constant buffer
   size_t xferBufferSize_;             //!< Transfer buffer size
   mutable amd::Monitor  lockXferOps_; //!< Lock transfer operation
 };
@@ -531,7 +519,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
 
 inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
                                            size_t size, const void* value, size_t offset,
-                                           const device::Memory* dev_mem) const {
+                                           const device::Memory* dev_mem, bool writeVAImmediate) const {
   const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
 
   void* param = kernel->parameters().values() + desc.offset_;
@@ -548,16 +536,23 @@ inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
       reinterpret_cast<Memory**>(kernel->parameters().values() +
         kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
     } else {
-      amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
       // convert cl_mem to amd::Memory*, return false if invalid.
-      reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
-        kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
-      if (dev_mem == nullptr) {
-        LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
-          mem->getDeviceMemory(dev())->virtualAddress()) + offset;
+      amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
+      if (!writeVAImmediate) {
+        reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
+          kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
+        if (dev_mem == nullptr) {
+          LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
+            mem->getDeviceMemory(dev())->virtualAddress()) + offset;
+        } else {
+          LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
+            dev_mem->virtualAddress()) + offset;
+        }
       } else {
-        LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
-          dev_mem->virtualAddress()) + offset;
+        reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
+          kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
+        uintptr_t addr = reinterpret_cast<uintptr_t>(value);
+        LP64_SWITCH(uint32_value, uint64_value) = addr + offset;
       }
     }
   } else if (desc.type_ == T_SAMPLER) {
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index f935fdce65..67b2648738 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -403,6 +403,8 @@ class VirtualGPU : public device::VirtualDevice {
   //! Indicates the status of the callback handler. The callback would process the commands
   //! and would collect profiling data, update refcounts
   bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
+
+  void* allocKernArg(size_t size, size_t alignment);
   // } roc OpenCL integration
  private:
   //! Dispatches a barrier with blocking HSA signals
@@ -427,7 +429,6 @@ class VirtualGPU : public device::VirtualDevice {
   bool initPool(size_t kernarg_pool_size);
   void destroyPool();
 
-  void* allocKernArg(size_t size, size_t alignment);
   void resetKernArgPool() {
     kernarg_pool_cur_offset_ = 0;
     kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;