diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp
index cf57b6b45d..b759a479ee 100644
--- a/rocclr/device/rocm/rocblit.cpp
+++ b/rocclr/device/rocm/rocblit.cpp
@@ -774,8 +774,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
program_(nullptr),
- constantBuffer_(nullptr),
- constantBufferOffset_(0),
xferBufferSize_(0),
lockXferOps_("Transfer Ops Lock", true) {
for (uint i = 0; i < BlitTotal; ++i) {
@@ -799,10 +797,6 @@ KernelBlitManager::~KernelBlitManager() {
// Release a dummy context
context_->release();
}
-
- if (nullptr != constantBuffer_) {
- constantBuffer_->release();
- }
}
bool KernelBlitManager::create(amd::Device& device) {
@@ -854,18 +848,6 @@ bool KernelBlitManager::createProgram(Device& device) {
result = true;
} while (!result);
- // Create an internal constant buffer
- constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
- // Assign the constant buffer to the current virtual GPU
- constantBuffer_->setVirtualDevice(&gpu());
- if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) {
- constantBuffer_->release();
- constantBuffer_ = nullptr;
- return false;
- } else if (constantBuffer_ == nullptr) {
- return false;
- }
-
return result;
}
@@ -2030,14 +2012,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
}
- Memory* gpuCB = dev().getRocMemory(constantBuffer_);
- if (gpuCB == nullptr) {
- return false;
- }
-
- // Find offset in the current constant buffer to allow multipel fills
- uint32_t constBufOffset = ConstantBufferOffset();
- auto constBuf = reinterpret_cast
(constantBuffer_->getHostMem()) + constBufOffset;
+ auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
if (packed_obj.pattern_expanded_) {
@@ -2045,9 +2020,8 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
} else {
memcpy(constBuf, pattern, kpattern_size32);
}
-
- mem = as_cl(gpuCB->owner());
- setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
+ constexpr bool kDirectVa = true;
+ setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
koffset /= alignment;
kpattern_size32 /= alignment;
@@ -2127,18 +2101,12 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
}
- Memory* gpuCB = dev().getRocMemory(constantBuffer_);
- if (gpuCB == nullptr) {
- return false;
- }
-
- // Find offset in the current constant buffer to allow multipel fills
- uint32_t constBufOffset = ConstantBufferOffset();
- auto constBuf = reinterpret_cast(constantBuffer_->getHostMem()) + constBufOffset;
+ // Get constant buffer to allow multipel fills
+ auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
memcpy(constBuf, pattern, patternSize);
- mem = as_cl(gpuCB->owner());
- setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
+ constexpr bool kDirectVa = true;
+ setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
uint64_t mem_origin = static_cast(origin[0]);
uint64_t width = static_cast(size[0]);
diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp
index aee3151845..6d9444431d 100644
--- a/rocclr/device/rocm/rocblit.hpp
+++ b/rocclr/device/rocm/rocblit.hpp
@@ -486,21 +486,11 @@ class KernelBlitManager : public DmaBlitManager {
inline void setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset = 0,
- const device::Memory* dev_mem = nullptr) const;
+ const device::Memory* dev_mem = nullptr,
+ bool writeVAImmediate = false) const;
- uint32_t ConstantBufferOffset() const {
- // Make sure it can fit at least 128 bytes for OCL memory fill of double16
- constexpr uint32_t kManagedSize = 0x80;
- // Adjust the ofset to the new location
- constantBufferOffset_ += kManagedSize;
- // Check if the allocation exceeds the limit
- if ((constantBufferOffset_ + kManagedSize) > constantBuffer_->getSize()) {
- // Stall GPU and reset the ofset
- gpu().releaseGpuMemoryFence();
- constantBufferOffset_ = 0;
- }
- return constantBufferOffset_;
- }
+ static constexpr uint32_t kCBSize = 0x80;
+ static constexpr size_t kCBAlignment = 0x80;
inline uint32_t NumBlitKernels() {
return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
@@ -514,8 +504,6 @@ class KernelBlitManager : public DmaBlitManager {
amd::Program* program_; //!< GPU program object
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
- amd::Memory* constantBuffer_; //!< An internal CB for blits
- mutable uint32_t constantBufferOffset_; //!< Current offset in the constant buffer
size_t xferBufferSize_; //!< Transfer buffer size
mutable amd::Monitor lockXferOps_; //!< Lock transfer operation
};
@@ -531,7 +519,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset,
- const device::Memory* dev_mem) const {
+ const device::Memory* dev_mem, bool writeVAImmediate) const {
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
void* param = kernel->parameters().values() + desc.offset_;
@@ -548,16 +536,23 @@ inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
reinterpret_cast(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
} else {
- amd::Memory* mem = as_amd(*static_cast(value));
// convert cl_mem to amd::Memory*, return false if invalid.
- reinterpret_cast(kernel->parameters().values() +
- kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
- if (dev_mem == nullptr) {
- LP64_SWITCH(uint32_value, uint64_value) = static_cast(
- mem->getDeviceMemory(dev())->virtualAddress()) + offset;
+ amd::Memory* mem = as_amd(*static_cast(value));
+ if (!writeVAImmediate) {
+ reinterpret_cast(kernel->parameters().values() +
+ kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
+ if (dev_mem == nullptr) {
+ LP64_SWITCH(uint32_value, uint64_value) = static_cast(
+ mem->getDeviceMemory(dev())->virtualAddress()) + offset;
+ } else {
+ LP64_SWITCH(uint32_value, uint64_value) = static_cast(
+ dev_mem->virtualAddress()) + offset;
+ }
} else {
- LP64_SWITCH(uint32_value, uint64_value) = static_cast(
- dev_mem->virtualAddress()) + offset;
+ reinterpret_cast(kernel->parameters().values() +
+ kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
+ uintptr_t addr = reinterpret_cast(value);
+ LP64_SWITCH(uint32_value, uint64_value) = addr + offset;
}
}
} else if (desc.type_ == T_SAMPLER) {
diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp
index f935fdce65..67b2648738 100644
--- a/rocclr/device/rocm/rocvirtual.hpp
+++ b/rocclr/device/rocm/rocvirtual.hpp
@@ -403,6 +403,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Indicates the status of the callback handler. The callback would process the commands
//! and would collect profiling data, update refcounts
bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
+
+ void* allocKernArg(size_t size, size_t alignment);
// } roc OpenCL integration
private:
//! Dispatches a barrier with blocking HSA signals
@@ -427,7 +429,6 @@ class VirtualGPU : public device::VirtualDevice {
bool initPool(size_t kernarg_pool_size);
void destroyPool();
- void* allocKernArg(size_t size, size_t alignment);
void resetKernArgPool() {
kernarg_pool_cur_offset_ = 0;
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;