diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index cf57b6b45d..b759a479ee 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -774,8 +774,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_ KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup) : DmaBlitManager(gpu, setup), program_(nullptr), - constantBuffer_(nullptr), - constantBufferOffset_(0), xferBufferSize_(0), lockXferOps_("Transfer Ops Lock", true) { for (uint i = 0; i < BlitTotal; ++i) { @@ -799,10 +797,6 @@ KernelBlitManager::~KernelBlitManager() { // Release a dummy context context_->release(); } - - if (nullptr != constantBuffer_) { - constantBuffer_->release(); - } } bool KernelBlitManager::create(amd::Device& device) { @@ -854,18 +848,6 @@ bool KernelBlitManager::createProgram(Device& device) { result = true; } while (!result); - // Create an internal constant buffer - constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki); - // Assign the constant buffer to the current virtual GPU - constantBuffer_->setVirtualDevice(&gpu()); - if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) { - constantBuffer_->release(); - constantBuffer_ = nullptr; - return false; - } else if (constantBuffer_ == nullptr) { - return false; - } - return result; } @@ -2030,14 +2012,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); } - Memory* gpuCB = dev().getRocMemory(constantBuffer_); - if (gpuCB == nullptr) { - return false; - } - - // Find offset in the current constant buffer to allow multipel fills - uint32_t constBufOffset = ConstantBufferOffset(); - auto constBuf = reinterpret_cast
(constantBuffer_->getHostMem()) + constBufOffset; + auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment); // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern. if (packed_obj.pattern_expanded_) { @@ -2045,9 +2020,8 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern } else { memcpy(constBuf, pattern, kpattern_size32); } - - mem = as_cl(gpuCB->owner()); - setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset); + constexpr bool kDirectVa = true; + setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa); koffset /= alignment; kpattern_size32 /= alignment; @@ -2127,18 +2101,12 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr); } - Memory* gpuCB = dev().getRocMemory(constantBuffer_); - if (gpuCB == nullptr) { - return false; - } - - // Find offset in the current constant buffer to allow multipel fills - uint32_t constBufOffset = ConstantBufferOffset(); - auto constBuf = reinterpret_cast
(constantBuffer_->getHostMem()) + constBufOffset; + // Get constant buffer to allow multipel fills + auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment); memcpy(constBuf, pattern, patternSize); - mem = as_cl(gpuCB->owner()); - setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset); + constexpr bool kDirectVa = true; + setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa); uint64_t mem_origin = static_cast(origin[0]); uint64_t width = static_cast(size[0]); diff --git a/rocclr/device/rocm/rocblit.hpp b/rocclr/device/rocm/rocblit.hpp index aee3151845..6d9444431d 100644 --- a/rocclr/device/rocm/rocblit.hpp +++ b/rocclr/device/rocm/rocblit.hpp @@ -486,21 +486,11 @@ class KernelBlitManager : public DmaBlitManager { inline void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value, size_t offset = 0, - const device::Memory* dev_mem = nullptr) const; + const device::Memory* dev_mem = nullptr, + bool writeVAImmediate = false) const; - uint32_t ConstantBufferOffset() const { - // Make sure it can fit at least 128 bytes for OCL memory fill of double16 - constexpr uint32_t kManagedSize = 0x80; - // Adjust the ofset to the new location - constantBufferOffset_ += kManagedSize; - // Check if the allocation exceeds the limit - if ((constantBufferOffset_ + kManagedSize) > constantBuffer_->getSize()) { - // Stall GPU and reset the ofset - gpu().releaseGpuMemoryFence(); - constantBufferOffset_ = 0; - } - return constantBufferOffset_; - } + static constexpr uint32_t kCBSize = 0x80; + static constexpr size_t kCBAlignment = 0x80; inline uint32_t NumBlitKernels() { return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal; @@ -514,8 +504,6 @@ class KernelBlitManager : public DmaBlitManager { amd::Program* program_; //!< GPU program object amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit - amd::Memory* constantBuffer_; //!< An internal CB for blits - mutable uint32_t constantBufferOffset_; //!< Current offset in the constant buffer size_t xferBufferSize_; //!< Transfer buffer size mutable amd::Monitor lockXferOps_; //!< Lock transfer operation }; @@ -531,7 +519,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = { inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value, size_t offset, - const device::Memory* dev_mem) const { + const device::Memory* dev_mem, bool writeVAImmediate) const { const amd::KernelParameterDescriptor& desc = kernel->signature().at(index); void* param = kernel->parameters().values() + desc.offset_; @@ -548,16 +536,23 @@ inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, reinterpret_cast(kernel->parameters().values() + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr; } else { - amd::Memory* mem = as_amd(*static_cast(value)); // convert cl_mem to amd::Memory*, return false if invalid. - reinterpret_cast(kernel->parameters().values() + - kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem; - if (dev_mem == nullptr) { - LP64_SWITCH(uint32_value, uint64_value) = static_cast( - mem->getDeviceMemory(dev())->virtualAddress()) + offset; + amd::Memory* mem = as_amd(*static_cast(value)); + if (!writeVAImmediate) { + reinterpret_cast(kernel->parameters().values() + + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem; + if (dev_mem == nullptr) { + LP64_SWITCH(uint32_value, uint64_value) = static_cast( + mem->getDeviceMemory(dev())->virtualAddress()) + offset; + } else { + LP64_SWITCH(uint32_value, uint64_value) = static_cast( + dev_mem->virtualAddress()) + offset; + } } else { - LP64_SWITCH(uint32_value, uint64_value) = static_cast( - dev_mem->virtualAddress()) + offset; + reinterpret_cast(kernel->parameters().values() + + kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr; + uintptr_t addr = reinterpret_cast(value); + LP64_SWITCH(uint32_value, uint64_value) = addr + offset; } } } else if (desc.type_ == T_SAMPLER) { diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index f935fdce65..67b2648738 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -403,6 +403,8 @@ class VirtualGPU : public device::VirtualDevice { //! Indicates the status of the callback handler. The callback would process the commands //! and would collect profiling data, update refcounts bool isHandlerPending() const { return barriers_.IsHandlerPending(); } + + void* allocKernArg(size_t size, size_t alignment); // } roc OpenCL integration private: //! Dispatches a barrier with blocking HSA signals @@ -427,7 +429,6 @@ class VirtualGPU : public device::VirtualDevice { bool initPool(size_t kernarg_pool_size); void destroyPool(); - void* allocKernArg(size_t size, size_t alignment); void resetKernArgPool() { kernarg_pool_cur_offset_ = 0; kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;