SWDEV-260345 - Manage constant buffer for blit
- Leverage managed buffer that would use chunks for fill pattern. Use a different chunk for the next fill to avoid wait Change-Id: I254483c867e112f66564ffd8f55e0a605d8896c9
Этот коммит содержится в:
@@ -774,8 +774,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
|
||||
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
|
||||
: DmaBlitManager(gpu, setup),
|
||||
program_(nullptr),
|
||||
constantBuffer_(nullptr),
|
||||
constantBufferOffset_(0),
|
||||
xferBufferSize_(0),
|
||||
lockXferOps_("Transfer Ops Lock", true) {
|
||||
for (uint i = 0; i < BlitTotal; ++i) {
|
||||
@@ -799,10 +797,6 @@ KernelBlitManager::~KernelBlitManager() {
|
||||
// Release a dummy context
|
||||
context_->release();
|
||||
}
|
||||
|
||||
if (nullptr != constantBuffer_) {
|
||||
constantBuffer_->release();
|
||||
}
|
||||
}
|
||||
|
||||
bool KernelBlitManager::create(amd::Device& device) {
|
||||
@@ -854,18 +848,6 @@ bool KernelBlitManager::createProgram(Device& device) {
|
||||
result = true;
|
||||
} while (!result);
|
||||
|
||||
// Create an internal constant buffer
|
||||
constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
|
||||
// Assign the constant buffer to the current virtual GPU
|
||||
constantBuffer_->setVirtualDevice(&gpu());
|
||||
if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) {
|
||||
constantBuffer_->release();
|
||||
constantBuffer_ = nullptr;
|
||||
return false;
|
||||
} else if (constantBuffer_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -2030,14 +2012,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
}
|
||||
|
||||
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
|
||||
if (gpuCB == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Find offset in the current constant buffer to allow multipel fills
|
||||
uint32_t constBufOffset = ConstantBufferOffset();
|
||||
auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
|
||||
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
|
||||
|
||||
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
|
||||
if (packed_obj.pattern_expanded_) {
|
||||
@@ -2045,9 +2020,8 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
|
||||
} else {
|
||||
memcpy(constBuf, pattern, kpattern_size32);
|
||||
}
|
||||
|
||||
mem = as_cl<amd::Memory>(gpuCB->owner());
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
|
||||
constexpr bool kDirectVa = true;
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
|
||||
|
||||
koffset /= alignment;
|
||||
kpattern_size32 /= alignment;
|
||||
@@ -2127,18 +2101,12 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
|
||||
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
|
||||
}
|
||||
|
||||
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
|
||||
if (gpuCB == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Find offset in the current constant buffer to allow multipel fills
|
||||
uint32_t constBufOffset = ConstantBufferOffset();
|
||||
auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
|
||||
// Get constant buffer to allow multipel fills
|
||||
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
|
||||
memcpy(constBuf, pattern, patternSize);
|
||||
|
||||
mem = as_cl<amd::Memory>(gpuCB->owner());
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
|
||||
constexpr bool kDirectVa = true;
|
||||
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
|
||||
|
||||
uint64_t mem_origin = static_cast<uint64_t>(origin[0]);
|
||||
uint64_t width = static_cast<uint64_t>(size[0]);
|
||||
|
||||
@@ -486,21 +486,11 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
|
||||
inline void setArgument(amd::Kernel* kernel, size_t index,
|
||||
size_t size, const void* value, size_t offset = 0,
|
||||
const device::Memory* dev_mem = nullptr) const;
|
||||
const device::Memory* dev_mem = nullptr,
|
||||
bool writeVAImmediate = false) const;
|
||||
|
||||
uint32_t ConstantBufferOffset() const {
|
||||
// Make sure it can fit at least 128 bytes for OCL memory fill of double16
|
||||
constexpr uint32_t kManagedSize = 0x80;
|
||||
// Adjust the ofset to the new location
|
||||
constantBufferOffset_ += kManagedSize;
|
||||
// Check if the allocation exceeds the limit
|
||||
if ((constantBufferOffset_ + kManagedSize) > constantBuffer_->getSize()) {
|
||||
// Stall GPU and reset the ofset
|
||||
gpu().releaseGpuMemoryFence();
|
||||
constantBufferOffset_ = 0;
|
||||
}
|
||||
return constantBufferOffset_;
|
||||
}
|
||||
static constexpr uint32_t kCBSize = 0x80;
|
||||
static constexpr size_t kCBAlignment = 0x80;
|
||||
|
||||
inline uint32_t NumBlitKernels() {
|
||||
return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
|
||||
@@ -514,8 +504,6 @@ class KernelBlitManager : public DmaBlitManager {
|
||||
|
||||
amd::Program* program_; //!< GPU program object
|
||||
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
|
||||
amd::Memory* constantBuffer_; //!< An internal CB for blits
|
||||
mutable uint32_t constantBufferOffset_; //!< Current offset in the constant buffer
|
||||
size_t xferBufferSize_; //!< Transfer buffer size
|
||||
mutable amd::Monitor lockXferOps_; //!< Lock transfer operation
|
||||
};
|
||||
@@ -531,7 +519,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
|
||||
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
|
||||
size_t size, const void* value, size_t offset,
|
||||
const device::Memory* dev_mem) const {
|
||||
const device::Memory* dev_mem, bool writeVAImmediate) const {
|
||||
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
|
||||
|
||||
void* param = kernel->parameters().values() + desc.offset_;
|
||||
@@ -548,16 +536,23 @@ inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
|
||||
reinterpret_cast<Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
|
||||
} else {
|
||||
amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
|
||||
// convert cl_mem to amd::Memory*, return false if invalid.
|
||||
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
|
||||
if (dev_mem == nullptr) {
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
|
||||
mem->getDeviceMemory(dev())->virtualAddress()) + offset;
|
||||
amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
|
||||
if (!writeVAImmediate) {
|
||||
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
|
||||
if (dev_mem == nullptr) {
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
|
||||
mem->getDeviceMemory(dev())->virtualAddress()) + offset;
|
||||
} else {
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
|
||||
dev_mem->virtualAddress()) + offset;
|
||||
}
|
||||
} else {
|
||||
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
|
||||
dev_mem->virtualAddress()) + offset;
|
||||
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
|
||||
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(value);
|
||||
LP64_SWITCH(uint32_value, uint64_value) = addr + offset;
|
||||
}
|
||||
}
|
||||
} else if (desc.type_ == T_SAMPLER) {
|
||||
|
||||
@@ -403,6 +403,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Indicates the status of the callback handler. The callback would process the commands
|
||||
//! and would collect profiling data, update refcounts
|
||||
bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
@@ -427,7 +429,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
bool initPool(size_t kernarg_pool_size);
|
||||
void destroyPool();
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
void resetKernArgPool() {
|
||||
kernarg_pool_cur_offset_ = 0;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
|
||||
Ссылка в новой задаче
Block a user