SWDEV-260345 - Manage constant buffer for blit

- Leverage managed buffer that would use chunks for fill pattern. Use a
different chunk for the next fill to avoid wait

Change-Id: I254483c867e112f66564ffd8f55e0a605d8896c9
Этот коммит содержится в:
Saleel Kudchadker
2022-07-11 16:16:07 -07:00
родитель 46b9430a4e
Коммит 175ad024d3
3 изменённых файлов: 29 добавлений и 65 удалений
+7 -39
Просмотреть файл
@@ -774,8 +774,6 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
: DmaBlitManager(gpu, setup),
program_(nullptr),
constantBuffer_(nullptr),
constantBufferOffset_(0),
xferBufferSize_(0),
lockXferOps_("Transfer Ops Lock", true) {
for (uint i = 0; i < BlitTotal; ++i) {
@@ -799,10 +797,6 @@ KernelBlitManager::~KernelBlitManager() {
// Release a dummy context
context_->release();
}
if (nullptr != constantBuffer_) {
constantBuffer_->release();
}
}
bool KernelBlitManager::create(amd::Device& device) {
@@ -854,18 +848,6 @@ bool KernelBlitManager::createProgram(Device& device) {
result = true;
} while (!result);
// Create an internal constant buffer
constantBuffer_ = new (*context_) amd::Buffer(*context_, CL_MEM_ALLOC_HOST_PTR, 4 * Ki);
// Assign the constant buffer to the current virtual GPU
constantBuffer_->setVirtualDevice(&gpu());
if ((constantBuffer_ != nullptr) && !constantBuffer_->create(nullptr)) {
constantBuffer_->release();
constantBuffer_ = nullptr;
return false;
} else if (constantBuffer_ == nullptr) {
return false;
}
return result;
}
@@ -2030,14 +2012,7 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
}
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
if (gpuCB == nullptr) {
return false;
}
// Find offset in the current constant buffer to allow multipel fills
uint32_t constBufOffset = ConstantBufferOffset();
auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
// If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
if (packed_obj.pattern_expanded_) {
@@ -2045,9 +2020,8 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
} else {
memcpy(constBuf, pattern, kpattern_size32);
}
mem = as_cl<amd::Memory>(gpuCB->owner());
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
constexpr bool kDirectVa = true;
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
koffset /= alignment;
kpattern_size32 /= alignment;
@@ -2127,18 +2101,12 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
}
Memory* gpuCB = dev().getRocMemory(constantBuffer_);
if (gpuCB == nullptr) {
return false;
}
// Find offset in the current constant buffer to allow multipel fills
uint32_t constBufOffset = ConstantBufferOffset();
auto constBuf = reinterpret_cast<address>(constantBuffer_->getHostMem()) + constBufOffset;
// Get constant buffer to allow multipel fills
auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);
memcpy(constBuf, pattern, patternSize);
mem = as_cl<amd::Memory>(gpuCB->owner());
setArgument(kernels_[fillType], 4, sizeof(cl_mem), &mem, constBufOffset);
constexpr bool kDirectVa = true;
setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
uint64_t mem_origin = static_cast<uint64_t>(origin[0]);
uint64_t width = static_cast<uint64_t>(size[0]);
+20 -25
Просмотреть файл
@@ -486,21 +486,11 @@ class KernelBlitManager : public DmaBlitManager {
inline void setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset = 0,
const device::Memory* dev_mem = nullptr) const;
const device::Memory* dev_mem = nullptr,
bool writeVAImmediate = false) const;
uint32_t ConstantBufferOffset() const {
// Make sure it can fit at least 128 bytes for OCL memory fill of double16
constexpr uint32_t kManagedSize = 0x80;
// Adjust the ofset to the new location
constantBufferOffset_ += kManagedSize;
// Check if the allocation exceeds the limit
if ((constantBufferOffset_ + kManagedSize) > constantBuffer_->getSize()) {
// Stall GPU and reset the ofset
gpu().releaseGpuMemoryFence();
constantBufferOffset_ = 0;
}
return constantBufferOffset_;
}
static constexpr uint32_t kCBSize = 0x80;
static constexpr size_t kCBAlignment = 0x80;
inline uint32_t NumBlitKernels() {
return (dev().info().imageSupport_) ? BlitTotal : BlitLinearTotal;
@@ -514,8 +504,6 @@ class KernelBlitManager : public DmaBlitManager {
amd::Program* program_; //!< GPU program object
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
amd::Memory* constantBuffer_; //!< An internal CB for blits
mutable uint32_t constantBufferOffset_; //!< Current offset in the constant buffer
size_t xferBufferSize_; //!< Transfer buffer size
mutable amd::Monitor lockXferOps_; //!< Lock transfer operation
};
@@ -531,7 +519,7 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
size_t size, const void* value, size_t offset,
const device::Memory* dev_mem) const {
const device::Memory* dev_mem, bool writeVAImmediate) const {
const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);
void* param = kernel->parameters().values() + desc.offset_;
@@ -548,16 +536,23 @@ inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index,
reinterpret_cast<Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
} else {
amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
// convert cl_mem to amd::Memory*, return false if invalid.
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
if (dev_mem == nullptr) {
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
mem->getDeviceMemory(dev())->virtualAddress()) + offset;
amd::Memory* mem = as_amd(*static_cast<const cl_mem*>(value));
if (!writeVAImmediate) {
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = mem;
if (dev_mem == nullptr) {
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
mem->getDeviceMemory(dev())->virtualAddress()) + offset;
} else {
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
dev_mem->virtualAddress()) + offset;
}
} else {
LP64_SWITCH(uint32_value, uint64_value) = static_cast<uintptr_t>(
dev_mem->virtualAddress()) + offset;
reinterpret_cast<amd::Memory**>(kernel->parameters().values() +
kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] = nullptr;
uintptr_t addr = reinterpret_cast<uintptr_t>(value);
LP64_SWITCH(uint32_value, uint64_value) = addr + offset;
}
}
} else if (desc.type_ == T_SAMPLER) {
+2 -1
Просмотреть файл
@@ -403,6 +403,8 @@ class VirtualGPU : public device::VirtualDevice {
//! Indicates the status of the callback handler. The callback would process the commands
//! and would collect profiling data, update refcounts
bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
void* allocKernArg(size_t size, size_t alignment);
// } roc OpenCL integration
private:
//! Dispatches a barrier with blocking HSA signals
@@ -427,7 +429,6 @@ class VirtualGPU : public device::VirtualDevice {
bool initPool(size_t kernarg_pool_size);
void destroyPool();
void* allocKernArg(size_t size, size_t alignment);
void resetKernArgPool() {
kernarg_pool_cur_offset_ = 0;
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;