SWDEV-499997 - Unify ManagedBuffer and KernelArg buffer implementation
Change-Id: I95421c87904dd62d7ee214539a57c7bda1097ff4
Этот коммит содержится в:
коммит произвёл
Saleel Kudchadker
родитель
28cbf2bc4f
Коммит
cfcc743824
@@ -1279,8 +1279,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
schedulerQueue_(nullptr),
|
||||
schedulerSignal_({0}),
|
||||
barriers_(*this),
|
||||
kernarg_pool_signal_(KernelArgPoolNumSignal),
|
||||
managed_buffer_(*this, ManagedBuffer::kPoolNumSignals * device.settings().stagedXferSize_),
|
||||
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
|
||||
cuMask_(cuMask),
|
||||
priority_(priority),
|
||||
copy_command_type_(0),
|
||||
@@ -1298,10 +1298,6 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
profiling_ = profiling;
|
||||
cooperative_ = cooperative;
|
||||
|
||||
kernarg_pool_base_ = nullptr;
|
||||
kernarg_pool_size_ = 0;
|
||||
kernarg_pool_cur_offset_ = 0;
|
||||
|
||||
if (device.settings().fenceScopeAgent_) {
|
||||
dispatchPacketHeaderNoSync_ =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
@@ -1341,8 +1337,6 @@ VirtualGPU::~VirtualGPU() {
|
||||
releaseGpuMemoryFence();
|
||||
}
|
||||
|
||||
destroyPool();
|
||||
|
||||
releasePinnedMem();
|
||||
|
||||
if (timestamp_ != nullptr) {
|
||||
@@ -1390,7 +1384,7 @@ bool VirtualGPU::create() {
|
||||
gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_, cuMask_, priority_);
|
||||
if (!gpu_queue_) return false;
|
||||
|
||||
if (!initPool(dev().settings().kernargPoolSize_)) {
|
||||
if (!managed_kernarg_buffer_.Create(Device::MemorySegment::kKernArg)) {
|
||||
LogError("Couldn't allocate arguments/signals for the queue");
|
||||
return false;
|
||||
}
|
||||
@@ -1433,7 +1427,7 @@ bool VirtualGPU::create() {
|
||||
return false;
|
||||
}
|
||||
// Create managed buffer for staging copies
|
||||
if (!managed_buffer_.Create()) {
|
||||
if (!managed_buffer_.Create(Device::MemorySegment::kNoAtomics)) {
|
||||
LogError("Could not create managed buffer for this queue!");
|
||||
return false;
|
||||
}
|
||||
@@ -1453,12 +1447,24 @@ VirtualGPU::ManagedBuffer::~ManagedBuffer() {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::ManagedBuffer::Create() {
|
||||
bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) {
|
||||
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
|
||||
active_chunk_ = 0;
|
||||
// Allocate memory for managed buffer
|
||||
pool_base_ = reinterpret_cast<address>(
|
||||
gpu_.dev().hostAlloc(pool_size_, 0, Device::MemorySegment::kNoAtomics));
|
||||
if (mem_segment == Device::MemorySegment::kKernArg &&
|
||||
(gpu_.dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
|
||||
gpu_.dev().info().largeBar_) {
|
||||
pool_base_ =
|
||||
reinterpret_cast<address>(gpu_.dev().deviceLocalAlloc(pool_size_));
|
||||
if (pool_base_ != nullptr) {
|
||||
// @note Workaround first access penalty.
|
||||
// KFD may update CPU page tables on the first CPU access
|
||||
*pool_base_ = 0;
|
||||
}
|
||||
} else {
|
||||
pool_base_ = reinterpret_cast<address>(
|
||||
gpu_.dev().hostAlloc(pool_size_, 0, mem_segment));
|
||||
}
|
||||
if (pool_base_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
@@ -1474,6 +1480,12 @@ bool VirtualGPU::ManagedBuffer::Create() {
|
||||
// ================================================================================================
|
||||
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
|
||||
auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_);
|
||||
return Acquire(size, alignment);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
address VirtualGPU::ManagedBuffer::Acquire(uint32_t size, uint32_t alignment) {
|
||||
assert(alignment != 0);
|
||||
address result = nullptr;
|
||||
result = amd::alignUp(pool_base_ + pool_cur_offset_, alignment);
|
||||
const size_t pool_new_usage = (result + size) - pool_base_;
|
||||
@@ -1483,6 +1495,7 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
|
||||
} else {
|
||||
// Reset the signal for the barrier packet
|
||||
hsa_signal_silent_store_relaxed(pool_signal_[active_chunk_], kInitSignalValueOne);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Issue barrier to flush chunk %d", active_chunk_);
|
||||
// Currently don't skip wait signal check, because SDMA engine cna be used in staging copy
|
||||
constexpr bool kSkipSignal = false;
|
||||
// Dispatch a barrier packet into the queue
|
||||
@@ -1503,80 +1516,17 @@ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
active_chunk_ = 0;
|
||||
if ((dev().settings().kernel_arg_impl_ != KernelArgImpl::HostKernelArgs) &&
|
||||
roc_device_.info().largeBar_) {
|
||||
kernarg_pool_base_ =
|
||||
reinterpret_cast<address>(roc_device_.deviceLocalAlloc(kernarg_pool_size_));
|
||||
if (kernarg_pool_base_ != nullptr) {
|
||||
// @note Workaround first access penalty.
|
||||
// KFD may update CPU page tables on the first CPU access
|
||||
*kernarg_pool_base_ = 0;
|
||||
}
|
||||
} else {
|
||||
kernarg_pool_base_ = reinterpret_cast<address>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
|
||||
Device::MemorySegment::kKernArg));
|
||||
}
|
||||
if (kernarg_pool_base_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
hsa_agent_t agent = gpu_device();
|
||||
for (auto& it : kernarg_pool_signal_) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 1, &agent, &it)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void VirtualGPU::destroyPool() {
|
||||
for (auto& it : kernarg_pool_signal_) {
|
||||
if (it.handle != 0) {
|
||||
hsa_signal_destroy(it);
|
||||
}
|
||||
}
|
||||
if (kernarg_pool_base_ != nullptr) {
|
||||
roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
|
||||
}
|
||||
void VirtualGPU::ManagedBuffer::ResetPool() {
|
||||
pool_cur_offset_ = 0;
|
||||
pool_chunk_end_ = pool_size_ / kPoolNumSignals;
|
||||
active_chunk_ = 0;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
|
||||
assert(alignment != 0);
|
||||
address result = nullptr;
|
||||
result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment);
|
||||
const size_t pool_new_usage = (result + size) - kernarg_pool_base_;
|
||||
if (pool_new_usage <= kernarg_pool_chunk_end_) {
|
||||
kernarg_pool_cur_offset_ = pool_new_usage;
|
||||
return result;
|
||||
} else {
|
||||
//! That means the app didn't call clFlush/clFinish for very long time.
|
||||
// Reset the signal for the barrier packet
|
||||
hsa_signal_silent_store_relaxed(kernarg_pool_signal_[active_chunk_], kInitSignalValueOne);
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Issue barrier to flush kernel arg chunk %d",
|
||||
active_chunk_);
|
||||
// Dispatch a barrier packet into the queue
|
||||
dispatchBarrierPacket(kBarrierPacketHeader, true, kernarg_pool_signal_[active_chunk_]);
|
||||
// Get the next chunk
|
||||
active_chunk_ = ++active_chunk_ % KernelArgPoolNumSignal;
|
||||
// Make sure the new active chunk is free
|
||||
bool test = WaitForSignal(kernarg_pool_signal_[active_chunk_], ActiveWait());
|
||||
assert(test && "Runtime can't fail a wait for chunk!");
|
||||
// Make sure the current offset matches the new chunk to avoid possible overlaps
|
||||
// between chunks and issues during recycle
|
||||
kernarg_pool_cur_offset_ = (active_chunk_ == 0) ? 0 : kernarg_pool_chunk_end_;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_cur_offset_ +
|
||||
kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment);
|
||||
kernarg_pool_cur_offset_ = (result + size) - kernarg_pool_base_;
|
||||
}
|
||||
|
||||
return result;
|
||||
return managed_kernarg_buffer_.Acquire(size, alignment);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -198,11 +198,17 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
~ManagedBuffer();
|
||||
|
||||
//! Allocates all necessary resources to manage memory
|
||||
bool Create();
|
||||
bool Create(amd::Device::MemorySegment mem_segment);
|
||||
|
||||
//! Acquires memory for use on the gpu
|
||||
address Acquire(uint32_t size);
|
||||
|
||||
//! Acquires custom aligned memory for use on the gpu
|
||||
address Acquire(uint32_t size, uint32_t alignment);
|
||||
|
||||
//! Reset mem pool
|
||||
void ResetPool();
|
||||
|
||||
private:
|
||||
VirtualGPU& gpu_; //!< Queue object for ROCm device
|
||||
address pool_base_ = nullptr; //!< Memory pool base address
|
||||
@@ -478,13 +484,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet,
|
||||
amd::NDRangeContainer& sizes);
|
||||
|
||||
bool initPool(size_t kernarg_pool_size);
|
||||
void destroyPool();
|
||||
|
||||
void resetKernArgPool() {
|
||||
kernarg_pool_cur_offset_ = 0;
|
||||
kernarg_pool_chunk_end_ = kernarg_pool_size_ / KernelArgPoolNumSignal;
|
||||
active_chunk_ = 0;
|
||||
managed_kernarg_buffer_.ResetPool();
|
||||
}
|
||||
|
||||
uint64_t getVQVirtualAddress();
|
||||
@@ -564,17 +565,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
HwQueueTracker barriers_; //!< Tracks active barriers in ROCr
|
||||
|
||||
//!< The number of chunks the kernel arg pool will be divided
|
||||
static constexpr uint32_t KernelArgPoolNumSignal = 4;
|
||||
address kernarg_pool_base_;
|
||||
uint32_t kernarg_pool_size_;
|
||||
uint32_t kernarg_pool_chunk_end_; //!< The end offset of the current chunck
|
||||
uint32_t active_chunk_; //!< The index of the current active chunk
|
||||
uint32_t kernarg_pool_cur_offset_;
|
||||
std::vector<hsa_signal_t> kernarg_pool_signal_; //!< Pool of HSA signals to manage
|
||||
//!< multiple chunks
|
||||
|
||||
ManagedBuffer managed_buffer_; //!< Memory manager for staging copies
|
||||
ManagedBuffer managed_kernarg_buffer_; //!< Managed memory for kernel args
|
||||
|
||||
friend class Timestamp;
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user