From 1cd46afe6d25a323ec972ad0739d5e366aa709c5 Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Tue, 24 Jan 2017 18:55:55 -0600 Subject: [PATCH] Implement SDMA path for Gfx9 Gfx9 requires monotonic write pointer and doorbell. Cound fields are 1-based compared with 0-based pre-Gfx9. - Restructure implementation to use monotonic ring indices - Remove redundant submission size checks (handled by AcquireWriteAddress) - Unify copy/fill per-command limit (documentation is unclear) Change-Id: I57c1675221d2e63aa319fee700d9951671e1bd65 --- runtime/hsa-runtime/core/inc/amd_blit_sdma.h | 71 ++-- .../core/runtime/amd_blit_sdma.cpp | 354 +++++++++--------- .../core/runtime/amd_gpu_agent.cpp | 14 +- 3 files changed, 229 insertions(+), 210 deletions(-) diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 6212c3dcc2..fb0eb1abf8 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -55,7 +55,19 @@ #include "core/util/utils.h" namespace amd { -class BlitSdma : public core::Blit { +class BlitSdmaBase : public core::Blit { + public: + static const size_t kQueueSize; + static const size_t kCopyPacketSize; + static const size_t kMaxSingleCopySize; + static const size_t kMaxSingleFillSize; +}; + +// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes. +// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring. +// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field. +template +class BlitSdma : public BlitSdmaBase { public: explicit BlitSdma(); @@ -113,10 +125,6 @@ class BlitSdma : public core::Blit { virtual hsa_status_t EnableProfiling(bool enable) override; - static const size_t kQueueSize; - - static const size_t kCopyPacketSize; - protected: /// @brief Acquires the address into queue buffer where a new command /// packet of specified size could be written. The address that is @@ -126,13 +134,15 @@ class BlitSdma : public core::Blit { /// /// @param cmd_size Command packet size in bytes. /// + /// @param curr_index (output) Index to pass to ReleaseWriteAddress. + /// /// @return pointer into the queue buffer where a PM4 packet of specified size /// could be written. NULL if input size is greater than the size of queue /// buffer. - char* AcquireWriteAddress(uint32_t cmd_size); - void UpdateWriteAndDoorbellRegister(uint32_t current_offset, - uint32_t new_offset); + char* AcquireWriteAddress(uint32_t cmd_size, RingIndexTy& curr_index); + + void UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, RingIndexTy new_index); /// @brief Updates the Write Register of compute device to the end of /// SDMA packet written into queue buffer. The update to Write Register @@ -142,17 +152,19 @@ class BlitSdma : public core::Blit { /// will block until T1 has completed its update (assumes T1 acquired the /// write address first). /// - /// @param cmd_addr pointer into the queue buffer where a PM4 packet was - /// written. + /// @param curr_index Index passed back from AcquireWriteAddress. /// /// @param cmd_size Command packet size in bytes. - void ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size); + void ReleaseWriteAddress(RingIndexTy curr_index, uint32_t cmd_size); /// @brief Writes NO-OP words into queue buffer in case writing a command /// causes the queue buffer to wrap. /// - /// @param cmd_size Size in bytes of command causing queue buffer to wrap. - void WrapQueue(uint32_t cmd_size); + /// @param curr_index Index to begin padding from. + void PadRingToEnd(RingIndexTy curr_index); + + uint32_t WrapIntoRing(RingIndexTy index); + bool CanWriteUpto(RingIndexTy upto_index); /// @brief Build fence command void BuildFenceCommand(char* fence_command_addr, uint32_t* fence, @@ -176,9 +188,6 @@ class BlitSdma : public core::Blit { // Agent object owning the SDMA engine. GpuAgent* agent_; - /// Indicates size of Queue buffer in bytes. - uint32_t queue_size_; - /// Base address of the Queue buffer at construction time. char* queue_start_addr_; @@ -191,20 +200,9 @@ class BlitSdma : public core::Blit { /// and write indices HsaQueueResource queue_resource_; - /// @brief Current address of execution in Queue buffer. - /// - /// @note: The value of address is obtained by reading - /// the value of Write Register of the compute device. - /// Users should write to the Queue buffer at the current - /// address, else it will lead to execution error and potentially - /// a hang. - /// - /// @note: The value of Write Register does not always begin - /// with Zero after a Queue has been created. This needs to be - /// understood better. This means that current address number of - /// words of Queue buffer is unavailable for use. - volatile uint32_t cached_reserve_offset_; - volatile uint32_t cached_commit_offset_; + // Monotonic ring indices, in bytes, tracking written and submitted commands. + RingIndexTy cached_reserve_index_; + RingIndexTy cached_commit_index_; uint32_t linear_copy_command_size_; @@ -235,6 +233,19 @@ class BlitSdma : public core::Blit { /// True if platform atomic is supported. bool platform_atomic_support_; }; + +class BlitSdmaV2V3 + // Ring indices are 32-bit. + // HW ring indices are not monotonic (wrap at end of ring). + // Count fields of SDMA commands are 0-based. + : public BlitSdma {}; + +class BlitSdmaV4 + // Ring indices are 64-bit. + // HW ring indices are monotonic (do not wrap at end of ring). + // Count fields of SDMA commands are 1-based. + : public BlitSdma {}; + } // namespace amd #endif // header guard diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index c397bfb4e2..7daf92ba89 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -46,14 +46,13 @@ #include #include #include +#include #include "core/inc/amd_gpu_agent.h" #include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" #include "core/inc/signal.h" -#define SDMA_QUEUE_SIZE 1024 * 1024 - namespace amd { // SDMA packet for VI device. // Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt @@ -373,29 +372,33 @@ inline uint32_t ptrhigh32(const void* p) { #endif } -const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE; -const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR); +const size_t BlitSdmaBase::kQueueSize = 1024 * 1024; +const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR); +const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation +const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0; -BlitSdma::BlitSdma() - : core::Blit(), - agent_(NULL), - queue_size_(0), +template +BlitSdma::BlitSdma() + : agent_(NULL), queue_start_addr_(NULL), fence_base_addr_(NULL), fence_pool_size_(0), fence_pool_counter_(0), - cached_reserve_offset_(0), - cached_commit_offset_(0), + cached_reserve_index_(0), + cached_commit_index_(0), platform_atomic_support_(true) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } -BlitSdma::~BlitSdma() {} +template +BlitSdma::~BlitSdma() {} -hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { +template +hsa_status_t BlitSdma::Initialize( + const core::Agent& agent) { agent_ = reinterpret_cast(&const_cast(agent)); - if (queue_start_addr_ != NULL && queue_size_ != 0) { + if (queue_start_addr_ != NULL) { // Already initialized. return HSA_STATUS_SUCCESS; } @@ -412,26 +415,6 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP); trap_command_size_ = sizeof(SDMA_PKT_TRAP); - const uint32_t sync_command_size = fence_command_size_; - const uint32_t max_num_copy_command = - std::floor((static_cast(queue_size_) - sync_command_size) / - linear_copy_command_size_); - const uint32_t max_num_fill_command = - std::floor((static_cast(queue_size_) - sync_command_size) / - fill_command_size_); - - max_single_linear_copy_size_ = 0x3fffe0; - max_total_linear_copy_size_ = static_cast( - std::min(static_cast(SIZE_MAX), - static_cast(max_num_copy_command) * - static_cast(max_single_linear_copy_size_))); - - max_single_fill_size_ = (1 << 22) - sizeof(uint32_t); - max_total_fill_size_ = static_cast( - std::min(static_cast(SIZE_MAX), - static_cast(max_num_fill_command) * - static_cast(max_single_fill_size_))); - const amd::GpuAgentInt& amd_gpu_agent = static_cast(agent); @@ -445,35 +428,31 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { } // Allocate queue buffer. - queue_size_ = kQueueSize; - - queue_start_addr_ = - (char*)core::Runtime::runtime_singleton_->system_allocator()( - queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable); + queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()( + kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable); if (queue_start_addr_ == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - std::memset(queue_start_addr_, 0, queue_size_); + std::memset(queue_start_addr_, 0, kQueueSize); // Access kernel driver to initialize the queue control block // This call binds user mode queue object to underlying compute // device. const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA; - if (HSAKMT_STATUS_SUCCESS != - hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100, - HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, - queue_size_, NULL, &queue_resource_)) { + if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100, + HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, + kQueueSize, NULL, &queue_resource_)) { Destroy(agent); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } - cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr); - cached_commit_offset_ = cached_reserve_offset_; + cached_reserve_index_ = *reinterpret_cast(queue_resource_.Queue_write_ptr); + cached_commit_index_ = cached_reserve_index_; - fence_pool_size_ = static_cast( - (kQueueSize + fence_command_size_ - 1) / fence_command_size_); + fence_pool_size_ = + static_cast((kQueueSize + fence_command_size_ - 1) / fence_command_size_); fence_pool_mask_ = fence_pool_size_ - 1; @@ -490,7 +469,9 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { +template +hsa_status_t BlitSdma::Destroy( + const core::Agent& agent) { // Release all allocated resources and reset them to zero. if (queue_resource_.QueueId != 0) { @@ -500,7 +481,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { memset(&queue_resource_, 0, sizeof(queue_resource_)); } - if (queue_start_addr_ != NULL && queue_size_ != 0) { + if (queue_start_addr_ != NULL) { // Release queue buffer. core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_); } @@ -509,24 +490,19 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) { core::Runtime::runtime_singleton_->system_deallocator()(fence_base_addr_); } - queue_size_ = 0; queue_start_addr_ = NULL; - cached_reserve_offset_ = 0; - cached_commit_offset_ = 0; + cached_reserve_index_ = 0; + cached_commit_index_ = 0; return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, - size_t size) { - if (size > max_total_linear_copy_size_) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - +template +hsa_status_t BlitSdma::SubmitLinearCopyCommand( + void* dst, const void* src, size_t size) { // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_copy_command = - (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_; + const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; @@ -538,8 +514,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, uint32_t* fence_addr = ObtainFenceObject(); *fence_addr = 0; - char* command_addr = AcquireWriteAddress(total_command_size); - char* const command_addr_temp = command_addr; + RingIndexTy curr_index; + char* command_addr = AcquireWriteAddress(total_command_size, curr_index); if (command_addr == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -551,20 +527,17 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, BuildFenceCommand(command_addr, fence_addr, kFenceValue); - ReleaseWriteAddress(command_addr_temp, total_command_size); + ReleaseWriteAddress(curr_index, total_command_size); WaitFence(fence_addr, kFenceValue); return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::SubmitLinearCopyCommand( - void* dst, const void* src, size_t size, - std::vector& dep_signals, core::Signal& out_signal) { - if (size > max_total_linear_copy_size_) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - +template +hsa_status_t BlitSdma::SubmitLinearCopyCommand( + void* dst, const void* src, size_t size, std::vector& dep_signals, + core::Signal& out_signal) { // The signal is 64 bit value, and poll checks for 32 bit value. So we // need to use two poll operations per dependent signal. const uint32_t num_poll_command = @@ -574,8 +547,7 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_copy_command = - (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_; + const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; @@ -624,8 +596,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( total_poll_command_size + total_copy_command_size + sync_command_size + total_timestamp_command_size + interrupt_command_size; - char* command_addr = AcquireWriteAddress(total_command_size); - char* const command_addr_temp = command_addr; + RingIndexTy curr_index; + char* command_addr = AcquireWriteAddress(total_command_size, curr_index); if (command_addr == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -695,23 +667,19 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( BuildTrapCommand(command_addr); } - ReleaseWriteAddress(command_addr_temp, total_command_size); + ReleaseWriteAddress(curr_index, total_command_size); return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, - size_t count) { +template +hsa_status_t BlitSdma::SubmitLinearFillCommand( + void* ptr, uint32_t value, size_t count) { const size_t size = count * sizeof(uint32_t); - if (size > max_total_fill_size_) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - // Break the copy into multiple copy operation incase the copy size exceeds // the SDMA linear copy limit. - const uint32_t num_fill_command = - (size + max_single_fill_size_ - 1) / max_single_fill_size_; + const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize; const uint32_t total_fill_command_size = num_fill_command * fill_command_size_; @@ -719,8 +687,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, const uint32_t total_command_size = total_fill_command_size + fence_command_size_; - char* command_addr = AcquireWriteAddress(total_command_size); - char* const command_addr_temp = command_addr; + RingIndexTy curr_index; + char* command_addr = AcquireWriteAddress(total_command_size, curr_index); if (command_addr == NULL) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; @@ -729,8 +697,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, const uint32_t fill_command_size = fill_command_size_; size_t cur_size = 0; for (uint32_t i = 0; i < num_fill_command; ++i) { - const uint32_t fill_size = static_cast( - std::min((size - cur_size), max_single_fill_size_)); + const uint32_t fill_size = + static_cast(std::min((size - cur_size), kMaxSingleFillSize)); void* cur_ptr = static_cast(ptr) + cur_size; @@ -747,7 +715,7 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, packet_addr->DATA_UNION.src_data_31_0 = value; - packet_addr->COUNT_UNION.count = fill_size; + packet_addr->COUNT_UNION.count = fill_size + SizeToCountOffset; command_addr += fill_command_size; cur_size += fill_size; @@ -761,139 +729,160 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value, BuildFenceCommand(command_addr, fence_addr, kFenceValue); - ReleaseWriteAddress(command_addr_temp, total_command_size); + ReleaseWriteAddress(curr_index, total_command_size); WaitFence(fence_addr, kFenceValue); return HSA_STATUS_SUCCESS; } -hsa_status_t BlitSdma::EnableProfiling(bool enable) { +template +hsa_status_t BlitSdma::EnableProfiling( + bool enable) { return HSA_STATUS_SUCCESS; } -char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) { - if (cmd_size > queue_size_) { +template +char* BlitSdma::AcquireWriteAddress( + uint32_t cmd_size, RingIndexTy& curr_index) { + // Ring is full when all but one byte is written. + if (cmd_size >= kQueueSize) { return NULL; } while (true) { - const uint32_t curr_offset = - atomic::Load(&cached_reserve_offset_, std::memory_order_acquire); - const uint32_t end_offset = curr_offset + cmd_size; + curr_index = atomic::Load(&cached_reserve_index_, std::memory_order_acquire); - if (end_offset >= queue_size_) { - // Queue buffer is not enough to contain the new command. - WrapQueue(cmd_size); + // Check whether a linear region of the requested size is available. + // If == cmd_size: region is at beginning of ring. + // If < cmd_size: region intersects end of ring, pad with no-ops and retry. + if (WrapIntoRing(curr_index + cmd_size) < cmd_size) { + PadRingToEnd(curr_index); continue; } - const uint32_t curr_read_ptr_val = - atomic::Load(queue_resource_.Queue_read_ptr, std::memory_order_acquire); - if (curr_offset < curr_read_ptr_val && end_offset > curr_read_ptr_val) { - // Queue is wrapping and there is not enough space to recycle. + // Check whether the engine has finished using this region. + const RingIndexTy new_index = curr_index + cmd_size; + + if (CanWriteUpto(new_index) == false) { + // Wait for read index to move and try again. + os::YieldThread(); continue; } - if (atomic::Cas(&cached_reserve_offset_, end_offset, curr_offset, - std::memory_order_release) == curr_offset) { - return queue_start_addr_ + curr_offset; + // Try to reserve this part of the ring. + if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) == + curr_index) { + return queue_start_addr_ + WrapIntoRing(curr_index); } + + // Another thread reserved curr_index, try again. + os::YieldThread(); } return NULL; } -void BlitSdma::UpdateWriteAndDoorbellRegister(uint32_t current_offset, - uint32_t new_offset) { +template +void BlitSdma::UpdateWriteAndDoorbellRegister( + RingIndexTy curr_index, RingIndexTy new_index) { while (true) { - // Make sure that the address before ::current_offset is already released. + // Make sure that the address before ::curr_index is already released. // Otherwise the CP may read invalid packets. - if (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) == - current_offset) { + if (atomic::Load(&cached_commit_index_, std::memory_order_acquire) == curr_index) { if (core::Runtime::runtime_singleton_->flag().sdma_wait_idle()) { // TODO: remove when sdma wpointer issue is resolved. // Wait until the SDMA engine finish processing all packets before // updating the wptr and doorbell. - while (atomic::Load(queue_resource_.Queue_read_ptr, - std::memory_order_acquire) != current_offset) { + while (WrapIntoRing(*reinterpret_cast(queue_resource_.Queue_read_ptr)) != + WrapIntoRing(curr_index)) { os::YieldThread(); } } // Update write pointer and doorbel register. - atomic::Store(queue_resource_.Queue_write_ptr, new_offset); + *reinterpret_cast(queue_resource_.Queue_write_ptr) = + (HwIndexMonotonic ? new_index : WrapIntoRing(new_index)); + // Ensure write pointer is visible to GPU before doorbell. std::atomic_thread_fence(std::memory_order_release); - atomic::Store(queue_resource_.Queue_DoorBell, new_offset); + *reinterpret_cast(queue_resource_.Queue_DoorBell) = + (HwIndexMonotonic ? new_index : WrapIntoRing(new_index)); - std::atomic_thread_fence(std::memory_order_release); - - atomic::Store(&cached_commit_offset_, new_offset); + atomic::Store(&cached_commit_index_, new_index, std::memory_order_release); break; } + + // Waiting for another thread to submit preceding commands first. + os::YieldThread(); } } -void BlitSdma::ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size) { - assert(cmd_addr != NULL); - assert(cmd_addr >= queue_start_addr_); - - if (cmd_size > queue_size_) { +template +void BlitSdma::ReleaseWriteAddress( + RingIndexTy curr_index, uint32_t cmd_size) { + if (cmd_size > kQueueSize) { assert(false && "cmd_addr is outside the queue buffer range"); return; } - // Update write register. - const uint32_t curent_offset = cmd_addr - queue_start_addr_; - const uint32_t new_offset = curent_offset + cmd_size; - UpdateWriteAndDoorbellRegister(curent_offset, new_offset); + UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size); } -void BlitSdma::WrapQueue(uint32_t cmd_size) { - // Re-determine the offset into queue buffer where NOOP instructions - // should be written. - while (true) { - const uint32_t full_offset = queue_size_ + 1; +template +void BlitSdma::PadRingToEnd( + RingIndexTy curr_index) { + // Reserve region from here to the end of the ring. + RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index)); - uint32_t curent_offset = - atomic::Load(&cached_reserve_offset_, std::memory_order_acquire); - const uint32_t end_offset = curent_offset + cmd_size; - if (end_offset < queue_size_) { - return; - } + // Check whether the engine has finished using this region. + if (CanWriteUpto(new_index) == false) { + // Wait for read index to move and try again. + return; + } - if (curent_offset == full_offset) { - // Another thread is already wrapping the queue. - continue; - } + if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) == + curr_index) { + // Write and submit NOP commands in reserved region. + char* nop_address = queue_start_addr_ + WrapIntoRing(curr_index); + memset(nop_address, 0, new_index - curr_index); - // Close reservation to queue temporarily by "making" it full. - if (atomic::Cas(&cached_reserve_offset_, full_offset, curent_offset, - std::memory_order_release) == curent_offset) { - // Wait till all reserved packets are commited. - while (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) != - curent_offset) { - os::YieldThread(); - } - - // Fill the remainder of the queue with NOOP commands. - char* noop_address = queue_start_addr_ + curent_offset; - const size_t noop_commands_size = queue_size_ - curent_offset; - memset(noop_address, 0, noop_commands_size); - - // Update write and doorbell registers to execute NOOP instructions. - UpdateWriteAndDoorbellRegister(curent_offset, 0); - - // Open access to queue. - atomic::Store(&cached_reserve_offset_, 0U, std::memory_order_release); - } + UpdateWriteAndDoorbellRegister(curr_index, new_index); } } -void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence, - uint32_t fence_value) { +template +uint32_t BlitSdma::WrapIntoRing( + RingIndexTy index) { + return index & (kQueueSize - 1); +} + +template +bool BlitSdma::CanWriteUpto( + RingIndexTy upto_index) { + // Get/calculate the monotonic read index. + RingIndexTy hw_read_index = *reinterpret_cast(queue_resource_.Queue_read_ptr); + RingIndexTy read_index; + + if (HwIndexMonotonic) { + read_index = hw_read_index; + } else { + // Calculate distance from commit index to HW read index. + // Commit index is always < kQueueSize away from HW read index. + RingIndexTy commit_index = atomic::Load(&cached_commit_index_, std::memory_order_relaxed); + RingIndexTy dist_to_read_index = WrapIntoRing(commit_index - hw_read_index); + read_index = commit_index - dist_to_read_index; + } + + // Check whether the read pointer has passed the given index. + // At most we can submit (kQueueSize - 1) bytes at a time. + return (upto_index - read_index) < kQueueSize; +} + +template +void BlitSdma::BuildFenceCommand( + char* fence_command_addr, uint32_t* fence, uint32_t fence_value) { assert(fence_command_addr != NULL); SDMA_PKT_FENCE* packet_addr = reinterpret_cast(fence_command_addr); @@ -909,7 +898,8 @@ void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence, packet_addr->DATA_UNION.data = fence_value; } -uint32_t* BlitSdma::ObtainFenceObject() { +template +uint32_t* BlitSdma::ObtainFenceObject() { const uint32_t fence_index = atomic::Add(&fence_pool_counter_, 1U, std::memory_order_acquire); uint32_t* fence_addr = &fence_base_addr_[fence_index & fence_pool_mask_]; @@ -917,7 +907,9 @@ uint32_t* BlitSdma::ObtainFenceObject() { return fence_addr; } -void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) { +template +void BlitSdma::WaitFence(uint32_t* fence, + uint32_t fence_value) { int spin_count = 51; while (atomic::Load(fence, std::memory_order_acquire) != fence_value) { if (--spin_count > 0) { @@ -927,12 +919,13 @@ void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) { } } -void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, - void* dst, const void* src, size_t size) { +template +void BlitSdma::BuildCopyCommand( + char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) { size_t cur_size = 0; for (uint32_t i = 0; i < num_copy_command; ++i) { - const uint32_t copy_size = static_cast( - std::min((size - cur_size), max_single_linear_copy_size_)); + const uint32_t copy_size = + static_cast(std::min((size - cur_size), kMaxSingleCopySize)); void* cur_dst = static_cast(dst) + cur_size; const void* cur_src = static_cast(src) + cur_size; @@ -945,7 +938,7 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, packet_addr->HEADER_UNION.op = SDMA_OP_COPY; packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR; - packet_addr->COUNT_UNION.count = copy_size; + packet_addr->COUNT_UNION.count = copy_size + SizeToCountOffset; packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src); packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src); @@ -960,8 +953,9 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, assert(cur_size == size); } -void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr, - uint32_t reference) { +template +void BlitSdma::BuildPollCommand( + char* cmd_addr, void* addr, uint32_t reference) { SDMA_PKT_POLL_REGMEM* packet_addr = reinterpret_cast(cmd_addr); @@ -981,7 +975,9 @@ void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr, packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever. } -void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) { +template +void BlitSdma::BuildAtomicDecrementCommand( + char* cmd_addr, void* addr) { SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast(cmd_addr); memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC)); @@ -996,8 +992,9 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) { packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff; } -void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr, - void* write_address) { +template +void BlitSdma::BuildGetGlobalTimestampCommand( + char* cmd_addr, void* write_address) { SDMA_PKT_TIMESTAMP* packet_addr = reinterpret_cast(cmd_addr); @@ -1010,7 +1007,8 @@ void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr, packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address); } -void BlitSdma::BuildTrapCommand(char* cmd_addr) { +template +void BlitSdma::BuildTrapCommand(char* cmd_addr) { SDMA_PKT_TRAP* packet_addr = reinterpret_cast(cmd_addr); @@ -1018,4 +1016,8 @@ void BlitSdma::BuildTrapCommand(char* cmd_addr) { packet_addr->HEADER_UNION.op = SDMA_OP_TRAP; } + +template class BlitSdma; +template class BlitSdma; + } // namespace amd diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 7c98e63372..140e37c989 100755 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -389,9 +389,9 @@ bool GpuAgent::InitEndTsPool() { return true; } - end_ts_pool_size_ = static_cast( - (BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) / - (BlitSdma::kCopyPacketSize)); + end_ts_pool_size_ = + static_cast((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) / + (BlitSdmaBase::kCopyPacketSize)); // Allocate end timestamp object for both h2d and d2h DMA. const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize; @@ -510,7 +510,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() { } core::Blit* GpuAgent::CreateBlitSdma() { - BlitSdma* sdma = new BlitSdma(); + core::Blit* sdma; + + if (isa_->GetMajorVersion() <= 8) { + sdma = new BlitSdmaV2V3; + } else { + sdma = new BlitSdmaV4; + } if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) { sdma->Destroy(*this);