Implement SDMA path for Gfx9

Gfx9 requires monotonic write pointer and doorbell.
Cound fields are 1-based compared with 0-based pre-Gfx9.

- Restructure implementation to use monotonic ring indices
- Remove redundant submission size checks (handled by AcquireWriteAddress)
- Unify copy/fill per-command limit (documentation is unclear)

Change-Id: I57c1675221d2e63aa319fee700d9951671e1bd65
This commit is contained in:
Jay Cornwall
2017-01-24 18:55:55 -06:00
förälder a324f21a46
incheckning 1cd46afe6d
3 ändrade filer med 229 tillägg och 210 borttagningar
+41 -30
Visa fil
@@ -55,7 +55,19 @@
#include "core/util/utils.h"
namespace amd {
class BlitSdma : public core::Blit {
class BlitSdmaBase : public core::Blit {
public:
static const size_t kQueueSize;
static const size_t kCopyPacketSize;
static const size_t kMaxSingleCopySize;
static const size_t kMaxSingleFillSize;
};
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring.
// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field.
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
class BlitSdma : public BlitSdmaBase {
public:
explicit BlitSdma();
@@ -113,10 +125,6 @@ class BlitSdma : public core::Blit {
virtual hsa_status_t EnableProfiling(bool enable) override;
static const size_t kQueueSize;
static const size_t kCopyPacketSize;
protected:
/// @brief Acquires the address into queue buffer where a new command
/// packet of specified size could be written. The address that is
@@ -126,13 +134,15 @@ class BlitSdma : public core::Blit {
///
/// @param cmd_size Command packet size in bytes.
///
/// @param curr_index (output) Index to pass to ReleaseWriteAddress.
///
/// @return pointer into the queue buffer where a PM4 packet of specified size
/// could be written. NULL if input size is greater than the size of queue
/// buffer.
char* AcquireWriteAddress(uint32_t cmd_size);
void UpdateWriteAndDoorbellRegister(uint32_t current_offset,
uint32_t new_offset);
char* AcquireWriteAddress(uint32_t cmd_size, RingIndexTy& curr_index);
void UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, RingIndexTy new_index);
/// @brief Updates the Write Register of compute device to the end of
/// SDMA packet written into queue buffer. The update to Write Register
@@ -142,17 +152,19 @@ class BlitSdma : public core::Blit {
/// will block until T1 has completed its update (assumes T1 acquired the
/// write address first).
///
/// @param cmd_addr pointer into the queue buffer where a PM4 packet was
/// written.
/// @param curr_index Index passed back from AcquireWriteAddress.
///
/// @param cmd_size Command packet size in bytes.
void ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size);
void ReleaseWriteAddress(RingIndexTy curr_index, uint32_t cmd_size);
/// @brief Writes NO-OP words into queue buffer in case writing a command
/// causes the queue buffer to wrap.
///
/// @param cmd_size Size in bytes of command causing queue buffer to wrap.
void WrapQueue(uint32_t cmd_size);
/// @param curr_index Index to begin padding from.
void PadRingToEnd(RingIndexTy curr_index);
uint32_t WrapIntoRing(RingIndexTy index);
bool CanWriteUpto(RingIndexTy upto_index);
/// @brief Build fence command
void BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
@@ -176,9 +188,6 @@ class BlitSdma : public core::Blit {
// Agent object owning the SDMA engine.
GpuAgent* agent_;
/// Indicates size of Queue buffer in bytes.
uint32_t queue_size_;
/// Base address of the Queue buffer at construction time.
char* queue_start_addr_;
@@ -191,20 +200,9 @@ class BlitSdma : public core::Blit {
/// and write indices
HsaQueueResource queue_resource_;
/// @brief Current address of execution in Queue buffer.
///
/// @note: The value of address is obtained by reading
/// the value of Write Register of the compute device.
/// Users should write to the Queue buffer at the current
/// address, else it will lead to execution error and potentially
/// a hang.
///
/// @note: The value of Write Register does not always begin
/// with Zero after a Queue has been created. This needs to be
/// understood better. This means that current address number of
/// words of Queue buffer is unavailable for use.
volatile uint32_t cached_reserve_offset_;
volatile uint32_t cached_commit_offset_;
// Monotonic ring indices, in bytes, tracking written and submitted commands.
RingIndexTy cached_reserve_index_;
RingIndexTy cached_commit_index_;
uint32_t linear_copy_command_size_;
@@ -235,6 +233,19 @@ class BlitSdma : public core::Blit {
/// True if platform atomic is supported.
bool platform_atomic_support_;
};
class BlitSdmaV2V3
// Ring indices are 32-bit.
// HW ring indices are not monotonic (wrap at end of ring).
// Count fields of SDMA commands are 0-based.
: public BlitSdma<uint32_t, false, 0> {};
class BlitSdmaV4
// Ring indices are 64-bit.
// HW ring indices are monotonic (do not wrap at end of ring).
// Count fields of SDMA commands are 1-based.
: public BlitSdma<uint64_t, true, -1> {};
} // namespace amd
#endif // header guard
+178 -176
Visa fil
@@ -46,14 +46,13 @@
#include <atomic>
#include <cmath>
#include <cstring>
#include <limits>
#include "core/inc/amd_gpu_agent.h"
#include "core/inc/amd_memory_region.h"
#include "core/inc/runtime.h"
#include "core/inc/signal.h"
#define SDMA_QUEUE_SIZE 1024 * 1024
namespace amd {
// SDMA packet for VI device.
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
@@ -373,29 +372,33 @@ inline uint32_t ptrhigh32(const void* p) {
#endif
}
const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE;
const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation
const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
BlitSdma::BlitSdma()
: core::Blit(),
agent_(NULL),
queue_size_(0),
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
: agent_(NULL),
queue_start_addr_(NULL),
fence_base_addr_(NULL),
fence_pool_size_(0),
fence_pool_counter_(0),
cached_reserve_offset_(0),
cached_commit_offset_(0),
cached_reserve_index_(0),
cached_commit_index_(0),
platform_atomic_support_(true) {
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
}
BlitSdma::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
const core::Agent& agent) {
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
if (queue_start_addr_ != NULL && queue_size_ != 0) {
if (queue_start_addr_ != NULL) {
// Already initialized.
return HSA_STATUS_SUCCESS;
}
@@ -412,26 +415,6 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
trap_command_size_ = sizeof(SDMA_PKT_TRAP);
const uint32_t sync_command_size = fence_command_size_;
const uint32_t max_num_copy_command =
std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
linear_copy_command_size_);
const uint32_t max_num_fill_command =
std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
fill_command_size_);
max_single_linear_copy_size_ = 0x3fffe0;
max_total_linear_copy_size_ = static_cast<size_t>(
std::min(static_cast<uint64_t>(SIZE_MAX),
static_cast<uint64_t>(max_num_copy_command) *
static_cast<uint64_t>(max_single_linear_copy_size_)));
max_single_fill_size_ = (1 << 22) - sizeof(uint32_t);
max_total_fill_size_ = static_cast<size_t>(
std::min(static_cast<uint64_t>(SIZE_MAX),
static_cast<uint64_t>(max_num_fill_command) *
static_cast<uint64_t>(max_single_fill_size_)));
const amd::GpuAgentInt& amd_gpu_agent =
static_cast<const amd::GpuAgentInt&>(agent);
@@ -445,35 +428,31 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
}
// Allocate queue buffer.
queue_size_ = kQueueSize;
queue_start_addr_ =
(char*)core::Runtime::runtime_singleton_->system_allocator()(
queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable);
queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()(
kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable);
if (queue_start_addr_ == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
std::memset(queue_start_addr_, 0, queue_size_);
std::memset(queue_start_addr_, 0, kQueueSize);
// Access kernel driver to initialize the queue control block
// This call binds user mode queue object to underlying compute
// device.
const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
if (HSAKMT_STATUS_SUCCESS !=
hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
queue_size_, NULL, &queue_resource_)) {
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
kQueueSize, NULL, &queue_resource_)) {
Destroy(agent);
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr);
cached_commit_offset_ = cached_reserve_offset_;
cached_reserve_index_ = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr);
cached_commit_index_ = cached_reserve_index_;
fence_pool_size_ = static_cast<uint32_t>(
(kQueueSize + fence_command_size_ - 1) / fence_command_size_);
fence_pool_size_ =
static_cast<uint32_t>((kQueueSize + fence_command_size_ - 1) / fence_command_size_);
fence_pool_mask_ = fence_pool_size_ - 1;
@@ -490,7 +469,9 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy(
const core::Agent& agent) {
// Release all allocated resources and reset them to zero.
if (queue_resource_.QueueId != 0) {
@@ -500,7 +481,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
memset(&queue_resource_, 0, sizeof(queue_resource_));
}
if (queue_start_addr_ != NULL && queue_size_ != 0) {
if (queue_start_addr_ != NULL) {
// Release queue buffer.
core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_);
}
@@ -509,24 +490,19 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
core::Runtime::runtime_singleton_->system_deallocator()(fence_base_addr_);
}
queue_size_ = 0;
queue_start_addr_ = NULL;
cached_reserve_offset_ = 0;
cached_commit_offset_ = 0;
cached_reserve_index_ = 0;
cached_commit_index_ = 0;
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
size_t size) {
if (size > max_total_linear_copy_size_) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size) {
// Break the copy into multiple copy operation incase the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_copy_command =
(size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
const uint32_t total_copy_command_size =
num_copy_command * linear_copy_command_size_;
@@ -538,8 +514,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
uint32_t* fence_addr = ObtainFenceObject();
*fence_addr = 0;
char* command_addr = AcquireWriteAddress(total_command_size);
char* const command_addr_temp = command_addr;
RingIndexTy curr_index;
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
if (command_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -551,20 +527,17 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
BuildFenceCommand(command_addr, fence_addr, kFenceValue);
ReleaseWriteAddress(command_addr_temp, total_command_size);
ReleaseWriteAddress(curr_index, total_command_size);
WaitFence(fence_addr, kFenceValue);
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitSdma::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size,
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
if (size > max_total_linear_copy_size_) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
// The signal is 64 bit value, and poll checks for 32 bit value. So we
// need to use two poll operations per dependent signal.
const uint32_t num_poll_command =
@@ -574,8 +547,7 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
// Break the copy into multiple copy operation incase the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_copy_command =
(size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
const uint32_t total_copy_command_size =
num_copy_command * linear_copy_command_size_;
@@ -624,8 +596,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
total_poll_command_size + total_copy_command_size + sync_command_size +
total_timestamp_command_size + interrupt_command_size;
char* command_addr = AcquireWriteAddress(total_command_size);
char* const command_addr_temp = command_addr;
RingIndexTy curr_index;
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
if (command_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -695,23 +667,19 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
BuildTrapCommand(command_addr);
}
ReleaseWriteAddress(command_addr_temp, total_command_size);
ReleaseWriteAddress(curr_index, total_command_size);
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
size_t count) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearFillCommand(
void* ptr, uint32_t value, size_t count) {
const size_t size = count * sizeof(uint32_t);
if (size > max_total_fill_size_) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
// Break the copy into multiple copy operation incase the copy size exceeds
// the SDMA linear copy limit.
const uint32_t num_fill_command =
(size + max_single_fill_size_ - 1) / max_single_fill_size_;
const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize;
const uint32_t total_fill_command_size =
num_fill_command * fill_command_size_;
@@ -719,8 +687,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
const uint32_t total_command_size =
total_fill_command_size + fence_command_size_;
char* command_addr = AcquireWriteAddress(total_command_size);
char* const command_addr_temp = command_addr;
RingIndexTy curr_index;
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
if (command_addr == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -729,8 +697,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
const uint32_t fill_command_size = fill_command_size_;
size_t cur_size = 0;
for (uint32_t i = 0; i < num_fill_command; ++i) {
const uint32_t fill_size = static_cast<uint32_t>(
std::min((size - cur_size), max_single_fill_size_));
const uint32_t fill_size =
static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleFillSize));
void* cur_ptr = static_cast<char*>(ptr) + cur_size;
@@ -747,7 +715,7 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
packet_addr->DATA_UNION.src_data_31_0 = value;
packet_addr->COUNT_UNION.count = fill_size;
packet_addr->COUNT_UNION.count = fill_size + SizeToCountOffset;
command_addr += fill_command_size;
cur_size += fill_size;
@@ -761,139 +729,160 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
BuildFenceCommand(command_addr, fence_addr, kFenceValue);
ReleaseWriteAddress(command_addr_temp, total_command_size);
ReleaseWriteAddress(curr_index, total_command_size);
WaitFence(fence_addr, kFenceValue);
return HSA_STATUS_SUCCESS;
}
hsa_status_t BlitSdma::EnableProfiling(bool enable) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::EnableProfiling(
bool enable) {
return HSA_STATUS_SUCCESS;
}
char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) {
if (cmd_size > queue_size_) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::AcquireWriteAddress(
uint32_t cmd_size, RingIndexTy& curr_index) {
// Ring is full when all but one byte is written.
if (cmd_size >= kQueueSize) {
return NULL;
}
while (true) {
const uint32_t curr_offset =
atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
const uint32_t end_offset = curr_offset + cmd_size;
curr_index = atomic::Load(&cached_reserve_index_, std::memory_order_acquire);
if (end_offset >= queue_size_) {
// Queue buffer is not enough to contain the new command.
WrapQueue(cmd_size);
// Check whether a linear region of the requested size is available.
// If == cmd_size: region is at beginning of ring.
// If < cmd_size: region intersects end of ring, pad with no-ops and retry.
if (WrapIntoRing(curr_index + cmd_size) < cmd_size) {
PadRingToEnd(curr_index);
continue;
}
const uint32_t curr_read_ptr_val =
atomic::Load(queue_resource_.Queue_read_ptr, std::memory_order_acquire);
if (curr_offset < curr_read_ptr_val && end_offset > curr_read_ptr_val) {
// Queue is wrapping and there is not enough space to recycle.
// Check whether the engine has finished using this region.
const RingIndexTy new_index = curr_index + cmd_size;
if (CanWriteUpto(new_index) == false) {
// Wait for read index to move and try again.
os::YieldThread();
continue;
}
if (atomic::Cas(&cached_reserve_offset_, end_offset, curr_offset,
std::memory_order_release) == curr_offset) {
return queue_start_addr_ + curr_offset;
// Try to reserve this part of the ring.
if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
curr_index) {
return queue_start_addr_ + WrapIntoRing(curr_index);
}
// Another thread reserved curr_index, try again.
os::YieldThread();
}
return NULL;
}
void BlitSdma::UpdateWriteAndDoorbellRegister(uint32_t current_offset,
uint32_t new_offset) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::UpdateWriteAndDoorbellRegister(
RingIndexTy curr_index, RingIndexTy new_index) {
while (true) {
// Make sure that the address before ::current_offset is already released.
// Make sure that the address before ::curr_index is already released.
// Otherwise the CP may read invalid packets.
if (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) ==
current_offset) {
if (atomic::Load(&cached_commit_index_, std::memory_order_acquire) == curr_index) {
if (core::Runtime::runtime_singleton_->flag().sdma_wait_idle()) {
// TODO: remove when sdma wpointer issue is resolved.
// Wait until the SDMA engine finish processing all packets before
// updating the wptr and doorbell.
while (atomic::Load(queue_resource_.Queue_read_ptr,
std::memory_order_acquire) != current_offset) {
while (WrapIntoRing(*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr)) !=
WrapIntoRing(curr_index)) {
os::YieldThread();
}
}
// Update write pointer and doorbel register.
atomic::Store(queue_resource_.Queue_write_ptr, new_offset);
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr) =
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
// Ensure write pointer is visible to GPU before doorbell.
std::atomic_thread_fence(std::memory_order_release);
atomic::Store(queue_resource_.Queue_DoorBell, new_offset);
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_DoorBell) =
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
std::atomic_thread_fence(std::memory_order_release);
atomic::Store(&cached_commit_offset_, new_offset);
atomic::Store(&cached_commit_index_, new_index, std::memory_order_release);
break;
}
// Waiting for another thread to submit preceding commands first.
os::YieldThread();
}
}
void BlitSdma::ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size) {
assert(cmd_addr != NULL);
assert(cmd_addr >= queue_start_addr_);
if (cmd_size > queue_size_) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ReleaseWriteAddress(
RingIndexTy curr_index, uint32_t cmd_size) {
if (cmd_size > kQueueSize) {
assert(false && "cmd_addr is outside the queue buffer range");
return;
}
// Update write register.
const uint32_t curent_offset = cmd_addr - queue_start_addr_;
const uint32_t new_offset = curent_offset + cmd_size;
UpdateWriteAndDoorbellRegister(curent_offset, new_offset);
UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size);
}
void BlitSdma::WrapQueue(uint32_t cmd_size) {
// Re-determine the offset into queue buffer where NOOP instructions
// should be written.
while (true) {
const uint32_t full_offset = queue_size_ + 1;
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::PadRingToEnd(
RingIndexTy curr_index) {
// Reserve region from here to the end of the ring.
RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
uint32_t curent_offset =
atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
const uint32_t end_offset = curent_offset + cmd_size;
if (end_offset < queue_size_) {
return;
}
// Check whether the engine has finished using this region.
if (CanWriteUpto(new_index) == false) {
// Wait for read index to move and try again.
return;
}
if (curent_offset == full_offset) {
// Another thread is already wrapping the queue.
continue;
}
if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
curr_index) {
// Write and submit NOP commands in reserved region.
char* nop_address = queue_start_addr_ + WrapIntoRing(curr_index);
memset(nop_address, 0, new_index - curr_index);
// Close reservation to queue temporarily by "making" it full.
if (atomic::Cas(&cached_reserve_offset_, full_offset, curent_offset,
std::memory_order_release) == curent_offset) {
// Wait till all reserved packets are commited.
while (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) !=
curent_offset) {
os::YieldThread();
}
// Fill the remainder of the queue with NOOP commands.
char* noop_address = queue_start_addr_ + curent_offset;
const size_t noop_commands_size = queue_size_ - curent_offset;
memset(noop_address, 0, noop_commands_size);
// Update write and doorbell registers to execute NOOP instructions.
UpdateWriteAndDoorbellRegister(curent_offset, 0);
// Open access to queue.
atomic::Store(&cached_reserve_offset_, 0U, std::memory_order_release);
}
UpdateWriteAndDoorbellRegister(curr_index, new_index);
}
}
void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
uint32_t fence_value) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WrapIntoRing(
RingIndexTy index) {
return index & (kQueueSize - 1);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::CanWriteUpto(
RingIndexTy upto_index) {
// Get/calculate the monotonic read index.
RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
RingIndexTy read_index;
if (HwIndexMonotonic) {
read_index = hw_read_index;
} else {
// Calculate distance from commit index to HW read index.
// Commit index is always < kQueueSize away from HW read index.
RingIndexTy commit_index = atomic::Load(&cached_commit_index_, std::memory_order_relaxed);
RingIndexTy dist_to_read_index = WrapIntoRing(commit_index - hw_read_index);
read_index = commit_index - dist_to_read_index;
}
// Check whether the read pointer has passed the given index.
// At most we can submit (kQueueSize - 1) bytes at a time.
return (upto_index - read_index) < kQueueSize;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceCommand(
char* fence_command_addr, uint32_t* fence, uint32_t fence_value) {
assert(fence_command_addr != NULL);
SDMA_PKT_FENCE* packet_addr =
reinterpret_cast<SDMA_PKT_FENCE*>(fence_command_addr);
@@ -909,7 +898,8 @@ void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
packet_addr->DATA_UNION.data = fence_value;
}
uint32_t* BlitSdma::ObtainFenceObject() {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
uint32_t* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ObtainFenceObject() {
const uint32_t fence_index =
atomic::Add(&fence_pool_counter_, 1U, std::memory_order_acquire);
uint32_t* fence_addr = &fence_base_addr_[fence_index & fence_pool_mask_];
@@ -917,7 +907,9 @@ uint32_t* BlitSdma::ObtainFenceObject() {
return fence_addr;
}
void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WaitFence(uint32_t* fence,
uint32_t fence_value) {
int spin_count = 51;
while (atomic::Load(fence, std::memory_order_acquire) != fence_value) {
if (--spin_count > 0) {
@@ -927,12 +919,13 @@ void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
}
}
void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
void* dst, const void* src, size_t size) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyCommand(
char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) {
size_t cur_size = 0;
for (uint32_t i = 0; i < num_copy_command; ++i) {
const uint32_t copy_size = static_cast<uint32_t>(
std::min((size - cur_size), max_single_linear_copy_size_));
const uint32_t copy_size =
static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleCopySize));
void* cur_dst = static_cast<char*>(dst) + cur_size;
const void* cur_src = static_cast<const char*>(src) + cur_size;
@@ -945,7 +938,7 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
packet_addr->HEADER_UNION.op = SDMA_OP_COPY;
packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR;
packet_addr->COUNT_UNION.count = copy_size;
packet_addr->COUNT_UNION.count = copy_size + SizeToCountOffset;
packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src);
packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src);
@@ -960,8 +953,9 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
assert(cur_size == size);
}
void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
uint32_t reference) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
char* cmd_addr, void* addr, uint32_t reference) {
SDMA_PKT_POLL_REGMEM* packet_addr =
reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
@@ -981,7 +975,9 @@ void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever.
}
void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildAtomicDecrementCommand(
char* cmd_addr, void* addr) {
SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast<SDMA_PKT_ATOMIC*>(cmd_addr);
memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC));
@@ -996,8 +992,9 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
}
void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
void* write_address) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildGetGlobalTimestampCommand(
char* cmd_addr, void* write_address) {
SDMA_PKT_TIMESTAMP* packet_addr =
reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
@@ -1010,7 +1007,8 @@ void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
}
void BlitSdma::BuildTrapCommand(char* cmd_addr) {
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapCommand(char* cmd_addr) {
SDMA_PKT_TRAP* packet_addr =
reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
@@ -1018,4 +1016,8 @@ void BlitSdma::BuildTrapCommand(char* cmd_addr) {
packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
}
template class BlitSdma<uint32_t, false, 0>;
template class BlitSdma<uint64_t, true, -1>;
} // namespace amd
@@ -389,9 +389,9 @@ bool GpuAgent::InitEndTsPool() {
return true;
}
end_ts_pool_size_ = static_cast<uint32_t>(
(BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) /
(BlitSdma::kCopyPacketSize));
end_ts_pool_size_ =
static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
(BlitSdmaBase::kCopyPacketSize));
// Allocate end timestamp object for both h2d and d2h DMA.
const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
@@ -510,7 +510,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
}
core::Blit* GpuAgent::CreateBlitSdma() {
BlitSdma* sdma = new BlitSdma();
core::Blit* sdma;
if (isa_->GetMajorVersion() <= 8) {
sdma = new BlitSdmaV2V3;
} else {
sdma = new BlitSdmaV4;
}
if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
sdma->Destroy(*this);