Implement SDMA path for Gfx9
Gfx9 requires monotonic write pointer and doorbell. Cound fields are 1-based compared with 0-based pre-Gfx9. - Restructure implementation to use monotonic ring indices - Remove redundant submission size checks (handled by AcquireWriteAddress) - Unify copy/fill per-command limit (documentation is unclear) Change-Id: I57c1675221d2e63aa319fee700d9951671e1bd65
This commit is contained in:
@@ -55,7 +55,19 @@
|
||||
#include "core/util/utils.h"
|
||||
|
||||
namespace amd {
|
||||
class BlitSdma : public core::Blit {
|
||||
class BlitSdmaBase : public core::Blit {
|
||||
public:
|
||||
static const size_t kQueueSize;
|
||||
static const size_t kCopyPacketSize;
|
||||
static const size_t kMaxSingleCopySize;
|
||||
static const size_t kMaxSingleFillSize;
|
||||
};
|
||||
|
||||
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
|
||||
// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring.
|
||||
// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field.
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
class BlitSdma : public BlitSdmaBase {
|
||||
public:
|
||||
explicit BlitSdma();
|
||||
|
||||
@@ -113,10 +125,6 @@ class BlitSdma : public core::Blit {
|
||||
|
||||
virtual hsa_status_t EnableProfiling(bool enable) override;
|
||||
|
||||
static const size_t kQueueSize;
|
||||
|
||||
static const size_t kCopyPacketSize;
|
||||
|
||||
protected:
|
||||
/// @brief Acquires the address into queue buffer where a new command
|
||||
/// packet of specified size could be written. The address that is
|
||||
@@ -126,13 +134,15 @@ class BlitSdma : public core::Blit {
|
||||
///
|
||||
/// @param cmd_size Command packet size in bytes.
|
||||
///
|
||||
/// @param curr_index (output) Index to pass to ReleaseWriteAddress.
|
||||
///
|
||||
/// @return pointer into the queue buffer where a PM4 packet of specified size
|
||||
/// could be written. NULL if input size is greater than the size of queue
|
||||
/// buffer.
|
||||
char* AcquireWriteAddress(uint32_t cmd_size);
|
||||
|
||||
void UpdateWriteAndDoorbellRegister(uint32_t current_offset,
|
||||
uint32_t new_offset);
|
||||
char* AcquireWriteAddress(uint32_t cmd_size, RingIndexTy& curr_index);
|
||||
|
||||
void UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, RingIndexTy new_index);
|
||||
|
||||
/// @brief Updates the Write Register of compute device to the end of
|
||||
/// SDMA packet written into queue buffer. The update to Write Register
|
||||
@@ -142,17 +152,19 @@ class BlitSdma : public core::Blit {
|
||||
/// will block until T1 has completed its update (assumes T1 acquired the
|
||||
/// write address first).
|
||||
///
|
||||
/// @param cmd_addr pointer into the queue buffer where a PM4 packet was
|
||||
/// written.
|
||||
/// @param curr_index Index passed back from AcquireWriteAddress.
|
||||
///
|
||||
/// @param cmd_size Command packet size in bytes.
|
||||
void ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size);
|
||||
void ReleaseWriteAddress(RingIndexTy curr_index, uint32_t cmd_size);
|
||||
|
||||
/// @brief Writes NO-OP words into queue buffer in case writing a command
|
||||
/// causes the queue buffer to wrap.
|
||||
///
|
||||
/// @param cmd_size Size in bytes of command causing queue buffer to wrap.
|
||||
void WrapQueue(uint32_t cmd_size);
|
||||
/// @param curr_index Index to begin padding from.
|
||||
void PadRingToEnd(RingIndexTy curr_index);
|
||||
|
||||
uint32_t WrapIntoRing(RingIndexTy index);
|
||||
bool CanWriteUpto(RingIndexTy upto_index);
|
||||
|
||||
/// @brief Build fence command
|
||||
void BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
|
||||
@@ -176,9 +188,6 @@ class BlitSdma : public core::Blit {
|
||||
// Agent object owning the SDMA engine.
|
||||
GpuAgent* agent_;
|
||||
|
||||
/// Indicates size of Queue buffer in bytes.
|
||||
uint32_t queue_size_;
|
||||
|
||||
/// Base address of the Queue buffer at construction time.
|
||||
char* queue_start_addr_;
|
||||
|
||||
@@ -191,20 +200,9 @@ class BlitSdma : public core::Blit {
|
||||
/// and write indices
|
||||
HsaQueueResource queue_resource_;
|
||||
|
||||
/// @brief Current address of execution in Queue buffer.
|
||||
///
|
||||
/// @note: The value of address is obtained by reading
|
||||
/// the value of Write Register of the compute device.
|
||||
/// Users should write to the Queue buffer at the current
|
||||
/// address, else it will lead to execution error and potentially
|
||||
/// a hang.
|
||||
///
|
||||
/// @note: The value of Write Register does not always begin
|
||||
/// with Zero after a Queue has been created. This needs to be
|
||||
/// understood better. This means that current address number of
|
||||
/// words of Queue buffer is unavailable for use.
|
||||
volatile uint32_t cached_reserve_offset_;
|
||||
volatile uint32_t cached_commit_offset_;
|
||||
// Monotonic ring indices, in bytes, tracking written and submitted commands.
|
||||
RingIndexTy cached_reserve_index_;
|
||||
RingIndexTy cached_commit_index_;
|
||||
|
||||
uint32_t linear_copy_command_size_;
|
||||
|
||||
@@ -235,6 +233,19 @@ class BlitSdma : public core::Blit {
|
||||
/// True if platform atomic is supported.
|
||||
bool platform_atomic_support_;
|
||||
};
|
||||
|
||||
class BlitSdmaV2V3
|
||||
// Ring indices are 32-bit.
|
||||
// HW ring indices are not monotonic (wrap at end of ring).
|
||||
// Count fields of SDMA commands are 0-based.
|
||||
: public BlitSdma<uint32_t, false, 0> {};
|
||||
|
||||
class BlitSdmaV4
|
||||
// Ring indices are 64-bit.
|
||||
// HW ring indices are monotonic (do not wrap at end of ring).
|
||||
// Count fields of SDMA commands are 1-based.
|
||||
: public BlitSdma<uint64_t, true, -1> {};
|
||||
|
||||
} // namespace amd
|
||||
|
||||
#endif // header guard
|
||||
|
||||
@@ -46,14 +46,13 @@
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
#include "core/inc/amd_gpu_agent.h"
|
||||
#include "core/inc/amd_memory_region.h"
|
||||
#include "core/inc/runtime.h"
|
||||
#include "core/inc/signal.h"
|
||||
|
||||
#define SDMA_QUEUE_SIZE 1024 * 1024
|
||||
|
||||
namespace amd {
|
||||
// SDMA packet for VI device.
|
||||
// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
|
||||
@@ -373,29 +372,33 @@ inline uint32_t ptrhigh32(const void* p) {
|
||||
#endif
|
||||
}
|
||||
|
||||
const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE;
|
||||
const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
|
||||
const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
|
||||
const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
|
||||
const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation
|
||||
const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
|
||||
|
||||
BlitSdma::BlitSdma()
|
||||
: core::Blit(),
|
||||
agent_(NULL),
|
||||
queue_size_(0),
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
|
||||
: agent_(NULL),
|
||||
queue_start_addr_(NULL),
|
||||
fence_base_addr_(NULL),
|
||||
fence_pool_size_(0),
|
||||
fence_pool_counter_(0),
|
||||
cached_reserve_offset_(0),
|
||||
cached_commit_offset_(0),
|
||||
cached_reserve_index_(0),
|
||||
cached_commit_index_(0),
|
||||
platform_atomic_support_(true) {
|
||||
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
|
||||
}
|
||||
|
||||
BlitSdma::~BlitSdma() {}
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
|
||||
|
||||
hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
|
||||
const core::Agent& agent) {
|
||||
agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
|
||||
|
||||
if (queue_start_addr_ != NULL && queue_size_ != 0) {
|
||||
if (queue_start_addr_ != NULL) {
|
||||
// Already initialized.
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -412,26 +415,6 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
|
||||
trap_command_size_ = sizeof(SDMA_PKT_TRAP);
|
||||
|
||||
const uint32_t sync_command_size = fence_command_size_;
|
||||
const uint32_t max_num_copy_command =
|
||||
std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
|
||||
linear_copy_command_size_);
|
||||
const uint32_t max_num_fill_command =
|
||||
std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
|
||||
fill_command_size_);
|
||||
|
||||
max_single_linear_copy_size_ = 0x3fffe0;
|
||||
max_total_linear_copy_size_ = static_cast<size_t>(
|
||||
std::min(static_cast<uint64_t>(SIZE_MAX),
|
||||
static_cast<uint64_t>(max_num_copy_command) *
|
||||
static_cast<uint64_t>(max_single_linear_copy_size_)));
|
||||
|
||||
max_single_fill_size_ = (1 << 22) - sizeof(uint32_t);
|
||||
max_total_fill_size_ = static_cast<size_t>(
|
||||
std::min(static_cast<uint64_t>(SIZE_MAX),
|
||||
static_cast<uint64_t>(max_num_fill_command) *
|
||||
static_cast<uint64_t>(max_single_fill_size_)));
|
||||
|
||||
const amd::GpuAgentInt& amd_gpu_agent =
|
||||
static_cast<const amd::GpuAgentInt&>(agent);
|
||||
|
||||
@@ -445,35 +428,31 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
}
|
||||
|
||||
// Allocate queue buffer.
|
||||
queue_size_ = kQueueSize;
|
||||
|
||||
queue_start_addr_ =
|
||||
(char*)core::Runtime::runtime_singleton_->system_allocator()(
|
||||
queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable);
|
||||
queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()(
|
||||
kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable);
|
||||
|
||||
if (queue_start_addr_ == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
std::memset(queue_start_addr_, 0, queue_size_);
|
||||
std::memset(queue_start_addr_, 0, kQueueSize);
|
||||
|
||||
// Access kernel driver to initialize the queue control block
|
||||
// This call binds user mode queue object to underlying compute
|
||||
// device.
|
||||
const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
|
||||
if (HSAKMT_STATUS_SUCCESS !=
|
||||
hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
|
||||
queue_size_, NULL, &queue_resource_)) {
|
||||
if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
|
||||
HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
|
||||
kQueueSize, NULL, &queue_resource_)) {
|
||||
Destroy(agent);
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr);
|
||||
cached_commit_offset_ = cached_reserve_offset_;
|
||||
cached_reserve_index_ = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr);
|
||||
cached_commit_index_ = cached_reserve_index_;
|
||||
|
||||
fence_pool_size_ = static_cast<uint32_t>(
|
||||
(kQueueSize + fence_command_size_ - 1) / fence_command_size_);
|
||||
fence_pool_size_ =
|
||||
static_cast<uint32_t>((kQueueSize + fence_command_size_ - 1) / fence_command_size_);
|
||||
|
||||
fence_pool_mask_ = fence_pool_size_ - 1;
|
||||
|
||||
@@ -490,7 +469,9 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy(
|
||||
const core::Agent& agent) {
|
||||
// Release all allocated resources and reset them to zero.
|
||||
|
||||
if (queue_resource_.QueueId != 0) {
|
||||
@@ -500,7 +481,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
|
||||
memset(&queue_resource_, 0, sizeof(queue_resource_));
|
||||
}
|
||||
|
||||
if (queue_start_addr_ != NULL && queue_size_ != 0) {
|
||||
if (queue_start_addr_ != NULL) {
|
||||
// Release queue buffer.
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_);
|
||||
}
|
||||
@@ -509,24 +490,19 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
|
||||
core::Runtime::runtime_singleton_->system_deallocator()(fence_base_addr_);
|
||||
}
|
||||
|
||||
queue_size_ = 0;
|
||||
queue_start_addr_ = NULL;
|
||||
cached_reserve_offset_ = 0;
|
||||
cached_commit_offset_ = 0;
|
||||
cached_reserve_index_ = 0;
|
||||
cached_commit_index_ = 0;
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
|
||||
size_t size) {
|
||||
if (size > max_total_linear_copy_size_) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size) {
|
||||
// Break the copy into multiple copy operation incase the copy size exceeds
|
||||
// the SDMA linear copy limit.
|
||||
const uint32_t num_copy_command =
|
||||
(size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
|
||||
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
|
||||
|
||||
const uint32_t total_copy_command_size =
|
||||
num_copy_command * linear_copy_command_size_;
|
||||
@@ -538,8 +514,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
|
||||
uint32_t* fence_addr = ObtainFenceObject();
|
||||
*fence_addr = 0;
|
||||
|
||||
char* command_addr = AcquireWriteAddress(total_command_size);
|
||||
char* const command_addr_temp = command_addr;
|
||||
RingIndexTy curr_index;
|
||||
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
|
||||
|
||||
if (command_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -551,20 +527,17 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
|
||||
|
||||
BuildFenceCommand(command_addr, fence_addr, kFenceValue);
|
||||
|
||||
ReleaseWriteAddress(command_addr_temp, total_command_size);
|
||||
ReleaseWriteAddress(curr_index, total_command_size);
|
||||
|
||||
WaitFence(fence_addr, kFenceValue);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size,
|
||||
std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
|
||||
if (size > max_total_linear_copy_size_) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
|
||||
void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
// The signal is 64 bit value, and poll checks for 32 bit value. So we
|
||||
// need to use two poll operations per dependent signal.
|
||||
const uint32_t num_poll_command =
|
||||
@@ -574,8 +547,7 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
|
||||
// Break the copy into multiple copy operation incase the copy size exceeds
|
||||
// the SDMA linear copy limit.
|
||||
const uint32_t num_copy_command =
|
||||
(size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
|
||||
const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
|
||||
const uint32_t total_copy_command_size =
|
||||
num_copy_command * linear_copy_command_size_;
|
||||
|
||||
@@ -624,8 +596,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
total_poll_command_size + total_copy_command_size + sync_command_size +
|
||||
total_timestamp_command_size + interrupt_command_size;
|
||||
|
||||
char* command_addr = AcquireWriteAddress(total_command_size);
|
||||
char* const command_addr_temp = command_addr;
|
||||
RingIndexTy curr_index;
|
||||
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
|
||||
|
||||
if (command_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -695,23 +667,19 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
|
||||
BuildTrapCommand(command_addr);
|
||||
}
|
||||
|
||||
ReleaseWriteAddress(command_addr_temp, total_command_size);
|
||||
ReleaseWriteAddress(curr_index, total_command_size);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
size_t count) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearFillCommand(
|
||||
void* ptr, uint32_t value, size_t count) {
|
||||
const size_t size = count * sizeof(uint32_t);
|
||||
|
||||
if (size > max_total_fill_size_) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
// Break the copy into multiple copy operation incase the copy size exceeds
|
||||
// the SDMA linear copy limit.
|
||||
const uint32_t num_fill_command =
|
||||
(size + max_single_fill_size_ - 1) / max_single_fill_size_;
|
||||
const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize;
|
||||
|
||||
const uint32_t total_fill_command_size =
|
||||
num_fill_command * fill_command_size_;
|
||||
@@ -719,8 +687,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
const uint32_t total_command_size =
|
||||
total_fill_command_size + fence_command_size_;
|
||||
|
||||
char* command_addr = AcquireWriteAddress(total_command_size);
|
||||
char* const command_addr_temp = command_addr;
|
||||
RingIndexTy curr_index;
|
||||
char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
|
||||
|
||||
if (command_addr == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
@@ -729,8 +697,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
const uint32_t fill_command_size = fill_command_size_;
|
||||
size_t cur_size = 0;
|
||||
for (uint32_t i = 0; i < num_fill_command; ++i) {
|
||||
const uint32_t fill_size = static_cast<uint32_t>(
|
||||
std::min((size - cur_size), max_single_fill_size_));
|
||||
const uint32_t fill_size =
|
||||
static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleFillSize));
|
||||
|
||||
void* cur_ptr = static_cast<char*>(ptr) + cur_size;
|
||||
|
||||
@@ -747,7 +715,7 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
|
||||
packet_addr->DATA_UNION.src_data_31_0 = value;
|
||||
|
||||
packet_addr->COUNT_UNION.count = fill_size;
|
||||
packet_addr->COUNT_UNION.count = fill_size + SizeToCountOffset;
|
||||
|
||||
command_addr += fill_command_size;
|
||||
cur_size += fill_size;
|
||||
@@ -761,139 +729,160 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
|
||||
|
||||
BuildFenceCommand(command_addr, fence_addr, kFenceValue);
|
||||
|
||||
ReleaseWriteAddress(command_addr_temp, total_command_size);
|
||||
ReleaseWriteAddress(curr_index, total_command_size);
|
||||
|
||||
WaitFence(fence_addr, kFenceValue);
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t BlitSdma::EnableProfiling(bool enable) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::EnableProfiling(
|
||||
bool enable) {
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) {
|
||||
if (cmd_size > queue_size_) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::AcquireWriteAddress(
|
||||
uint32_t cmd_size, RingIndexTy& curr_index) {
|
||||
// Ring is full when all but one byte is written.
|
||||
if (cmd_size >= kQueueSize) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const uint32_t curr_offset =
|
||||
atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
|
||||
const uint32_t end_offset = curr_offset + cmd_size;
|
||||
curr_index = atomic::Load(&cached_reserve_index_, std::memory_order_acquire);
|
||||
|
||||
if (end_offset >= queue_size_) {
|
||||
// Queue buffer is not enough to contain the new command.
|
||||
WrapQueue(cmd_size);
|
||||
// Check whether a linear region of the requested size is available.
|
||||
// If == cmd_size: region is at beginning of ring.
|
||||
// If < cmd_size: region intersects end of ring, pad with no-ops and retry.
|
||||
if (WrapIntoRing(curr_index + cmd_size) < cmd_size) {
|
||||
PadRingToEnd(curr_index);
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t curr_read_ptr_val =
|
||||
atomic::Load(queue_resource_.Queue_read_ptr, std::memory_order_acquire);
|
||||
if (curr_offset < curr_read_ptr_val && end_offset > curr_read_ptr_val) {
|
||||
// Queue is wrapping and there is not enough space to recycle.
|
||||
// Check whether the engine has finished using this region.
|
||||
const RingIndexTy new_index = curr_index + cmd_size;
|
||||
|
||||
if (CanWriteUpto(new_index) == false) {
|
||||
// Wait for read index to move and try again.
|
||||
os::YieldThread();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (atomic::Cas(&cached_reserve_offset_, end_offset, curr_offset,
|
||||
std::memory_order_release) == curr_offset) {
|
||||
return queue_start_addr_ + curr_offset;
|
||||
// Try to reserve this part of the ring.
|
||||
if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
|
||||
curr_index) {
|
||||
return queue_start_addr_ + WrapIntoRing(curr_index);
|
||||
}
|
||||
|
||||
// Another thread reserved curr_index, try again.
|
||||
os::YieldThread();
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void BlitSdma::UpdateWriteAndDoorbellRegister(uint32_t current_offset,
|
||||
uint32_t new_offset) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::UpdateWriteAndDoorbellRegister(
|
||||
RingIndexTy curr_index, RingIndexTy new_index) {
|
||||
while (true) {
|
||||
// Make sure that the address before ::current_offset is already released.
|
||||
// Make sure that the address before ::curr_index is already released.
|
||||
// Otherwise the CP may read invalid packets.
|
||||
if (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) ==
|
||||
current_offset) {
|
||||
if (atomic::Load(&cached_commit_index_, std::memory_order_acquire) == curr_index) {
|
||||
if (core::Runtime::runtime_singleton_->flag().sdma_wait_idle()) {
|
||||
// TODO: remove when sdma wpointer issue is resolved.
|
||||
// Wait until the SDMA engine finish processing all packets before
|
||||
// updating the wptr and doorbell.
|
||||
while (atomic::Load(queue_resource_.Queue_read_ptr,
|
||||
std::memory_order_acquire) != current_offset) {
|
||||
while (WrapIntoRing(*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr)) !=
|
||||
WrapIntoRing(curr_index)) {
|
||||
os::YieldThread();
|
||||
}
|
||||
}
|
||||
|
||||
// Update write pointer and doorbel register.
|
||||
atomic::Store(queue_resource_.Queue_write_ptr, new_offset);
|
||||
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr) =
|
||||
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
|
||||
|
||||
// Ensure write pointer is visible to GPU before doorbell.
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
|
||||
atomic::Store(queue_resource_.Queue_DoorBell, new_offset);
|
||||
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_DoorBell) =
|
||||
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
|
||||
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
|
||||
atomic::Store(&cached_commit_offset_, new_offset);
|
||||
atomic::Store(&cached_commit_index_, new_index, std::memory_order_release);
|
||||
break;
|
||||
}
|
||||
|
||||
// Waiting for another thread to submit preceding commands first.
|
||||
os::YieldThread();
|
||||
}
|
||||
}
|
||||
|
||||
void BlitSdma::ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size) {
|
||||
assert(cmd_addr != NULL);
|
||||
assert(cmd_addr >= queue_start_addr_);
|
||||
|
||||
if (cmd_size > queue_size_) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ReleaseWriteAddress(
|
||||
RingIndexTy curr_index, uint32_t cmd_size) {
|
||||
if (cmd_size > kQueueSize) {
|
||||
assert(false && "cmd_addr is outside the queue buffer range");
|
||||
return;
|
||||
}
|
||||
|
||||
// Update write register.
|
||||
const uint32_t curent_offset = cmd_addr - queue_start_addr_;
|
||||
const uint32_t new_offset = curent_offset + cmd_size;
|
||||
UpdateWriteAndDoorbellRegister(curent_offset, new_offset);
|
||||
UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size);
|
||||
}
|
||||
|
||||
void BlitSdma::WrapQueue(uint32_t cmd_size) {
|
||||
// Re-determine the offset into queue buffer where NOOP instructions
|
||||
// should be written.
|
||||
while (true) {
|
||||
const uint32_t full_offset = queue_size_ + 1;
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::PadRingToEnd(
|
||||
RingIndexTy curr_index) {
|
||||
// Reserve region from here to the end of the ring.
|
||||
RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
|
||||
|
||||
uint32_t curent_offset =
|
||||
atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
|
||||
const uint32_t end_offset = curent_offset + cmd_size;
|
||||
if (end_offset < queue_size_) {
|
||||
return;
|
||||
}
|
||||
// Check whether the engine has finished using this region.
|
||||
if (CanWriteUpto(new_index) == false) {
|
||||
// Wait for read index to move and try again.
|
||||
return;
|
||||
}
|
||||
|
||||
if (curent_offset == full_offset) {
|
||||
// Another thread is already wrapping the queue.
|
||||
continue;
|
||||
}
|
||||
if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
|
||||
curr_index) {
|
||||
// Write and submit NOP commands in reserved region.
|
||||
char* nop_address = queue_start_addr_ + WrapIntoRing(curr_index);
|
||||
memset(nop_address, 0, new_index - curr_index);
|
||||
|
||||
// Close reservation to queue temporarily by "making" it full.
|
||||
if (atomic::Cas(&cached_reserve_offset_, full_offset, curent_offset,
|
||||
std::memory_order_release) == curent_offset) {
|
||||
// Wait till all reserved packets are commited.
|
||||
while (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) !=
|
||||
curent_offset) {
|
||||
os::YieldThread();
|
||||
}
|
||||
|
||||
// Fill the remainder of the queue with NOOP commands.
|
||||
char* noop_address = queue_start_addr_ + curent_offset;
|
||||
const size_t noop_commands_size = queue_size_ - curent_offset;
|
||||
memset(noop_address, 0, noop_commands_size);
|
||||
|
||||
// Update write and doorbell registers to execute NOOP instructions.
|
||||
UpdateWriteAndDoorbellRegister(curent_offset, 0);
|
||||
|
||||
// Open access to queue.
|
||||
atomic::Store(&cached_reserve_offset_, 0U, std::memory_order_release);
|
||||
}
|
||||
UpdateWriteAndDoorbellRegister(curr_index, new_index);
|
||||
}
|
||||
}
|
||||
|
||||
void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
|
||||
uint32_t fence_value) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WrapIntoRing(
|
||||
RingIndexTy index) {
|
||||
return index & (kQueueSize - 1);
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::CanWriteUpto(
|
||||
RingIndexTy upto_index) {
|
||||
// Get/calculate the monotonic read index.
|
||||
RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
|
||||
RingIndexTy read_index;
|
||||
|
||||
if (HwIndexMonotonic) {
|
||||
read_index = hw_read_index;
|
||||
} else {
|
||||
// Calculate distance from commit index to HW read index.
|
||||
// Commit index is always < kQueueSize away from HW read index.
|
||||
RingIndexTy commit_index = atomic::Load(&cached_commit_index_, std::memory_order_relaxed);
|
||||
RingIndexTy dist_to_read_index = WrapIntoRing(commit_index - hw_read_index);
|
||||
read_index = commit_index - dist_to_read_index;
|
||||
}
|
||||
|
||||
// Check whether the read pointer has passed the given index.
|
||||
// At most we can submit (kQueueSize - 1) bytes at a time.
|
||||
return (upto_index - read_index) < kQueueSize;
|
||||
}
|
||||
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceCommand(
|
||||
char* fence_command_addr, uint32_t* fence, uint32_t fence_value) {
|
||||
assert(fence_command_addr != NULL);
|
||||
SDMA_PKT_FENCE* packet_addr =
|
||||
reinterpret_cast<SDMA_PKT_FENCE*>(fence_command_addr);
|
||||
@@ -909,7 +898,8 @@ void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
|
||||
packet_addr->DATA_UNION.data = fence_value;
|
||||
}
|
||||
|
||||
uint32_t* BlitSdma::ObtainFenceObject() {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
uint32_t* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ObtainFenceObject() {
|
||||
const uint32_t fence_index =
|
||||
atomic::Add(&fence_pool_counter_, 1U, std::memory_order_acquire);
|
||||
uint32_t* fence_addr = &fence_base_addr_[fence_index & fence_pool_mask_];
|
||||
@@ -917,7 +907,9 @@ uint32_t* BlitSdma::ObtainFenceObject() {
|
||||
return fence_addr;
|
||||
}
|
||||
|
||||
void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WaitFence(uint32_t* fence,
|
||||
uint32_t fence_value) {
|
||||
int spin_count = 51;
|
||||
while (atomic::Load(fence, std::memory_order_acquire) != fence_value) {
|
||||
if (--spin_count > 0) {
|
||||
@@ -927,12 +919,13 @@ void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
|
||||
}
|
||||
}
|
||||
|
||||
void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
|
||||
void* dst, const void* src, size_t size) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyCommand(
|
||||
char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) {
|
||||
size_t cur_size = 0;
|
||||
for (uint32_t i = 0; i < num_copy_command; ++i) {
|
||||
const uint32_t copy_size = static_cast<uint32_t>(
|
||||
std::min((size - cur_size), max_single_linear_copy_size_));
|
||||
const uint32_t copy_size =
|
||||
static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleCopySize));
|
||||
|
||||
void* cur_dst = static_cast<char*>(dst) + cur_size;
|
||||
const void* cur_src = static_cast<const char*>(src) + cur_size;
|
||||
@@ -945,7 +938,7 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
|
||||
packet_addr->HEADER_UNION.op = SDMA_OP_COPY;
|
||||
packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR;
|
||||
|
||||
packet_addr->COUNT_UNION.count = copy_size;
|
||||
packet_addr->COUNT_UNION.count = copy_size + SizeToCountOffset;
|
||||
|
||||
packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src);
|
||||
packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src);
|
||||
@@ -960,8 +953,9 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
|
||||
assert(cur_size == size);
|
||||
}
|
||||
|
||||
void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
|
||||
uint32_t reference) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
|
||||
char* cmd_addr, void* addr, uint32_t reference) {
|
||||
SDMA_PKT_POLL_REGMEM* packet_addr =
|
||||
reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
|
||||
|
||||
@@ -981,7 +975,9 @@ void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
|
||||
packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever.
|
||||
}
|
||||
|
||||
void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildAtomicDecrementCommand(
|
||||
char* cmd_addr, void* addr) {
|
||||
SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast<SDMA_PKT_ATOMIC*>(cmd_addr);
|
||||
|
||||
memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC));
|
||||
@@ -996,8 +992,9 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
|
||||
packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
|
||||
}
|
||||
|
||||
void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
|
||||
void* write_address) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildGetGlobalTimestampCommand(
|
||||
char* cmd_addr, void* write_address) {
|
||||
SDMA_PKT_TIMESTAMP* packet_addr =
|
||||
reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
|
||||
|
||||
@@ -1010,7 +1007,8 @@ void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
|
||||
packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
|
||||
}
|
||||
|
||||
void BlitSdma::BuildTrapCommand(char* cmd_addr) {
|
||||
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
|
||||
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapCommand(char* cmd_addr) {
|
||||
SDMA_PKT_TRAP* packet_addr =
|
||||
reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
|
||||
|
||||
@@ -1018,4 +1016,8 @@ void BlitSdma::BuildTrapCommand(char* cmd_addr) {
|
||||
|
||||
packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
|
||||
}
|
||||
|
||||
template class BlitSdma<uint32_t, false, 0>;
|
||||
template class BlitSdma<uint64_t, true, -1>;
|
||||
|
||||
} // namespace amd
|
||||
|
||||
@@ -389,9 +389,9 @@ bool GpuAgent::InitEndTsPool() {
|
||||
return true;
|
||||
}
|
||||
|
||||
end_ts_pool_size_ = static_cast<uint32_t>(
|
||||
(BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) /
|
||||
(BlitSdma::kCopyPacketSize));
|
||||
end_ts_pool_size_ =
|
||||
static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
|
||||
(BlitSdmaBase::kCopyPacketSize));
|
||||
|
||||
// Allocate end timestamp object for both h2d and d2h DMA.
|
||||
const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
|
||||
@@ -510,7 +510,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
|
||||
}
|
||||
|
||||
core::Blit* GpuAgent::CreateBlitSdma() {
|
||||
BlitSdma* sdma = new BlitSdma();
|
||||
core::Blit* sdma;
|
||||
|
||||
if (isa_->GetMajorVersion() <= 8) {
|
||||
sdma = new BlitSdmaV2V3;
|
||||
} else {
|
||||
sdma = new BlitSdmaV4;
|
||||
}
|
||||
|
||||
if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
|
||||
sdma->Destroy(*this);
|
||||
|
||||
Referens i nytt ärende
Block a user