rocr: Remove SDMA code for gfx7 and gfx8

Remove deprecated SDMA code for gfx7 and gfx8 asics
Этот коммит содержится в:
David Yat Sin
2025-07-07 22:12:20 +00:00
коммит произвёл Yat Sin, David
родитель 5285c24657
Коммит d3f70910e1
3 изменённых файлов: 105 добавлений и 185 удалений
+11 -25
Просмотреть файл
@@ -73,11 +73,7 @@ class BlitSdmaBase : public core::Blit {
core::Signal& out_signal) = 0;
};
// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring.
// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field.
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
class BlitSdma : public BlitSdmaBase {
template <bool useGCR> class BlitSdma : public BlitSdmaBase {
public:
BlitSdma();
@@ -163,9 +159,9 @@ class BlitSdma : public BlitSdmaBase {
/// could be written. NULL if input size is greater than the size of queue
/// buffer.
char* AcquireWriteAddress(uint32_t cmd_size, RingIndexTy& curr_index);
char* AcquireWriteAddress(uint32_t cmd_size, uint64_t& curr_index);
void UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, RingIndexTy new_index);
void UpdateWriteAndDoorbellRegister(uint64_t curr_index, uint64_t new_index);
/// @brief Updates the Write Register of compute device to the end of
/// SDMA packet written into queue buffer. The update to Write Register
@@ -178,16 +174,16 @@ class BlitSdma : public BlitSdmaBase {
/// @param curr_index Index passed back from AcquireWriteAddress.
///
/// @param cmd_size Command packet size in bytes.
void ReleaseWriteAddress(RingIndexTy curr_index, uint32_t cmd_size);
void ReleaseWriteAddress(uint64_t curr_index, uint32_t cmd_size);
/// @brief Writes NO-OP words into queue buffer in case writing a command
/// causes the queue buffer to wrap.
///
/// @param curr_index Index to begin padding from.
void PadRingToEnd(RingIndexTy curr_index);
void PadRingToEnd(uint64_t curr_index);
uint32_t WrapIntoRing(RingIndexTy index);
bool CanWriteUpto(RingIndexTy upto_index);
uint32_t WrapIntoRing(uint64_t index);
bool CanWriteUpto(uint64_t upto_index);
/// @brief Build fence command
void BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
@@ -265,8 +261,8 @@ class BlitSdma : public BlitSdmaBase {
HsaQueueResource queue_resource_;
// Monotonic ring indices, in bytes, tracking written and submitted commands.
RingIndexTy cached_reserve_index_;
RingIndexTy cached_commit_index_;
uint64_t cached_reserve_index_;
uint64_t cached_commit_index_;
static const uint32_t linear_copy_command_size_;
@@ -314,21 +310,11 @@ class BlitSdma : public BlitSdmaBase {
size_t min_submission_size_;
};
// Ring indices are 32-bit.
// HW ring indices are not monotonic (wrap at end of ring).
// Count fields of SDMA commands are 0-based.
typedef BlitSdma<uint32_t, false, 0, false> BlitSdmaV2V3;
// Ring indices are 64-bit.
// HW ring indices are monotonic (do not wrap at end of ring).
// Count fields of SDMA commands are 1-based.
typedef BlitSdma<uint64_t, true, -1, false> BlitSdmaV4;
typedef BlitSdma<false> BlitSdmaV4;
// Ring indices are 64-bit.
// HW ring indices are monotonic (do not wrap at end of ring).
// Count fields of SDMA commands are 1-based.
// SDMA is connected to gL2.
typedef BlitSdma<uint64_t, true, -1, true> BlitSdmaV5;
typedef BlitSdma<true> BlitSdmaV5;
} // namespace amd
} // namespace rocr
+94 -156
Просмотреть файл
@@ -77,44 +77,33 @@ const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_;
const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_;
// Initialize size of various sDMA commands use by this module
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::linear_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::fence_command_size_ = sizeof(SDMA_PKT_FENCE);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::fence_command_size_ = sizeof(SDMA_PKT_FENCE);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::poll_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::flush_command_size_ = sizeof(SDMA_PKT_POLL_REGMEM);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::atomic_command_size_ = sizeof(SDMA_PKT_ATOMIC);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
template <bool useGCR>
const uint32_t BlitSdma<useGCR>::timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
template <bool useGCR> const uint32_t BlitSdma<useGCR>::trap_command_size_ = sizeof(SDMA_PKT_TRAP);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
const uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::gcr_command_size_ = sizeof(SDMA_PKT_GCR);
template <bool useGCR> const uint32_t BlitSdma<useGCR>::gcr_command_size_ = sizeof(SDMA_PKT_GCR);
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BlitSdma()
template <bool useGCR>
BlitSdma<useGCR>::BlitSdma()
: agent_(NULL),
queue_start_addr_(NULL),
bytes_queued_(0),
@@ -129,12 +118,11 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BlitSdma()
std::memset(&queue_resource_, 0, sizeof(queue_resource_));
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::~BlitSdma() {}
template <bool useGCR> BlitSdma<useGCR>::~BlitSdma() {}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Initialize(
const core::Agent& agent, bool use_xgmi, size_t linear_copy_size_override, int rec_eng) {
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::Initialize(const core::Agent& agent, bool use_xgmi,
size_t linear_copy_size_override, int rec_eng) {
if (queue_start_addr_ != NULL) {
// Already initialized.
return HSA_STATUS_SUCCESS;
@@ -201,7 +189,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
cached_reserve_index_ = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr);
cached_reserve_index_ = *reinterpret_cast<uint64_t*>(queue_resource_.Queue_write_ptr);
cached_commit_index_ = cached_reserve_index_;
if (core::g_use_interrupt_wait) {
@@ -218,9 +206,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Destroy(
const core::Agent& agent) {
template <bool useGCR> hsa_status_t BlitSdma<useGCR>::Destroy(const core::Agent& agent) {
// Release all allocated resources and reset them to zero.
if (queue_resource_.QueueId != 0) {
@@ -245,9 +231,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitBlockingCommand(const void* cmd, size_t cmd_size,
uint64_t size) {
ScopedAcquire<KernelMutex> lock(&lock_);
@@ -278,11 +263,11 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
return ret;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCommand(
const void* cmd, size_t cmd_size, uint64_t size, const std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal, std::vector<core::Signal*>& gang_signals) {
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitCommand(const void* cmd, size_t cmd_size, uint64_t size,
const std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal,
std::vector<core::Signal*>& gang_signals) {
uint32_t num_poll_command = 0;
// Cached copy of dep_signals[i]->LoadRelaxed
@@ -355,9 +340,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
// Add space for acquire or release Hdp flush command
uint32_t flush_cmd_size = 0;
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
flush_cmd_size = flush_command_size_;
}
if (hdp_flush_support_) flush_cmd_size = flush_command_size_;
}
// Add space for cache flush.
@@ -368,7 +351,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
const uint32_t pad_size = total_command_size < min_submission_size_ ?
min_submission_size_ - total_command_size : 0;
RingIndexTy curr_index;
uint64_t curr_index;
char* command_addr;
uint64_t prior_bytes, post_bytes;
{
@@ -426,7 +409,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
// Issue a Hdp flush cmd
if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
if ((HwIndexMonotonic) && (hdp_flush_support_)) {
if (hdp_flush_support_) {
BuildHdpFlushCommand(command_addr);
command_addr += flush_command_size_;
bytes_written_[wrapped_index] = prior_bytes;
@@ -542,9 +525,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) {
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) {
// Break the copy into multiple copy operation incase the copy size exceeds
// the SDMA linear copy limit.
const size_t max_copy_size = max_single_linear_copy_size_ ? max_single_linear_copy_size_ :
@@ -557,9 +539,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), size);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size,
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitLinearCopyCommand(void* dst, const void* src, size_t size,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal,
std::vector<core::Signal*>& gang_signals) {
@@ -577,9 +558,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
out_signal, gang_signals);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t
BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCopyRectCommand(
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitCopyRectCommand(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
@@ -653,9 +633,8 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCopyRe
out_signal, gang_signals);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) {
template <bool useGCR>
hsa_status_t BlitSdma<useGCR>::SubmitLinearFillCommand(void* ptr, uint32_t value, size_t count) {
const size_t size = count * sizeof(uint32_t);
const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize;
@@ -666,15 +645,12 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
return SubmitBlockingCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_CONSTANT_FILL), size);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::EnableProfiling(
bool enable) {
template <bool useGCR> hsa_status_t BlitSdma<useGCR>::EnableProfiling(bool enable) {
return HSA_STATUS_SUCCESS;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::AcquireWriteAddress(
uint32_t cmd_size, RingIndexTy& curr_index) {
template <bool useGCR>
char* BlitSdma<useGCR>::AcquireWriteAddress(uint32_t cmd_size, uint64_t& curr_index) {
// Ring is full when all but one byte is written.
if (cmd_size >= kQueueSize) {
return nullptr;
@@ -692,7 +668,7 @@ char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Acquir
}
// Check whether the engine has finished using this region.
const RingIndexTy new_index = curr_index + cmd_size;
const uint64_t new_index = curr_index + cmd_size;
if (CanWriteUpto(new_index) == false) {
// Wait for read index to move and try again.
@@ -713,10 +689,8 @@ char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Acquir
return nullptr;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::UpdateWriteAndDoorbellRegister(RingIndexTy curr_index,
RingIndexTy new_index) {
template <bool useGCR>
void BlitSdma<useGCR>::UpdateWriteAndDoorbellRegister(uint64_t curr_index, uint64_t new_index) {
while (true) {
// Make sure that the address before ::curr_index is already released.
// Otherwise the CP may read invalid packets.
@@ -725,21 +699,19 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
// TODO: remove when sdma wpointer issue is resolved.
// Wait until the SDMA engine finish processing all packets before
// updating the wptr and doorbell.
while (WrapIntoRing(*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr)) !=
while (WrapIntoRing(*reinterpret_cast<uint64_t*>(queue_resource_.Queue_read_ptr)) !=
WrapIntoRing(curr_index)) {
os::YieldThread();
}
}
// Update write pointer and doorbell register.
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr) =
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
*reinterpret_cast<uint64_t*>(queue_resource_.Queue_write_ptr) = new_index;
// Ensure write pointer is visible to GPU before doorbell.
std::atomic_thread_fence(std::memory_order_release);
*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_DoorBell) =
(HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
*reinterpret_cast<uint64_t*>(queue_resource_.Queue_DoorBell) = new_index;
atomic::Store(&cached_commit_index_, new_index, std::memory_order_release);
break;
@@ -750,9 +722,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::ReleaseWriteAddress(
RingIndexTy curr_index, uint32_t cmd_size) {
template <bool useGCR>
void BlitSdma<useGCR>::ReleaseWriteAddress(uint64_t curr_index, uint32_t cmd_size) {
if (cmd_size > kQueueSize) {
assert(false && "cmd_addr is outside the queue buffer range");
return;
@@ -761,11 +732,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::Release
UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::PadRingToEnd(
RingIndexTy curr_index) {
template <bool useGCR> void BlitSdma<useGCR>::PadRingToEnd(uint64_t curr_index) {
// Reserve region from here to the end of the ring.
RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
uint64_t new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
// Check whether the engine has finished using this region.
if (CanWriteUpto(new_index) == false) {
@@ -786,37 +755,22 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::PadRing
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::WrapIntoRing(
RingIndexTy index) {
template <bool useGCR> uint32_t BlitSdma<useGCR>::WrapIntoRing(uint64_t index) {
return index & (kQueueSize - 1);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::CanWriteUpto(
RingIndexTy upto_index) {
template <bool useGCR> bool BlitSdma<useGCR>::CanWriteUpto(uint64_t upto_index) {
// Get/calculate the monotonic read index.
RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
RingIndexTy read_index;
if (HwIndexMonotonic) {
read_index = hw_read_index;
} else {
// Calculate distance from commit index to HW read index.
// Commit index is always < kQueueSize away from HW read index.
RingIndexTy commit_index = atomic::Load(&cached_commit_index_, std::memory_order_relaxed);
RingIndexTy dist_to_read_index = WrapIntoRing(commit_index - hw_read_index);
read_index = commit_index - dist_to_read_index;
}
uint64_t hw_read_index = *reinterpret_cast<uint64_t*>(queue_resource_.Queue_read_ptr);
// Check whether the read pointer has passed the given index.
// At most we can submit (kQueueSize - 1) bytes at a time.
return (upto_index - read_index) < kQueueSize;
return (upto_index - hw_read_index) < kQueueSize;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFenceCommand(
char* fence_command_addr, uint32_t* fence, uint32_t fence_value) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
uint32_t fence_value) {
assert(fence_command_addr != NULL);
SDMA_PKT_FENCE* packet_addr =
reinterpret_cast<SDMA_PKT_FENCE*>(fence_command_addr);
@@ -836,9 +790,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFe
packet_addr->DATA_UNION.data = fence_value;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCopyCommand(
char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst,
const void* src, size_t size) {
size_t cur_size = 0;
const size_t max_copy_size = max_single_linear_copy_size_ ? max_single_linear_copy_size_ :
kMaxSingleCopySize;
@@ -858,9 +812,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCo
packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR;
if (max_copy_size == (1 << 30) -1)
packet_addr->COUNT_UNION.count_ext.count = copy_size + SizeToCountOffset;
packet_addr->COUNT_UNION.count_ext.count = copy_size - 1; /* count is 1-based */
else
packet_addr->COUNT_UNION.count.count = copy_size + SizeToCountOffset;
packet_addr->COUNT_UNION.count.count = copy_size - 1; /* count is 1-based */
packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src);
packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src);
@@ -881,11 +835,12 @@ Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byt
This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets
must be representable in terms of elements in all tiles of the copy.
*/
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCopyRectCommand(
const std::function<void*(size_t)>& append, const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
const hsa_dim3_t* range) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildCopyRectCommand(const std::function<void*(size_t)>& append,
const hsa_pitched_ptr_t* dst,
const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range) {
// Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides
// width), the largest element that perfectly covers width.
// width | 16 ensures that we don't return a higher element than is supported and avoids
@@ -1029,9 +984,9 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCo
}
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFillCommand(
char* cmd_addr, uint32_t num_fill_command, void* ptr, uint32_t value, size_t count) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildFillCommand(char* cmd_addr, uint32_t num_fill_command, void* ptr,
uint32_t value, size_t count) {
char* cur_ptr = reinterpret_cast<char*>(ptr);
const uint32_t maxDwordCount = kMaxSingleFillSize / sizeof(uint32_t);
SDMA_PKT_CONSTANT_FILL* packet_addr = reinterpret_cast<SDMA_PKT_CONSTANT_FILL*>(cmd_addr);
@@ -1050,7 +1005,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFi
packet_addr->DATA_UNION.src_data_31_0 = value;
packet_addr->COUNT_UNION.count = (fill_count + SizeToCountOffset) * sizeof(uint32_t);
/* count is 1-based */
packet_addr->COUNT_UNION.count = (fill_count - 1) * sizeof(uint32_t);
packet_addr++;
cur_ptr += fill_count * sizeof(uint32_t);
@@ -1059,9 +1015,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFi
assert(count == 0 && "SDMA fill command count error.");
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildPollCommand(
char* cmd_addr, void* addr, uint32_t reference) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference) {
SDMA_PKT_POLL_REGMEM* packet_addr =
reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
@@ -1081,9 +1036,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildPo
packet_addr->DW5_UNION.retry_count = 0xfff; // Retry forever.
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast<SDMA_PKT_ATOMIC*>(cmd_addr);
memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC));
@@ -1098,9 +1052,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
useGCR>::BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address) {
template <bool useGCR>
void BlitSdma<useGCR>::BuildGetGlobalTimestampCommand(char* cmd_addr, void* write_address) {
SDMA_PKT_TIMESTAMP* packet_addr =
reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
@@ -1113,9 +1066,7 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset,
packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildTrapCommand(
char* cmd_addr, uint32_t event_id) {
template <bool useGCR> void BlitSdma<useGCR>::BuildTrapCommand(char* cmd_addr, uint32_t event_id) {
SDMA_PKT_TRAP* packet_addr =
reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
@@ -1125,17 +1076,13 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildTr
packet_addr->INT_CONTEXT_UNION.int_ctx = event_id;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildHdpFlushCommand(
char* cmd_addr) {
template <bool useGCR> void BlitSdma<useGCR>::BuildHdpFlushCommand(char* cmd_addr) {
assert(cmd_addr != NULL);
SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
memcpy(addr, &hdp_flush_cmd, flush_command_size_);
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildGCRCommand(
char* cmd_addr, bool invalidate) {
template <bool useGCR> void BlitSdma<useGCR>::BuildGCRCommand(char* cmd_addr, bool invalidate) {
assert(cmd_addr != NULL);
assert(useGCR && "Unsupported SDMA command - GCR.");
SDMA_PKT_GCR* addr = reinterpret_cast<SDMA_PKT_GCR*>(cmd_addr);
@@ -1154,25 +1101,16 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildGC
addr->WORD2_UNION.GCR_CONTROL_GL2_RANGE = 0;
}
template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset, bool useGCR>
uint64_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::PendingBytes() {
RingIndexTy commit = atomic::Load(&cached_commit_index_, std::memory_order_acquire);
RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
RingIndexTy read;
if (HwIndexMonotonic) {
read = hw_read_index;
} else {
RingIndexTy dist_to_read_index = WrapIntoRing(commit - hw_read_index);
read = commit - dist_to_read_index;
}
template <bool useGCR> uint64_t BlitSdma<useGCR>::PendingBytes() {
uint64_t commit = atomic::Load(&cached_commit_index_, std::memory_order_acquire);
uint64_t hw_read_index = *reinterpret_cast<uint64_t*>(queue_resource_.Queue_read_ptr);
if (commit == read) return 0;
return bytes_queued_ - bytes_written_[WrapIntoRing(read)];
if (commit == hw_read_index) return 0;
return bytes_queued_ - bytes_written_[WrapIntoRing(hw_read_index)];
}
template class BlitSdma<uint32_t, false, 0, false>;
template class BlitSdma<uint64_t, true, -1, false>;
template class BlitSdma<uint64_t, true, -1, true>;
template class BlitSdma<false>;
template class BlitSdma<true>;
} // namespace amd
} // namespace rocr
-4
Просмотреть файл
@@ -711,10 +711,6 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) {
const size_t copy_size_overrides[2] = {0x3fffff, 0x3fffffff};
switch (isa_->GetMajorVersion()) {
case 7:
case 8:
sdma = new BlitSdmaV2V3();
break;
case 9:
sdma = new BlitSdmaV4();
copy_size_override = (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10) ?