From 1cd46afe6d25a323ec972ad0739d5e366aa709c5 Mon Sep 17 00:00:00 2001
From: Jay Cornwall <Jay.Cornwall@amd.com>
Date: Tue, 24 Jan 2017 18:55:55 -0600
Subject: [PATCH] Implement SDMA path for Gfx9

Gfx9 requires monotonic write pointer and doorbell.
Cound fields are 1-based compared with 0-based pre-Gfx9.

- Restructure implementation to use monotonic ring indices
- Remove redundant submission size checks (handled by AcquireWriteAddress)
- Unify copy/fill per-command limit (documentation is unclear)

Change-Id: I57c1675221d2e63aa319fee700d9951671e1bd65
---
 runtime/hsa-runtime/core/inc/amd_blit_sdma.h  |  71 ++--
 .../core/runtime/amd_blit_sdma.cpp            | 354 +++++++++---------
 .../core/runtime/amd_gpu_agent.cpp            |  14 +-
 3 files changed, 229 insertions(+), 210 deletions(-)
diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
index 6212c3dcc2..fb0eb1abf8 100644
--- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
+++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h
@@ -55,7 +55,19 @@
 #include "core/util/utils.h"
 
 namespace amd {
-class BlitSdma : public core::Blit {
+class BlitSdmaBase : public core::Blit {
+ public:
+  static const size_t kQueueSize;
+  static const size_t kCopyPacketSize;
+  static const size_t kMaxSingleCopySize;
+  static const size_t kMaxSingleFillSize;
+};
+
+// RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
+// HwIndexMonotonic: true if SDMA HW index is monotonic, false if it wraps at end of ring.
+// SizeToCountOffset: value added to size (in bytes) to form SDMA command count field.
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+class BlitSdma : public BlitSdmaBase {
  public:
   explicit BlitSdma();
 
@@ -113,10 +125,6 @@ class BlitSdma : public core::Blit {
 
   virtual hsa_status_t EnableProfiling(bool enable) override;
 
-  static const size_t kQueueSize;
-
-  static const size_t kCopyPacketSize;
-
  protected:
   /// @brief Acquires the address into queue buffer where a new command
   /// packet of specified size could be written. The address that is
@@ -126,13 +134,15 @@ class BlitSdma : public core::Blit {
   ///
   /// @param cmd_size Command packet size in bytes.
   ///
+  /// @param curr_index (output) Index to pass to ReleaseWriteAddress.
+  ///
   /// @return pointer into the queue buffer where a PM4 packet of specified size
   /// could be written. NULL if input size is greater than the size of queue
   /// buffer.
-  char* AcquireWriteAddress(uint32_t cmd_size);
 
-  void UpdateWriteAndDoorbellRegister(uint32_t current_offset,
-                                      uint32_t new_offset);
+  char* AcquireWriteAddress(uint32_t cmd_size, RingIndexTy& curr_index);
+
+  void UpdateWriteAndDoorbellRegister(RingIndexTy curr_index, RingIndexTy new_index);
 
   /// @brief Updates the Write Register of compute device to the end of
   /// SDMA packet written into queue buffer. The update to Write Register
@@ -142,17 +152,19 @@ class BlitSdma : public core::Blit {
   /// will block until T1 has completed its update (assumes T1 acquired the
   /// write address first).
   ///
-  /// @param cmd_addr pointer into the queue buffer where a PM4 packet was
-  /// written.
+  /// @param curr_index Index passed back from AcquireWriteAddress.
   ///
   /// @param cmd_size Command packet size in bytes.
-  void ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size);
+  void ReleaseWriteAddress(RingIndexTy curr_index, uint32_t cmd_size);
 
   /// @brief Writes NO-OP words into queue buffer in case writing a command
   /// causes the queue buffer to wrap.
   ///
-  /// @param cmd_size Size in bytes of command causing queue buffer to wrap.
-  void WrapQueue(uint32_t cmd_size);
+  /// @param curr_index Index to begin padding from.
+  void PadRingToEnd(RingIndexTy curr_index);
+
+  uint32_t WrapIntoRing(RingIndexTy index);
+  bool CanWriteUpto(RingIndexTy upto_index);
 
   /// @brief Build fence command
   void BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
@@ -176,9 +188,6 @@ class BlitSdma : public core::Blit {
   // Agent object owning the SDMA engine.
   GpuAgent* agent_;
 
-  /// Indicates size of Queue buffer in bytes.
-  uint32_t queue_size_;
-
   /// Base address of the Queue buffer at construction time.
   char* queue_start_addr_;
 
@@ -191,20 +200,9 @@ class BlitSdma : public core::Blit {
   /// and write indices
   HsaQueueResource queue_resource_;
 
-  /// @brief Current address of execution in Queue buffer.
-  ///
-  /// @note: The value of address is obtained by reading
-  /// the value of Write Register of the compute device.
-  /// Users should write to the Queue buffer at the current
-  /// address, else it will lead to execution error and potentially
-  /// a hang.
-  ///
-  /// @note: The value of Write Register does not always begin
-  /// with Zero after a Queue has been created. This needs to be
-  /// understood better. This means that current address number of
-  /// words of Queue buffer is unavailable for use.
-  volatile uint32_t cached_reserve_offset_;
-  volatile uint32_t cached_commit_offset_;
+  // Monotonic ring indices, in bytes, tracking written and submitted commands.
+  RingIndexTy cached_reserve_index_;
+  RingIndexTy cached_commit_index_;
 
   uint32_t linear_copy_command_size_;
 
@@ -235,6 +233,19 @@ class BlitSdma : public core::Blit {
   /// True if platform atomic is supported.
   bool platform_atomic_support_;
 };
+
+class BlitSdmaV2V3
+    // Ring indices are 32-bit.
+    // HW ring indices are not monotonic (wrap at end of ring).
+    // Count fields of SDMA commands are 0-based.
+    : public BlitSdma<uint32_t, false, 0> {};
+
+class BlitSdmaV4
+    // Ring indices are 64-bit.
+    // HW ring indices are monotonic (do not wrap at end of ring).
+    // Count fields of SDMA commands are 1-based.
+    : public BlitSdma<uint64_t, true, -1> {};
+
 }  // namespace amd
 
 #endif  // header guard
diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
index c397bfb4e2..7daf92ba89 100644
--- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
@@ -46,14 +46,13 @@
 #include <atomic>
 #include <cmath>
 #include <cstring>
+#include <limits>
 
 #include "core/inc/amd_gpu_agent.h"
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
 #include "core/inc/signal.h"
 
-#define SDMA_QUEUE_SIZE 1024 * 1024
-
 namespace amd {
 // SDMA packet for VI device.
 // Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
@@ -373,29 +372,33 @@ inline uint32_t ptrhigh32(const void* p) {
 #endif
 }
 
-const size_t BlitSdma::kQueueSize = SDMA_QUEUE_SIZE;
-const size_t BlitSdma::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
+const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
+const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
+const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0;  // From HW documentation
+const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
 
-BlitSdma::BlitSdma()
-    : core::Blit(),
-      agent_(NULL),
-      queue_size_(0),
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BlitSdma()
+    : agent_(NULL),
       queue_start_addr_(NULL),
       fence_base_addr_(NULL),
       fence_pool_size_(0),
       fence_pool_counter_(0),
-      cached_reserve_offset_(0),
-      cached_commit_offset_(0),
+      cached_reserve_index_(0),
+      cached_commit_index_(0),
       platform_atomic_support_(true) {
   std::memset(&queue_resource_, 0, sizeof(queue_resource_));
 }
 
-BlitSdma::~BlitSdma() {}
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
 
-hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
+    const core::Agent& agent) {
   agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
 
-  if (queue_start_addr_ != NULL && queue_size_ != 0) {
+  if (queue_start_addr_ != NULL) {
     // Already initialized.
     return HSA_STATUS_SUCCESS;
   }
@@ -412,26 +415,6 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   timestamp_command_size_ = sizeof(SDMA_PKT_TIMESTAMP);
   trap_command_size_ = sizeof(SDMA_PKT_TRAP);
 
-  const uint32_t sync_command_size = fence_command_size_;
-  const uint32_t max_num_copy_command =
-      std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
-                 linear_copy_command_size_);
-  const uint32_t max_num_fill_command =
-      std::floor((static_cast<uint32_t>(queue_size_) - sync_command_size) /
-                 fill_command_size_);
-
-  max_single_linear_copy_size_ = 0x3fffe0;
-  max_total_linear_copy_size_ = static_cast<size_t>(
-      std::min(static_cast<uint64_t>(SIZE_MAX),
-               static_cast<uint64_t>(max_num_copy_command) *
-                   static_cast<uint64_t>(max_single_linear_copy_size_)));
-
-  max_single_fill_size_ = (1 << 22) - sizeof(uint32_t);
-  max_total_fill_size_ = static_cast<size_t>(
-      std::min(static_cast<uint64_t>(SIZE_MAX),
-               static_cast<uint64_t>(max_num_fill_command) *
-                   static_cast<uint64_t>(max_single_fill_size_)));
-
   const amd::GpuAgentInt& amd_gpu_agent =
       static_cast<const amd::GpuAgentInt&>(agent);
 
@@ -445,35 +428,31 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   }
 
   // Allocate queue buffer.
-  queue_size_ = kQueueSize;
-
-  queue_start_addr_ =
-      (char*)core::Runtime::runtime_singleton_->system_allocator()(
-          queue_size_, 0x1000, core::MemoryRegion::AllocateExecutable);
+  queue_start_addr_ = (char*)core::Runtime::runtime_singleton_->system_allocator()(
+      kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable);
 
   if (queue_start_addr_ == NULL) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
-  std::memset(queue_start_addr_, 0, queue_size_);
+  std::memset(queue_start_addr_, 0, kQueueSize);
 
   // Access kernel driver to initialize the queue control block
   // This call binds user mode queue object to underlying compute
   // device.
   const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
-  if (HSAKMT_STATUS_SUCCESS !=
-      hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
-                        HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
-                        queue_size_, NULL, &queue_resource_)) {
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
+                                                 HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
+                                                 kQueueSize, NULL, &queue_resource_)) {
     Destroy(agent);
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
 
-  cached_reserve_offset_ = *(queue_resource_.Queue_write_ptr);
-  cached_commit_offset_ = cached_reserve_offset_;
+  cached_reserve_index_ = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr);
+  cached_commit_index_ = cached_reserve_index_;
 
-  fence_pool_size_ = static_cast<uint32_t>(
-      (kQueueSize + fence_command_size_ - 1) / fence_command_size_);
+  fence_pool_size_ =
+      static_cast<uint32_t>((kQueueSize + fence_command_size_ - 1) / fence_command_size_);
 
   fence_pool_mask_ = fence_pool_size_ - 1;
 
@@ -490,7 +469,9 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy(
+    const core::Agent& agent) {
   // Release all allocated resources and reset them to zero.
 
   if (queue_resource_.QueueId != 0) {
@@ -500,7 +481,7 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
     memset(&queue_resource_, 0, sizeof(queue_resource_));
   }
 
-  if (queue_start_addr_ != NULL && queue_size_ != 0) {
+  if (queue_start_addr_ != NULL) {
     // Release queue buffer.
     core::Runtime::runtime_singleton_->system_deallocator()(queue_start_addr_);
   }
@@ -509,24 +490,19 @@ hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
     core::Runtime::runtime_singleton_->system_deallocator()(fence_base_addr_);
   }
 
-  queue_size_ = 0;
   queue_start_addr_ = NULL;
-  cached_reserve_offset_ = 0;
-  cached_commit_offset_ = 0;
+  cached_reserve_index_ = 0;
+  cached_commit_index_ = 0;
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
-                                               size_t size) {
-  if (size > max_total_linear_copy_size_) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
+    void* dst, const void* src, size_t size) {
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_copy_command =
-      (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
+  const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
 
   const uint32_t total_copy_command_size =
       num_copy_command * linear_copy_command_size_;
@@ -538,8 +514,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
   uint32_t* fence_addr = ObtainFenceObject();
   *fence_addr = 0;
 
-  char* command_addr = AcquireWriteAddress(total_command_size);
-  char* const command_addr_temp = command_addr;
+  RingIndexTy curr_index;
+  char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
 
   if (command_addr == NULL) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -551,20 +527,17 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src,
 
   BuildFenceCommand(command_addr, fence_addr, kFenceValue);
 
-  ReleaseWriteAddress(command_addr_temp, total_command_size);
+  ReleaseWriteAddress(curr_index, total_command_size);
 
   WaitFence(fence_addr, kFenceValue);
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitSdma::SubmitLinearCopyCommand(
-    void* dst, const void* src, size_t size,
-    std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
-  if (size > max_total_linear_copy_size_) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
+    void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
+    core::Signal& out_signal) {
   // The signal is 64 bit value, and poll checks for 32 bit value. So we
   // need to use two poll operations per dependent signal.
   const uint32_t num_poll_command =
@@ -574,8 +547,7 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
 
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_copy_command =
-      (size + max_single_linear_copy_size_ - 1) / max_single_linear_copy_size_;
+  const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
   const uint32_t total_copy_command_size =
       num_copy_command * linear_copy_command_size_;
 
@@ -624,8 +596,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
       total_poll_command_size + total_copy_command_size + sync_command_size +
       total_timestamp_command_size + interrupt_command_size;
 
-  char* command_addr = AcquireWriteAddress(total_command_size);
-  char* const command_addr_temp = command_addr;
+  RingIndexTy curr_index;
+  char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
 
   if (command_addr == NULL) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -695,23 +667,19 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand(
     BuildTrapCommand(command_addr);
   }
 
-  ReleaseWriteAddress(command_addr_temp, total_command_size);
+  ReleaseWriteAddress(curr_index, total_command_size);
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
-                                               size_t count) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearFillCommand(
+    void* ptr, uint32_t value, size_t count) {
   const size_t size = count * sizeof(uint32_t);
 
-  if (size > max_total_fill_size_) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
   // Break the copy into multiple copy operation incase the copy size exceeds
   // the SDMA linear copy limit.
-  const uint32_t num_fill_command =
-      (size + max_single_fill_size_ - 1) / max_single_fill_size_;
+  const uint32_t num_fill_command = (size + kMaxSingleFillSize - 1) / kMaxSingleFillSize;
 
   const uint32_t total_fill_command_size =
       num_fill_command * fill_command_size_;
@@ -719,8 +687,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
   const uint32_t total_command_size =
       total_fill_command_size + fence_command_size_;
 
-  char* command_addr = AcquireWriteAddress(total_command_size);
-  char* const command_addr_temp = command_addr;
+  RingIndexTy curr_index;
+  char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
 
   if (command_addr == NULL) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
@@ -729,8 +697,8 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
   const uint32_t fill_command_size = fill_command_size_;
   size_t cur_size = 0;
   for (uint32_t i = 0; i < num_fill_command; ++i) {
-    const uint32_t fill_size = static_cast<uint32_t>(
-        std::min((size - cur_size), max_single_fill_size_));
+    const uint32_t fill_size =
+        static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleFillSize));
 
     void* cur_ptr = static_cast<char*>(ptr) + cur_size;
 
@@ -747,7 +715,7 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
 
     packet_addr->DATA_UNION.src_data_31_0 = value;
 
-    packet_addr->COUNT_UNION.count = fill_size;
+    packet_addr->COUNT_UNION.count = fill_size + SizeToCountOffset;
 
     command_addr += fill_command_size;
     cur_size += fill_size;
@@ -761,139 +729,160 @@ hsa_status_t BlitSdma::SubmitLinearFillCommand(void* ptr, uint32_t value,
 
   BuildFenceCommand(command_addr, fence_addr, kFenceValue);
 
-  ReleaseWriteAddress(command_addr_temp, total_command_size);
+  ReleaseWriteAddress(curr_index, total_command_size);
 
   WaitFence(fence_addr, kFenceValue);
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t BlitSdma::EnableProfiling(bool enable) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::EnableProfiling(
+    bool enable) {
   return HSA_STATUS_SUCCESS;
 }
 
-char* BlitSdma::AcquireWriteAddress(uint32_t cmd_size) {
-  if (cmd_size > queue_size_) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+char* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::AcquireWriteAddress(
+    uint32_t cmd_size, RingIndexTy& curr_index) {
+  // Ring is full when all but one byte is written.
+  if (cmd_size >= kQueueSize) {
     return NULL;
   }
 
   while (true) {
-    const uint32_t curr_offset =
-        atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
-    const uint32_t end_offset = curr_offset + cmd_size;
+    curr_index = atomic::Load(&cached_reserve_index_, std::memory_order_acquire);
 
-    if (end_offset >= queue_size_) {
-      // Queue buffer is not enough to contain the new command.
-      WrapQueue(cmd_size);
+    // Check whether a linear region of the requested size is available.
+    // If == cmd_size: region is at beginning of ring.
+    // If < cmd_size: region intersects end of ring, pad with no-ops and retry.
+    if (WrapIntoRing(curr_index + cmd_size) < cmd_size) {
+      PadRingToEnd(curr_index);
       continue;
     }
 
-    const uint32_t curr_read_ptr_val =
-        atomic::Load(queue_resource_.Queue_read_ptr, std::memory_order_acquire);
-    if (curr_offset < curr_read_ptr_val && end_offset > curr_read_ptr_val) {
-      // Queue is wrapping and there is not enough space to recycle.
+    // Check whether the engine has finished using this region.
+    const RingIndexTy new_index = curr_index + cmd_size;
+
+    if (CanWriteUpto(new_index) == false) {
+      // Wait for read index to move and try again.
+      os::YieldThread();
       continue;
     }
 
-    if (atomic::Cas(&cached_reserve_offset_, end_offset, curr_offset,
-                    std::memory_order_release) == curr_offset) {
-      return queue_start_addr_ + curr_offset;
+    // Try to reserve this part of the ring.
+    if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
+        curr_index) {
+      return queue_start_addr_ + WrapIntoRing(curr_index);
     }
+
+    // Another thread reserved curr_index, try again.
+    os::YieldThread();
   }
 
   return NULL;
 }
 
-void BlitSdma::UpdateWriteAndDoorbellRegister(uint32_t current_offset,
-                                              uint32_t new_offset) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::UpdateWriteAndDoorbellRegister(
+    RingIndexTy curr_index, RingIndexTy new_index) {
   while (true) {
-    // Make sure that the address before ::current_offset is already released.
+    // Make sure that the address before ::curr_index is already released.
     // Otherwise the CP may read invalid packets.
-    if (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) ==
-        current_offset) {
+    if (atomic::Load(&cached_commit_index_, std::memory_order_acquire) == curr_index) {
       if (core::Runtime::runtime_singleton_->flag().sdma_wait_idle()) {
         // TODO: remove when sdma wpointer issue is resolved.
         // Wait until the SDMA engine finish processing all packets before
         // updating the wptr and doorbell.
-        while (atomic::Load(queue_resource_.Queue_read_ptr,
-                            std::memory_order_acquire) != current_offset) {
+        while (WrapIntoRing(*reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr)) !=
+               WrapIntoRing(curr_index)) {
           os::YieldThread();
         }
       }
 
       // Update write pointer and doorbel register.
-      atomic::Store(queue_resource_.Queue_write_ptr, new_offset);
+      *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_write_ptr) =
+          (HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
 
+      // Ensure write pointer is visible to GPU before doorbell.
       std::atomic_thread_fence(std::memory_order_release);
 
-      atomic::Store(queue_resource_.Queue_DoorBell, new_offset);
+      *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_DoorBell) =
+          (HwIndexMonotonic ? new_index : WrapIntoRing(new_index));
 
-      std::atomic_thread_fence(std::memory_order_release);
-
-      atomic::Store(&cached_commit_offset_, new_offset);
+      atomic::Store(&cached_commit_index_, new_index, std::memory_order_release);
       break;
     }
+
+    // Waiting for another thread to submit preceding commands first.
+    os::YieldThread();
   }
 }
 
-void BlitSdma::ReleaseWriteAddress(char* cmd_addr, uint32_t cmd_size) {
-  assert(cmd_addr != NULL);
-  assert(cmd_addr >= queue_start_addr_);
-
-  if (cmd_size > queue_size_) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ReleaseWriteAddress(
+    RingIndexTy curr_index, uint32_t cmd_size) {
+  if (cmd_size > kQueueSize) {
     assert(false && "cmd_addr is outside the queue buffer range");
     return;
   }
 
-  // Update write register.
-  const uint32_t curent_offset = cmd_addr - queue_start_addr_;
-  const uint32_t new_offset = curent_offset + cmd_size;
-  UpdateWriteAndDoorbellRegister(curent_offset, new_offset);
+  UpdateWriteAndDoorbellRegister(curr_index, curr_index + cmd_size);
 }
 
-void BlitSdma::WrapQueue(uint32_t cmd_size) {
-  // Re-determine the offset into queue buffer where NOOP instructions
-  // should be written.
-  while (true) {
-    const uint32_t full_offset = queue_size_ + 1;
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::PadRingToEnd(
+    RingIndexTy curr_index) {
+  // Reserve region from here to the end of the ring.
+  RingIndexTy new_index = curr_index + (kQueueSize - WrapIntoRing(curr_index));
 
-    uint32_t curent_offset =
-        atomic::Load(&cached_reserve_offset_, std::memory_order_acquire);
-    const uint32_t end_offset = curent_offset + cmd_size;
-    if (end_offset < queue_size_) {
-      return;
-    }
+  // Check whether the engine has finished using this region.
+  if (CanWriteUpto(new_index) == false) {
+    // Wait for read index to move and try again.
+    return;
+  }
 
-    if (curent_offset == full_offset) {
-      // Another thread is already wrapping the queue.
-      continue;
-    }
+  if (atomic::Cas(&cached_reserve_index_, new_index, curr_index, std::memory_order_release) ==
+      curr_index) {
+    // Write and submit NOP commands in reserved region.
+    char* nop_address = queue_start_addr_ + WrapIntoRing(curr_index);
+    memset(nop_address, 0, new_index - curr_index);
 
-    // Close reservation to queue temporarily by "making" it full.
-    if (atomic::Cas(&cached_reserve_offset_, full_offset, curent_offset,
-                    std::memory_order_release) == curent_offset) {
-      // Wait till all reserved packets are commited.
-      while (atomic::Load(&cached_commit_offset_, std::memory_order_acquire) !=
-             curent_offset) {
-        os::YieldThread();
-      }
-
-      // Fill the remainder of the queue with NOOP commands.
-      char* noop_address = queue_start_addr_ + curent_offset;
-      const size_t noop_commands_size = queue_size_ - curent_offset;
-      memset(noop_address, 0, noop_commands_size);
-
-      // Update write and doorbell registers to execute NOOP instructions.
-      UpdateWriteAndDoorbellRegister(curent_offset, 0);
-
-      // Open access to queue.
-      atomic::Store(&cached_reserve_offset_, 0U, std::memory_order_release);
-    }
+    UpdateWriteAndDoorbellRegister(curr_index, new_index);
   }
 }
 
-void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
-                                 uint32_t fence_value) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+uint32_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WrapIntoRing(
+    RingIndexTy index) {
+  return index & (kQueueSize - 1);
+}
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+bool BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::CanWriteUpto(
+    RingIndexTy upto_index) {
+  // Get/calculate the monotonic read index.
+  RingIndexTy hw_read_index = *reinterpret_cast<RingIndexTy*>(queue_resource_.Queue_read_ptr);
+  RingIndexTy read_index;
+
+  if (HwIndexMonotonic) {
+    read_index = hw_read_index;
+  } else {
+    // Calculate distance from commit index to HW read index.
+    // Commit index is always < kQueueSize away from HW read index.
+    RingIndexTy commit_index = atomic::Load(&cached_commit_index_, std::memory_order_relaxed);
+    RingIndexTy dist_to_read_index = WrapIntoRing(commit_index - hw_read_index);
+    read_index = commit_index - dist_to_read_index;
+  }
+
+  // Check whether the read pointer has passed the given index.
+  // At most we can submit (kQueueSize - 1) bytes at a time.
+  return (upto_index - read_index) < kQueueSize;
+}
+
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildFenceCommand(
+    char* fence_command_addr, uint32_t* fence, uint32_t fence_value) {
   assert(fence_command_addr != NULL);
   SDMA_PKT_FENCE* packet_addr =
       reinterpret_cast<SDMA_PKT_FENCE*>(fence_command_addr);
@@ -909,7 +898,8 @@ void BlitSdma::BuildFenceCommand(char* fence_command_addr, uint32_t* fence,
   packet_addr->DATA_UNION.data = fence_value;
 }
 
-uint32_t* BlitSdma::ObtainFenceObject() {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+uint32_t* BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::ObtainFenceObject() {
   const uint32_t fence_index =
       atomic::Add(&fence_pool_counter_, 1U, std::memory_order_acquire);
   uint32_t* fence_addr = &fence_base_addr_[fence_index & fence_pool_mask_];
@@ -917,7 +907,9 @@ uint32_t* BlitSdma::ObtainFenceObject() {
   return fence_addr;
 }
 
-void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::WaitFence(uint32_t* fence,
+                                                                           uint32_t fence_value) {
   int spin_count = 51;
   while (atomic::Load(fence, std::memory_order_acquire) != fence_value) {
     if (--spin_count > 0) {
@@ -927,12 +919,13 @@ void BlitSdma::WaitFence(uint32_t* fence, uint32_t fence_value) {
   }
 }
 
-void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
-                                void* dst, const void* src, size_t size) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyCommand(
+    char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size) {
   size_t cur_size = 0;
   for (uint32_t i = 0; i < num_copy_command; ++i) {
-    const uint32_t copy_size = static_cast<uint32_t>(
-        std::min((size - cur_size), max_single_linear_copy_size_));
+    const uint32_t copy_size =
+        static_cast<uint32_t>(std::min((size - cur_size), kMaxSingleCopySize));
 
     void* cur_dst = static_cast<char*>(dst) + cur_size;
     const void* cur_src = static_cast<const char*>(src) + cur_size;
@@ -945,7 +938,7 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
     packet_addr->HEADER_UNION.op = SDMA_OP_COPY;
     packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR;
 
-    packet_addr->COUNT_UNION.count = copy_size;
+    packet_addr->COUNT_UNION.count = copy_size + SizeToCountOffset;
 
     packet_addr->SRC_ADDR_LO_UNION.src_addr_31_0 = ptrlow32(cur_src);
     packet_addr->SRC_ADDR_HI_UNION.src_addr_63_32 = ptrhigh32(cur_src);
@@ -960,8 +953,9 @@ void BlitSdma::BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command,
   assert(cur_size == size);
 }
 
-void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
-                                uint32_t reference) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
+    char* cmd_addr, void* addr, uint32_t reference) {
   SDMA_PKT_POLL_REGMEM* packet_addr =
       reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
 
@@ -981,7 +975,9 @@ void BlitSdma::BuildPollCommand(char* cmd_addr, void* addr,
   packet_addr->DW5_UNION.retry_count = 0xfff;  // Retry forever.
 }
 
-void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildAtomicDecrementCommand(
+    char* cmd_addr, void* addr) {
   SDMA_PKT_ATOMIC* packet_addr = reinterpret_cast<SDMA_PKT_ATOMIC*>(cmd_addr);
 
   memset(packet_addr, 0, sizeof(SDMA_PKT_ATOMIC));
@@ -996,8 +992,9 @@ void BlitSdma::BuildAtomicDecrementCommand(char* cmd_addr, void* addr) {
   packet_addr->SRC_DATA_HI_UNION.src_data_63_32 = 0xffffffff;
 }
 
-void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
-                                              void* write_address) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildGetGlobalTimestampCommand(
+    char* cmd_addr, void* write_address) {
   SDMA_PKT_TIMESTAMP* packet_addr =
       reinterpret_cast<SDMA_PKT_TIMESTAMP*>(cmd_addr);
 
@@ -1010,7 +1007,8 @@ void BlitSdma::BuildGetGlobalTimestampCommand(char* cmd_addr,
   packet_addr->ADDR_HI_UNION.addr_63_32 = ptrhigh32(write_address);
 }
 
-void BlitSdma::BuildTrapCommand(char* cmd_addr) {
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildTrapCommand(char* cmd_addr) {
   SDMA_PKT_TRAP* packet_addr =
       reinterpret_cast<SDMA_PKT_TRAP*>(cmd_addr);
 
@@ -1018,4 +1016,8 @@ void BlitSdma::BuildTrapCommand(char* cmd_addr) {
 
   packet_addr->HEADER_UNION.op = SDMA_OP_TRAP;
 }
+
+template class BlitSdma<uint32_t, false, 0>;
+template class BlitSdma<uint64_t, true, -1>;
+
 }  // namespace amd
diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 7c98e63372..140e37c989 100755
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -389,9 +389,9 @@ bool GpuAgent::InitEndTsPool() {
     return true;
   }
 
-  end_ts_pool_size_ = static_cast<uint32_t>(
-      (BlitSdma::kQueueSize + BlitSdma::kCopyPacketSize - 1) /
-      (BlitSdma::kCopyPacketSize));
+  end_ts_pool_size_ =
+      static_cast<uint32_t>((BlitSdmaBase::kQueueSize + BlitSdmaBase::kCopyPacketSize - 1) /
+                            (BlitSdmaBase::kCopyPacketSize));
 
   // Allocate end timestamp object for both h2d and d2h DMA.
   const size_t alloc_size = 2 * end_ts_pool_size_ * kTsSize;
@@ -510,7 +510,13 @@ core::Queue* GpuAgent::CreateInterceptibleQueue() {
 }
 
 core::Blit* GpuAgent::CreateBlitSdma() {
-  BlitSdma* sdma = new BlitSdma();
+  core::Blit* sdma;
+
+  if (isa_->GetMajorVersion() <= 8) {
+    sdma = new BlitSdmaV2V3;
+  } else {
+    sdma = new BlitSdmaV4;
+  }
 
   if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
     sdma->Destroy(*this);