clr: Implement per-stream SDMA engine affinity for improved copy performance (#2480)

Problem: The existing SDMA engine selection logic had several issues: 1. Same VirtualGPU/stream could use different SDMA engines for consecutive async copies since copy_engine_status may report engines as busy 2. Busy and Preferred engine check for every copy 3. No global tracking of which VirtualGPU uses which engine, leading to suboptimal resource allocation Solution: Implemented a global SDMA engine allocator with per-stream affinity: - Added Device::SdmaEngineAllocator to manage VirtualGPU → engine assignments * Maintains global map of active assignments * Enforces exclusivity: different streams use different engines (except inter-GPU copies where preferred engines are prioritized for optimal hardware paths like XGMI links) * Thread-safe allocation/release with Monitor lock - Modified VirtualGPU to cache assigned engine locally (assigned_sdma_engine_) for fast lookup without map access on hot path - Refactored rocrCopyBuffer() to: 1. Check local cached engine first → use if assigned 2. Call AllocateSdmaEngine() if not assigned → cache result - Moved HSA API queries (memory_copy_engine_status, memory_get_preferred_copy_engine) into AllocateEngine() for cleaner separation of concerns - Engine release on HostQueue::finish() instead of only VirtualGPU destruction * Improves engine utilization by releasing earlier * Added virtual ReleaseSdmaEngines() method to device::VirtualDevice - Added future path for simple round-robin allocation (kUseSimpleRR) for next-gen GPUs with uniform SDMA bandwidth (disabled by default) Cleanup: - Removed selectSdmaEngine() helper (logic moved to allocator) - Removed getSdmaRWMasks() (allocator accesses maxSdmaReadMask_/WriteMask_ directly) - Removed unused sdmaEngineReadMask_/WriteMask_ member variables from DmaBlitManager Benefits: - Ensures consistent per-stream SDMA engine usage - Prevents cross-stream contention and engine thrashing - Prioritizes hardware-optimal paths for inter-GPU transfers - Better resource utilization through earlier release - Cleaner, more maintainable code structure
2026-01-07 19:37:45 -08:00
@@ -1305,6 +1305,7 @@ class VirtualDevice : public amd::ReferenceCountedObject {
  virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }

  virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
+  virtual void ReleaseSdmaEngines() {}  //!< Release SDMA engine assignments (ROCm specific)
  virtual void ReleaseAllHwQueues() {}
  virtual void ReleaseHwQueue() {}

@@ -35,7 +35,6 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
      StagingXferSize(dev().settings().stagedXferSize_),
      completeOperation_(false),
      context_(nullptr) {
-  dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
 }

 inline void DmaBlitManager::synchronize() const {
@@ -470,31 +469,13 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
  return result;
 }

-// Select an SDMA engine using priority-based scheduling
-// Prefers engines in preferredMask (high-bandwidth engines), otherwise any free engine
-static inline uint32_t selectSdmaEngine(uint32_t freeMask, uint32_t preferredMask) {
-  if (freeMask == 0) return 0;
-
-  // Try preferred engines first (high-bandwidth engines)
-  uint32_t preferredFree = freeMask & preferredMask;
-  if (preferredFree != 0) {
-    return preferredFree & (~preferredFree + 1);  // Extract lowest preferred engine
-  }
-
-  // Fall back to non-preferred engines (slower engines)
-  return freeMask & (~freeMask + 1);  // Extract lowest available engine
-}
-
 // ================================================================================================
 inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, const_address src,
                                           hsa_agent_t& srcAgent, size_t size,
                                           amd::CopyMetadata& copyMetadata) const {
  hsa_status_t status = HSA_STATUS_SUCCESS;
-
  uint32_t copyMask = 0;
-  uint32_t freeEngineMask = 0;
-  uint32_t recIdMask = 0;
-  bool kUseRegularCopyApi = 0;
+  bool kUseRegularCopyApi = false;
  constexpr size_t kRetainCountThreshold = 8;
  bool forceSDMA =
      (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::SDMA);
@@ -523,33 +504,34 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
  hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

  if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
-    // Get the mask of valid engines for this operation (read or write)
-    uint32_t validEngineMask =
-        (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
+    // Check if this VirtualGPU already has an assigned engine with affinity
+    uint32_t assignedEngineMask = gpu().AssignedSdmaEngine();

-    // Check SDMA engine status to get currently free engines
-    status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
-
-    if (status == HSA_STATUS_SUCCESS) {
-      status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
-    }
-
-    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-            "Query copy engine status %x, srcAgent %p, "
-            "dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
-            status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
-
-    // Constrain to valid engines for this operation
-    freeEngineMask &= validEngineMask;
-    recIdMask &= validEngineMask;
-
-    if (freeEngineMask != 0) {
-      // Use priority-based scheduling: prefer high-bandwidth engines (recIdMask)
-      copyMask = selectSdmaEngine(freeEngineMask, recIdMask);
+    if (assignedEngineMask != 0) {
+      // This VirtualGPU/stream already has an assigned engine - just use it
+      // Stream ordering handles any busy conditions naturally
+      copyMask = assignedEngineMask;

      ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "Selected SDMA engine: free_mask=0x%x, preferred_mask=0x%x, selected_mask=0x%x",
-              freeEngineMask, recIdMask, copyMask);
+              "Using assigned SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
+              &gpu(), copyMask, engine);
+    } else {
+      // No assigned engine yet - allocate one using device-level allocator
+      copyMask = dev().AllocateSdmaEngine(&gpu(), engine, dstAgent, srcAgent);
+
+      if (copyMask != 0) {
+        // Store the assigned engine in the VirtualGPU for future use
+        gpu().SetAssignedSdmaEngine(copyMask);
+
+        ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+                "Allocated new SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
+                &gpu(), copyMask, engine);
+      } else {
+        ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+                "Failed to allocate SDMA engine for VirtualGPU %p, falling back to regular copy",
+                &gpu());
+        kUseRegularCopyApi = true;
+      }
    }

    if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
@@ -259,8 +259,6 @@ class DmaBlitManager : public device::HostBlitManager {

  bool completeOperation_;        //!< DMA blit manager must complete operation
  amd::Context* context_;         //!< A dummy context
-  uint32_t sdmaEngineReadMask_;   //!< SDMA Engine Read Mask
-  uint32_t sdmaEngineWriteMask_;  //!< SDMA Engine Write Mask

 private:
  //! Disable copy constructor
@@ -146,6 +146,7 @@ Device::Device(hsa_agent_t bkendDevice)
      preferred_numa_node_(0),
      maxSdmaReadMask_(0),
      maxSdmaWriteMask_(0),
+      sdma_engine_allocator_(*this),
      cpu_agent_info_(nullptr) {
  group_segment_.handle = 0;
  gpuvm_segment_.handle = 0;
@@ -3509,9 +3510,143 @@ void Device::HiddenHeapInit(const VirtualGPU& gpu) {
 }

 // ================================================================================================
-void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
-  *readMask = maxSdmaReadMask_;
-  *writeMask = maxSdmaWriteMask_;
+uint32_t Device::SdmaEngineAllocator::AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                                                      hsa_agent_t dstAgent, hsa_agent_t srcAgent) {
+  amd::ScopedLock lock(lock_);
+
+  // Get valid engine mask based on operation type (read vs write)
+  uint32_t validEngineMask = (engine_type == HwQueueEngine::SdmaRead)
+                              ? device_.maxSdmaReadMask_
+                              : device_.maxSdmaWriteMask_;
+
+  // Simple round-robin path if all engines have equal bandwidth
+  // Disabled by default - use preferred engine logic for current GPUs
+  constexpr bool kUseSimpleRR = false;
+
+  if (kUseSimpleRR) {
+    // Simple round-robin: just cycle through valid engines
+    // This will be enabled for future GPUs where engine selection doesn't matter
+    if (validEngineMask == 0) {
+      ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+              "No valid SDMA engines for VirtualGPU %p", vgpu);
+      return 0;
+    }
+
+    // Cycle through bit positions, find next valid engine
+    uint32_t start_bit = next_rr_engine_.fetch_add(1, std::memory_order_relaxed);
+    uint32_t selected_mask = 0;
+
+    // Try up to 32 positions to find a valid engine
+    for (uint32_t i = 0; i < 32; ++i) {
+      uint32_t bit = (start_bit + i) % 32;
+      uint32_t mask = 1u << bit;
+      if (validEngineMask & mask) {
+        selected_mask = mask;
+        break;
+      }
+    }
+
+    vgpu_to_engine_[vgpu] = selected_mask;
+
+    ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+            "Assigned SDMA engine (simple RR) to VirtualGPU %p: mask=0x%x, engine_type=%d",
+            vgpu, selected_mask, engine_type);
+
+    return selected_mask;
+  }
+
+  // Current path: Query HSA for engine status and preferences
+  uint32_t freeEngineMask = 0;
+  uint32_t preferredMask = 0;
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  // Query current engine status
+  status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
+  if (status == HSA_STATUS_SUCCESS) {
+    // Query preferred (high-bandwidth) engines
+    status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &preferredMask);
+  }
+
+  // Constrain to valid engines
+  freeEngineMask &= validEngineMask;
+  preferredMask &= validEngineMask;
+
+  ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+          "Engine query for VirtualGPU %p: status=%x, free_mask=0x%x, preferred_mask=0x%x, "
+          "valid_mask=0x%x, engine_type=%d",
+          vgpu, status, freeEngineMask, preferredMask, validEngineMask, engine_type);
+
+  uint32_t candidate_mask = 0;
+  uint32_t allocated_mask = 0;
+
+  // For inter-GPU copies, strongly prefer the recommended engines
+  bool is_inter_gpu = (engine_type == HwQueueEngine::SdmaInter);
+
+  if (is_inter_gpu && (preferredMask != 0)) {
+    // Inter-GPU: prioritize preferredMask, even if engines are already allocated
+    candidate_mask = validEngineMask & preferredMask;
+
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+            "Inter-GPU copy for VirtualGPU %p: prioritizing preferred engines, "
+            "candidate_mask=0x%x",
+            vgpu, candidate_mask);
+  } else {
+    // Regular read/write/intra: enforce exclusivity (don't share engines)
+    // Build a mask of engines already allocated to other VirtualGPUs
+    for (const auto& pair : vgpu_to_engine_) {
+      allocated_mask |= pair.second;
+    }
+
+    uint32_t available_mask = validEngineMask & ~allocated_mask;
+
+    if (available_mask == 0) {
+      ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+              "No unallocated SDMA engines available for VirtualGPU %p, engine_type=%d "
+              "(valid_mask=0x%x, allocated_mask=0x%x)",
+              vgpu, engine_type, validEngineMask, allocated_mask);
+      return 0;
+    }
+
+    // Prefer high-bandwidth (recommended) engines if available
+    candidate_mask = available_mask & preferredMask;
+    if (candidate_mask == 0) {
+      candidate_mask = available_mask;
+    }
+  }
+
+  if (candidate_mask == 0) {
+    ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+            "No candidate SDMA engines for VirtualGPU %p, engine_type=%d",
+            vgpu, engine_type);
+    return 0;
+  }
+
+  // Select the lowest bit (first available engine)
+  uint32_t selected_mask = candidate_mask & (~candidate_mask + 1);
+
+  // Update the map
+  vgpu_to_engine_[vgpu] = selected_mask;
+
+  ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+          "Assigned SDMA engine to VirtualGPU %p: mask=0x%x, engine_type=%d, "
+          "valid_mask=0x%x, preferred_mask=0x%x, allocated_mask=0x%x, is_inter_gpu=%d",
+          vgpu, selected_mask, engine_type, validEngineMask, preferredMask,
+          allocated_mask, is_inter_gpu);
+
+  return selected_mask;
+}
+
+// ================================================================================================
+void Device::SdmaEngineAllocator::ReleaseEngine(VirtualGPU* vgpu) {
+  amd::ScopedLock lock(lock_);
+
+  auto it = vgpu_to_engine_.find(vgpu);
+  if (it != vgpu_to_engine_.end()) {
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+            "Released SDMA engine for VirtualGPU %p: mask=0x%x",
+            vgpu, it->second);
+    vgpu_to_engine_.erase(it);
+  }
 }

 // ================================================================================================
@@ -602,9 +602,16 @@ class Device : public NullDevice {
  void HiddenHeapAlloc(const VirtualGPU& gpu);
  //! Init hidden heap for device memory allocations
  void HiddenHeapInit(const VirtualGPU& gpu);
-  void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
  bool isXgmi() const override { return isXgmi_; }

+  //! SDMA engine allocation for per-stream affinity
+  uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                              hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
+    return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
+  }
+  void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
+    sdma_engine_allocator_.ReleaseEngine(vgpu);
+  }
  //! Returns the map of code objects to kernels
  const auto& KernelMap() const { return kernel_map_; }
  //! Adds a kernel to the kernel map
@@ -706,6 +713,27 @@ class Device : public NullDevice {
  bool isXgmi_;  //!< Flag to indicate if there is XGMI between CPU<->GPU
  bool pm4_emulation_ = false;  //!< Flag to indicate if PM4 emulation is enabled

+  //! SDMA engine allocator for per-stream affinity
+  struct SdmaEngineAllocator {
+    amd::Monitor lock_;  //!< Protects the allocation state
+    std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_;  //!< VirtualGPU -> engine mask
+    std::atomic<uint32_t> next_rr_engine_{0};  //!< Simple RR counter for future use
+    const Device& device_;  //!< Reference to parent device for accessing masks
+
+    SdmaEngineAllocator(const Device& device)
+        : lock_(true), device_(device) {}
+
+    //! Allocate an SDMA engine for a VirtualGPU
+    //! Queries HSA for engine status and preferred engines, then allocates
+    //! For inter-GPU copies, strongly prefers recommended engines even if already allocated
+    uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                           hsa_agent_t dstAgent, hsa_agent_t srcAgent);
+
+    //! Release engine allocation for a VirtualGPU
+    void ReleaseEngine(VirtualGPU* vgpu);
+  };
+  mutable SdmaEngineAllocator sdma_engine_allocator_;
+
  //! Code object to kernel info map (used in the crash dump analysis)
  mutable std::map<uint64_t, Kernel&> kernel_map_;

@@ -1783,6 +1783,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,

 // ================================================================================================
 VirtualGPU::~VirtualGPU() {
+  // Release SDMA engine assignment for this VirtualGPU
+  dev().ReleaseSdmaEngine(this);
+  ClearAssignedSdmaEngine();
+
  delete blitMgr_;

  if (tracking_created_) {
@@ -1990,6 +1994,14 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
  }
 }

+// ================================================================================================
+void VirtualGPU::ReleaseSdmaEngines() {
+  // Release SDMA engine assignment when queue is idle
+  // This allows the engine to be reassigned to other active streams
+  dev().ReleaseSdmaEngine(this);
+  ClearAssignedSdmaEngine();
+}
+
 // ================================================================================================
 void VirtualGPU::ReleaseAllHwQueues() {
  if (roc_device_.settings().dynamic_queues_ &&
@@ -390,6 +390,7 @@ class VirtualGPU : public device::VirtualDevice {
  virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {}

  virtual address allocKernelArguments(size_t size, size_t alignment) final;
+  virtual void ReleaseSdmaEngines() final;  //!< Release SDMA engine assignments
  virtual void ReleaseAllHwQueues() final;
  virtual void ReleaseHwQueue() final;

@@ -459,6 +460,17 @@ class VirtualGPU : public device::VirtualDevice {
  void AnalyzeAqlQueue() const;
  bool ForceIrq() const { return force_irq_; }

+  //! SDMA engine affinity management
+  uint32_t AssignedSdmaEngine() const {
+    return assigned_sdma_engine_;
+  }
+  void SetAssignedSdmaEngine(uint32_t engine_mask) {
+    assigned_sdma_engine_ = engine_mask;
+  }
+  void ClearAssignedSdmaEngine() {
+    assigned_sdma_engine_ = 0;
+  }
+
 private:
  //! Dispatches a barrier with blocking HSA signals
  void dispatchBlockingWait();
@@ -628,6 +640,9 @@ class VirtualGPU : public device::VirtualDevice {
                                              //!< with a completion signal
  hsa_signal_t last_completion_signal_{};     //!< The last completion signal

+  //! SDMA engine affinity tracking for this VirtualGPU/stream
+  uint32_t assigned_sdma_engine_ = 0;           //!< Assigned SDMA engine mask for all operations
+
  using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 }  // namespace amd::roc
@@ -226,6 +226,8 @@ void HostQueue::finish(bool cpu_wait) {
      }
    }
  }
+  // Release SDMA engine assignments
+  vdev()->ReleaseSdmaEngines();
  // Release all HW queues, which are idle or nearly idle
  vdev()->ReleaseAllHwQueues();