clr: Implement per-stream SDMA engine affinity for improved copy performance (#2480)

Problem:
The existing SDMA engine selection logic had several issues:
1. Same VirtualGPU/stream could use different SDMA engines for consecutive
   async copies since copy_engine_status may report engines as busy
2. Busy and Preferred engine check for every copy
3. No global tracking of which VirtualGPU uses which engine, leading to
   suboptimal resource allocation

Solution:
Implemented a global SDMA engine allocator with per-stream affinity:

- Added Device::SdmaEngineAllocator to manage VirtualGPU → engine assignments
  * Maintains global map of active assignments
  * Enforces exclusivity: different streams use different engines (except
    inter-GPU copies where preferred engines are prioritized for optimal
    hardware paths like XGMI links)
  * Thread-safe allocation/release with Monitor lock

- Modified VirtualGPU to cache assigned engine locally (assigned_sdma_engine_)
  for fast lookup without map access on hot path

- Refactored rocrCopyBuffer() to:
  1. Check local cached engine first → use if assigned
  2. Call AllocateSdmaEngine() if not assigned → cache result

- Moved HSA API queries (memory_copy_engine_status, memory_get_preferred_copy_engine)
  into AllocateEngine() for cleaner separation of concerns

- Engine release on HostQueue::finish() instead of only VirtualGPU destruction
  * Improves engine utilization by releasing earlier
  * Added virtual ReleaseSdmaEngines() method to device::VirtualDevice

- Added future path for simple round-robin allocation (kUseSimpleRR) for
  next-gen GPUs with uniform SDMA bandwidth (disabled by default)

Cleanup:
- Removed selectSdmaEngine() helper (logic moved to allocator)
- Removed getSdmaRWMasks() (allocator accesses maxSdmaReadMask_/WriteMask_ directly)
- Removed unused sdmaEngineReadMask_/WriteMask_ member variables from DmaBlitManager

Benefits:
- Ensures consistent per-stream SDMA engine usage
- Prevents cross-stream contention and engine thrashing
- Prioritizes hardware-optimal paths for inter-GPU transfers
- Better resource utilization through earlier release
- Cleaner, more maintainable code structure
This commit is contained in:
SaleelK
2026-01-07 19:37:45 -08:00
committed by GitHub
szülő be04fa8250
commit 6b28faa532
8 fájl változott, egészen pontosan 223 új sor hozzáadva és 50 régi sor törölve
@@ -1305,6 +1305,7 @@ class VirtualDevice : public amd::ReferenceCountedObject {
virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
virtual void ReleaseSdmaEngines() {} //!< Release SDMA engine assignments (ROCm specific)
virtual void ReleaseAllHwQueues() {}
virtual void ReleaseHwQueue() {}
@@ -35,7 +35,6 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
StagingXferSize(dev().settings().stagedXferSize_),
completeOperation_(false),
context_(nullptr) {
dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
}
inline void DmaBlitManager::synchronize() const {
@@ -470,31 +469,13 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
return result;
}
// Select an SDMA engine using priority-based scheduling
// Prefers engines in preferredMask (high-bandwidth engines), otherwise any free engine
static inline uint32_t selectSdmaEngine(uint32_t freeMask, uint32_t preferredMask) {
if (freeMask == 0) return 0;
// Try preferred engines first (high-bandwidth engines)
uint32_t preferredFree = freeMask & preferredMask;
if (preferredFree != 0) {
return preferredFree & (~preferredFree + 1); // Extract lowest preferred engine
}
// Fall back to non-preferred engines (slower engines)
return freeMask & (~freeMask + 1); // Extract lowest available engine
}
// ================================================================================================
inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, const_address src,
hsa_agent_t& srcAgent, size_t size,
amd::CopyMetadata& copyMetadata) const {
hsa_status_t status = HSA_STATUS_SUCCESS;
uint32_t copyMask = 0;
uint32_t freeEngineMask = 0;
uint32_t recIdMask = 0;
bool kUseRegularCopyApi = 0;
bool kUseRegularCopyApi = false;
constexpr size_t kRetainCountThreshold = 8;
bool forceSDMA =
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::SDMA);
@@ -523,33 +504,34 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
// Get the mask of valid engines for this operation (read or write)
uint32_t validEngineMask =
(engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
// Check if this VirtualGPU already has an assigned engine with affinity
uint32_t assignedEngineMask = gpu().AssignedSdmaEngine();
// Check SDMA engine status to get currently free engines
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
if (status == HSA_STATUS_SUCCESS) {
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Query copy engine status %x, srcAgent %p, "
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
// Constrain to valid engines for this operation
freeEngineMask &= validEngineMask;
recIdMask &= validEngineMask;
if (freeEngineMask != 0) {
// Use priority-based scheduling: prefer high-bandwidth engines (recIdMask)
copyMask = selectSdmaEngine(freeEngineMask, recIdMask);
if (assignedEngineMask != 0) {
// This VirtualGPU/stream already has an assigned engine - just use it
// Stream ordering handles any busy conditions naturally
copyMask = assignedEngineMask;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Selected SDMA engine: free_mask=0x%x, preferred_mask=0x%x, selected_mask=0x%x",
freeEngineMask, recIdMask, copyMask);
"Using assigned SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
&gpu(), copyMask, engine);
} else {
// No assigned engine yet - allocate one using device-level allocator
copyMask = dev().AllocateSdmaEngine(&gpu(), engine, dstAgent, srcAgent);
if (copyMask != 0) {
// Store the assigned engine in the VirtualGPU for future use
gpu().SetAssignedSdmaEngine(copyMask);
ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Allocated new SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
&gpu(), copyMask, engine);
} else {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"Failed to allocate SDMA engine for VirtualGPU %p, falling back to regular copy",
&gpu());
kUseRegularCopyApi = true;
}
}
if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
@@ -259,8 +259,6 @@ class DmaBlitManager : public device::HostBlitManager {
bool completeOperation_; //!< DMA blit manager must complete operation
amd::Context* context_; //!< A dummy context
uint32_t sdmaEngineReadMask_; //!< SDMA Engine Read Mask
uint32_t sdmaEngineWriteMask_; //!< SDMA Engine Write Mask
private:
//! Disable copy constructor
@@ -146,6 +146,7 @@ Device::Device(hsa_agent_t bkendDevice)
preferred_numa_node_(0),
maxSdmaReadMask_(0),
maxSdmaWriteMask_(0),
sdma_engine_allocator_(*this),
cpu_agent_info_(nullptr) {
group_segment_.handle = 0;
gpuvm_segment_.handle = 0;
@@ -3509,9 +3510,143 @@ void Device::HiddenHeapInit(const VirtualGPU& gpu) {
}
// ================================================================================================
void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
*readMask = maxSdmaReadMask_;
*writeMask = maxSdmaWriteMask_;
uint32_t Device::SdmaEngineAllocator::AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent) {
amd::ScopedLock lock(lock_);
// Get valid engine mask based on operation type (read vs write)
uint32_t validEngineMask = (engine_type == HwQueueEngine::SdmaRead)
? device_.maxSdmaReadMask_
: device_.maxSdmaWriteMask_;
// Simple round-robin path if all engines have equal bandwidth
// Disabled by default - use preferred engine logic for current GPUs
constexpr bool kUseSimpleRR = false;
if (kUseSimpleRR) {
// Simple round-robin: just cycle through valid engines
// This will be enabled for future GPUs where engine selection doesn't matter
if (validEngineMask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No valid SDMA engines for VirtualGPU %p", vgpu);
return 0;
}
// Cycle through bit positions, find next valid engine
uint32_t start_bit = next_rr_engine_.fetch_add(1, std::memory_order_relaxed);
uint32_t selected_mask = 0;
// Try up to 32 positions to find a valid engine
for (uint32_t i = 0; i < 32; ++i) {
uint32_t bit = (start_bit + i) % 32;
uint32_t mask = 1u << bit;
if (validEngineMask & mask) {
selected_mask = mask;
break;
}
}
vgpu_to_engine_[vgpu] = selected_mask;
ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Assigned SDMA engine (simple RR) to VirtualGPU %p: mask=0x%x, engine_type=%d",
vgpu, selected_mask, engine_type);
return selected_mask;
}
// Current path: Query HSA for engine status and preferences
uint32_t freeEngineMask = 0;
uint32_t preferredMask = 0;
hsa_status_t status = HSA_STATUS_SUCCESS;
// Query current engine status
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
if (status == HSA_STATUS_SUCCESS) {
// Query preferred (high-bandwidth) engines
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &preferredMask);
}
// Constrain to valid engines
freeEngineMask &= validEngineMask;
preferredMask &= validEngineMask;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Engine query for VirtualGPU %p: status=%x, free_mask=0x%x, preferred_mask=0x%x, "
"valid_mask=0x%x, engine_type=%d",
vgpu, status, freeEngineMask, preferredMask, validEngineMask, engine_type);
uint32_t candidate_mask = 0;
uint32_t allocated_mask = 0;
// For inter-GPU copies, strongly prefer the recommended engines
bool is_inter_gpu = (engine_type == HwQueueEngine::SdmaInter);
if (is_inter_gpu && (preferredMask != 0)) {
// Inter-GPU: prioritize preferredMask, even if engines are already allocated
candidate_mask = validEngineMask & preferredMask;
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Inter-GPU copy for VirtualGPU %p: prioritizing preferred engines, "
"candidate_mask=0x%x",
vgpu, candidate_mask);
} else {
// Regular read/write/intra: enforce exclusivity (don't share engines)
// Build a mask of engines already allocated to other VirtualGPUs
for (const auto& pair : vgpu_to_engine_) {
allocated_mask |= pair.second;
}
uint32_t available_mask = validEngineMask & ~allocated_mask;
if (available_mask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No unallocated SDMA engines available for VirtualGPU %p, engine_type=%d "
"(valid_mask=0x%x, allocated_mask=0x%x)",
vgpu, engine_type, validEngineMask, allocated_mask);
return 0;
}
// Prefer high-bandwidth (recommended) engines if available
candidate_mask = available_mask & preferredMask;
if (candidate_mask == 0) {
candidate_mask = available_mask;
}
}
if (candidate_mask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No candidate SDMA engines for VirtualGPU %p, engine_type=%d",
vgpu, engine_type);
return 0;
}
// Select the lowest bit (first available engine)
uint32_t selected_mask = candidate_mask & (~candidate_mask + 1);
// Update the map
vgpu_to_engine_[vgpu] = selected_mask;
ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Assigned SDMA engine to VirtualGPU %p: mask=0x%x, engine_type=%d, "
"valid_mask=0x%x, preferred_mask=0x%x, allocated_mask=0x%x, is_inter_gpu=%d",
vgpu, selected_mask, engine_type, validEngineMask, preferredMask,
allocated_mask, is_inter_gpu);
return selected_mask;
}
// ================================================================================================
void Device::SdmaEngineAllocator::ReleaseEngine(VirtualGPU* vgpu) {
amd::ScopedLock lock(lock_);
auto it = vgpu_to_engine_.find(vgpu);
if (it != vgpu_to_engine_.end()) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Released SDMA engine for VirtualGPU %p: mask=0x%x",
vgpu, it->second);
vgpu_to_engine_.erase(it);
}
}
// ================================================================================================
@@ -602,9 +602,16 @@ class Device : public NullDevice {
void HiddenHeapAlloc(const VirtualGPU& gpu);
//! Init hidden heap for device memory allocations
void HiddenHeapInit(const VirtualGPU& gpu);
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
bool isXgmi() const override { return isXgmi_; }
//! SDMA engine allocation for per-stream affinity
uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
}
void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
sdma_engine_allocator_.ReleaseEngine(vgpu);
}
//! Returns the map of code objects to kernels
const auto& KernelMap() const { return kernel_map_; }
//! Adds a kernel to the kernel map
@@ -706,6 +713,27 @@ class Device : public NullDevice {
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU
bool pm4_emulation_ = false; //!< Flag to indicate if PM4 emulation is enabled
//! SDMA engine allocator for per-stream affinity
struct SdmaEngineAllocator {
amd::Monitor lock_; //!< Protects the allocation state
std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_; //!< VirtualGPU -> engine mask
std::atomic<uint32_t> next_rr_engine_{0}; //!< Simple RR counter for future use
const Device& device_; //!< Reference to parent device for accessing masks
SdmaEngineAllocator(const Device& device)
: lock_(true), device_(device) {}
//! Allocate an SDMA engine for a VirtualGPU
//! Queries HSA for engine status and preferred engines, then allocates
//! For inter-GPU copies, strongly prefers recommended engines even if already allocated
uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent);
//! Release engine allocation for a VirtualGPU
void ReleaseEngine(VirtualGPU* vgpu);
};
mutable SdmaEngineAllocator sdma_engine_allocator_;
//! Code object to kernel info map (used in the crash dump analysis)
mutable std::map<uint64_t, Kernel&> kernel_map_;
@@ -1783,6 +1783,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
// ================================================================================================
VirtualGPU::~VirtualGPU() {
// Release SDMA engine assignment for this VirtualGPU
dev().ReleaseSdmaEngine(this);
ClearAssignedSdmaEngine();
delete blitMgr_;
if (tracking_created_) {
@@ -1990,6 +1994,14 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
}
}
// ================================================================================================
void VirtualGPU::ReleaseSdmaEngines() {
// Release SDMA engine assignment when queue is idle
// This allows the engine to be reassigned to other active streams
dev().ReleaseSdmaEngine(this);
ClearAssignedSdmaEngine();
}
// ================================================================================================
void VirtualGPU::ReleaseAllHwQueues() {
if (roc_device_.settings().dynamic_queues_ &&
@@ -390,6 +390,7 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {}
virtual address allocKernelArguments(size_t size, size_t alignment) final;
virtual void ReleaseSdmaEngines() final; //!< Release SDMA engine assignments
virtual void ReleaseAllHwQueues() final;
virtual void ReleaseHwQueue() final;
@@ -459,6 +460,17 @@ class VirtualGPU : public device::VirtualDevice {
void AnalyzeAqlQueue() const;
bool ForceIrq() const { return force_irq_; }
//! SDMA engine affinity management
uint32_t AssignedSdmaEngine() const {
return assigned_sdma_engine_;
}
void SetAssignedSdmaEngine(uint32_t engine_mask) {
assigned_sdma_engine_ = engine_mask;
}
void ClearAssignedSdmaEngine() {
assigned_sdma_engine_ = 0;
}
private:
//! Dispatches a barrier with blocking HSA signals
void dispatchBlockingWait();
@@ -628,6 +640,9 @@ class VirtualGPU : public device::VirtualDevice {
//!< with a completion signal
hsa_signal_t last_completion_signal_{}; //!< The last completion signal
//! SDMA engine affinity tracking for this VirtualGPU/stream
uint32_t assigned_sdma_engine_ = 0; //!< Assigned SDMA engine mask for all operations
using KernelArgImpl = device::Settings::KernelArgImpl;
};
} // namespace amd::roc
@@ -226,6 +226,8 @@ void HostQueue::finish(bool cpu_wait) {
}
}
}
// Release SDMA engine assignments
vdev()->ReleaseSdmaEngines();
// Release all HW queues, which are idle or nearly idle
vdev()->ReleaseAllHwQueues();