diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 6d9a74a47e..6a68c0fedc 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1305,6 +1305,7 @@ class VirtualDevice : public amd::ReferenceCountedObject { virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); } virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; } + virtual void ReleaseSdmaEngines() {} //!< Release SDMA engine assignments (ROCm specific) virtual void ReleaseAllHwQueues() {} virtual void ReleaseHwQueue() {} diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index 09c3c03b04..daca663f26 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -35,7 +35,6 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) StagingXferSize(dev().settings().stagedXferSize_), completeOperation_(false), context_(nullptr) { - dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_); } inline void DmaBlitManager::synchronize() const { @@ -470,31 +469,13 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem return result; } -// Select an SDMA engine using priority-based scheduling -// Prefers engines in preferredMask (high-bandwidth engines), otherwise any free engine -static inline uint32_t selectSdmaEngine(uint32_t freeMask, uint32_t preferredMask) { - if (freeMask == 0) return 0; - - // Try preferred engines first (high-bandwidth engines) - uint32_t preferredFree = freeMask & preferredMask; - if (preferredFree != 0) { - return preferredFree & (~preferredFree + 1); // Extract lowest preferred engine - } - - // Fall back to non-preferred engines (slower engines) - return freeMask & (~freeMask + 1); // Extract lowest available engine -} - // ================================================================================================ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, const_address src, hsa_agent_t& srcAgent, size_t size, amd::CopyMetadata& copyMetadata) const { hsa_status_t status = HSA_STATUS_SUCCESS; - uint32_t copyMask = 0; - uint32_t freeEngineMask = 0; - uint32_t recIdMask = 0; - bool kUseRegularCopyApi = 0; + bool kUseRegularCopyApi = false; constexpr size_t kRetainCountThreshold = 8; bool forceSDMA = (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::SDMA); @@ -523,33 +504,34 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) { - // Get the mask of valid engines for this operation (read or write) - uint32_t validEngineMask = - (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_); + // Check if this VirtualGPU already has an assigned engine with affinity + uint32_t assignedEngineMask = gpu().AssignedSdmaEngine(); - // Check SDMA engine status to get currently free engines - status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); - - if (status == HSA_STATUS_SUCCESS) { - status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask); - } - - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "Query copy engine status %x, srcAgent %p, " - "dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x", - status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask); - - // Constrain to valid engines for this operation - freeEngineMask &= validEngineMask; - recIdMask &= validEngineMask; - - if (freeEngineMask != 0) { - // Use priority-based scheduling: prefer high-bandwidth engines (recIdMask) - copyMask = selectSdmaEngine(freeEngineMask, recIdMask); + if (assignedEngineMask != 0) { + // This VirtualGPU/stream already has an assigned engine - just use it + // Stream ordering handles any busy conditions naturally + copyMask = assignedEngineMask; ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "Selected SDMA engine: free_mask=0x%x, preferred_mask=0x%x, selected_mask=0x%x", - freeEngineMask, recIdMask, copyMask); + "Using assigned SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d", + &gpu(), copyMask, engine); + } else { + // No assigned engine yet - allocate one using device-level allocator + copyMask = dev().AllocateSdmaEngine(&gpu(), engine, dstAgent, srcAgent); + + if (copyMask != 0) { + // Store the assigned engine in the VirtualGPU for future use + gpu().SetAssignedSdmaEngine(copyMask); + + ClPrint(amd::LOG_INFO, amd::LOG_COPY, + "Allocated new SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d", + &gpu(), copyMask, engine); + } else { + ClPrint(amd::LOG_WARNING, amd::LOG_COPY, + "Failed to allocate SDMA engine for VirtualGPU %p, falling back to regular copy", + &gpu()); + kUseRegularCopyApi = true; + } } if (copyMask != 0 && status == HSA_STATUS_SUCCESS) { diff --git a/projects/clr/rocclr/device/rocm/rocblit.hpp b/projects/clr/rocclr/device/rocm/rocblit.hpp index 4c03582f45..9b60899c20 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.hpp +++ b/projects/clr/rocclr/device/rocm/rocblit.hpp @@ -259,8 +259,6 @@ class DmaBlitManager : public device::HostBlitManager { bool completeOperation_; //!< DMA blit manager must complete operation amd::Context* context_; //!< A dummy context - uint32_t sdmaEngineReadMask_; //!< SDMA Engine Read Mask - uint32_t sdmaEngineWriteMask_; //!< SDMA Engine Write Mask private: //! Disable copy constructor diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index ad1f993578..8982037806 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -146,6 +146,7 @@ Device::Device(hsa_agent_t bkendDevice) preferred_numa_node_(0), maxSdmaReadMask_(0), maxSdmaWriteMask_(0), + sdma_engine_allocator_(*this), cpu_agent_info_(nullptr) { group_segment_.handle = 0; gpuvm_segment_.handle = 0; @@ -3509,9 +3510,143 @@ void Device::HiddenHeapInit(const VirtualGPU& gpu) { } // ================================================================================================ -void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const { - *readMask = maxSdmaReadMask_; - *writeMask = maxSdmaWriteMask_; +uint32_t Device::SdmaEngineAllocator::AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type, + hsa_agent_t dstAgent, hsa_agent_t srcAgent) { + amd::ScopedLock lock(lock_); + + // Get valid engine mask based on operation type (read vs write) + uint32_t validEngineMask = (engine_type == HwQueueEngine::SdmaRead) + ? device_.maxSdmaReadMask_ + : device_.maxSdmaWriteMask_; + + // Simple round-robin path if all engines have equal bandwidth + // Disabled by default - use preferred engine logic for current GPUs + constexpr bool kUseSimpleRR = false; + + if (kUseSimpleRR) { + // Simple round-robin: just cycle through valid engines + // This will be enabled for future GPUs where engine selection doesn't matter + if (validEngineMask == 0) { + ClPrint(amd::LOG_WARNING, amd::LOG_COPY, + "No valid SDMA engines for VirtualGPU %p", vgpu); + return 0; + } + + // Cycle through bit positions, find next valid engine + uint32_t start_bit = next_rr_engine_.fetch_add(1, std::memory_order_relaxed); + uint32_t selected_mask = 0; + + // Try up to 32 positions to find a valid engine + for (uint32_t i = 0; i < 32; ++i) { + uint32_t bit = (start_bit + i) % 32; + uint32_t mask = 1u << bit; + if (validEngineMask & mask) { + selected_mask = mask; + break; + } + } + + vgpu_to_engine_[vgpu] = selected_mask; + + ClPrint(amd::LOG_INFO, amd::LOG_COPY, + "Assigned SDMA engine (simple RR) to VirtualGPU %p: mask=0x%x, engine_type=%d", + vgpu, selected_mask, engine_type); + + return selected_mask; + } + + // Current path: Query HSA for engine status and preferences + uint32_t freeEngineMask = 0; + uint32_t preferredMask = 0; + hsa_status_t status = HSA_STATUS_SUCCESS; + + // Query current engine status + status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); + if (status == HSA_STATUS_SUCCESS) { + // Query preferred (high-bandwidth) engines + status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &preferredMask); + } + + // Constrain to valid engines + freeEngineMask &= validEngineMask; + preferredMask &= validEngineMask; + + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + "Engine query for VirtualGPU %p: status=%x, free_mask=0x%x, preferred_mask=0x%x, " + "valid_mask=0x%x, engine_type=%d", + vgpu, status, freeEngineMask, preferredMask, validEngineMask, engine_type); + + uint32_t candidate_mask = 0; + uint32_t allocated_mask = 0; + + // For inter-GPU copies, strongly prefer the recommended engines + bool is_inter_gpu = (engine_type == HwQueueEngine::SdmaInter); + + if (is_inter_gpu && (preferredMask != 0)) { + // Inter-GPU: prioritize preferredMask, even if engines are already allocated + candidate_mask = validEngineMask & preferredMask; + + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + "Inter-GPU copy for VirtualGPU %p: prioritizing preferred engines, " + "candidate_mask=0x%x", + vgpu, candidate_mask); + } else { + // Regular read/write/intra: enforce exclusivity (don't share engines) + // Build a mask of engines already allocated to other VirtualGPUs + for (const auto& pair : vgpu_to_engine_) { + allocated_mask |= pair.second; + } + + uint32_t available_mask = validEngineMask & ~allocated_mask; + + if (available_mask == 0) { + ClPrint(amd::LOG_WARNING, amd::LOG_COPY, + "No unallocated SDMA engines available for VirtualGPU %p, engine_type=%d " + "(valid_mask=0x%x, allocated_mask=0x%x)", + vgpu, engine_type, validEngineMask, allocated_mask); + return 0; + } + + // Prefer high-bandwidth (recommended) engines if available + candidate_mask = available_mask & preferredMask; + if (candidate_mask == 0) { + candidate_mask = available_mask; + } + } + + if (candidate_mask == 0) { + ClPrint(amd::LOG_WARNING, amd::LOG_COPY, + "No candidate SDMA engines for VirtualGPU %p, engine_type=%d", + vgpu, engine_type); + return 0; + } + + // Select the lowest bit (first available engine) + uint32_t selected_mask = candidate_mask & (~candidate_mask + 1); + + // Update the map + vgpu_to_engine_[vgpu] = selected_mask; + + ClPrint(amd::LOG_INFO, amd::LOG_COPY, + "Assigned SDMA engine to VirtualGPU %p: mask=0x%x, engine_type=%d, " + "valid_mask=0x%x, preferred_mask=0x%x, allocated_mask=0x%x, is_inter_gpu=%d", + vgpu, selected_mask, engine_type, validEngineMask, preferredMask, + allocated_mask, is_inter_gpu); + + return selected_mask; +} + +// ================================================================================================ +void Device::SdmaEngineAllocator::ReleaseEngine(VirtualGPU* vgpu) { + amd::ScopedLock lock(lock_); + + auto it = vgpu_to_engine_.find(vgpu); + if (it != vgpu_to_engine_.end()) { + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, + "Released SDMA engine for VirtualGPU %p: mask=0x%x", + vgpu, it->second); + vgpu_to_engine_.erase(it); + } } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index b6f42cf97c..4b90935533 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -602,9 +602,16 @@ class Device : public NullDevice { void HiddenHeapAlloc(const VirtualGPU& gpu); //! Init hidden heap for device memory allocations void HiddenHeapInit(const VirtualGPU& gpu); - void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const; bool isXgmi() const override { return isXgmi_; } + //! SDMA engine allocation for per-stream affinity + uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type, + hsa_agent_t dstAgent, hsa_agent_t srcAgent) const { + return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent); + } + void ReleaseSdmaEngine(VirtualGPU* vgpu) const { + sdma_engine_allocator_.ReleaseEngine(vgpu); + } //! Returns the map of code objects to kernels const auto& KernelMap() const { return kernel_map_; } //! Adds a kernel to the kernel map @@ -706,6 +713,27 @@ class Device : public NullDevice { bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU bool pm4_emulation_ = false; //!< Flag to indicate if PM4 emulation is enabled + //! SDMA engine allocator for per-stream affinity + struct SdmaEngineAllocator { + amd::Monitor lock_; //!< Protects the allocation state + std::unordered_map vgpu_to_engine_; //!< VirtualGPU -> engine mask + std::atomic next_rr_engine_{0}; //!< Simple RR counter for future use + const Device& device_; //!< Reference to parent device for accessing masks + + SdmaEngineAllocator(const Device& device) + : lock_(true), device_(device) {} + + //! Allocate an SDMA engine for a VirtualGPU + //! Queries HSA for engine status and preferred engines, then allocates + //! For inter-GPU copies, strongly prefers recommended engines even if already allocated + uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type, + hsa_agent_t dstAgent, hsa_agent_t srcAgent); + + //! Release engine allocation for a VirtualGPU + void ReleaseEngine(VirtualGPU* vgpu); + }; + mutable SdmaEngineAllocator sdma_engine_allocator_; + //! Code object to kernel info map (used in the crash dump analysis) mutable std::map kernel_map_; diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 0814e460fe..00c54e15e9 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1783,6 +1783,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative, // ================================================================================================ VirtualGPU::~VirtualGPU() { + // Release SDMA engine assignment for this VirtualGPU + dev().ReleaseSdmaEngine(this); + ClearAssignedSdmaEngine(); + delete blitMgr_; if (tracking_created_) { @@ -1990,6 +1994,14 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) { } } +// ================================================================================================ +void VirtualGPU::ReleaseSdmaEngines() { + // Release SDMA engine assignment when queue is idle + // This allows the engine to be reassigned to other active streams + dev().ReleaseSdmaEngine(this); + ClearAssignedSdmaEngine(); +} + // ================================================================================================ void VirtualGPU::ReleaseAllHwQueues() { if (roc_device_.settings().dynamic_queues_ && diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index de3185fd06..518adfec63 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -390,6 +390,7 @@ class VirtualGPU : public device::VirtualDevice { virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {} virtual address allocKernelArguments(size_t size, size_t alignment) final; + virtual void ReleaseSdmaEngines() final; //!< Release SDMA engine assignments virtual void ReleaseAllHwQueues() final; virtual void ReleaseHwQueue() final; @@ -459,6 +460,17 @@ class VirtualGPU : public device::VirtualDevice { void AnalyzeAqlQueue() const; bool ForceIrq() const { return force_irq_; } + //! SDMA engine affinity management + uint32_t AssignedSdmaEngine() const { + return assigned_sdma_engine_; + } + void SetAssignedSdmaEngine(uint32_t engine_mask) { + assigned_sdma_engine_ = engine_mask; + } + void ClearAssignedSdmaEngine() { + assigned_sdma_engine_ = 0; + } + private: //! Dispatches a barrier with blocking HSA signals void dispatchBlockingWait(); @@ -628,6 +640,9 @@ class VirtualGPU : public device::VirtualDevice { //!< with a completion signal hsa_signal_t last_completion_signal_{}; //!< The last completion signal + //! SDMA engine affinity tracking for this VirtualGPU/stream + uint32_t assigned_sdma_engine_ = 0; //!< Assigned SDMA engine mask for all operations + using KernelArgImpl = device::Settings::KernelArgImpl; }; } // namespace amd::roc diff --git a/projects/clr/rocclr/platform/commandqueue.cpp b/projects/clr/rocclr/platform/commandqueue.cpp index f8fa12f3ad..4fead460b0 100644 --- a/projects/clr/rocclr/platform/commandqueue.cpp +++ b/projects/clr/rocclr/platform/commandqueue.cpp @@ -226,6 +226,8 @@ void HostQueue::finish(bool cpu_wait) { } } } + // Release SDMA engine assignments + vdev()->ReleaseSdmaEngines(); // Release all HW queues, which are idle or nearly idle vdev()->ReleaseAllHwQueues();