From e6e0378acdd84470de81855b27e4c1aba96b3267 Mon Sep 17 00:00:00 2001 From: SaleelK Date: Mon, 12 Jan 2026 11:01:02 -0800 Subject: [PATCH] clr: Always query new engine for intergpu copies (#2559) --- projects/clr/rocclr/device/rocm/rocblit.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index daca663f26..cc8bd1c46d 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -507,7 +507,11 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c // Check if this VirtualGPU already has an assigned engine with affinity uint32_t assignedEngineMask = gpu().AssignedSdmaEngine(); - if (assignedEngineMask != 0) { + // For inter-GPU copies, always query preferred engine based on agents + // because the allocator has special logic to select high-bandwidth engines + // for specific src/dst pairs, and we shouldn't reuse an engine from a different copy type + + if (assignedEngineMask != 0 && engine != HwQueueEngine::SdmaInter) { // This VirtualGPU/stream already has an assigned engine - just use it // Stream ordering handles any busy conditions naturally copyMask = assignedEngineMask;