diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index 871319cfc8..697fe7c33d 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -679,20 +679,23 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, uint32_t copyMask = 0; uint32_t freeEngineMask = 0; - bool useRegularCopyApi = false; + bool useRegularCopyApi = !HIP_USE_SDMA_QUERY; HwQueueEngine engine = HwQueueEngine::Unknown; if ((srcAgent.handle == dev().getCpuAgent().handle) && (dstAgent.handle != dev().getCpuAgent().handle)) { engine = HwQueueEngine::SdmaWrite; - copyMask = dev().fetchSDMAMask(this, false); + copyMask = useRegularCopyApi ? 0 : dev().fetchSDMAMask(this, false); } else if ((srcAgent.handle != dev().getCpuAgent().handle) && (dstAgent.handle == dev().getCpuAgent().handle)) { engine = HwQueueEngine::SdmaRead; - copyMask = dev().fetchSDMAMask(this, true); + copyMask = useRegularCopyApi ? 0 : dev().fetchSDMAMask(this, true); } - if (engine != HwQueueEngine::Unknown) { + auto wait_events = gpu().Barriers().WaitingSignal(engine); + hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); + + if (!useRegularCopyApi && engine != HwQueueEngine::Unknown) { if (copyMask == 0) { // Check SDMA engine status status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask); @@ -703,8 +706,6 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, } if (copyMask != 0 && status == HSA_STATUS_SUCCESS) { - auto wait_events = gpu().Barriers().WaitingSignal(engine); - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); // Copy on the first available free engine if ROCr returns a valid mask hsa_amd_sdma_engine_id_t copyEngine = static_cast(copyMask); @@ -717,33 +718,26 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory, status = hsa_amd_memory_async_copy_on_engine(dst, dstAgent, src, srcAgent, size[0], wait_events.size(), wait_events.data(), active, copyEngine, false); - if (status != HSA_STATUS_SUCCESS) { - gpu().Barriers().ResetCurrentSignal(); - } } else { useRegularCopyApi = true; } } if (engine == HwQueueEngine::Unknown || useRegularCopyApi) { - auto wait_events = gpu().Barriers().WaitingSignal(engine); - hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp()); ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, - "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, " - "completion_signal=0x%zx", - dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, - active.handle); + "HSA Async Copy dst=0x%zx, src=0x%zx, size=%ld, wait_event=0x%zx, " + "completion_signal=0x%zx", + dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0, + active.handle); status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], wait_events.size(), wait_events.data(), active); - if (status != HSA_STATUS_SUCCESS) { - gpu().Barriers().ResetCurrentSignal(); - } } if (status == HSA_STATUS_SUCCESS) { gpu().addSystemScope(); } else { + gpu().Barriers().ResetCurrentSignal(); LogPrintfError("HSA copy failed with code %d, falling to Blit copy", status); } diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 6a33dd0918..5b95d4a7ab 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -285,7 +285,9 @@ release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \ release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \ "Set initial heap size for device malloc.") \ release(bool, HIP_FORCE_DEV_KERNARG, 0, \ - "Force device mem for kernel args") + "Force device mem for kernel args") \ +release(bool, HIP_USE_SDMA_QUERY, 0, \ + "Use SDMA query API to make copy decisions") namespace amd {