From c8cd6e607e61908aac6c5080cc217fb243c09f34 Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 19 Dec 2018 13:46:12 -0500 Subject: [PATCH] P4 to Git Change 1722452 by skudchad@skudchad_test2_win_opencl on 2018/12/19 13:24:25 SWDEV-145570 - Use Subwindow copy SDMA for D->H and H->D copies if possible or fall back to linebyline copies if unalinged pitch. - Set correct flags for SVM finegrain buffer for ROC backend ReviewBoardURL = http://ocltc.amd.com/reviews/r/16353/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#41 edit --- rocclr/runtime/device/rocm/rocblit.cpp | 101 ++++++++++++++++++----- rocclr/runtime/device/rocm/rocmemory.cpp | 1 + 2 files changed, 80 insertions(+), 22 deletions(-) diff --git a/rocclr/runtime/device/rocm/rocblit.cpp b/rocclr/runtime/device/rocm/rocblit.cpp index 588fc5b3a6..88e723dc57 100644 --- a/rocclr/runtime/device/rocm/rocblit.cpp +++ b/rocclr/runtime/device/rocm/rocblit.cpp @@ -353,7 +353,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d dstMemory.isHostMemDirectAccess())) { return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire); } else { - return false; + void* src = gpuMem(srcMemory).getDeviceMemory(); void* dst = gpuMem(dstMemory).getDeviceMemory(); @@ -363,33 +363,90 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d const hsa_agent_t dstAgent = (dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice(); - const hsa_signal_value_t kInitVal = size[2] * size[1]; - hsa_signal_store_relaxed(completion_signal_, kInitVal); + bool isSubwindowRectCopy = true; + hsa_amd_copy_direction_t direction = hsaHostToHost; - for (size_t z = 0; z < size[2]; ++z) { - for (size_t y = 0; y < size[1]; ++y) { - size_t srcOffset = srcRect.offset(0, y, z); - size_t dstOffset = dstRect.offset(0, y, z); + hsa_agent_t agent = dev().getBackendDevice(); + //Determine copy direction + if (srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess()) { + direction = hsaHostToDevice; + } else if (!srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) { + direction = hsaDeviceToHost; + } else if (!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess()) { + direction = hsaDeviceToDevice; + } - // Copy memory line by line - hsa_status_t status = - hsa_amd_memory_async_copy((reinterpret_cast
(dst) + dstOffset), dstAgent, - (reinterpret_cast(src) + srcOffset), srcAgent, - size[0], 0, nullptr, completion_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("DMA buffer failed with code %d", status); - return false; + hsa_pitched_ptr_t srcMem = { (reinterpret_cast
(src) + srcRect.offset(0, 0, 0)), + srcRect.rowPitch_, + srcRect.slicePitch_ }; + + hsa_pitched_ptr_t dstMem = { (reinterpret_cast
(dst) + dstRect.offset(0, 0, 0)), + dstRect.rowPitch_, + dstRect.slicePitch_ }; + + hsa_dim3_t dim = { static_cast(size[0]), + static_cast(size[1]), + static_cast(size[2]) }; + hsa_dim3_t offset = { 0, 0 ,0 }; + + + if ((srcRect.rowPitch_ % 4 != 0) || + (srcRect.slicePitch_ % 4 != 0) || + (dstRect.rowPitch_ % 4 != 0) || + (dstRect.slicePitch_ % 4 != 0)) { + isSubwindowRectCopy = false; + } + + if (isSubwindowRectCopy ) { + const hsa_signal_value_t kInitVal = 1; + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + // Copy memory line by line + hsa_status_t status = + hsa_amd_memory_async_copy_rect(&srcMem, &offset, &dstMem, &offset, &dim, agent, + direction, 0, nullptr, completion_signal_); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("DMA buffer failed with code %d", status); + return false; + } + + + hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_BLOCKED); + if (val != 0) { + LogError("Async copy failed"); + return false; + } + } else { + // Fall to line by line copies + const hsa_signal_value_t kInitVal = size[2] * size[1]; + hsa_signal_store_relaxed(completion_signal_, kInitVal); + + for (size_t z = 0; z < size[2]; ++z) { + for (size_t y = 0; y < size[1]; ++y) { + size_t srcOffset = srcRect.offset(0, y, z); + size_t dstOffset = dstRect.offset(0, y, z); + + // Copy memory line by line + hsa_status_t status = + hsa_amd_memory_async_copy((reinterpret_cast
(dst) + dstOffset), dstAgent, + (reinterpret_cast(src) + srcOffset), srcAgent, + size[0], 0, nullptr, completion_signal_); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("DMA buffer failed with code %d", status); + return false; + } } } + + hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, + uint64_t(-1), HSA_WAIT_STATE_BLOCKED); + if (val != 0) { + LogError("Async copy failed"); + return false; + } } - hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, - uint64_t(-1), HSA_WAIT_STATE_BLOCKED); - - if (val != 0) { - LogError("Async copy failed"); - return false; - } } return true; } diff --git a/rocclr/runtime/device/rocm/rocmemory.cpp b/rocclr/runtime/device/rocm/rocmemory.cpp index a5f425cfc9..edfdea8e62 100644 --- a/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/rocclr/runtime/device/rocm/rocmemory.cpp @@ -671,6 +671,7 @@ bool Buffer::create() { if (owner()->getSvmPtr() == reinterpret_cast(1)) { if (isFineGrain) { deviceMemory_ = dev().hostAlloc(size(), 1, false); + flags_ |= HostMemoryDirectAccess; } else { deviceMemory_ = dev().deviceLocalAlloc(size()); }