From c8cd6e607e61908aac6c5080cc217fb243c09f34 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 19 Dec 2018 13:46:12 -0500
Subject: [PATCH] P4 to Git Change 1722452 by
skudchad@skudchad_test2_win_opencl on 2018/12/19 13:24:25
SWDEV-145570 - Use Subwindow copy SDMA for D->H and H->D copies if possible or fall back to linebyline copies if unalinged pitch.
- Set correct flags for SVM finegrain buffer for ROC backend
ReviewBoardURL = http://ocltc.amd.com/reviews/r/16353/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocblit.cpp#27 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#41 edit
---
rocclr/runtime/device/rocm/rocblit.cpp | 101 ++++++++++++++++++-----
rocclr/runtime/device/rocm/rocmemory.cpp | 1 +
2 files changed, 80 insertions(+), 22 deletions(-)
diff --git a/rocclr/runtime/device/rocm/rocblit.cpp b/rocclr/runtime/device/rocm/rocblit.cpp
index 588fc5b3a6..88e723dc57 100644
--- a/rocclr/runtime/device/rocm/rocblit.cpp
+++ b/rocclr/runtime/device/rocm/rocblit.cpp
@@ -353,7 +353,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
dstMemory.isHostMemDirectAccess())) {
return HostBlitManager::copyBufferRect(srcMemory, dstMemory, srcRect, dstRect, size, entire);
} else {
- return false;
+
void* src = gpuMem(srcMemory).getDeviceMemory();
void* dst = gpuMem(dstMemory).getDeviceMemory();
@@ -363,33 +363,90 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
const hsa_agent_t dstAgent =
(dstMemory.isHostMemDirectAccess()) ? dev().getCpuAgent() : dev().getBackendDevice();
- const hsa_signal_value_t kInitVal = size[2] * size[1];
- hsa_signal_store_relaxed(completion_signal_, kInitVal);
+ bool isSubwindowRectCopy = true;
+ hsa_amd_copy_direction_t direction = hsaHostToHost;
- for (size_t z = 0; z < size[2]; ++z) {
- for (size_t y = 0; y < size[1]; ++y) {
- size_t srcOffset = srcRect.offset(0, y, z);
- size_t dstOffset = dstRect.offset(0, y, z);
+ hsa_agent_t agent = dev().getBackendDevice();
+ //Determine copy direction
+ if (srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess()) {
+ direction = hsaHostToDevice;
+ } else if (!srcMemory.isHostMemDirectAccess() && dstMemory.isHostMemDirectAccess()) {
+ direction = hsaDeviceToHost;
+ } else if (!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess()) {
+ direction = hsaDeviceToDevice;
+ }
- // Copy memory line by line
- hsa_status_t status =
- hsa_amd_memory_async_copy((reinterpret_cast(dst) + dstOffset), dstAgent,
- (reinterpret_cast(src) + srcOffset), srcAgent,
- size[0], 0, nullptr, completion_signal_);
- if (status != HSA_STATUS_SUCCESS) {
- LogPrintfError("DMA buffer failed with code %d", status);
- return false;
+ hsa_pitched_ptr_t srcMem = { (reinterpret_cast(src) + srcRect.offset(0, 0, 0)),
+ srcRect.rowPitch_,
+ srcRect.slicePitch_ };
+
+ hsa_pitched_ptr_t dstMem = { (reinterpret_cast(dst) + dstRect.offset(0, 0, 0)),
+ dstRect.rowPitch_,
+ dstRect.slicePitch_ };
+
+ hsa_dim3_t dim = { static_cast(size[0]),
+ static_cast(size[1]),
+ static_cast(size[2]) };
+ hsa_dim3_t offset = { 0, 0 ,0 };
+
+
+ if ((srcRect.rowPitch_ % 4 != 0) ||
+ (srcRect.slicePitch_ % 4 != 0) ||
+ (dstRect.rowPitch_ % 4 != 0) ||
+ (dstRect.slicePitch_ % 4 != 0)) {
+ isSubwindowRectCopy = false;
+ }
+
+ if (isSubwindowRectCopy ) {
+ const hsa_signal_value_t kInitVal = 1;
+ hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+ // Copy memory line by line
+ hsa_status_t status =
+ hsa_amd_memory_async_copy_rect(&srcMem, &offset, &dstMem, &offset, &dim, agent,
+ direction, 0, nullptr, completion_signal_);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogPrintfError("DMA buffer failed with code %d", status);
+ return false;
+ }
+
+
+ hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
+ uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
+ if (val != 0) {
+ LogError("Async copy failed");
+ return false;
+ }
+ } else {
+ // Fall to line by line copies
+ const hsa_signal_value_t kInitVal = size[2] * size[1];
+ hsa_signal_store_relaxed(completion_signal_, kInitVal);
+
+ for (size_t z = 0; z < size[2]; ++z) {
+ for (size_t y = 0; y < size[1]; ++y) {
+ size_t srcOffset = srcRect.offset(0, y, z);
+ size_t dstOffset = dstRect.offset(0, y, z);
+
+ // Copy memory line by line
+ hsa_status_t status =
+ hsa_amd_memory_async_copy((reinterpret_cast(dst) + dstOffset), dstAgent,
+ (reinterpret_cast(src) + srcOffset), srcAgent,
+ size[0], 0, nullptr, completion_signal_);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogPrintfError("DMA buffer failed with code %d", status);
+ return false;
+ }
}
}
+
+ hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
+ uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
+ if (val != 0) {
+ LogError("Async copy failed");
+ return false;
+ }
}
- hsa_signal_value_t val = hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
- uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
-
- if (val != 0) {
- LogError("Async copy failed");
- return false;
- }
}
return true;
}
diff --git a/rocclr/runtime/device/rocm/rocmemory.cpp b/rocclr/runtime/device/rocm/rocmemory.cpp
index a5f425cfc9..edfdea8e62 100644
--- a/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -671,6 +671,7 @@ bool Buffer::create() {
if (owner()->getSvmPtr() == reinterpret_cast(1)) {
if (isFineGrain) {
deviceMemory_ = dev().hostAlloc(size(), 1, false);
+ flags_ |= HostMemoryDirectAccess;
} else {
deviceMemory_ = dev().deviceLocalAlloc(size());
}