From a669c5ab36accbf73edf8b301728bab75f1dfa3d Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 18 Aug 2014 16:46:45 -0400
Subject: [PATCH] P4 to Git Change 1067573 by skudchad@skudchad_opencl_win_2 on
2014/08/18 16:38:03
ECR #304775 - Refactor code to do line by line copies for read\write Rect. This avoids taking the blit copy path which may be even slower.
ReviewBoardURL = http://ocltc.amd.com/reviews/r/5567/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#108 edit
[ROCm/clr commit: a5e788c9f8d0dae140a00ad2de755afb40801d41]
---
.../clr/rocclr/runtime/device/gpu/gpublit.cpp | 67 ++++++++++---------
1 file changed, 35 insertions(+), 32 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
index 9a64e6efed..4d638e5e31 100644
--- a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
+++ b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp
@@ -613,7 +613,41 @@ DmaBlitManager::copyBufferRect(
size_t srcOffset;
size_t dstOffset;
- if (!dev().settings().rectLinearDMA_) {
+ bool subWindowRectCopy = dev().settings().rectLinearDMA_;
+ // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
+ size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
+
+ srcOffset = srcRect.offset(0, 0, 0);
+ dstOffset = dstRect.offset(0, 0, 0);
+
+ if (subWindowRectCopy &&
+ (((srcOffset % 4) != 0) ||
+ ((dstOffset % 4) != 0) ||
+ ((size[0] % 4) != 0) ||
+ ((srcRect.rowPitch_ % 4) != 0) ||
+ ((srcRect.slicePitch_ % 4) != 0) ||
+ ((dstRect.rowPitch_ % 4) != 0) ||
+ ((dstRect.slicePitch_ % 4) != 0) ||
+ (srcRect.rowPitch_ > pitchLimit) ||
+ (dstRect.rowPitch_ > pitchLimit) ||
+ (size[0] > 0x3fff) || // 14 bits limit in HW
+ (size[1] > 0x3fff) || // 14 bits limit in HW
+ (size[2] > 0x7ff))) { // 11 bits limit in HW
+ // Restriction with rectLinearDRMDMA packet
+ subWindowRectCopy = false;
+ }
+
+ if (subWindowRectCopy) {
+ // Copy data with subwindow copy packet
+ if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
+ amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
+ amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
+ size, gpuMem(dstMemory), true)) {
+ LogError("copyBufferRect failed!");
+ return false;
+ }
+ }
+ else {
for (size_t z = 0; z < size[2]; ++z) {
for (size_t y = 0; y < size[1]; ++y) {
srcOffset = srcRect.offset(0, y, z);
@@ -632,37 +666,6 @@ DmaBlitManager::copyBufferRect(
}
}
}
- else {
- srcOffset = srcRect.offset(0, 0, 0);
- dstOffset = dstRect.offset(0, 0, 0);
-
- // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
- size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;
-
- if (((srcOffset % 4) != 0) ||
- ((dstOffset % 4) != 0) ||
- ((size[0] % 4) != 0) ||
- ((srcRect.rowPitch_ % 4) != 0) ||
- ((srcRect.slicePitch_ % 4) != 0) ||
- ((dstRect.rowPitch_ % 4) != 0) ||
- ((dstRect.slicePitch_ % 4) != 0) ||
- (srcRect.rowPitch_ > pitchLimit) ||
- (dstRect.rowPitch_ > pitchLimit) ||
- (size[0] > 0x3fff) || // 14 bits limit in HW
- (size[1] > 0x3fff) || // 14 bits limit in HW
- (size[2] > 0x7ff)) { // 11 bits limit in HW
- // Restriction with rectLinearDRMDMA packet
- return false;
- }
- // Copy data
- if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
- amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
- amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
- size, gpuMem(dstMemory), true)) {
- LogError("copyBufferRect failed!");
- return false;
- }
- }
}
return true;
}