From a669c5ab36accbf73edf8b301728bab75f1dfa3d Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 18 Aug 2014 16:46:45 -0400 Subject: [PATCH] P4 to Git Change 1067573 by skudchad@skudchad_opencl_win_2 on 2014/08/18 16:38:03 ECR #304775 - Refactor code to do line by line copies for read\write Rect. This avoids taking the blit copy path which may be even slower. ReviewBoardURL = http://ocltc.amd.com/reviews/r/5567/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#108 edit [ROCm/clr commit: a5e788c9f8d0dae140a00ad2de755afb40801d41] --- .../clr/rocclr/runtime/device/gpu/gpublit.cpp | 67 ++++++++++--------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp index 9a64e6efed..4d638e5e31 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpublit.cpp @@ -613,7 +613,41 @@ DmaBlitManager::copyBufferRect( size_t srcOffset; size_t dstOffset; - if (!dev().settings().rectLinearDMA_) { + bool subWindowRectCopy = dev().settings().rectLinearDMA_; + // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) + size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF; + + srcOffset = srcRect.offset(0, 0, 0); + dstOffset = dstRect.offset(0, 0, 0); + + if (subWindowRectCopy && + (((srcOffset % 4) != 0) || + ((dstOffset % 4) != 0) || + ((size[0] % 4) != 0) || + ((srcRect.rowPitch_ % 4) != 0) || + ((srcRect.slicePitch_ % 4) != 0) || + ((dstRect.rowPitch_ % 4) != 0) || + ((dstRect.slicePitch_ % 4) != 0) || + (srcRect.rowPitch_ > pitchLimit) || + (dstRect.rowPitch_ > pitchLimit) || + (size[0] > 0x3fff) || // 14 bits limit in HW + (size[1] > 0x3fff) || // 14 bits limit in HW + (size[2] > 0x7ff))) { // 11 bits limit in HW + // Restriction with rectLinearDRMDMA packet + subWindowRectCopy = false; + } + + if (subWindowRectCopy) { + // Copy data with subwindow copy packet + if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), + amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), + amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), + size, gpuMem(dstMemory), true)) { + LogError("copyBufferRect failed!"); + return false; + } + } + else { for (size_t z = 0; z < size[2]; ++z) { for (size_t y = 0; y < size[1]; ++y) { srcOffset = srcRect.offset(0, y, z); @@ -632,37 +666,6 @@ DmaBlitManager::copyBufferRect( } } } - else { - srcOffset = srcRect.offset(0, 0, 0); - dstOffset = dstRect.offset(0, 0, 0); - - // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) - size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF; - - if (((srcOffset % 4) != 0) || - ((dstOffset % 4) != 0) || - ((size[0] % 4) != 0) || - ((srcRect.rowPitch_ % 4) != 0) || - ((srcRect.slicePitch_ % 4) != 0) || - ((dstRect.rowPitch_ % 4) != 0) || - ((dstRect.slicePitch_ % 4) != 0) || - (srcRect.rowPitch_ > pitchLimit) || - (dstRect.rowPitch_ > pitchLimit) || - (size[0] > 0x3fff) || // 14 bits limit in HW - (size[1] > 0x3fff) || // 14 bits limit in HW - (size[2] > 0x7ff)) { // 11 bits limit in HW - // Restriction with rectLinearDRMDMA packet - return false; - } - // Copy data - if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), - amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), - amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), - size, gpuMem(dstMemory), true)) { - LogError("copyBufferRect failed!"); - return false; - } - } } return true; }