From 5efe63df44eda095f5d10cf075226bf7a450d3cc Mon Sep 17 00:00:00 2001 From: foreman Date: Mon, 25 Aug 2014 15:09:01 -0400 Subject: [PATCH] P4 to Git Change 1069927 by skudchad@skudchad_test_win_opencl2 on 2014/08/25 14:51:55 ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet) ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit --- rocclr/runtime/device/gpu/gpublit.cpp | 34 +++++++++++++------ rocclr/runtime/device/gpu/gpuresource.cpp | 5 +-- rocclr/runtime/device/gpu/gpuresource.hpp | 3 +- .../device/gpu/gslbe/src/rt/GSLContext.cpp | 5 +-- .../device/gpu/gslbe/src/rt/GSLContext.h | 2 +- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp index 4d638e5e31..32e1c9e5a7 100644 --- a/rocclr/runtime/device/gpu/gpublit.cpp +++ b/rocclr/runtime/device/gpu/gpublit.cpp @@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect( size_t srcOffset; size_t dstOffset; + uint bytesPerElement = 16; + bool optimalElementSize = false; bool subWindowRectCopy = dev().settings().rectLinearDMA_; - // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) - size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF; srcOffset = srcRect.offset(0, 0, 0); dstOffset = dstRect.offset(0, 0, 0); + while (bytesPerElement >= 1) { + if (((srcOffset % 4) == 0) && + ((dstOffset % 4) == 0) && + ((size[0] % bytesPerElement) == 0) && + ((srcRect.rowPitch_ % bytesPerElement) == 0) && + ((srcRect.slicePitch_ % bytesPerElement) == 0) && + ((dstRect.rowPitch_ % bytesPerElement) == 0) && + ((dstRect.slicePitch_ % bytesPerElement) == 0)) { + optimalElementSize = true; + break; + } + bytesPerElement = bytesPerElement >> 1; + } + + // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) + size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF; + size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF; + if (subWindowRectCopy && - (((srcOffset % 4) != 0) || - ((dstOffset % 4) != 0) || - ((size[0] % 4) != 0) || - ((srcRect.rowPitch_ % 4) != 0) || - ((srcRect.slicePitch_ % 4) != 0) || - ((dstRect.rowPitch_ % 4) != 0) || - ((dstRect.slicePitch_ % 4) != 0) || + (!optimalElementSize || (srcRect.rowPitch_ > pitchLimit) || (dstRect.rowPitch_ > pitchLimit) || - (size[0] > 0x3fff) || // 14 bits limit in HW + (size[0] > sizeLimit) || // See above (size[1] > 0x3fff) || // 14 bits limit in HW (size[2] > 0x7ff))) { // 11 bits limit in HW // Restriction with rectLinearDRMDMA packet @@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect( if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), - size, gpuMem(dstMemory), true)) { + size, gpuMem(dstMemory), true, false, bytesPerElement)) { LogError("copyBufferRect failed!"); return false; } diff --git a/rocclr/runtime/device/gpu/gpuresource.cpp b/rocclr/runtime/device/gpu/gpuresource.cpp index 9ae11fba57..7035246240 100644 --- a/rocclr/runtime/device/gpu/gpuresource.cpp +++ b/rocclr/runtime/device/gpu/gpuresource.cpp @@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo( const amd::Coord3D& size, Resource& dstResource, bool enableCopyRect, - bool flushDMA) const + bool flushDMA, + uint bytesPerElement) const { GpuEvent event; bool result; @@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo( result = gpu.copyPartial(event, gslResource(), calSrcOrigin, dstResource.gslResource(), calDstOrigin, - calSize, static_cast(syncFlags), enableCopyRect); + calSize, static_cast(syncFlags), enableCopyRect, bytesPerElement); if (result) { // Mark source and destination as busy diff --git a/rocclr/runtime/device/gpu/gpuresource.hpp b/rocclr/runtime/device/gpu/gpuresource.hpp index c7dcc635cf..0c891c2dde 100644 --- a/rocclr/runtime/device/gpu/gpuresource.hpp +++ b/rocclr/runtime/device/gpu/gpuresource.hpp @@ -232,7 +232,8 @@ public: const amd::Coord3D& size, //!< Size of the region to copy Resource& dstResource, //!< Destination resource bool enableRectCopy = false, //!< Rectangular DMA support - bool flushDMA = false //!< Flush DMA if requested + bool flushDMA = false, //!< Flush DMA if requested + uint bytesPerElement = 1 //!< Bytes Per Element ) const; /*! \brief Copies size/4 DWORD of memory to a surface diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 6c657344e0..dfd1f7dc84 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent& event, size_t* destOffset, size_t* size, CALmemcopyflags flags, - bool enableRectCopy) + bool enableRectCopy, + uint32 bytesPerElement) { uint64 surfaceSize; uint32 mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE; @@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent& event, } m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2], destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2], - size[0], (uint32)size[1], (uint32)size[2], mode); + size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement); } else { diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index cd02a46bc3..ecd04024d9 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -71,7 +71,7 @@ public: void destroyProgramObject(gslProgramObject func); bool copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset, - gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect); + gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement); void setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);