diff --git a/rocclr/runtime/device/gpu/gpublit.cpp b/rocclr/runtime/device/gpu/gpublit.cpp index 4d638e5e31..32e1c9e5a7 100644 --- a/rocclr/runtime/device/gpu/gpublit.cpp +++ b/rocclr/runtime/device/gpu/gpublit.cpp @@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect( size_t srcOffset; size_t dstOffset; + uint bytesPerElement = 16; + bool optimalElementSize = false; bool subWindowRectCopy = dev().settings().rectLinearDMA_; - // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) - size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF; srcOffset = srcRect.offset(0, 0, 0); dstOffset = dstRect.offset(0, 0, 0); + while (bytesPerElement >= 1) { + if (((srcOffset % 4) == 0) && + ((dstOffset % 4) == 0) && + ((size[0] % bytesPerElement) == 0) && + ((srcRect.rowPitch_ % bytesPerElement) == 0) && + ((srcRect.slicePitch_ % bytesPerElement) == 0) && + ((dstRect.rowPitch_ % bytesPerElement) == 0) && + ((dstRect.slicePitch_ % bytesPerElement) == 0)) { + optimalElementSize = true; + break; + } + bytesPerElement = bytesPerElement >> 1; + } + + // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits) + size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF; + size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF; + if (subWindowRectCopy && - (((srcOffset % 4) != 0) || - ((dstOffset % 4) != 0) || - ((size[0] % 4) != 0) || - ((srcRect.rowPitch_ % 4) != 0) || - ((srcRect.slicePitch_ % 4) != 0) || - ((dstRect.rowPitch_ % 4) != 0) || - ((dstRect.slicePitch_ % 4) != 0) || + (!optimalElementSize || (srcRect.rowPitch_ > pitchLimit) || (dstRect.rowPitch_ > pitchLimit) || - (size[0] > 0x3fff) || // 14 bits limit in HW + (size[0] > sizeLimit) || // See above (size[1] > 0x3fff) || // 14 bits limit in HW (size[2] > 0x7ff))) { // 11 bits limit in HW // Restriction with rectLinearDRMDMA packet @@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect( if (!gpuMem(srcMemory).partialMemCopyTo(gpu(), amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_), amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_), - size, gpuMem(dstMemory), true)) { + size, gpuMem(dstMemory), true, false, bytesPerElement)) { LogError("copyBufferRect failed!"); return false; } diff --git a/rocclr/runtime/device/gpu/gpuresource.cpp b/rocclr/runtime/device/gpu/gpuresource.cpp index 9ae11fba57..7035246240 100644 --- a/rocclr/runtime/device/gpu/gpuresource.cpp +++ b/rocclr/runtime/device/gpu/gpuresource.cpp @@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo( const amd::Coord3D& size, Resource& dstResource, bool enableCopyRect, - bool flushDMA) const + bool flushDMA, + uint bytesPerElement) const { GpuEvent event; bool result; @@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo( result = gpu.copyPartial(event, gslResource(), calSrcOrigin, dstResource.gslResource(), calDstOrigin, - calSize, static_cast(syncFlags), enableCopyRect); + calSize, static_cast(syncFlags), enableCopyRect, bytesPerElement); if (result) { // Mark source and destination as busy diff --git a/rocclr/runtime/device/gpu/gpuresource.hpp b/rocclr/runtime/device/gpu/gpuresource.hpp index c7dcc635cf..0c891c2dde 100644 --- a/rocclr/runtime/device/gpu/gpuresource.hpp +++ b/rocclr/runtime/device/gpu/gpuresource.hpp @@ -232,7 +232,8 @@ public: const amd::Coord3D& size, //!< Size of the region to copy Resource& dstResource, //!< Destination resource bool enableRectCopy = false, //!< Rectangular DMA support - bool flushDMA = false //!< Flush DMA if requested + bool flushDMA = false, //!< Flush DMA if requested + uint bytesPerElement = 1 //!< Bytes Per Element ) const; /*! \brief Copies size/4 DWORD of memory to a surface diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 6c657344e0..dfd1f7dc84 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent& event, size_t* destOffset, size_t* size, CALmemcopyflags flags, - bool enableRectCopy) + bool enableRectCopy, + uint32 bytesPerElement) { uint64 surfaceSize; uint32 mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE; @@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent& event, } m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2], destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2], - size[0], (uint32)size[1], (uint32)size[2], mode); + size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement); } else { diff --git a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index cd02a46bc3..ecd04024d9 100644 --- a/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -71,7 +71,7 @@ public: void destroyProgramObject(gslProgramObject func); bool copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset, - gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect); + gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement); void setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);