P4 to Git Change 1069927 by skudchad@skudchad_test_win_opencl2 on 2014/08/25 14:51:55

ECR #304775 - Optimization for rectangular copies(Part2). Due to HW restriction of 14bits for src and dst pitch, its advantageous to choose optimal bpp. Higher the bpp the larger the byte pitch. This indirectly helps to reduce the number of packets for buffer copy(line by line vs a single sub_win raw packet) ReviewBoardURL = http://ocltc.amd.com/reviews/r/5605/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpublit.cpp#109 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#191 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.hpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#64 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#38 edit [ROCm/clr commit: 5efe63df44]
2014-08-25 15:09:01 -04:00
@@ -613,24 +613,36 @@ DmaBlitManager::copyBufferRect(
        size_t  srcOffset;
        size_t  dstOffset;

+        uint bytesPerElement = 16;
+        bool optimalElementSize = false;
        bool subWindowRectCopy = dev().settings().rectLinearDMA_;
-        // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
-        size_t pitchLimit = dev().settings().ciPlus_ ? 0xFFFF : 0x7FFFF;

        srcOffset   = srcRect.offset(0, 0, 0);
        dstOffset   = dstRect.offset(0, 0, 0);

+        while (bytesPerElement >= 1) {
+            if (((srcOffset % 4) == 0) &&
+                ((dstOffset % 4) == 0) &&
+                ((size[0] % bytesPerElement) == 0) &&
+                ((srcRect.rowPitch_ % bytesPerElement) == 0) &&
+                ((srcRect.slicePitch_ % bytesPerElement) == 0) &&
+                ((dstRect.rowPitch_ % bytesPerElement) == 0) &&
+                ((dstRect.slicePitch_ % bytesPerElement) == 0)) {
+                    optimalElementSize = true;
+                    break;
+            }
+            bytesPerElement = bytesPerElement >> 1;
+        }
+
+        // 19 bit limit in HW in SI and 16 bit limit in CI+(we adjust the ElementSize to 4bytes but the packet still has 14bits)
+        size_t pitchLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x7FFFF;
+        size_t sizeLimit = dev().settings().ciPlus_ ? (0x3FFF * bytesPerElement) | 0xF : 0x3FFF;
+
        if (subWindowRectCopy &&
-            (((srcOffset % 4) != 0) ||
-            ((dstOffset % 4) != 0) ||
-            ((size[0] % 4) != 0) ||
-            ((srcRect.rowPitch_ % 4) != 0) ||
-            ((srcRect.slicePitch_ % 4) != 0) ||
-            ((dstRect.rowPitch_ % 4) != 0) ||
-            ((dstRect.slicePitch_ % 4) != 0) ||
+            (!optimalElementSize ||
            (srcRect.rowPitch_ > pitchLimit) ||
            (dstRect.rowPitch_ > pitchLimit) ||
-            (size[0] > 0x3fff) ||   // 14 bits limit in HW
+            (size[0] > sizeLimit) ||    // See above
            (size[1] > 0x3fff) ||   // 14 bits limit in HW
            (size[2] > 0x7ff))) {    // 11 bits limit in HW
            // Restriction with rectLinearDRMDMA packet
@@ -642,7 +654,7 @@ DmaBlitManager::copyBufferRect(
            if (!gpuMem(srcMemory).partialMemCopyTo(gpu(),
                amd::Coord3D(srcOffset, srcRect.rowPitch_, srcRect.slicePitch_),
                amd::Coord3D(dstOffset, dstRect.rowPitch_, dstRect.slicePitch_),
-                size, gpuMem(dstMemory), true)) {
+                size, gpuMem(dstMemory), true, false, bytesPerElement)) {
                LogError("copyBufferRect failed!");
                return false;
            }
@@ -1158,7 +1158,8 @@ Resource::partialMemCopyTo(
    const amd::Coord3D& size,
    Resource& dstResource,
    bool enableCopyRect,
-    bool flushDMA) const
+    bool flushDMA,
+    uint bytesPerElement) const
 {
    GpuEvent    event;
    bool        result;
@@ -1198,7 +1199,7 @@ Resource::partialMemCopyTo(
    result = gpu.copyPartial(event,
        gslResource(), calSrcOrigin,
        dstResource.gslResource(), calDstOrigin,
-        calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect);
+        calSize, static_cast<CALmemcopyflags>(syncFlags), enableCopyRect, bytesPerElement);

    if (result) {
        // Mark source and destination as busy
@@ -232,7 +232,8 @@ public:
        const amd::Coord3D& size,       //!< Size of the region to copy
        Resource& dstResource,          //!< Destination resource
        bool enableRectCopy = false,    //!< Rectangular DMA support
-        bool flushDMA = false           //!< Flush DMA if requested
+        bool flushDMA = false,          //!< Flush DMA if requested
+        uint bytesPerElement = 1        //!< Bytes Per Element
        ) const;

    /*! \brief Copies size/4 DWORD of memory to a surface
@@ -513,7 +513,8 @@ CALGSLContext::copyPartial(GpuEvent&      event,
                           size_t*          destOffset,
                           size_t*          size,
                           CALmemcopyflags  flags,
-                           bool             enableRectCopy)
+                           bool             enableRectCopy,
+                           uint32           bytesPerElement)
 {
    uint64      surfaceSize;
    uint32      mode = GSL_SYNCUPLOAD_IGNORE_ELEMENTSIZE;
@@ -563,7 +564,7 @@ CALGSLContext::copyPartial(GpuEvent&      event,
            }
            m_cs->syncUploadRawRect(srcMem, srcOffset[0], (uint32)srcOffset[1], (uint32)srcOffset[2],
                                    destMem, destOffset[0], (uint32)destOffset[1], (uint32)destOffset[2],
-                                    size[0], (uint32)size[1], (uint32)size[2], mode);
+                                    size[0], (uint32)size[1], (uint32)size[2], mode, bytesPerElement);
        }
        else
        {
@@ -71,7 +71,7 @@ public:
    void             destroyProgramObject(gslProgramObject func);

    bool             copyPartial(GpuEvent& event, gslMemObject srcMem, size_t* srcOffset,
-                        gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect);
+                        gslMemObject destMem, size_t* destOffset, size_t* size, CALmemcopyflags flags, bool enableCopyRect, uint32 bytesPerElement);

    void             setSamplerParameter(uint32 sampler, gslTexParameterPname param, CALvoid* vals);