P4 to Git Change 1968527 by gandryey@gera-win10 on 2019/07/16 14:52:45

SWDEV-195023 - [CQE OCL][Navi10][RESOLVE] corruption seen in thumbnail for mxf clip after enabling temporal denoiser in Davinci resolve app - Add a workaround for missing custom pitch in gfx10 HW. It can be disabled with GPU_IMAGE_BUFFER_WAR=0. Workaround implements double copy with an image without pitch. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#26 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#12 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#89 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#138 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#62 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#313 edit [ROCm/clr commit: 582fae6820]
2019-07-16 14:56:08 -04:00
commit c1063c0ea1
@@ -79,6 +79,10 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
  uint allocAttempt = 0;
  // Reset the flag in case we reallocate the heap in local/remote
  flags_ &= ~HostMemoryDirectAccess;
+  
+  if (!ValidateMemory(memType)) {
+    return false;
+  }

  do {
    // Create a resource in CAL
@@ -1089,4 +1093,18 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
  return mapAddress + offset;
 }

+bool Image::ValidateMemory(Resource::MemoryType memType) {
+  if (dev().settings().imageBufferWar_ && (memType == ImageBuffer) && (owner() != nullptr) &&
+      ((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
+       owner()->asImage()->getRowPitch())) {
+    // Create a native image without pitch as a backing store
+    copyImageBuffer_ = new pal::Image(dev(), size(), desc().width_, desc().height_, desc().depth_,
+                                      desc().format_, desc().topology_, 0);
+    if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create(Resource::Local)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace pal
@@ -62,7 +62,7 @@ class Memory : public device::Memory, public Resource {
  );

  //! Default destructor
-  ~Memory();
+  virtual ~Memory();

  //! Creates the interop memory
  bool createInterop();
@@ -156,6 +156,9 @@ class Memory : public device::Memory, public Resource {
  //! Decrement map count
  void decIndMapCount();

+  //! Validates allocated memory for possible workarounds
+  virtual bool ValidateMemory(Resource::MemoryType memType) { return true; }
+
 private:
  //! Disable copy constructor
  Memory(const Memory&);
@@ -201,7 +204,8 @@ class Image : public pal::Memory {
        cl_mem_object_type imageType,  //!< CL image type
        uint mipLevels                 //!< The number of mip levels
        )
-      : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) {}
+      : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels),
+        copyImageBuffer_(nullptr) {}

  //! Image constructor
  Image(const Device& gpuDev,          //!< GPU device object
@@ -213,7 +217,10 @@ class Image : public pal::Memory {
        cl_mem_object_type imageType,  //!< CL image type
        uint mipLevels                 //!< The number of mip levels
        )
-      : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) {}
+      : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels),
+        copyImageBuffer_(nullptr) {}
+
+  virtual ~Image() { delete copyImageBuffer_; }

  //! Allocate memory for API-level maps
  virtual void* allocMapTarget(const amd::Coord3D& origin,  //!< The map location in memory
@@ -225,12 +232,19 @@ class Image : public pal::Memory {

  virtual uint64_t virtualAddress() const override { return hwSrd(); }

+  Image* CopyImageBuffer() const { return copyImageBuffer_; }
+
+  //! Validates allocated memory for possible workarounds
+  bool ValidateMemory(Resource::MemoryType memType) final;
+
 private:
  //! Disable copy constructor
  Image(const Image&);

  //! Disable operator=
  Image& operator=(const Image&);
+
+  Image* copyImageBuffer_;
 };

 }  // namespace pal
@@ -144,6 +144,7 @@ Settings::Settings() {
  hsailExplicitXnack_ = false;
  lcWavefrontSize64_ = true;
  enableHwP2P_ = false;
+  imageBufferWar_ = false;
 }

 bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -331,6 +332,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
      return false;
  }

+  if (gfx10Plus_) {
+    // GFX10 HW doesn't support custom pitch. Enable double copy workaround
+    imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
+  }
+
  splitSizeForWin7_ = false;

 #if defined(_WIN32)
@@ -62,7 +62,8 @@ class Settings : public device::Settings {
      uint rgpSqttForceDisable_ : 1;    //!< Disables SQTT
      uint splitSizeForWin7_ : 1;       //!< DMA flush split size for Win 7
      uint enableHwP2P_ : 1;            //!< Forces HW P2P path for testing
-      uint reserved_ : 10;
+      uint imageBufferWar_ : 1;         //!< Image buffer workaround for Gfx10
+      uint reserved_ : 9;
    };
    uint value_;
  };
@@ -1105,6 +1105,13 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
      }
    } break;
    case CL_COMMAND_READ_IMAGE:
+      if (memory->memoryType() == Resource::ImageBuffer) {
+        Image* imageBuffer = static_cast<Image*>(memory);
+        // Check if synchronization has to be performed
+        if (imageBuffer->CopyImageBuffer() != nullptr) {
+          memory = imageBuffer->CopyImageBuffer();
+        }
+      }
      if (hostMemory != nullptr) {
        // Accelerated image to buffer transfer without pinning
        amd::Coord3D dstOrigin(offset);
@@ -2398,6 +2405,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    }
  }

+  // Check if image buffer write back is required
+  if (state_.imageBufferWrtBack_) {
+    // Avoid recursive write back
+    state_.imageBufferWrtBack_ = false;
+    // Make sure the original kernel execution is done
+    addBarrier(RgpSqqtBarrierReason::MemDependency);
+    for (const auto imageBuffer : wrtBackImageBuffer_) {
+      Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
+      amd::Image* image = imageBuffer->owner()->asImage();
+      amd::Coord3D offs(0);
+      // Copy memory from the the backing store image into original buffer
+      bool result = blitMgr().copyImageToBuffer(
+        *imageBuffer->CopyImageBuffer(), *buffer, offs, offs,
+        image->getRegion(), true,
+        image->getRowPitch(), image->getSlicePitch());
+    }
+    wrtBackImageBuffer_.clear();
+  }
+
  // Perform post dispatch logic for RGP traces
  if (rgpCaptureEna()) {
    dev().rgpCaptureMgr()->PostDispatch(this);
@@ -3256,6 +3282,32 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
            mem->signalWrite(&dev());
          }
          if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+            if (gpuMem->memoryType() == Resource::ImageBuffer) {
+              Image* imageBuffer = static_cast<Image*>(gpuMem);
+              // Check if synchronization has to be performed
+              if (imageBuffer->CopyImageBuffer() != nullptr) {
+                Memory* buffer = dev().getGpuMemory(mem->parent());
+                amd::Image* image = mem->asImage();
+                amd::Coord3D offs(0);
+                // Copy memory from the original image buffer into the backing store image
+                bool result = blitMgr().copyBufferToImage(
+                  *buffer, *imageBuffer->CopyImageBuffer(), offs, offs,
+                  image->getRegion(), true, image->getRowPitch(), image->getSlicePitch());
+                // Make sure the copy operation is done
+                addBarrier(RgpSqqtBarrierReason::MemDependency);
+                // Use backing store SRD as the replacment
+                uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
+                WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
+                // Add backing store image to the list of memory handles
+                addVmMemory(imageBuffer->CopyImageBuffer());
+                // If it's not a read only resource, then runtime has to write back
+                if (!info.readOnly_) {
+                  wrtBackImageBuffer_.push_back(imageBuffer);
+                  state_.imageBufferWrtBack_ = true;
+                }
+              }
+            }
+
            //! \note Special case for the image views.
            //! Copy SRD to CB1, so blit manager will be able to release
            //! this view without a wait for SRD resource.
@@ -205,6 +205,7 @@ class VirtualGPU : public device::VirtualDevice {
      uint profileEnabled_ : 1;      //!< Profiling is enabled for WaveLimiter
      uint perfCounterEnabled_ : 1;  //!< PerfCounter is enabled
      uint rgpCaptureEnabled_ : 1;   //!< RGP capture is enabled in the runtime
+      uint imageBufferWrtBack_: 1;   //!< Enable image buffer write back
    };
    uint value_;
    State() : value_(0) {}
@@ -643,6 +644,7 @@ class VirtualGPU : public device::VirtualDevice {
  Pal::ICmdAllocator* cmdAllocator_;  //!< Command buffer allocator
  Queue* queues_[AllEngines];         //!< HW queues for all engines
  MemoryRange sdmaRange_;             //!< SDMA memory range for write access
+  std::vector<Image*> wrtBackImageBuffer_;  //!< Array of images for write back 
 };

 inline void VirtualGPU::addVmMemory(const Memory* memory) {
@@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false,                                  \
         "Enables cooperative group launch")                                  \
 release(uint, GPU_MAX_COMMAND_BUFFERS, 8,                                     \
         "The maximum number of command buffers allocated per queue")         \
+release(bool, GPU_IMAGE_BUFFER_WAR, true,                                     \
+        "Enables image buffer workaround")                                    \
 release(cstring, HIP_VISIBLE_DEVICES, "",                                     \
        "Only devices whose index is present in the sequence are visible to HIP")  \
 release(cstring, CUDA_VISIBLE_DEVICES, "",                                    \