From c1063c0ea1d7388a2d778073d13c7a899c1c6d89 Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 16 Jul 2019 14:56:08 -0400
Subject: [PATCH] P4 to Git Change 1968527 by gandryey@gera-win10 on 2019/07/16
14:52:45
SWDEV-195023 - [CQE OCL][Navi10][RESOLVE] corruption seen in thumbnail for mxf clip after enabling temporal denoiser in Davinci resolve app
- Add a workaround for missing custom pitch in gfx10 HW. It can be disabled with GPU_IMAGE_BUFFER_WAR=0. Workaround implements double copy with an image without pitch.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#26 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#89 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#138 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#313 edit
[ROCm/clr commit: 582fae6820ca856aa020361e2ae5a561f8d8e4b2]
---
.../rocclr/runtime/device/pal/palmemory.cpp | 18 +++++++
.../rocclr/runtime/device/pal/palmemory.hpp | 20 +++++--
.../rocclr/runtime/device/pal/palsettings.cpp | 6 +++
.../rocclr/runtime/device/pal/palsettings.hpp | 3 +-
.../rocclr/runtime/device/pal/palvirtual.cpp | 52 +++++++++++++++++++
.../rocclr/runtime/device/pal/palvirtual.hpp | 2 +
projects/clr/rocclr/runtime/utils/flags.hpp | 2 +
7 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
index c75c2e17bc..f6ca8bab8e 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.cpp
@@ -79,6 +79,10 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
uint allocAttempt = 0;
// Reset the flag in case we reallocate the heap in local/remote
flags_ &= ~HostMemoryDirectAccess;
+
+ if (!ValidateMemory(memType)) {
+ return false;
+ }
do {
// Create a resource in CAL
@@ -1089,4 +1093,18 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
return mapAddress + offset;
}
+bool Image::ValidateMemory(Resource::MemoryType memType) {
+ if (dev().settings().imageBufferWar_ && (memType == ImageBuffer) && (owner() != nullptr) &&
+ ((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
+ owner()->asImage()->getRowPitch())) {
+ // Create a native image without pitch as a backing store
+ copyImageBuffer_ = new pal::Image(dev(), size(), desc().width_, desc().height_, desc().depth_,
+ desc().format_, desc().topology_, 0);
+ if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create(Resource::Local)) {
+ return false;
+ }
+ }
+ return true;
+}
+
} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
index 2ce3062cce..54c74f20ab 100644
--- a/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palmemory.hpp
@@ -62,7 +62,7 @@ class Memory : public device::Memory, public Resource {
);
//! Default destructor
- ~Memory();
+ virtual ~Memory();
//! Creates the interop memory
bool createInterop();
@@ -156,6 +156,9 @@ class Memory : public device::Memory, public Resource {
//! Decrement map count
void decIndMapCount();
+ //! Validates allocated memory for possible workarounds
+ virtual bool ValidateMemory(Resource::MemoryType memType) { return true; }
+
private:
//! Disable copy constructor
Memory(const Memory&);
@@ -201,7 +204,8 @@ class Image : public pal::Memory {
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
- : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) {}
+ : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels),
+ copyImageBuffer_(nullptr) {}
//! Image constructor
Image(const Device& gpuDev, //!< GPU device object
@@ -213,7 +217,10 @@ class Image : public pal::Memory {
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
- : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) {}
+ : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels),
+ copyImageBuffer_(nullptr) {}
+
+ virtual ~Image() { delete copyImageBuffer_; }
//! Allocate memory for API-level maps
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
@@ -225,12 +232,19 @@ class Image : public pal::Memory {
virtual uint64_t virtualAddress() const override { return hwSrd(); }
+ Image* CopyImageBuffer() const { return copyImageBuffer_; }
+
+ //! Validates allocated memory for possible workarounds
+ bool ValidateMemory(Resource::MemoryType memType) final;
+
private:
//! Disable copy constructor
Image(const Image&);
//! Disable operator=
Image& operator=(const Image&);
+
+ Image* copyImageBuffer_;
};
} // namespace pal
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
index 9b1e656e3d..893d0d1143 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.cpp
@@ -144,6 +144,7 @@ Settings::Settings() {
hsailExplicitXnack_ = false;
lcWavefrontSize64_ = true;
enableHwP2P_ = false;
+ imageBufferWar_ = false;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -331,6 +332,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
return false;
}
+ if (gfx10Plus_) {
+ // GFX10 HW doesn't support custom pitch. Enable double copy workaround
+ imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
+ }
+
splitSizeForWin7_ = false;
#if defined(_WIN32)
diff --git a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
index 716b45f307..fc6f63d416 100644
--- a/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palsettings.hpp
@@ -62,7 +62,8 @@ class Settings : public device::Settings {
uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
uint enableHwP2P_ : 1; //!< Forces HW P2P path for testing
- uint reserved_ : 10;
+ uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
+ uint reserved_ : 9;
};
uint value_;
};
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
index 38968f29ea..ab6421822a 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.cpp
@@ -1105,6 +1105,13 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
}
} break;
case CL_COMMAND_READ_IMAGE:
+ if (memory->memoryType() == Resource::ImageBuffer) {
+ Image* imageBuffer = static_cast(memory);
+ // Check if synchronization has to be performed
+ if (imageBuffer->CopyImageBuffer() != nullptr) {
+ memory = imageBuffer->CopyImageBuffer();
+ }
+ }
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
@@ -2398,6 +2405,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
}
+ // Check if image buffer write back is required
+ if (state_.imageBufferWrtBack_) {
+ // Avoid recursive write back
+ state_.imageBufferWrtBack_ = false;
+ // Make sure the original kernel execution is done
+ addBarrier(RgpSqqtBarrierReason::MemDependency);
+ for (const auto imageBuffer : wrtBackImageBuffer_) {
+ Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
+ amd::Image* image = imageBuffer->owner()->asImage();
+ amd::Coord3D offs(0);
+ // Copy memory from the the backing store image into original buffer
+ bool result = blitMgr().copyImageToBuffer(
+ *imageBuffer->CopyImageBuffer(), *buffer, offs, offs,
+ image->getRegion(), true,
+ image->getRowPitch(), image->getSlicePitch());
+ }
+ wrtBackImageBuffer_.clear();
+ }
+
// Perform post dispatch logic for RGP traces
if (rgpCaptureEna()) {
dev().rgpCaptureMgr()->PostDispatch(this);
@@ -3256,6 +3282,32 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
mem->signalWrite(&dev());
}
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
+ if (gpuMem->memoryType() == Resource::ImageBuffer) {
+ Image* imageBuffer = static_cast(gpuMem);
+ // Check if synchronization has to be performed
+ if (imageBuffer->CopyImageBuffer() != nullptr) {
+ Memory* buffer = dev().getGpuMemory(mem->parent());
+ amd::Image* image = mem->asImage();
+ amd::Coord3D offs(0);
+ // Copy memory from the original image buffer into the backing store image
+ bool result = blitMgr().copyBufferToImage(
+ *buffer, *imageBuffer->CopyImageBuffer(), offs, offs,
+ image->getRegion(), true, image->getRowPitch(), image->getSlicePitch());
+ // Make sure the copy operation is done
+ addBarrier(RgpSqqtBarrierReason::MemDependency);
+ // Use backing store SRD as the replacment
+ uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
+ WriteAqlArgAt(const_cast(params), &srd, sizeof(srd), desc.offset_);
+ // Add backing store image to the list of memory handles
+ addVmMemory(imageBuffer->CopyImageBuffer());
+ // If it's not a read only resource, then runtime has to write back
+ if (!info.readOnly_) {
+ wrtBackImageBuffer_.push_back(imageBuffer);
+ state_.imageBufferWrtBack_ = true;
+ }
+ }
+ }
+
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
diff --git a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
index 61242af0f2..71e7cf2e73 100644
--- a/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palvirtual.hpp
@@ -205,6 +205,7 @@ class VirtualGPU : public device::VirtualDevice {
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
+ uint imageBufferWrtBack_: 1; //!< Enable image buffer write back
};
uint value_;
State() : value_(0) {}
@@ -643,6 +644,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
Queue* queues_[AllEngines]; //!< HW queues for all engines
MemoryRange sdmaRange_; //!< SDMA memory range for write access
+ std::vector wrtBackImageBuffer_; //!< Array of images for write back
};
inline void VirtualGPU::addVmMemory(const Memory* memory) {
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index fcb1e10314..85e815ca95 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false, \
"Enables cooperative group launch") \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
"The maximum number of command buffers allocated per queue") \
+release(bool, GPU_IMAGE_BUFFER_WAR, true, \
+ "Enables image buffer workaround") \
release(cstring, HIP_VISIBLE_DEVICES, "", \
"Only devices whose index is present in the sequence are visible to HIP") \
release(cstring, CUDA_VISIBLE_DEVICES, "", \