P4 to Git Change 1968527 by gandryey@gera-win10 on 2019/07/16 14:52:45

SWDEV-195023 - [CQE OCL][Navi10][RESOLVE] corruption seen in thumbnail for mxf clip after enabling temporal denoiser in Davinci resolve app
	- Add a workaround for missing custom pitch in gfx10 HW. It can be disabled with GPU_IMAGE_BUFFER_WAR=0. Workaround implements double copy with an image without pitch.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#26 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#12 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#89 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#138 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#62 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#313 edit


[ROCm/clr commit: 582fae6820]
This commit is contained in:
foreman
2019-07-16 14:56:08 -04:00
rodzic 4d73993230
commit c1063c0ea1
7 zmienionych plików z 99 dodań i 4 usunięć
@@ -79,6 +79,10 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params
uint allocAttempt = 0;
// Reset the flag in case we reallocate the heap in local/remote
flags_ &= ~HostMemoryDirectAccess;
if (!ValidateMemory(memType)) {
return false;
}
do {
// Create a resource in CAL
@@ -1089,4 +1093,18 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
return mapAddress + offset;
}
bool Image::ValidateMemory(Resource::MemoryType memType) {
if (dev().settings().imageBufferWar_ && (memType == ImageBuffer) && (owner() != nullptr) &&
((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
owner()->asImage()->getRowPitch())) {
// Create a native image without pitch as a backing store
copyImageBuffer_ = new pal::Image(dev(), size(), desc().width_, desc().height_, desc().depth_,
desc().format_, desc().topology_, 0);
if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create(Resource::Local)) {
return false;
}
}
return true;
}
} // namespace pal
@@ -62,7 +62,7 @@ class Memory : public device::Memory, public Resource {
);
//! Default destructor
~Memory();
virtual ~Memory();
//! Creates the interop memory
bool createInterop();
@@ -156,6 +156,9 @@ class Memory : public device::Memory, public Resource {
//! Decrement map count
void decIndMapCount();
//! Validates allocated memory for possible workarounds
virtual bool ValidateMemory(Resource::MemoryType memType) { return true; }
private:
//! Disable copy constructor
Memory(const Memory&);
@@ -201,7 +204,8 @@ class Image : public pal::Memory {
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
: pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) {}
: pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels),
copyImageBuffer_(nullptr) {}
//! Image constructor
Image(const Device& gpuDev, //!< GPU device object
@@ -213,7 +217,10 @@ class Image : public pal::Memory {
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
: pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) {}
: pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels),
copyImageBuffer_(nullptr) {}
virtual ~Image() { delete copyImageBuffer_; }
//! Allocate memory for API-level maps
virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory
@@ -225,12 +232,19 @@ class Image : public pal::Memory {
virtual uint64_t virtualAddress() const override { return hwSrd(); }
Image* CopyImageBuffer() const { return copyImageBuffer_; }
//! Validates allocated memory for possible workarounds
bool ValidateMemory(Resource::MemoryType memType) final;
private:
//! Disable copy constructor
Image(const Image&);
//! Disable operator=
Image& operator=(const Image&);
Image* copyImageBuffer_;
};
} // namespace pal
@@ -144,6 +144,7 @@ Settings::Settings() {
hsailExplicitXnack_ = false;
lcWavefrontSize64_ = true;
enableHwP2P_ = false;
imageBufferWar_ = false;
}
bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -331,6 +332,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
return false;
}
if (gfx10Plus_) {
// GFX10 HW doesn't support custom pitch. Enable double copy workaround
imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
}
splitSizeForWin7_ = false;
#if defined(_WIN32)
@@ -62,7 +62,8 @@ class Settings : public device::Settings {
uint rgpSqttForceDisable_ : 1; //!< Disables SQTT
uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7
uint enableHwP2P_ : 1; //!< Forces HW P2P path for testing
uint reserved_ : 10;
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
uint reserved_ : 9;
};
uint value_;
};
@@ -1105,6 +1105,13 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) {
}
} break;
case CL_COMMAND_READ_IMAGE:
if (memory->memoryType() == Resource::ImageBuffer) {
Image* imageBuffer = static_cast<Image*>(memory);
// Check if synchronization has to be performed
if (imageBuffer->CopyImageBuffer() != nullptr) {
memory = imageBuffer->CopyImageBuffer();
}
}
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
@@ -2398,6 +2405,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
}
}
// Check if image buffer write back is required
if (state_.imageBufferWrtBack_) {
// Avoid recursive write back
state_.imageBufferWrtBack_ = false;
// Make sure the original kernel execution is done
addBarrier(RgpSqqtBarrierReason::MemDependency);
for (const auto imageBuffer : wrtBackImageBuffer_) {
Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
amd::Image* image = imageBuffer->owner()->asImage();
amd::Coord3D offs(0);
// Copy memory from the the backing store image into original buffer
bool result = blitMgr().copyImageToBuffer(
*imageBuffer->CopyImageBuffer(), *buffer, offs, offs,
image->getRegion(), true,
image->getRowPitch(), image->getSlicePitch());
}
wrtBackImageBuffer_.clear();
}
// Perform post dispatch logic for RGP traces
if (rgpCaptureEna()) {
dev().rgpCaptureMgr()->PostDispatch(this);
@@ -3256,6 +3282,32 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
mem->signalWrite(&dev());
}
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
if (gpuMem->memoryType() == Resource::ImageBuffer) {
Image* imageBuffer = static_cast<Image*>(gpuMem);
// Check if synchronization has to be performed
if (imageBuffer->CopyImageBuffer() != nullptr) {
Memory* buffer = dev().getGpuMemory(mem->parent());
amd::Image* image = mem->asImage();
amd::Coord3D offs(0);
// Copy memory from the original image buffer into the backing store image
bool result = blitMgr().copyBufferToImage(
*buffer, *imageBuffer->CopyImageBuffer(), offs, offs,
image->getRegion(), true, image->getRowPitch(), image->getSlicePitch());
// Make sure the copy operation is done
addBarrier(RgpSqqtBarrierReason::MemDependency);
// Use backing store SRD as the replacment
uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd();
WriteAqlArgAt(const_cast<address>(params), &srd, sizeof(srd), desc.offset_);
// Add backing store image to the list of memory handles
addVmMemory(imageBuffer->CopyImageBuffer());
// If it's not a read only resource, then runtime has to write back
if (!info.readOnly_) {
wrtBackImageBuffer_.push_back(imageBuffer);
state_.imageBufferWrtBack_ = true;
}
}
}
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
@@ -205,6 +205,7 @@ class VirtualGPU : public device::VirtualDevice {
uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter
uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled
uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime
uint imageBufferWrtBack_: 1; //!< Enable image buffer write back
};
uint value_;
State() : value_(0) {}
@@ -643,6 +644,7 @@ class VirtualGPU : public device::VirtualDevice {
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
Queue* queues_[AllEngines]; //!< HW queues for all engines
MemoryRange sdmaRange_; //!< SDMA memory range for write access
std::vector<Image*> wrtBackImageBuffer_; //!< Array of images for write back
};
inline void VirtualGPU::addVmMemory(const Memory* memory) {
@@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false, \
"Enables cooperative group launch") \
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
"The maximum number of command buffers allocated per queue") \
release(bool, GPU_IMAGE_BUFFER_WAR, true, \
"Enables image buffer workaround") \
release(cstring, HIP_VISIBLE_DEVICES, "", \
"Only devices whose index is present in the sequence are visible to HIP") \
release(cstring, CUDA_VISIBLE_DEVICES, "", \