From bd191b9d2ea46ca3ffd1d582e62b92b38e79d120 Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 17 Jan 2020 15:51:03 -0500
Subject: [PATCH] P4 to Git Change 2058803 by gandryey@gera-win10 on 2020/01/17
15:47:42
SWDEV-219901 - [OCL-ROCr]Add pitch workaround for Navi10
- Add pitch workaroud. Allocate a native image as the backing store and perform double copy when necessary
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#149 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#48 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#46 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#16 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#47 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#95 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#31 edit
[ROCm/clr commit: 3c137e7b198197c8b35d7ebfc764390f8bc5539a]
---
.../rocclr/runtime/device/rocm/rocdevice.cpp | 4 ++
.../rocclr/runtime/device/rocm/rocdevice.hpp | 3 +
.../rocclr/runtime/device/rocm/rocmemory.cpp | 33 +++++++++-
.../rocclr/runtime/device/rocm/rocmemory.hpp | 9 +++
.../runtime/device/rocm/rocsettings.cpp | 5 ++
.../runtime/device/rocm/rocsettings.hpp | 3 +-
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 60 +++++++++++++++++++
.../rocclr/runtime/device/rocm/rocvirtual.hpp | 4 ++
8 files changed, 119 insertions(+), 2 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index 49eadce97b..729323920a 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -969,6 +969,10 @@ Sampler::~Sampler() {
hsa_ext_sampler_destroy(dev_.getBackendDevice(), hsa_sampler);
}
+Memory* Device::getGpuMemory(amd::Memory* mem) const {
+ return static_cast(mem->getDeviceMemory(*this));
+}
+
bool Device::populateOCLDeviceConstants() {
info_.available_ = true;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index 0e499845d4..0298c1a967 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -430,6 +430,9 @@ class Device : public NullDevice {
virtual bool findLinkTypeAndHopCount(amd::Device* other_device, uint32_t* link_type,
uint32_t* hop_count);
+ //! Returns a GPU memory object from AMD memory object
+ roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object
+ ) const;
private:
static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index 9a4004e10c..1ba69a3c49 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -1022,7 +1022,10 @@ bool Image::createInteropImage() {
}
bool Image::create() {
- if (owner()->parent()) {
+ if (owner()->parent() != nullptr) {
+ if (!ValidateMemory()) {
+ return false;
+ }
// Image view creation
roc::Memory* parent = static_cast(owner()->parent()->getDeviceMemory(dev_));
@@ -1219,6 +1222,8 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi
Image::~Image() { destroy(); }
void Image::destroy() {
+ delete copyImageBuffer_;
+
if (hsaImageObject_.handle != 0) {
hsa_status_t status = hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_);
assert(status == HSA_STATUS_SUCCESS);
@@ -1241,5 +1246,31 @@ void Image::destroy() {
const_cast(dev()).updateFreeMemory(size(), true);
}
}
+
+bool Image::ValidateMemory() {
+ // Detect image view from buffer to distinguish linear paths from tiled.
+ amd::Memory* ancestor = owner()->parent();
+ while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) {
+ ancestor = ancestor->parent();
+ }
+ bool linearLayout = (ancestor->asBuffer() != nullptr);
+
+ if (dev().settings().imageBufferWar_ && linearLayout && (owner() != nullptr) &&
+ ((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) <
+ owner()->asImage()->getRowPitch())) {
+ constexpr bool ForceLinear = true;
+ amd::Image* img = owner()->asImage();
+ // Create a native image without pitch for validation
+ copyImageBuffer_ =
+ new (dev().context()) amd::Image(dev().context(), CL_MEM_OBJECT_IMAGE2D, img->getMemFlags(),
+ img->getImageFormat(), img->getWidth(), img->getHeight(), 1, 0, 0);
+
+ if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create()) {
+ return false;
+ }
+ }
+ return true;
+}
+
}
#endif // WITHOUT_HSA_BACKEND
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
index 5e742cd9d0..494474ecb1 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp
@@ -93,6 +93,9 @@ class Memory : public device::Memory {
virtual void IpcCreate (size_t offset, size_t* mem_size, void* handle) const;
+ //! Validates allocated memory for possible workarounds
+ virtual bool ValidateMemory() { return true; }
+
protected:
bool allocateMapMemory(size_t allocationSize);
@@ -179,6 +182,11 @@ class Image : public roc::Memory {
virtual const address cpuSrd() const { return reinterpret_cast(getHsaImageObject().handle); }
+ //! Validates allocated memory for possible workarounds
+ bool ValidateMemory() final;
+
+ amd::Image* CopyImageBuffer() const { return copyImageBuffer_; }
+
private:
//! Disable copy constructor
Image(const Buffer&);
@@ -200,6 +208,7 @@ class Image : public roc::Memory {
hsa_ext_image_t hsaImageObject_;
void* originalDeviceMemory_;
+ amd::Image* copyImageBuffer_ = nullptr;
};
}
#endif
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
index e6f77c0caf..9ad03bdaab 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp
@@ -70,6 +70,7 @@ Settings::Settings() {
useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true;
lcWavefrontSize64_ = true;
+ imageBufferWar_ = false;
}
bool Settings::create(bool fullProfile, int gfxipVersion) {
@@ -128,6 +129,10 @@ bool Settings::create(bool fullProfile, int gfxipVersion) {
if (gfxipVersion >= 1000) {
enableWave32Mode_ = true;
enableWgpMode_ = GPU_ENABLE_WGP_MODE;
+ if (gfxipVersion == 1001) {
+ // GFX10.1 HW doesn't support custom pitch. Enable double copy workaround
+ imageBufferWar_ = GPU_IMAGE_BUFFER_WAR;
+ }
}
if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) {
enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
index 7c925d1d4c..423d21c3e9 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp
@@ -26,7 +26,8 @@ class Settings : public device::Settings {
uint imageDMA_ : 1; //!< Enable direct image DMA transfers
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
- uint reserved_ : 25;
+ uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
+ uint reserved_ : 24;
};
uint value_;
};
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index a288693ba1..2834104b8a 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -330,6 +330,30 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
WriteAqlArgAt(const_cast(params), &image_srd, sizeof(image_srd), desc.offset_);
+
+ // Check if synchronization has to be performed
+ if (image->CopyImageBuffer() != nullptr) {
+ Memory* devBuf = dev().getGpuMemory(mem->parent());
+ amd::Coord3D offs(0);
+ Image* devCpImg = static_cast(dev().getGpuMemory(image->CopyImageBuffer()));
+ amd::Image* img = mem->asImage();
+
+ // Copy memory from the original image buffer into the backing store image
+ bool result = blitMgr().copyBufferToImage(
+ *devBuf, *devCpImg, offs, offs, img->getRegion(), true,
+ img->getRowPitch(), img->getSlicePitch());
+ // Make sure the copy operation is done
+ setAqlHeader(dispatchPacketHeader_);
+ // Use backing store SRD as the replacment
+ const uint64_t srd = devCpImg->getHsaImageObject().handle;
+ WriteAqlArgAt(const_cast(params), &srd, sizeof(srd), desc.offset_);
+
+ // If it's not a read only resource, then runtime has to write back
+ if (!desc.info_.readOnly_) {
+ wrtBackImageBuffer_.push_back(devCpImg);
+ imageBufferWrtBack_ = true;
+ }
+ }
}
}
}
@@ -947,6 +971,23 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
break;
}
case CL_COMMAND_READ_IMAGE: {
+ if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) {
+ Image* imageBuffer = static_cast(devMem);
+ // Check if synchronization has to be performed
+ if (nullptr != imageBuffer->CopyImageBuffer()) {
+ amd::Memory* memory = imageBuffer->CopyImageBuffer();
+ devMem = dev().getGpuMemory(memory);
+ if (nullptr == imageBuffer->owner()->getLastWriter()) {
+ Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
+ amd::Image* image = imageBuffer->owner()->asImage();
+ amd::Coord3D offs(0);
+ // Copy memory from the original image buffer into the backing store image
+ result = blitMgr().copyBufferToImage(*buffer, *devMem, offs,
+ offs, image->getRegion(), true,
+ image->getRowPitch(), image->getSlicePitch());
+ }
+ }
+ }
if (hostMemory != nullptr) {
// Accelerated image to buffer transfer without pinning
amd::Coord3D dstOrigin(offset);
@@ -2202,6 +2243,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_);
}
+ // Check if image buffer write back is required
+ if (imageBufferWrtBack_) {
+ // Avoid recursive write back
+ imageBufferWrtBack_ = false;
+ // Make sure the original kernel execution is done
+ releaseGpuMemoryFence();
+ for (const auto imageBuffer : wrtBackImageBuffer_) {
+ Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent());
+ amd::Image* image = imageBuffer->owner()->asImage();
+ Image* devImage = static_cast(dev().getGpuMemory(imageBuffer->owner()));
+ Memory* cpyImage = dev().getGpuMemory(devImage->CopyImageBuffer());
+ amd::Coord3D offs(0);
+ // Copy memory from the the backing store image into original buffer
+ bool result = blitMgr().copyImageToBuffer(*cpyImage, *buffer, offs,
+ offs, image->getRegion(), true,
+ image->getRowPitch(), image->getSlicePitch());
+ }
+ wrtBackImageBuffer_.clear();
+ }
return true;
}
/**
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 2e31dc4b4f..0a53ff03c7 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -300,6 +300,10 @@ class VirtualGPU : public device::VirtualDevice {
* used to synchronized on kernel outputs.
*/
bool hasPendingDispatch_;
+
+ bool imageBufferWrtBack_; //!< Enable image buffer write back
+ std::vector wrtBackImageBuffer_; //!< Array of images for write back
+
Timestamp* timestamp_;
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu