From bd191b9d2ea46ca3ffd1d582e62b92b38e79d120 Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 17 Jan 2020 15:51:03 -0500 Subject: [PATCH] P4 to Git Change 2058803 by gandryey@gera-win10 on 2020/01/17 15:47:42 SWDEV-219901 - [OCL-ROCr]Add pitch workaround for Navi10 - Add pitch workaroud. Allocate a native image as the backing store and perform double copy when necessary Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#149 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#48 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#46 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.hpp#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#47 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.hpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#95 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#31 edit [ROCm/clr commit: 3c137e7b198197c8b35d7ebfc764390f8bc5539a] --- .../rocclr/runtime/device/rocm/rocdevice.cpp | 4 ++ .../rocclr/runtime/device/rocm/rocdevice.hpp | 3 + .../rocclr/runtime/device/rocm/rocmemory.cpp | 33 +++++++++- .../rocclr/runtime/device/rocm/rocmemory.hpp | 9 +++ .../runtime/device/rocm/rocsettings.cpp | 5 ++ .../runtime/device/rocm/rocsettings.hpp | 3 +- .../rocclr/runtime/device/rocm/rocvirtual.cpp | 60 +++++++++++++++++++ .../rocclr/runtime/device/rocm/rocvirtual.hpp | 4 ++ 8 files changed, 119 insertions(+), 2 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp index 49eadce97b..729323920a 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp @@ -969,6 +969,10 @@ Sampler::~Sampler() { hsa_ext_sampler_destroy(dev_.getBackendDevice(), hsa_sampler); } +Memory* Device::getGpuMemory(amd::Memory* mem) const { + return static_cast(mem->getDeviceMemory(*this)); +} + bool Device::populateOCLDeviceConstants() { info_.available_ = true; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp index 0e499845d4..0298c1a967 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp @@ -430,6 +430,9 @@ class Device : public NullDevice { virtual bool findLinkTypeAndHopCount(amd::Device* other_device, uint32_t* link_type, uint32_t* hop_count); + //! Returns a GPU memory object from AMD memory object + roc::Memory* getGpuMemory(amd::Memory* mem //!< Pointer to AMD memory object + ) const; private: static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp index 9a4004e10c..1ba69a3c49 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp @@ -1022,7 +1022,10 @@ bool Image::createInteropImage() { } bool Image::create() { - if (owner()->parent()) { + if (owner()->parent() != nullptr) { + if (!ValidateMemory()) { + return false; + } // Image view creation roc::Memory* parent = static_cast(owner()->parent()->getDeviceMemory(dev_)); @@ -1219,6 +1222,8 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi Image::~Image() { destroy(); } void Image::destroy() { + delete copyImageBuffer_; + if (hsaImageObject_.handle != 0) { hsa_status_t status = hsa_ext_image_destroy(dev().getBackendDevice(), hsaImageObject_); assert(status == HSA_STATUS_SUCCESS); @@ -1241,5 +1246,31 @@ void Image::destroy() { const_cast(dev()).updateFreeMemory(size(), true); } } + +bool Image::ValidateMemory() { + // Detect image view from buffer to distinguish linear paths from tiled. + amd::Memory* ancestor = owner()->parent(); + while ((ancestor->asBuffer() == nullptr) && (ancestor->parent() != nullptr)) { + ancestor = ancestor->parent(); + } + bool linearLayout = (ancestor->asBuffer() != nullptr); + + if (dev().settings().imageBufferWar_ && linearLayout && (owner() != nullptr) && + ((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) < + owner()->asImage()->getRowPitch())) { + constexpr bool ForceLinear = true; + amd::Image* img = owner()->asImage(); + // Create a native image without pitch for validation + copyImageBuffer_ = + new (dev().context()) amd::Image(dev().context(), CL_MEM_OBJECT_IMAGE2D, img->getMemFlags(), + img->getImageFormat(), img->getWidth(), img->getHeight(), 1, 0, 0); + + if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create()) { + return false; + } + } + return true; +} + } #endif // WITHOUT_HSA_BACKEND diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp index 5e742cd9d0..494474ecb1 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.hpp @@ -93,6 +93,9 @@ class Memory : public device::Memory { virtual void IpcCreate (size_t offset, size_t* mem_size, void* handle) const; + //! Validates allocated memory for possible workarounds + virtual bool ValidateMemory() { return true; } + protected: bool allocateMapMemory(size_t allocationSize); @@ -179,6 +182,11 @@ class Image : public roc::Memory { virtual const address cpuSrd() const { return reinterpret_cast(getHsaImageObject().handle); } + //! Validates allocated memory for possible workarounds + bool ValidateMemory() final; + + amd::Image* CopyImageBuffer() const { return copyImageBuffer_; } + private: //! Disable copy constructor Image(const Buffer&); @@ -200,6 +208,7 @@ class Image : public roc::Memory { hsa_ext_image_t hsaImageObject_; void* originalDeviceMemory_; + amd::Image* copyImageBuffer_ = nullptr; }; } #endif diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp index e6f77c0caf..9ad03bdaab 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.cpp @@ -70,6 +70,7 @@ Settings::Settings() { useLightning_ = (!flagIsDefault(GPU_ENABLE_LC)) ? GPU_ENABLE_LC : true; lcWavefrontSize64_ = true; + imageBufferWar_ = false; } bool Settings::create(bool fullProfile, int gfxipVersion) { @@ -128,6 +129,10 @@ bool Settings::create(bool fullProfile, int gfxipVersion) { if (gfxipVersion >= 1000) { enableWave32Mode_ = true; enableWgpMode_ = GPU_ENABLE_WGP_MODE; + if (gfxipVersion == 1001) { + // GFX10.1 HW doesn't support custom pitch. Enable double copy workaround + imageBufferWar_ = GPU_IMAGE_BUFFER_WAR; + } } if (!flagIsDefault(GPU_ENABLE_WAVE32_MODE)) { enableWave32Mode_ = GPU_ENABLE_WAVE32_MODE; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp index 7c925d1d4c..423d21c3e9 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocsettings.hpp @@ -26,7 +26,8 @@ class Settings : public device::Settings { uint imageDMA_ : 1; //!< Enable direct image DMA transfers uint stagedXferRead_ : 1; //!< Uses a staged buffer read uint stagedXferWrite_ : 1; //!< Uses a staged buffer write - uint reserved_ : 25; + uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 + uint reserved_ : 24; }; uint value_; }; diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp index a288693ba1..2834104b8a 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -330,6 +330,30 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para const uint64_t image_srd = image->getHsaImageObject().handle; assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); WriteAqlArgAt(const_cast
(params), &image_srd, sizeof(image_srd), desc.offset_); + + // Check if synchronization has to be performed + if (image->CopyImageBuffer() != nullptr) { + Memory* devBuf = dev().getGpuMemory(mem->parent()); + amd::Coord3D offs(0); + Image* devCpImg = static_cast(dev().getGpuMemory(image->CopyImageBuffer())); + amd::Image* img = mem->asImage(); + + // Copy memory from the original image buffer into the backing store image + bool result = blitMgr().copyBufferToImage( + *devBuf, *devCpImg, offs, offs, img->getRegion(), true, + img->getRowPitch(), img->getSlicePitch()); + // Make sure the copy operation is done + setAqlHeader(dispatchPacketHeader_); + // Use backing store SRD as the replacment + const uint64_t srd = devCpImg->getHsaImageObject().handle; + WriteAqlArgAt(const_cast
(params), &srd, sizeof(srd), desc.offset_); + + // If it's not a read only resource, then runtime has to write back + if (!desc.info_.readOnly_) { + wrtBackImageBuffer_.push_back(devCpImg); + imageBufferWrtBack_ = true; + } + } } } } @@ -947,6 +971,23 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) { break; } case CL_COMMAND_READ_IMAGE: { + if ((cmd.source().parent() != nullptr) && (cmd.source().parent()->getType() == CL_MEM_OBJECT_BUFFER)) { + Image* imageBuffer = static_cast(devMem); + // Check if synchronization has to be performed + if (nullptr != imageBuffer->CopyImageBuffer()) { + amd::Memory* memory = imageBuffer->CopyImageBuffer(); + devMem = dev().getGpuMemory(memory); + if (nullptr == imageBuffer->owner()->getLastWriter()) { + Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent()); + amd::Image* image = imageBuffer->owner()->asImage(); + amd::Coord3D offs(0); + // Copy memory from the original image buffer into the backing store image + result = blitMgr().copyBufferToImage(*buffer, *devMem, offs, + offs, image->getRegion(), true, + image->getRowPitch(), image->getSlicePitch()); + } + } + } if (hostMemory != nullptr) { // Accelerated image to buffer transfer without pinning amd::Coord3D dstOrigin(offset); @@ -2202,6 +2243,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const getVQVirtualAddress(), schedulerParam_, schedulerQueue_, schedulerSignal_, schedulerThreads_); } + // Check if image buffer write back is required + if (imageBufferWrtBack_) { + // Avoid recursive write back + imageBufferWrtBack_ = false; + // Make sure the original kernel execution is done + releaseGpuMemoryFence(); + for (const auto imageBuffer : wrtBackImageBuffer_) { + Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent()); + amd::Image* image = imageBuffer->owner()->asImage(); + Image* devImage = static_cast(dev().getGpuMemory(imageBuffer->owner())); + Memory* cpyImage = dev().getGpuMemory(devImage->CopyImageBuffer()); + amd::Coord3D offs(0); + // Copy memory from the the backing store image into original buffer + bool result = blitMgr().copyImageToBuffer(*cpyImage, *buffer, offs, + offs, image->getRegion(), true, + image->getRowPitch(), image->getSlicePitch()); + } + wrtBackImageBuffer_.clear(); + } return true; } /** diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp index 2e31dc4b4f..0a53ff03c7 100644 --- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -300,6 +300,10 @@ class VirtualGPU : public device::VirtualDevice { * used to synchronized on kernel outputs. */ bool hasPendingDispatch_; + + bool imageBufferWrtBack_; //!< Enable image buffer write back + std::vector wrtBackImageBuffer_; //!< Array of images for write back + Timestamp* timestamp_; hsa_agent_t gpu_device_; //!< Physical device hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu