diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp index c75c2e17bc..f6ca8bab8e 100644 --- a/rocclr/runtime/device/pal/palmemory.cpp +++ b/rocclr/runtime/device/pal/palmemory.cpp @@ -79,6 +79,10 @@ bool Memory::create(Resource::MemoryType memType, Resource::CreateParams* params uint allocAttempt = 0; // Reset the flag in case we reallocate the heap in local/remote flags_ &= ~HostMemoryDirectAccess; + + if (!ValidateMemory(memType)) { + return false; + } do { // Create a resource in CAL @@ -1089,4 +1093,18 @@ void* Image::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& regi return mapAddress + offset; } +bool Image::ValidateMemory(Resource::MemoryType memType) { + if (dev().settings().imageBufferWar_ && (memType == ImageBuffer) && (owner() != nullptr) && + ((owner()->asImage()->getWidth() * owner()->asImage()->getImageFormat().getElementSize()) < + owner()->asImage()->getRowPitch())) { + // Create a native image without pitch as a backing store + copyImageBuffer_ = new pal::Image(dev(), size(), desc().width_, desc().height_, desc().depth_, + desc().format_, desc().topology_, 0); + if ((copyImageBuffer_ == nullptr) || !copyImageBuffer_->create(Resource::Local)) { + return false; + } + } + return true; +} + } // namespace pal diff --git a/rocclr/runtime/device/pal/palmemory.hpp b/rocclr/runtime/device/pal/palmemory.hpp index 2ce3062cce..54c74f20ab 100644 --- a/rocclr/runtime/device/pal/palmemory.hpp +++ b/rocclr/runtime/device/pal/palmemory.hpp @@ -62,7 +62,7 @@ class Memory : public device::Memory, public Resource { ); //! Default destructor - ~Memory(); + virtual ~Memory(); //! Creates the interop memory bool createInterop(); @@ -156,6 +156,9 @@ class Memory : public device::Memory, public Resource { //! Decrement map count void decIndMapCount(); + //! Validates allocated memory for possible workarounds + virtual bool ValidateMemory(Resource::MemoryType memType) { return true; } + private: //! Disable copy constructor Memory(const Memory&); @@ -201,7 +204,8 @@ class Image : public pal::Memory { cl_mem_object_type imageType, //!< CL image type uint mipLevels //!< The number of mip levels ) - : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels) {} + : pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels), + copyImageBuffer_(nullptr) {} //! Image constructor Image(const Device& gpuDev, //!< GPU device object @@ -213,7 +217,10 @@ class Image : public pal::Memory { cl_mem_object_type imageType, //!< CL image type uint mipLevels //!< The number of mip levels ) - : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels) {} + : pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels), + copyImageBuffer_(nullptr) {} + + virtual ~Image() { delete copyImageBuffer_; } //! Allocate memory for API-level maps virtual void* allocMapTarget(const amd::Coord3D& origin, //!< The map location in memory @@ -225,12 +232,19 @@ class Image : public pal::Memory { virtual uint64_t virtualAddress() const override { return hwSrd(); } + Image* CopyImageBuffer() const { return copyImageBuffer_; } + + //! Validates allocated memory for possible workarounds + bool ValidateMemory(Resource::MemoryType memType) final; + private: //! Disable copy constructor Image(const Image&); //! Disable operator= Image& operator=(const Image&); + + Image* copyImageBuffer_; }; } // namespace pal diff --git a/rocclr/runtime/device/pal/palsettings.cpp b/rocclr/runtime/device/pal/palsettings.cpp index 9b1e656e3d..893d0d1143 100644 --- a/rocclr/runtime/device/pal/palsettings.cpp +++ b/rocclr/runtime/device/pal/palsettings.cpp @@ -144,6 +144,7 @@ Settings::Settings() { hsailExplicitXnack_ = false; lcWavefrontSize64_ = true; enableHwP2P_ = false; + imageBufferWar_ = false; } bool Settings::create(const Pal::DeviceProperties& palProp, @@ -331,6 +332,11 @@ bool Settings::create(const Pal::DeviceProperties& palProp, return false; } + if (gfx10Plus_) { + // GFX10 HW doesn't support custom pitch. Enable double copy workaround + imageBufferWar_ = GPU_IMAGE_BUFFER_WAR; + } + splitSizeForWin7_ = false; #if defined(_WIN32) diff --git a/rocclr/runtime/device/pal/palsettings.hpp b/rocclr/runtime/device/pal/palsettings.hpp index 716b45f307..fc6f63d416 100644 --- a/rocclr/runtime/device/pal/palsettings.hpp +++ b/rocclr/runtime/device/pal/palsettings.hpp @@ -62,7 +62,8 @@ class Settings : public device::Settings { uint rgpSqttForceDisable_ : 1; //!< Disables SQTT uint splitSizeForWin7_ : 1; //!< DMA flush split size for Win 7 uint enableHwP2P_ : 1; //!< Forces HW P2P path for testing - uint reserved_ : 10; + uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10 + uint reserved_ : 9; }; uint value_; }; diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp index 38968f29ea..ab6421822a 100644 --- a/rocclr/runtime/device/pal/palvirtual.cpp +++ b/rocclr/runtime/device/pal/palvirtual.cpp @@ -1105,6 +1105,13 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& vcmd) { } } break; case CL_COMMAND_READ_IMAGE: + if (memory->memoryType() == Resource::ImageBuffer) { + Image* imageBuffer = static_cast(memory); + // Check if synchronization has to be performed + if (imageBuffer->CopyImageBuffer() != nullptr) { + memory = imageBuffer->CopyImageBuffer(); + } + } if (hostMemory != nullptr) { // Accelerated image to buffer transfer without pinning amd::Coord3D dstOrigin(offset); @@ -2398,6 +2405,25 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const } } + // Check if image buffer write back is required + if (state_.imageBufferWrtBack_) { + // Avoid recursive write back + state_.imageBufferWrtBack_ = false; + // Make sure the original kernel execution is done + addBarrier(RgpSqqtBarrierReason::MemDependency); + for (const auto imageBuffer : wrtBackImageBuffer_) { + Memory* buffer = dev().getGpuMemory(imageBuffer->owner()->parent()); + amd::Image* image = imageBuffer->owner()->asImage(); + amd::Coord3D offs(0); + // Copy memory from the the backing store image into original buffer + bool result = blitMgr().copyImageToBuffer( + *imageBuffer->CopyImageBuffer(), *buffer, offs, offs, + image->getRegion(), true, + image->getRowPitch(), image->getSlicePitch()); + } + wrtBackImageBuffer_.clear(); + } + // Perform post dispatch logic for RGP traces if (rgpCaptureEna()) { dev().rgpCaptureMgr()->PostDispatch(this); @@ -3256,6 +3282,32 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p mem->signalWrite(&dev()); } if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) { + if (gpuMem->memoryType() == Resource::ImageBuffer) { + Image* imageBuffer = static_cast(gpuMem); + // Check if synchronization has to be performed + if (imageBuffer->CopyImageBuffer() != nullptr) { + Memory* buffer = dev().getGpuMemory(mem->parent()); + amd::Image* image = mem->asImage(); + amd::Coord3D offs(0); + // Copy memory from the original image buffer into the backing store image + bool result = blitMgr().copyBufferToImage( + *buffer, *imageBuffer->CopyImageBuffer(), offs, offs, + image->getRegion(), true, image->getRowPitch(), image->getSlicePitch()); + // Make sure the copy operation is done + addBarrier(RgpSqqtBarrierReason::MemDependency); + // Use backing store SRD as the replacment + uint64_t srd = imageBuffer->CopyImageBuffer()->hwSrd(); + WriteAqlArgAt(const_cast
(params), &srd, sizeof(srd), desc.offset_); + // Add backing store image to the list of memory handles + addVmMemory(imageBuffer->CopyImageBuffer()); + // If it's not a read only resource, then runtime has to write back + if (!info.readOnly_) { + wrtBackImageBuffer_.push_back(imageBuffer); + state_.imageBufferWrtBack_ = true; + } + } + } + //! \note Special case for the image views. //! Copy SRD to CB1, so blit manager will be able to release //! this view without a wait for SRD resource. diff --git a/rocclr/runtime/device/pal/palvirtual.hpp b/rocclr/runtime/device/pal/palvirtual.hpp index 61242af0f2..71e7cf2e73 100644 --- a/rocclr/runtime/device/pal/palvirtual.hpp +++ b/rocclr/runtime/device/pal/palvirtual.hpp @@ -205,6 +205,7 @@ class VirtualGPU : public device::VirtualDevice { uint profileEnabled_ : 1; //!< Profiling is enabled for WaveLimiter uint perfCounterEnabled_ : 1; //!< PerfCounter is enabled uint rgpCaptureEnabled_ : 1; //!< RGP capture is enabled in the runtime + uint imageBufferWrtBack_: 1; //!< Enable image buffer write back }; uint value_; State() : value_(0) {} @@ -643,6 +644,7 @@ class VirtualGPU : public device::VirtualDevice { Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator Queue* queues_[AllEngines]; //!< HW queues for all engines MemoryRange sdmaRange_; //!< SDMA memory range for write access + std::vector wrtBackImageBuffer_; //!< Array of images for write back }; inline void VirtualGPU::addVmMemory(const Memory* memory) { diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp index fcb1e10314..85e815ca95 100644 --- a/rocclr/runtime/utils/flags.hpp +++ b/rocclr/runtime/utils/flags.hpp @@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false, \ "Enables cooperative group launch") \ release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \ "The maximum number of command buffers allocated per queue") \ +release(bool, GPU_IMAGE_BUFFER_WAR, true, \ + "Enables image buffer workaround") \ release(cstring, HIP_VISIBLE_DEVICES, "", \ "Only devices whose index is present in the sequence are visible to HIP") \ release(cstring, CUDA_VISIBLE_DEVICES, "", \