diff --git a/projects/clr/rocclr/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/device/gpu/gpuvirtual.cpp index 5073e34f01..d1ff61863f 100644 --- a/projects/clr/rocclr/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/device/gpu/gpuvirtual.cpp @@ -1243,7 +1243,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, - const amd::Coord3D& size) { + const amd::Coord3D& size, bool forceBlit) { gpu::Memory* memory = dev().getGpuMemory(amdMemory); bool entire = amdMemory->isEntirelyCovered(origin, size); @@ -1284,7 +1284,7 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const patternSize = elemSize; } result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, - amdMemory->isEntirelyCovered(origin, size)); + amdMemory->isEntirelyCovered(origin, size), forceBlit); if (NULL != bufferFromImage) { bufferFromImage->release(); } diff --git a/projects/clr/rocclr/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/device/gpu/gpuvirtual.hpp index 29db287919..a8791f4be2 100644 --- a/projects/clr/rocclr/device/gpu/gpuvirtual.hpp +++ b/projects/clr/rocclr/device/gpu/gpuvirtual.hpp @@ -460,6 +460,7 @@ class VirtualGPU : public device::VirtualDevice, public CALGSLContext { size_t patternSize, //!< pattern size const amd::Coord3D& origin, //!< memory origin const amd::Coord3D& size //!< memory size for filling + bool forceBlit = false //!< force shader blit path ); bool copyMemory(cl_command_type type, //!< the command type diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 898367b715..0238012549 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1791,7 +1791,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& vcmd) { bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, - const amd::Coord3D& size) { + const amd::Coord3D& size, bool forceBlit) { pal::Memory* memory = dev().getGpuMemory(amdMemory); bool entire = amdMemory->isEntirelyCovered(origin, size); @@ -1832,7 +1832,7 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const patternSize = elemSize; } result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, - amdMemory->isEntirelyCovered(origin, size)); + amdMemory->isEntirelyCovered(origin, size), forceBlit); if (nullptr != bufferFromImage) { bufferFromImage->release(); } @@ -2052,16 +2052,10 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) { amd::Coord3D size(fillSize, 1, 1); assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size); - memory->syncCacheFromHost(*this, syncFlags); if (!fillMemory(vcmd.type(), dstMemory, vcmd.pattern(), vcmd.patternSize(), origin, size)) { vcmd.setStatus(CL_INVALID_OPERATION); } - // Mark this as the most-recently written cache of the destination - dstMemory->signalWrite(&gpuDevice_); } else { // for FGS capable device, fill CPU memory directly amd::SvmBuffer::memFill(vcmd.dst(), vcmd.pattern(), vcmd.patternSize(), vcmd.times()); diff --git a/projects/clr/rocclr/device/pal/palvirtual.hpp b/projects/clr/rocclr/device/pal/palvirtual.hpp index 87538a2d4a..64ee66a3e7 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.hpp +++ b/projects/clr/rocclr/device/pal/palvirtual.hpp @@ -607,7 +607,8 @@ class VirtualGPU : public device::VirtualDevice { const void* pattern, //!< pattern to fill the memory size_t patternSize, //!< pattern size const amd::Coord3D& origin, //!< memory origin - const amd::Coord3D& size //!< memory size for filling + const amd::Coord3D& size, //!< memory size for filling + bool forceBlit = false //!< force shader blit path ); bool copyMemory(cl_command_type type, //!< the command type diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index db803164af..06b4d68146 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2147,7 +2147,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const void* pattern, size_t patternSize, const amd::Coord3D& origin, - const amd::Coord3D& size) { + const amd::Coord3D& size, bool forceBlit) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); @@ -2185,7 +2185,7 @@ bool VirtualGPU::fillMemory(cl_command_type type, amd::Memory* amdMemory, const pattern = fillValue; patternSize = elemSize; } - result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, entire); + result = blitMgr().fillBuffer(*memory, pattern, patternSize, realOrigin, realSize, entire, forceBlit); break; } case CL_COMMAND_FILL_IMAGE: { @@ -2307,18 +2307,14 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) { amd::Coord3D origin(offset); amd::Coord3D size(sizeBytes); - bool entire = amdMemory->isEntirelyCovered(origin, size); // Ensure memory ordering preceding the write dispatchBarrierPacket(kBarrierPacketReleaseHeader); - // Use GPU Blit to write - bool result = blitMgr().fillBuffer(*memory, &value, sizeBytes, origin, size, entire, true); - ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writting value: 0x%lx", value); - - if (!result) { - LogError("submitStreamOperation: Write failed!"); + if (!fillMemory(CL_COMMAND_FILL_BUFFER, amdMemory, &value, sizeBytes, origin, size, true)) { + cmd.setStatus(CL_INVALID_OPERATION); } + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value); } else { ShouldNotReachHere(); } @@ -2348,16 +2344,10 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { amd::Coord3D size(fillSize, 1, 1); assert((dstMemory->validateRegion(origin, size)) && "The incorrect fill size!"); - // Synchronize memory from host if necessary - device::Memory::SyncFlags syncFlags; - syncFlags.skipEntire_ = dstMemory->isEntirelyCovered(origin, size); - memory->syncCacheFromHost(*this, syncFlags); - if (!fillMemory(cmd.type(), dstMemory, cmd.pattern(), cmd.patternSize(), origin, size)) { + if (!fillMemory(cmd.type(), dstMemory, cmd.pattern(), cmd.patternSize(), origin, size, true)) { cmd.setStatus(CL_INVALID_OPERATION); } - // Mark this as the most-recently written cache of the destination - dstMemory->signalWrite(&dev()); } else { // Stall GPU for CPU access to memory releaseGpuMemoryFence(); diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index f1efa435c3..bbdc4428ad 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -415,7 +415,8 @@ class VirtualGPU : public device::VirtualDevice { const void* pattern, //!< pattern to fill the memory size_t patternSize, //!< pattern size const amd::Coord3D& origin, //!< memory origin - const amd::Coord3D& size //!< memory size for filling + const amd::Coord3D& size, //!< memory size for filling + bool forceBlit = false //!< force shader blit path ); //! Common function for memory copy used by both svm Copy and non-svm Copy