diff --git a/rocclr/device/pal/palblit.cpp b/rocclr/device/pal/palblit.cpp index 3074e11d19..9d88b5e84a 100644 --- a/rocclr/device/pal/palblit.cpp +++ b/rocclr/device/pal/palblit.cpp @@ -2325,9 +2325,12 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, bool entire) const { amd::ScopedLock k(lockXferOps_); bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillImage_ || gpuMem(memory).isHostMemDirectAccess()) { + constexpr size_t kFillImageThreshold = 256 * 256; + + // Use host fill if memory has direct access and image is small + if (setup_.disableFillImage_ || + (gpuMem(memory).isHostMemDirectAccess() && + (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillImage(memory, pattern, origin, size, entire); diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index c1033007e4..befb7b6226 100644 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -2352,9 +2352,12 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern, amd::ScopedLock k(lockXferOps_); bool result = false; - - // Use host fill if memory has direct access - if (setup_.disableFillImage_ || memory.isHostMemDirectAccess()) { + constexpr size_t kFillImageThreshold = 256 * 256; + + // Use host fill if memory has direct access and image is small + if (setup_.disableFillImage_ || + (gpuMem(memory).isHostMemDirectAccess() && + (size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) { // Stall GPU before CPU access gpu().releaseGpuMemoryFence(); result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);