From 9cdd929342ef078fd2599f7a25fab59706125a1f Mon Sep 17 00:00:00 2001 From: Rakesh Roy Date: Mon, 5 Sep 2022 23:17:30 +0530 Subject: [PATCH] SWDEV-353941 - Fix hipMemset latency issue for hipMallocManaged - In case of HMM, use blit kernel instead of CPU memcpy for hipMemset Change-Id: I89bfc96ff01a2375ed8df1b1c6bc05357dea84f7 [ROCm/clr commit: f097cda948d9c1627c6f3e84473749d8668f38fd] --- projects/clr/rocclr/device/pal/palvirtual.cpp | 12 +++++++++++- projects/clr/rocclr/device/rocm/rocvirtual.cpp | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 0a5e1484c5..dfbd198b4b 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -1872,11 +1872,21 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { amd::BufferRect rect; rect.create(static_cast(origin), static_cast(region), pitch, 0); + + bool force_blit = false; + if (amd::IS_HIP) { + constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR); + // In case of HMM, use blit kernel instead of CPU memcpy + if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) { + force_blit = true; + } + } + for (size_t slice = 0; slice < depth; slice++) { for (size_t row = 0; row < height; row++) { const size_t rowOffset = rect.offset(0, row, slice); if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), - amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1})) { + amd::Coord3D{rowOffset, 0, 0}, amd::Coord3D{width, 1, 1}, force_blit)) { cmd.setStatus(CL_INVALID_OPERATION); } } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 61daa52240..bba29a253b 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2296,8 +2296,18 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { amd::ScopedLock lock(execution()); profilingBegin(cmd); + + bool force_blit = false; + if (amd::IS_HIP) { + constexpr uint32_t kManagedAlloc = (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_ALLOC_HOST_PTR); + // In case of HMM, use blit kernel instead of CPU memcpy + if ((cmd.memory().getMemFlags() & kManagedAlloc) == kManagedAlloc) { + force_blit = true; + } + } + if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), - cmd.surface(), cmd.origin(), cmd.size())) { + cmd.surface(), cmd.origin(), cmd.size(), force_blit)) { cmd.setStatus(CL_INVALID_OPERATION); } profilingEnd(cmd);