SWDEV-432174 - Change the fillBuffer kernel

- Add the new fillBuffer kernel, which allows to launch a limited number of workgroups for memory fill operation - Switch fill memory to 16 bytes write by default - Allow to limit the workgroups with DEBUG_CLR_LIMIT_BLIT_WG Change-Id: Ibad1822f2d42b2fc71bcfc1917c31409c0623e8e
2023-11-14 12:49:17 -05:00
@@ -344,7 +344,7 @@ bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory&
 bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                        const amd::Coord3D& srcOrigin,
                                        const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
-                                        bool entire, size_t rowPitch, size_t slicePitch, 
+                                        bool entire, size_t rowPitch, size_t slicePitch,
                                        amd::CopyMetadata copyMetadata) const {
  size_t startLayer = srcOrigin[2];
  size_t numLayers = size[2];
@@ -475,7 +475,7 @@ bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memor

 bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
-                                const amd::Coord3D& size, bool entire, 
+                                const amd::Coord3D& size, bool entire,
                                amd::CopyMetadata copyMetadata) const {
  size_t startLayer = srcOrigin[2];
  size_t numLayers = size[2];
@@ -685,49 +685,41 @@ uint32_t HostBlitManager::sRGBmap(float fc) const {
  return (uint32_t)(c * 255.0 + 0.5);
 }

-bool HostBlitManager::FillBufferInfo::ExpandPattern64(uint64_t pattern, size_t pattern_size,
-                                                      uint64_t& pattern64) {
+// ================================================================================================
+void HostBlitManager::FillBufferInfo::ExpandPattern(uint32_t pattern_size, const void* pattern) {
+  // If pattern size exceeds extended, then runtime will select the normal path
+  if (pattern_size >= kExtendedSize) {
+    return;
+  }

-  bool retval = true;
-
-  do {
-
-    // If the pattern is 0 or if the pattern_size is same as max size.
-    if (pattern == 0 ||  pattern_size == sizeof(uint64_t)) {
-      pattern64 = pattern;
-      break;
+  pattern_expanded_ = true;
+  if (pattern_size == sizeof(uint8_t)) {
+    uint8_t pattern_byte = *reinterpret_cast<const uint8_t*>(pattern);
+    for (uint32_t i = 0; i < kExtendedSize; ++i) {
+      reinterpret_cast<uint8_t*>(expanded_pattern_)[i] = pattern_byte;
    }
-
-    // Clean Curr_pattern, since it was casted off from const void* with a lesser size than size_t.
-    ClearBits64(pattern, (pattern_size * 8));
-    pattern64 = 0;
-
-    if (pattern_size == sizeof(uint8_t)) {
-      pattern = pattern & 0xff;
-      pattern64 = ((pattern << 56) | (pattern << 48) | (pattern << 40) | (pattern << 32)
-                    | (pattern << 24) | (pattern << 16) | (pattern << 8) | (pattern));
-    } else if (pattern_size == sizeof(uint16_t)) {
-      pattern = pattern & 0xffff;
-      pattern64 = ((pattern << 48) | (pattern << 32) | (pattern << 16) | (pattern));
-    } else if (pattern_size == sizeof(uint32_t)) {
-      pattern = pattern & 0xffffffff;
-      pattern64 = ((pattern << 32) | (pattern));
-    } else {
-      LogPrintfError("Unsupported Pattern size: %u \n", pattern_size);
-      retval = false;
-      break;
+  } else if (pattern_size == sizeof(uint16_t)) {
+    uint16_t pattern_word = *reinterpret_cast<const uint16_t*>(pattern);
+    for (uint32_t i = 0; i < kExtendedSize / sizeof(uint16_t); ++i) {
+      reinterpret_cast<uint16_t*>(expanded_pattern_)[i] = pattern_word;
    }
-
-  } while (0);
-
-  return retval;
+  } else if (pattern_size == sizeof(uint32_t)) {
+    uint32_t pattern_dword = *reinterpret_cast<const uint32_t*>(pattern);
+    for (uint32_t i = 0; i < kExtendedSize / sizeof(uint32_t); ++i) {
+      reinterpret_cast<uint32_t*>(expanded_pattern_)[i] = pattern_dword;
+    }
+  } else {
+    uint64_t pattern_qword = *reinterpret_cast<const uint64_t*>(pattern);
+    reinterpret_cast<uint64_t*>(expanded_pattern_)[0] = pattern_qword;
+    reinterpret_cast<uint64_t*>(expanded_pattern_)[1] = pattern_qword;
+  }
 }

-bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
-                                           size_t fill_origin, const void* pattern_ptr,
-                                           size_t pattern_size,
-                                           std::vector<FillBufferInfo>& packed_info) {
-
+// ================================================================================================
+void HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, size_t fill_size,
+                                               size_t fill_origin, const void* pattern_ptr,
+                                               size_t pattern_size,
+                                               std::vector<FillBufferInfo>& packed_info) {
  // 1. Validate input arguments
  guarantee(fill_size >= pattern_size, "Pattern Size: %u cannot be greater than fill size: %u \n",
                                        pattern_size, fill_size);
@@ -736,60 +728,42 @@ bool HostBlitManager::FillBufferInfo::PackInfo(const device::Memory& memory, siz

  // 2. Calculate the next closest dword aligned address for faster processing
  size_t dst_addr = memory.virtualAddress() + fill_origin;
-  size_t aligned_dst_addr = amd::alignUp(dst_addr, sizeof(size_t));
+  size_t aligned_dst_addr = amd::alignUp(dst_addr, kExtendedSize);
  guarantee(aligned_dst_addr >= dst_addr, "Aligned address: %u cannot be greater than destination"
                                          "address :%u \n", aligned_dst_addr, dst_addr);

  // 3. If given address is not aligned calculate head and tail size.
  size_t head_size = std::min(aligned_dst_addr - dst_addr, fill_size);
-  size_t aligned_size = ((fill_size - head_size) / sizeof(size_t)) * sizeof(size_t);
-  size_t tail_size = (fill_size - head_size) % sizeof(size_t);
+  size_t aligned_size = ((fill_size - head_size) / kExtendedSize) * kExtendedSize;
+  size_t tail_size = (fill_size - head_size) % kExtendedSize;
  guarantee((head_size + aligned_size + tail_size) <= fill_size, "Head size, aligned size & tail"
                                                          "size together cannot cross fill size");

-  // 4. Clear unwanted bytes from the pattern if the pattern size is < sizeof(size_t).
-  uint64_t pattern = *(reinterpret_cast<uint64_t*>(const_cast<void*>(pattern_ptr)));
-  if (pattern_size < sizeof(uint64_t)) {
-    ClearBits64(pattern, (pattern_size * 8));
-  }
-
-  // 5. Fill the head, aligned, tail info if they exist.
-  FillBufferInfo fill_info;
+  // 4. Fill the head, aligned, tail info if they exist.
  if (head_size > 0) {
    // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
    guarantee((head_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");

-    fill_info.fill_size_ = head_size;
+    FillBufferInfo fill_info(head_size);
    packed_info.push_back(fill_info);
  }

-  fill_info.clearInfo();
  if (aligned_size > 0) {
    // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
    guarantee((aligned_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");

-    if (pattern_size < sizeof(uint64_t)) {
-      if (!ExpandPattern64(pattern, pattern_size, fill_info.expanded_pattern_)) {
-        DevLogPrintfError("Failed Expanding the pattern for pattern:%u, pattern_size: %u",
-                          pattern, pattern_size);
-        return false;
-      }
-      fill_info.pattern_expanded_ = true;
-    }
-    fill_info.fill_size_ = aligned_size;
+    FillBufferInfo fill_info(aligned_size);
+    fill_info.ExpandPattern(pattern_size, pattern_ptr);
    packed_info.push_back(fill_info);
  }
-  fill_info.clearInfo();

  if (tail_size > 0) {
    // Offsetted ptrs should align with pattern size. Runtime not responsible for rotating pattern.
    guarantee((tail_size % pattern_size) == 0, "Offseted ptr should align with pattern_size");

-    fill_info.fill_size_ = tail_size;
+    FillBufferInfo fill_info(tail_size);
    packed_info.push_back(fill_info);
  }
-  fill_info.clearInfo();
-
-  return true;
 }
+
 }  // namespace gpu
@@ -78,7 +78,7 @@ class BlitManager : public amd::HeapObject {
                          const amd::Coord3D& origin,  //!< Source origin
                          const amd::Coord3D& size,    //!< Size of the copy region
                          bool entire = false,         //!< Entire buffer will be updated
-                          amd::CopyMetadata copyMetadata = 
+                          amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata()//!< Memory copy MetaData
                          ) const = 0;

@@ -89,7 +89,7 @@ class BlitManager : public amd::HeapObject {
                              const amd::BufferRect& hostRect,  //!< Destination rectangle
                              const amd::Coord3D& size,         //!< Size of the copy region
                              bool entire = false,              //!< Entire buffer will be updated
-                              amd::CopyMetadata copyMetadata = 
+                              amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata()         //!< Memory copy MetaData
                              ) const = 0;

@@ -101,7 +101,7 @@ class BlitManager : public amd::HeapObject {
                         size_t rowPitch,             //!< Row pitch for host memory
                         size_t slicePitch,           //!< Slice pitch for host memory
                         bool entire = false,          //!< Entire buffer will be updated
-                         amd::CopyMetadata copyMetadata = 
+                         amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata()//!< Memory copy MetaData
                         ) const = 0;

@@ -111,7 +111,7 @@ class BlitManager : public amd::HeapObject {
                           const amd::Coord3D& origin,  //!< Destination origin
                           const amd::Coord3D& size,    //!< Size of the copy region
                           bool entire = false,         //!< Entire buffer will be updated
-                           amd::CopyMetadata copyMetadata = 
+                           amd::CopyMetadata copyMetadata =
                                     amd::CopyMetadata() //!< Memory copy MetaData
                           ) const = 0;

@@ -122,7 +122,7 @@ class BlitManager : public amd::HeapObject {
                               const amd::BufferRect& bufRect,   //!< Source rectangle
                               const amd::Coord3D& size,         //!< Size of the copy region
                               bool entire = false,              //!< Entire buffer will be updated
-                               amd::CopyMetadata copyMetadata = 
+                               amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata()          //!< Memory copy MetaData
                               ) const = 0;

@@ -134,7 +134,7 @@ class BlitManager : public amd::HeapObject {
                          size_t rowPitch,             //!< Row pitch for host memory
                          size_t slicePitch,           //!< Slice pitch for host memory
                          bool entire = false,          //!< Entire buffer will be updated
-                          amd::CopyMetadata copyMetadata = 
+                          amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata() //!< Memory copy MetaData
                          ) const = 0;

@@ -193,7 +193,7 @@ class BlitManager : public amd::HeapObject {
                         const amd::Coord3D& dstOrigin,  //!< Destination origin
                         const amd::Coord3D& size,       //!< Size of the copy region
                         bool entire = false,            //!< Entire buffer will be updated
-                         amd::CopyMetadata copyMetadata = 
+                         amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata() //!< Memory copy MetaData
                         ) const = 0;

@@ -301,7 +301,7 @@ class HostBlitManager : public device::BlitManager {
                         size_t rowPitch,             //!< Row pitch for host memory
                         size_t slicePitch,           //!< Slice pitch for host memory
                         bool entire = false,         //!< Entire buffer will be updated
-                         amd::CopyMetadata copyMetadata = 
+                         amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata() //!< Memory copy MetaData
                         ) const;

@@ -334,7 +334,7 @@ class HostBlitManager : public device::BlitManager {
                          size_t rowPitch,             //!< Row pitch for host memory
                          size_t slicePitch,           //!< Slice pitch for host memory
                          bool entire = false,         //!< Entire buffer will be updated
-                          amd::CopyMetadata copyMetadata = 
+                          amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata() //!< Memory copy MetaData
                          ) const;

@@ -393,7 +393,7 @@ class HostBlitManager : public device::BlitManager {
                         const amd::Coord3D& dstOrigin,  //!< Destination origin
                         const amd::Coord3D& size,       //!< Size of the copy region
                         bool entire = false,            //!< Entire buffer will be updated
-                         amd::CopyMetadata copyMetadata = 
+                         amd::CopyMetadata copyMetadata =
                                    amd::CopyMetadata()  //!< Memory copy MetaData
                         ) const;

@@ -423,36 +423,24 @@ class HostBlitManager : public device::BlitManager {
  const amd::Device& dev_;  //!< Physical device

  // Packed Fill Buffer
-  class FillBufferInfo {
-  public:
-    FillBufferInfo(): fill_size_(0), expanded_pattern_(0), pattern_expanded_(false) {}
+  struct FillBufferInfo {
+    static constexpr uint32_t kExtendedSize = 2 * sizeof(uint64_t);

-    static bool PackInfo(const device::Memory& memory, size_t fill_size,
-                  size_t fill_origin, const void* pattern, size_t pattern_size,
-                  std::vector<FillBufferInfo>& packed_info);
-
-  private:
-    static bool ExpandPattern64(uint64_t pattern, size_t pattern_size, uint64_t& pattern64);
-
-    static inline void ClearBits64(uint64_t& pattern, uint64_t num_bits) {
-      pattern &= ~(~(static_cast<uint64_t>(0)) << num_bits);
+    static void PackInfo(const device::Memory& memory, size_t fill_size,
+                         size_t fill_origin, const void* pattern, size_t pattern_size,
+                         std::vector<FillBufferInfo>& packed_info);

+    FillBufferInfo(size_t fill_size): fill_size_(fill_size), pattern_expanded_(false) {
+      memset(&expanded_pattern_, 0, sizeof(expanded_pattern_));
    }

-    void clearInfo () {
-      fill_size_ = 0;
-      expanded_pattern_ = 0;
-      pattern_expanded_ = false;
-    }
+    void ExpandPattern(uint32_t pattern_size, const void* pattern);

-  public:
-    size_t fill_size_;          // Fill size for this command
-    uint64_t expanded_pattern_; // Pattern for this command
-    bool pattern_expanded_;     // Boolean to check if pattern is expanded
+    size_t fill_size_;                        //!< Fill size for this command
+    uint8_t expanded_pattern_[kExtendedSize]; //!< Pattern for this command - 16 bytes
+    bool pattern_expanded_;                   //!< Boolean to check if pattern is expanded
  };

-
-
 private:
  //! Disable copy constructor
  HostBlitManager(const HostBlitManager&);
@@ -47,16 +47,57 @@ const char* BlitLinearSourceCode = BLIT_KERNELS(

  extern void __ockl_gws_init(uint nwm1, uint rid);

-  // Implementation
-  __kernel void __amd_rocclr_fillBufferAligned(__global uchar* bufUChar,
-                                               __global ushort* bufUShort,
-                                               __global uint* bufUInt,
-                                               __global ulong* bufULong,
-                                               __constant uchar* pattern,
-                                               uint patternSize, ulong offset,
-                                               ulong size) {
-    __amd_fillBufferAligned(bufUChar, bufUShort, bufUInt, bufULong,
-                            pattern, patternSize, offset, size);
+  __kernel void __amd_rocclr_fillBufferAligned(
+      __global uchar* bufUChar, __global ushort* bufUShort, __global uint* bufUInt,
+      __global ulong* bufULong, __global ulong2* bufULong2, __constant uchar* pattern,
+      uint pattern_size, ulong offset, ulong end_ptr, uint next_chunk) {
+    int id = get_global_id(0);
+    long cur_id = offset + id * pattern_size;
+    if (bufULong2) {
+      __global ulong2* element = &bufULong2[cur_id];
+      __constant ulong2* pt = (__constant ulong2*)pattern;
+      while ((ulong)element < end_ptr) {
+        for (uint i = 0; i < pattern_size; ++i) {
+          element[i] = pt[i];
+        }
+        element += next_chunk;
+      }
+    } else if (bufULong) {
+      __global ulong* element = &bufULong[cur_id];
+      __constant ulong* pt = (__constant ulong*)pattern;
+      while ((ulong)element < end_ptr) {
+        for (uint i = 0; i < pattern_size; ++i) {
+          element[i] = pt[i];
+        }
+        element += next_chunk;
+      }
+    } else if (bufUInt) {
+      __global uint* element = &bufUInt[cur_id];
+      __constant uint* pt = (__constant uint*)pattern;
+      while ((ulong)element < end_ptr) {
+        for (uint i = 0; i < pattern_size; ++i) {
+          element[i] = pt[i];
+        }
+        element += next_chunk;
+      }
+    } else if (bufUShort) {
+      __global ushort* element = &bufUShort[cur_id];
+      __constant ushort* pt = (__constant ushort*)pattern;
+      while ((ulong)element < end_ptr) {
+        for (uint i = 0; i < pattern_size; ++i) {
+          element[i] = pt[i];
+        }
+        element += next_chunk;
+      }
+    } else {
+      __global uchar* element = &bufUChar[cur_id];
+      while ((ulong)element < end_ptr) {
+        for (uint i = 0; i < pattern_size; ++i) {
+          element[i] = pattern[i];
+        }
+        element += next_chunk;
+      }
+    }
  }

  __kernel void __amd_rocclr_fillBufferAligned2D(__global uchar* bufUChar,
@@ -2172,63 +2172,96 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
    synchronize();
    return result;
  } else {
-    uint fillType = FillBufferAligned;
-    size_t globalWorkOffset[3] = {0, 0, 0};
-    uint64_t fillSize = size[0] / patternSize;
-    size_t globalWorkSize = amd::alignUp(fillSize, 256);
-    size_t localWorkSize = 256;
-    uint32_t alignment = (patternSize & 0x7) == 0 ?
-                          sizeof(uint64_t) :
-                          (patternSize & 0x3) == 0 ?
-                          sizeof(uint32_t) :
-                          (patternSize & 0x1) == 0 ?
-                          sizeof(uint16_t) : sizeof(uint8_t);
+    // Pack the fill buffer info, that handles unaligned memories.
+    std::vector<FillBufferInfo> packed_vector{};
+    FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);

-    // Program kernels arguments for the fill operation
-    Memory* mem = &gpuMem(memory);
-    if (alignment == sizeof(uint64_t)) {
-      setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
-    } else if (alignment == sizeof(uint32_t)) {
-      setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
-      setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
-    } else if (alignment == sizeof(uint16_t)) {
-      setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
-      setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
-    } else {
-      setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
-      setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-      setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
+    size_t overall_offset = origin[0];
+    for (auto& packed_obj : packed_vector) {
+      constexpr uint32_t kFillType = FillBufferAligned;
+      uint32_t kpattern_size = (packed_obj.pattern_expanded_)
+          ? HostBlitManager::FillBufferInfo::kExtendedSize
+          : patternSize;
+      size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
+      size_t koffset = overall_offset;
+      overall_offset += packed_obj.fill_size_;
+
+      size_t globalWorkOffset[3] = {0, 0, 0};
+      uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
+                           (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
+                           (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
+                           (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
+
+      // Program kernels arguments for the fill operation
+      Memory* mem = &gpuMem(memory);
+      if (alignment == 2 * sizeof(uint64_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
+      } else if (alignment == sizeof(uint64_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
+      } else if (alignment == sizeof(uint32_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
+      } else if (alignment == sizeof(uint16_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
+      } else {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
+      }
+      const size_t localWorkSize = 256;
+      size_t globalWorkSize =
+          std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
+      globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);
+
+      Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
+      void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
+      // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern
+      if (packed_obj.pattern_expanded_) {
+        memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
+      } else {
+        memcpy(constBuf, pattern, kpattern_size);
+      }
+      gpuCB.unmap(&gpu());
+      Memory* pGpuCB = &gpuCB;
+      setArgument(kernels_[kFillType], 5, sizeof(cl_mem), &pGpuCB);
+      uint64_t offset = origin[0];
+
+      // Adjust the pattern size in the copy type size
+      kpattern_size /= alignment;
+      setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
+      koffset /= alignment;
+      setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
+      // Calculate max id
+      kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
+      setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
+      uint32_t next_chunk = globalWorkSize * kpattern_size;
+      setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);
+
+      // Create ND range object for the kernel's execution
+      amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
+
+      // Execute the blit
+      address parameters = kernels_[kFillType]->parameters().values();
+      result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters);
+      gpu().xferWrite().Release(gpuCB);
    }
-    Memory& gpuCB = gpu().xferWrite().Acquire(patternSize);
-    void* constBuf = gpuCB.map(&gpu(), Resource::NoWait);
-    memcpy(constBuf, pattern, patternSize);
-    gpuCB.unmap(&gpu());
-    Memory* pGpuCB = &gpuCB;
-    setArgument(kernels_[fillType], 4, sizeof(cl_mem), &pGpuCB);
-    uint64_t offset = origin[0];
-
-    patternSize/= alignment;
-    offset /= alignment;
-
-    setArgument(kernels_[fillType], 5, sizeof(uint32_t), &patternSize);
-    setArgument(kernels_[fillType], 6, sizeof(offset), &offset);
-    setArgument(kernels_[fillType], 7, sizeof(fillSize), &fillSize);
-
-    // Create ND range object for the kernel's execution
-    amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);
-
-    // Execute the blit
-    address parameters = kernels_[fillType]->parameters().values();
-    result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters);
-    gpu().xferWrite().Release(gpuCB);
  }

  synchronize();
@@ -148,6 +148,8 @@ Settings::Settings() {
  prepinnedMinSize_ = 0;
  cpDmaCopySizeMax_ = GPU_CP_DMA_COPY_SIZE * Ki;
  useDeviceKernelArg_ = flagIsDefault(HIP_FORCE_DEV_KERNARG) ? false : HIP_FORCE_DEV_KERNARG;
+
+  limit_blit_wg_ = 16;
 }

 bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -288,7 +290,7 @@ bool Settings::create(const Pal::DeviceProperties& palProp,

      // Cap at OpenCL20 for now
      if (oclVersion_ > OpenCL20) oclVersion_ = OpenCL20;
-      
+
      use64BitPtr_ = LP64_SWITCH(false, true);

      if (oclVersion_ >= OpenCL20) {
@@ -435,6 +437,10 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
    prepinnedMinSize_ = PAL_PREPINNED_MEMORY_SIZE * Ki;
  }

+  limit_blit_wg_ = enableWgpMode_
+      ? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
+      : palProp.gfxipProperties.shaderCore.numAvailableCus;
+
  // Override current device settings
  override();

@@ -501,6 +507,10 @@ void Settings::override() {
  if (!flagIsDefault(PAL_ALWAYS_RESIDENT)) {
    alwaysResident_ = PAL_ALWAYS_RESIDENT;
  }
+
+  if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
+    limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
+  }
 }

 }  // namespace pal
@@ -118,6 +118,7 @@ class Settings : public device::Settings {
  amd::LibrarySelector libSelector_;  //!< Select linking libraries for compiler

  size_t prepinnedMinSize_;           //!< minimal memory size for prepinned transfer
+  uint32_t limit_blit_wg_;        //!< The number of workgroups for blit execution

  //! Default constructor
  Settings();
@@ -778,6 +778,9 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) {
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

+  // Align size to 64 bytes for more efficient fill operation
+  allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));
+
  virtualQueue_ = new Memory(dev(), allocSize);
  Resource::MemoryType type = (GPU_PRINT_CHILD_KERNEL == 0) ? Resource::Local : Resource::Remote;
  if ((virtualQueue_ == nullptr) || !virtualQueue_->create(type)) {
@@ -2048,9 +2048,10 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
 }

 // ================================================================================================
-bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern, size_t patternSize,
-                                     const amd::Coord3D& surface, const amd::Coord3D& origin,
-                                     const amd::Coord3D& size, bool entire, bool forceBlit) const {
+bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern,
+                                     size_t patternSize, const amd::Coord3D& surface,
+                                     const amd::Coord3D& origin, const amd::Coord3D& size,
+                                     bool entire, bool forceBlit) const {
  amd::ScopedLock k(lockXferOps_);
  bool result = false;

@@ -2063,79 +2064,90 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
    synchronize();
    return result;
  } else {
-
    // Pack the fill buffer info, that handles unaligned memories.
    std::vector<FillBufferInfo> packed_vector{};
    FillBufferInfo::PackInfo(memory, size[0], origin[0], pattern, patternSize, packed_vector);

    size_t overall_offset = origin[0];
    for (auto& packed_obj: packed_vector) {
-      uint fillType = FillBufferAligned;
-
-      uint32_t kpattern_size32 = (packed_obj.pattern_expanded_) ? sizeof(size_t) : patternSize;
-      size_t kfill_size = packed_obj.fill_size_/kpattern_size32;
+      constexpr uint32_t kFillType = FillBufferAligned;
+      uint32_t kpattern_size = (packed_obj.pattern_expanded_) ?
+                                HostBlitManager::FillBufferInfo::kExtendedSize : patternSize;
+      size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
      size_t koffset = overall_offset;
      overall_offset += packed_obj.fill_size_;

      size_t globalWorkOffset[3] = {0, 0, 0};
-      size_t globalWorkSize = amd::alignUp(kfill_size, 256);
-      size_t localWorkSize = 256;
-
-      uint32_t alignment = (kpattern_size32 & 0x7) == 0 ?
-                            sizeof(uint64_t) :
-                            (kpattern_size32 & 0x3) == 0 ?
-                            sizeof(uint32_t) :
-                            (kpattern_size32 & 0x1) == 0 ?
-                            sizeof(uint16_t) : sizeof(uint8_t);
-
+      uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t) :
+                           (kpattern_size & 0x7) == 0 ? sizeof(uint64_t) :
+                           (kpattern_size & 0x3) == 0 ? sizeof(uint32_t) :
+                           (kpattern_size & 0x1) == 0 ? sizeof(uint16_t) : sizeof(uint8_t);
      // Program kernels arguments for the fill operation
      cl_mem mem = as_cl<amd::Memory>(memory.owner());
-      if (alignment == sizeof(uint64_t)) {
-        setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 3, sizeof(cl_mem), &mem);
+      if (alignment == 2 * sizeof(uint64_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), &mem);
+      } else if (alignment == sizeof(uint64_t)) {
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
      } else if (alignment == sizeof(uint32_t)) {
-        setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 2, sizeof(cl_mem), &mem);
-        setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
      } else if (alignment == sizeof(uint16_t)) {
-        setArgument(kernels_[fillType], 0, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 1, sizeof(cl_mem), &mem);
-        setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
      } else {
-        setArgument(kernels_[fillType], 0, sizeof(cl_mem), &mem);
-        setArgument(kernels_[fillType], 1, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 2, sizeof(cl_mem), nullptr);
-        setArgument(kernels_[fillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem);
+        setArgument(kernels_[kFillType], 1, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 2, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 3, sizeof(cl_mem), nullptr);
+        setArgument(kernels_[kFillType], 4, sizeof(cl_mem), nullptr);
      }
+      const size_t localWorkSize = 256;
+      size_t globalWorkSize =
+          std::min(dev().settings().limit_blit_wg_ * localWorkSize, kfill_size);
+      globalWorkSize = amd::alignUp(globalWorkSize, localWorkSize);

      auto constBuf = gpu().allocKernArg(kCBSize, kCBAlignment);

      // If pattern has been expanded, use the expanded pattern, otherwise use the default pattern.
      if (packed_obj.pattern_expanded_) {
-        memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size32);
+        memcpy(constBuf, &packed_obj.expanded_pattern_, kpattern_size);
      } else {
-        memcpy(constBuf, pattern, kpattern_size32);
+        memcpy(constBuf, pattern, kpattern_size);
      }
      constexpr bool kDirectVa = true;
-      setArgument(kernels_[fillType], 4, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);
+      setArgument(kernels_[kFillType], 5, sizeof(cl_mem), constBuf, 0, nullptr, kDirectVa);

+      // Adjust the pattern size in the copy type size
+      kpattern_size /= alignment;
+      setArgument(kernels_[kFillType], 6, sizeof(uint32_t), &kpattern_size);
      koffset /= alignment;
-      kpattern_size32 /= alignment;
-
-      setArgument(kernels_[fillType], 5, sizeof(uint32_t), &kpattern_size32);
-      setArgument(kernels_[fillType], 6, sizeof(koffset), &koffset);
-      setArgument(kernels_[fillType], 7, sizeof(kfill_size), &kfill_size);
+      setArgument(kernels_[kFillType], 7, sizeof(koffset), &koffset);
+      // Calculate max id
+      kfill_size = memory.virtualAddress() + (koffset + kfill_size * kpattern_size) * alignment;
+      setArgument(kernels_[kFillType], 8, sizeof(kfill_size), &kfill_size);
+      uint32_t next_chunk = globalWorkSize * kpattern_size;
+      setArgument(kernels_[kFillType], 9, sizeof(uint32_t), &next_chunk);

      // Create ND range object for the kernel's execution
      amd::NDRangeContainer ndrange(1, globalWorkOffset, &globalWorkSize, &localWorkSize);

      // Execute the blit
-      address parameters = captureArguments(kernels_[fillType]);
-      result = gpu().submitKernelInternal(ndrange, *kernels_[fillType], parameters, nullptr);
+      address parameters = captureArguments(kernels_[kFillType]);
+      result = gpu().submitKernelInternal(ndrange, *kernels_[kFillType], parameters, nullptr);
      releaseArguments(parameters);
    }
  }
@@ -769,7 +769,10 @@ bool Device::create() {
                   pciDeviceId_);
    return false;
  }
-
+  hsaSettings->limit_blit_wg_ = info().maxComputeUnits_;
+  if (!flagIsDefault(DEBUG_CLR_LIMIT_BLIT_WG)) {
+    hsaSettings->limit_blit_wg_ = std::max(DEBUG_CLR_LIMIT_BLIT_WG, 0x1U);
+  }
  amd::Context::Info info = {0};
  std::vector<amd::Device*> devices;
  devices.push_back(this);
@@ -96,6 +96,7 @@ Settings::Settings() {

  host_hdp_flush_ = true;
  gwsInitSupported_ = true;
+  limit_blit_wg_ = 16;
 }

 // ================================================================================================
@@ -76,6 +76,7 @@ class Settings : public device::Settings {
  size_t sdmaCopyThreshold_;  //!< Use SDMA to copy above this size

  uint32_t  hmmFlags_;        //!< HMM functionality control flags
+  uint32_t  limit_blit_wg_;   //!< The number of workgroups for blit execution

  //! Default constructor
  Settings();
@@ -2828,8 +2828,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
  // Add mask array for AmdAqlWrap slots
  allocSize += amd::alignUp(numSlots, DeviceQueueMaskSize) / 8;

-  // Make sure the allocation size aligns with DWORD.
-  allocSize = amd::alignUp(allocSize, sizeof(uint64_t));
+  // Align size to 64 bytes for more efficient fill operation
+  allocSize = amd::alignUp(allocSize, 8 * sizeof(uint64_t));

  // CL_MEM_ALLOC_HOST_PTR/CL_MEM_READ_WRITE
  virtualQueue_ = new (dev().context()) amd::Buffer(dev().context(), CL_MEM_READ_WRITE, allocSize);
@@ -221,6 +221,8 @@ release(uint, ROC_AQL_QUEUE_SIZE, 16384,                                      \
        "AQL queue size in AQL packets")                                      \
 release(uint, ROC_SIGNAL_POOL_SIZE, 32,                                       \
        "Initial size of HSA signal pool")                                    \
+release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16,                                    \
+        "Limit the number of workgroups in blit operations")                  \
 release(bool, ROC_SKIP_KERNEL_ARG_COPY, false,                                \
        "If true, then runtime can skip kernel arg copy")                     \
 release(bool, GPU_STREAMOPS_CP_WAIT, false,                                   \