From 738bb19835f2738aa48405d6bcd104d062ece0e6 Mon Sep 17 00:00:00 2001 From: SaleelK Date: Sat, 8 Nov 2025 18:32:43 -0800 Subject: [PATCH] clr: Increase kernelArg/managedBuffer size (#1586) * Increase the buffer to 4MB. That can help kernel launches limited by a deep kernel pipeline Co-authored-by: JeniferC99 <150404595+JeniferC99@users.noreply.github.com> --- projects/clr/rocclr/device/rocm/rocvirtual.cpp | 5 ++--- projects/clr/rocclr/device/rocm/rocvirtual.hpp | 2 +- projects/clr/rocclr/utils/flags.hpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 7567703b8c..5abfd73284 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1801,7 +1801,7 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) { } hsa_agent_t agent = gpu_.dev().getBackendDevice(); for (auto& it : pool_signal_) { - if (HSA_STATUS_SUCCESS != Hsa::signal_create(0, 1, &agent, &it)) { + if (HSA_STATUS_SUCCESS != Hsa::signal_create(0, 1, &agent, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &it)) { return false; } } @@ -1810,8 +1810,7 @@ bool VirtualGPU::ManagedBuffer::Create(Device::MemorySegment mem_segment) { // ================================================================================================ address VirtualGPU::ManagedBuffer::Acquire(uint32_t size) { - auto alignment = amd::alignUp(256u, gpu_.dev().info().globalMemCacheLineSize_); - return Acquire(size, alignment); + return Acquire(size, gpu_.dev().info().globalMemCacheLineSize_); } // ================================================================================================ diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp index 7f23fa4a28..ce32d8fae3 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp @@ -196,7 +196,7 @@ class VirtualGPU : public device::VirtualDevice { class ManagedBuffer : public amd::EmbeddedObject { public: //! The number of chunks the arg pool will be divided - static constexpr uint32_t kPoolNumSignals = 4; + static constexpr uint32_t kPoolNumSignals = 16; ManagedBuffer(VirtualGPU& gpu, uint32_t pool_size) : gpu_(gpu), pool_size_(pool_size), pool_signal_(kPoolNumSignals) {} ~ManagedBuffer(); diff --git a/projects/clr/rocclr/utils/flags.hpp b/projects/clr/rocclr/utils/flags.hpp index b31b3834e6..9b1179be36 100644 --- a/projects/clr/rocclr/utils/flags.hpp +++ b/projects/clr/rocclr/utils/flags.hpp @@ -108,7 +108,7 @@ release(uint, OPENCL_VERSION, 200, \ "Force GPU opencl version") \ release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \ "Enable HSA device local memory usage") \ -release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \ +release(uint, HSA_KERNARG_POOL_SIZE, 4 * 1024 * 1024, \ "Kernarg pool size") \ release(bool, GPU_MIPMAP, true, \ "Enables GPU mipmap extension") \