From 4583cbafee64a80a3eba07b93c73ae741ca92029 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Tue, 2 Feb 2021 12:22:47 -0500 Subject: [PATCH] SWDEV-270013 - Allocate kernel_arguments from kern_arg & finegrain pool instead of coarse grain. Change-Id: Id4c6977934fdd6ef2311f6e75593801f1e51983c [ROCm/clr commit: 2df099df9e111c53724973a6d1c3ae633a2159aa] --- projects/clr/rocclr/device/device.hpp | 9 +++++- projects/clr/rocclr/device/gpu/gpudevice.cpp | 2 +- projects/clr/rocclr/device/gpu/gpudevice.hpp | 2 +- projects/clr/rocclr/device/pal/paldevice.cpp | 2 +- projects/clr/rocclr/device/pal/paldevice.hpp | 2 +- .../clr/rocclr/device/rocm/roccounters.cpp | 6 ++-- projects/clr/rocclr/device/rocm/rocdevice.cpp | 31 ++++++++++++++----- projects/clr/rocclr/device/rocm/rocdevice.hpp | 3 +- projects/clr/rocclr/device/rocm/rocmemory.cpp | 12 ++++--- .../clr/rocclr/device/rocm/rocvirtual.cpp | 3 +- projects/clr/rocclr/platform/context.cpp | 4 ++- 11 files changed, 54 insertions(+), 22 deletions(-) diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 3389fd3c00..d5550dfe5a 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1486,6 +1486,12 @@ class Device : public RuntimeObject { kLinkAtomicSupport } LinkAttribute; + typedef enum MemorySegment { + kNoAtomics = 0, + kAtomics = 1, + kKernArg = 2 + } MemorySegment; + typedef std::pair LinkAttrType; static constexpr size_t kP2PStagingSize = 4 * Mi; @@ -1624,7 +1630,8 @@ class Device : public RuntimeObject { /** * @copydoc amd::Context::hostAlloc */ - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const { + virtual void* hostAlloc(size_t size, size_t alignment, + MemorySegment mem_seg = kNoAtomics) const { ShouldNotCallThis(); return NULL; } diff --git a/projects/clr/rocclr/device/gpu/gpudevice.cpp b/projects/clr/rocclr/device/gpu/gpudevice.cpp index 57f2502dec..e7e3dcc995 100644 --- a/projects/clr/rocclr/device/gpu/gpudevice.cpp +++ b/projects/clr/rocclr/device/gpu/gpudevice.cpp @@ -2144,7 +2144,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize, hwStateSize); } -void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { +void* Device::hostAlloc(size_t size, size_t alignment, bool MemorySegment mem_seg) const { // for discrete gpu, we only reserve,no commit yet. return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE); } diff --git a/projects/clr/rocclr/device/gpu/gpudevice.hpp b/projects/clr/rocclr/device/gpu/gpudevice.hpp index 91543293aa..1f31368f3d 100644 --- a/projects/clr/rocclr/device/gpu/gpudevice.hpp +++ b/projects/clr/rocclr/device/gpu/gpudevice.hpp @@ -523,7 +523,7 @@ class Device : public NullDevice, public CALGSLDevice { ) const; //! host memory alloc - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const; //! SVM allocation virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 7fca6e50c1..c69a923c30 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -2112,7 +2112,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize, iDev()->CreateSamplerSrds(1, &samplerInfo, hwState); } -void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { +void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { // for discrete gpu, we only reserve,no commit yet. return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE); } diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index 4bfa687770..281c6ea5c7 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -473,7 +473,7 @@ class Device : public NullDevice { ) const; //! host memory alloc - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const; //! SVM allocation virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, diff --git a/projects/clr/rocclr/device/rocm/roccounters.cpp b/projects/clr/rocclr/device/rocm/roccounters.cpp index 3bad8898af..0cac41ddaa 100644 --- a/projects/clr/rocclr/device/rocm/roccounters.cpp +++ b/projects/clr/rocclr/device/rocm/roccounters.cpp @@ -550,7 +550,8 @@ bool PerfCounterProfile::initialize() { } if (cmd_buf.ptr == nullptr) { - void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, 1); + void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, + Device::MemorySegment::kAtomics); if (buf_ptr != nullptr) { profile_.command_buffer.ptr = buf_ptr; } @@ -565,7 +566,8 @@ bool PerfCounterProfile::initialize() { } if (out_buf.ptr == nullptr) { - void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, 1); + void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, + Device::MemorySegment::kAtomics); if (buf_ptr != nullptr) { profile_.output_buffer.ptr = buf_ptr; } diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 4b3302f76c..a0ed5b46f0 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -1845,12 +1845,27 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { } // ================================================================================================ -void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const { +void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; - // If runtime disables barrier, then all host allocations must have L2 disabled - const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_) - ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_ - : system_segment_; + + hsa_amd_memory_pool_t segment; + switch (mem_seg) { + case kKernArg : + case kNoAtomics : + // If runtime disables barrier, then all host allocations must have L2 disabled + if ((settings().barrier_sync_) && (system_coarse_segment_.handle != 0)) { + segment = system_coarse_segment_; + break; + } + // Falls through on else case. + case kAtomics : + segment = system_segment_; + break; + default : + guarantee(false && "Invalid Memory Segment"); + break; + } + assert(segment.handle != 0); hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr); ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size); @@ -1900,7 +1915,8 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const { void* ptr = nullptr; #ifndef ROCCLR_SUPPORT_NUMA_POLICY - ptr = hostAlloc(size, alignment, atomics); + ptr = hostAlloc(size, alignment, atomics + ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics); #else int mode = MPOL_DEFAULT; unsigned long nodeMask = 0; @@ -1930,7 +1946,8 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const { break; default: // All other modes fall back to default mode - ptr = hostAlloc(size, alignment, atomics); + ptr = hostAlloc(size, alignment, atomics + ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics); } #endif // ROCCLR_SUPPORT_NUMA_POLICY return ptr; diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index b66c6f7f34..f1d50a73eb 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -363,7 +363,8 @@ class Device : public NullDevice { //! Gets free memory on a GPU device virtual bool globalFreeMemory(size_t* freeMemory) const; - virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const; + virtual void* hostAlloc(size_t size, size_t alignment, + MemorySegment mem_seg = MemorySegment::kNoAtomics) const; virtual void hostFree(void* ptr, size_t size = 0) const; diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index 14b58bb3ea..4aeb47c42f 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -696,7 +696,7 @@ void Buffer::destroy() { // ================================================================================================ bool Buffer::create() { if (owner() == nullptr) { - deviceMemory_ = dev().hostAlloc(size(), 1, false); + deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); if (deviceMemory_ != nullptr) { flags_ |= HostMemoryDirectAccess; return true; @@ -730,12 +730,14 @@ bool Buffer::create() { // GPU accessible or prefetch memory into GPU dev().SvmAllocInit(deviceMemory_, size()); #else - deviceMemory_ = dev().hostAlloc(size(), 1, false); + deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); #endif // AMD_HMM_SUPPORT } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) { deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0); } else { - deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0); + deviceMemory_ = dev().hostAlloc(size(), 1, ((memFlags & CL_MEM_SVM_ATOMICS) != 0) + ? Device::MemorySegment::kAtomics + : Device::MemorySegment::kNoAtomics); } } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); @@ -810,7 +812,7 @@ bool Buffer::create() { return true; } - deviceMemory_ = dev().hostAlloc(size(), 1, false); + deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); owner()->setHostMem(deviceMemory_); if ((deviceMemory_ != nullptr) && dev().settings().apuSystem_) { @@ -1102,7 +1104,7 @@ bool Image::create() { } if (originalDeviceMemory_ == nullptr) { - originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false); + originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, Device::MemorySegment::kNoAtomics); if ((originalDeviceMemory_ != nullptr) && dev().settings().apuSystem_) { const_cast(dev()).updateFreeMemory(alloc_size, false); } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 3efda825ac..e61ef4834d 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1041,7 +1041,8 @@ bool VirtualGPU::create() { // ================================================================================================ bool VirtualGPU::initPool(size_t kernarg_pool_size) { kernarg_pool_size_ = kernarg_pool_size; - kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, false)); + kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, 0, + Device::MemorySegment::kKernArg)); if (kernarg_pool_base_ == nullptr) { return false; } diff --git a/projects/clr/rocclr/platform/context.cpp b/projects/clr/rocclr/platform/context.cpp index df536108f0..d322f95980 100644 --- a/projects/clr/rocclr/platform/context.cpp +++ b/projects/clr/rocclr/platform/context.cpp @@ -284,7 +284,9 @@ int Context::create(const intptr_t* properties) { void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const { if (customHostAllocDevice_ != NULL) { - return customHostAllocDevice_->hostAlloc(size, alignment, atomics); + return customHostAllocDevice_->hostAlloc(size, alignment, atomics + ? Device::MemorySegment::kAtomics + : Device::MemorySegment::kNoAtomics); } return AlignedMemory::allocate(size, alignment); }