From e295beb8ba1ab54dcd3fdc1c935ca9e644ffe9d4 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Mon, 5 Dec 2022 13:49:41 -0800 Subject: [PATCH] SWDEV-371904 - Adding pseudo fine grain flag to hsa memory allocation for device fine grained memory. Change-Id: I8cada90f0e3880dfbc5bf5a3fac4554e7a0cb08e [ROCm/clr commit: e56a611b92550f5965110fe77da87ada9ab258d4] --- projects/clr/rocclr/device/rocm/rocdevice.cpp | 8 +++++--- projects/clr/rocclr/device/rocm/rocdevice.hpp | 2 +- projects/clr/rocclr/device/rocm/rocmemory.cpp | 3 ++- projects/clr/rocclr/platform/memory.hpp | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 0f27a1b038..16fdd8d3b4 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2081,8 +2081,10 @@ bool Device::allowPeerAccess(device::Memory* memory) const { return true; } -void* Device::deviceLocalAlloc(size_t size, bool atomics) const { - const hsa_amd_memory_pool_t& pool = (atomics)? gpu_fine_grained_segment_ : gpuvm_segment_; +void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const { + const hsa_amd_memory_pool_t& pool = (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_; + uint32_t hsa_mem_flags = (atomics && pseudo_fine_grain) ? HSA_AMD_MEMORY_POOL_PCIE_FLAG + : HSA_AMD_MEMORY_POOL_STANDARD_FLAG; if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) { DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", @@ -2091,7 +2093,7 @@ void* Device::deviceLocalAlloc(size_t size, bool atomics) const { } void* ptr = nullptr; - hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr); + hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, hsa_mem_flags, &ptr); ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa device memory %p, size 0x%zx", ptr, size); if (stat != HSA_STATUS_SUCCESS) { LogError("Fail allocation local memory"); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index f2811134e3..b3da3783d1 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -422,7 +422,7 @@ class Device : public NullDevice { bool allowPeerAccess(device::Memory* memory) const; - void* deviceLocalAlloc(size_t size, bool atomics = false) const; + void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const; void memFree(void* ptr, size_t size) const; diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index 9d1deb5d44..780cb74885 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -796,7 +796,8 @@ bool Buffer::create(bool alloc_local) { } } else { assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!"); - deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0); + deviceMemory_ = dev().deviceLocalAlloc(size(), (memFlags & CL_MEM_SVM_ATOMICS) != 0, + (memFlags & ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN) != 0); } owner()->setSvmPtr(deviceMemory_); } else { diff --git a/projects/clr/rocclr/platform/memory.hpp b/projects/clr/rocclr/platform/memory.hpp index d1a0a49b63..3edd16cd1e 100644 --- a/projects/clr/rocclr/platform/memory.hpp +++ b/projects/clr/rocclr/platform/memory.hpp @@ -41,6 +41,7 @@ #define ROCCLR_MEM_HSA_SIGNAL_MEMORY (1u << 30) #define ROCCLR_MEM_INTERNAL_MEMORY (1u << 29) #define CL_MEM_VA_RANGE_AMD (1u << 28) +#define ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN (1u << 27) namespace device { class Memory;