diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp
index 3389fd3c00..d5550dfe5a 100644
--- a/projects/clr/rocclr/device/device.hpp
+++ b/projects/clr/rocclr/device/device.hpp
@@ -1486,6 +1486,12 @@ class Device : public RuntimeObject {
kLinkAtomicSupport
} LinkAttribute;
+ typedef enum MemorySegment {
+ kNoAtomics = 0,
+ kAtomics = 1,
+ kKernArg = 2
+ } MemorySegment;
+
typedef std::pair LinkAttrType;
static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1624,7 +1630,8 @@ class Device : public RuntimeObject {
/**
* @copydoc amd::Context::hostAlloc
*/
- virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const {
+ virtual void* hostAlloc(size_t size, size_t alignment,
+ MemorySegment mem_seg = kNoAtomics) const {
ShouldNotCallThis();
return NULL;
}
diff --git a/projects/clr/rocclr/device/gpu/gpudevice.cpp b/projects/clr/rocclr/device/gpu/gpudevice.cpp
index 57f2502dec..e7e3dcc995 100644
--- a/projects/clr/rocclr/device/gpu/gpudevice.cpp
+++ b/projects/clr/rocclr/device/gpu/gpudevice.cpp
@@ -2144,7 +2144,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
hwStateSize);
}
-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, bool MemorySegment mem_seg) const {
// for discrete gpu, we only reserve,no commit yet.
return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE);
}
diff --git a/projects/clr/rocclr/device/gpu/gpudevice.hpp b/projects/clr/rocclr/device/gpu/gpudevice.hpp
index 91543293aa..1f31368f3d 100644
--- a/projects/clr/rocclr/device/gpu/gpudevice.hpp
+++ b/projects/clr/rocclr/device/gpu/gpudevice.hpp
@@ -523,7 +523,7 @@ class Device : public NullDevice, public CALGSLDevice {
) const;
//! host memory alloc
- virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+ virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp
index 7fca6e50c1..c69a923c30 100644
--- a/projects/clr/rocclr/device/pal/paldevice.cpp
+++ b/projects/clr/rocclr/device/pal/paldevice.cpp
@@ -2112,7 +2112,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
}
-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
// for discrete gpu, we only reserve,no commit yet.
return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
}
diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp
index 4bfa687770..281c6ea5c7 100644
--- a/projects/clr/rocclr/device/pal/paldevice.hpp
+++ b/projects/clr/rocclr/device/pal/paldevice.hpp
@@ -473,7 +473,7 @@ class Device : public NullDevice {
) const;
//! host memory alloc
- virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+ virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
diff --git a/projects/clr/rocclr/device/rocm/roccounters.cpp b/projects/clr/rocclr/device/rocm/roccounters.cpp
index 3bad8898af..0cac41ddaa 100644
--- a/projects/clr/rocclr/device/rocm/roccounters.cpp
+++ b/projects/clr/rocclr/device/rocm/roccounters.cpp
@@ -550,7 +550,8 @@ bool PerfCounterProfile::initialize() {
}
if (cmd_buf.ptr == nullptr) {
- void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, 1);
+ void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment,
+ Device::MemorySegment::kAtomics);
if (buf_ptr != nullptr) {
profile_.command_buffer.ptr = buf_ptr;
}
@@ -565,7 +566,8 @@ bool PerfCounterProfile::initialize() {
}
if (out_buf.ptr == nullptr) {
- void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, 1);
+ void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment,
+ Device::MemorySegment::kAtomics);
if (buf_ptr != nullptr) {
profile_.output_buffer.ptr = buf_ptr;
}
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 4b3302f76c..a0ed5b46f0 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -1845,12 +1845,27 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
}
// ================================================================================================
-void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
+void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
- // If runtime disables barrier, then all host allocations must have L2 disabled
- const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_)
- ? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_
- : system_segment_;
+
+ hsa_amd_memory_pool_t segment;
+ switch (mem_seg) {
+ case kKernArg :
+ case kNoAtomics :
+ // If runtime disables barrier, then all host allocations must have L2 disabled
+ if ((settings().barrier_sync_) && (system_coarse_segment_.handle != 0)) {
+ segment = system_coarse_segment_;
+ break;
+ }
+ // Falls through on else case.
+ case kAtomics :
+ segment = system_segment_;
+ break;
+ default :
+ guarantee(false && "Invalid Memory Segment");
+ break;
+ }
+
assert(segment.handle != 0);
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
@@ -1900,7 +1915,8 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi
void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
void* ptr = nullptr;
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
- ptr = hostAlloc(size, alignment, atomics);
+ ptr = hostAlloc(size, alignment, atomics
+ ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
#else
int mode = MPOL_DEFAULT;
unsigned long nodeMask = 0;
@@ -1930,7 +1946,8 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
break;
default:
// All other modes fall back to default mode
- ptr = hostAlloc(size, alignment, atomics);
+ ptr = hostAlloc(size, alignment, atomics
+ ? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
}
#endif // ROCCLR_SUPPORT_NUMA_POLICY
return ptr;
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index b66c6f7f34..f1d50a73eb 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -363,7 +363,8 @@ class Device : public NullDevice {
//! Gets free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
- virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
+ virtual void* hostAlloc(size_t size, size_t alignment,
+ MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
virtual void hostFree(void* ptr, size_t size = 0) const;
diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp
index 14b58bb3ea..4aeb47c42f 100644
--- a/projects/clr/rocclr/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp
@@ -696,7 +696,7 @@ void Buffer::destroy() {
// ================================================================================================
bool Buffer::create() {
if (owner() == nullptr) {
- deviceMemory_ = dev().hostAlloc(size(), 1, false);
+ deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
@@ -730,12 +730,14 @@ bool Buffer::create() {
// GPU accessible or prefetch memory into GPU
dev().SvmAllocInit(deviceMemory_, size());
#else
- deviceMemory_ = dev().hostAlloc(size(), 1, false);
+ deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
#endif // AMD_HMM_SUPPORT
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
} else {
- deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
+ deviceMemory_ = dev().hostAlloc(size(), 1, ((memFlags & CL_MEM_SVM_ATOMICS) != 0)
+ ? Device::MemorySegment::kAtomics
+ : Device::MemorySegment::kNoAtomics);
}
} else {
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -810,7 +812,7 @@ bool Buffer::create() {
return true;
}
- deviceMemory_ = dev().hostAlloc(size(), 1, false);
+ deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
owner()->setHostMem(deviceMemory_);
if ((deviceMemory_ != nullptr) && dev().settings().apuSystem_) {
@@ -1102,7 +1104,7 @@ bool Image::create() {
}
if (originalDeviceMemory_ == nullptr) {
- originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
+ originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, Device::MemorySegment::kNoAtomics);
if ((originalDeviceMemory_ != nullptr) && dev().settings().apuSystem_) {
const_cast(dev()).updateFreeMemory(alloc_size, false);
}
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 3efda825ac..e61ef4834d 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -1041,7 +1041,8 @@ bool VirtualGPU::create() {
// ================================================================================================
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
- kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, false));
+ kernarg_pool_base_ = reinterpret_cast(roc_device_.hostAlloc(kernarg_pool_size_, 0,
+ Device::MemorySegment::kKernArg));
if (kernarg_pool_base_ == nullptr) {
return false;
}
diff --git a/projects/clr/rocclr/platform/context.cpp b/projects/clr/rocclr/platform/context.cpp
index df536108f0..d322f95980 100644
--- a/projects/clr/rocclr/platform/context.cpp
+++ b/projects/clr/rocclr/platform/context.cpp
@@ -284,7 +284,9 @@ int Context::create(const intptr_t* properties) {
void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const {
if (customHostAllocDevice_ != NULL) {
- return customHostAllocDevice_->hostAlloc(size, alignment, atomics);
+ return customHostAllocDevice_->hostAlloc(size, alignment, atomics
+ ? Device::MemorySegment::kAtomics
+ : Device::MemorySegment::kNoAtomics);
}
return AlignedMemory::allocate(size, alignment);
}