SWDEV-270013 - Allocate kernel_arguments from kern_arg & finegrain pool instead of coarse grain.
Change-Id: Id4c6977934fdd6ef2311f6e75593801f1e51983c
[ROCm/clr commit: 2df099df9e]
This commit is contained in:
committed by
Karthik Jayaprakash
parent
4da1282882
commit
4583cbafee
@@ -1486,6 +1486,12 @@ class Device : public RuntimeObject {
|
||||
kLinkAtomicSupport
|
||||
} LinkAttribute;
|
||||
|
||||
typedef enum MemorySegment {
|
||||
kNoAtomics = 0,
|
||||
kAtomics = 1,
|
||||
kKernArg = 2
|
||||
} MemorySegment;
|
||||
|
||||
typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
|
||||
|
||||
static constexpr size_t kP2PStagingSize = 4 * Mi;
|
||||
@@ -1624,7 +1630,8 @@ class Device : public RuntimeObject {
|
||||
/**
|
||||
* @copydoc amd::Context::hostAlloc
|
||||
*/
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const {
|
||||
virtual void* hostAlloc(size_t size, size_t alignment,
|
||||
MemorySegment mem_seg = kNoAtomics) const {
|
||||
ShouldNotCallThis();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -2144,7 +2144,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
|
||||
hwStateSize);
|
||||
}
|
||||
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, bool MemorySegment mem_seg) const {
|
||||
// for discrete gpu, we only reserve,no commit yet.
|
||||
return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE);
|
||||
}
|
||||
|
||||
@@ -523,7 +523,7 @@ class Device : public NullDevice, public CALGSLDevice {
|
||||
) const;
|
||||
|
||||
//! host memory alloc
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
|
||||
|
||||
//! SVM allocation
|
||||
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
|
||||
|
||||
@@ -2112,7 +2112,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
|
||||
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
|
||||
}
|
||||
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
// for discrete gpu, we only reserve,no commit yet.
|
||||
return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
|
||||
}
|
||||
|
||||
@@ -473,7 +473,7 @@ class Device : public NullDevice {
|
||||
) const;
|
||||
|
||||
//! host memory alloc
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
|
||||
|
||||
//! SVM allocation
|
||||
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
|
||||
|
||||
@@ -550,7 +550,8 @@ bool PerfCounterProfile::initialize() {
|
||||
}
|
||||
|
||||
if (cmd_buf.ptr == nullptr) {
|
||||
void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, 1);
|
||||
void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment,
|
||||
Device::MemorySegment::kAtomics);
|
||||
if (buf_ptr != nullptr) {
|
||||
profile_.command_buffer.ptr = buf_ptr;
|
||||
}
|
||||
@@ -565,7 +566,8 @@ bool PerfCounterProfile::initialize() {
|
||||
}
|
||||
|
||||
if (out_buf.ptr == nullptr) {
|
||||
void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, 1);
|
||||
void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment,
|
||||
Device::MemorySegment::kAtomics);
|
||||
if (buf_ptr != nullptr) {
|
||||
profile_.output_buffer.ptr = buf_ptr;
|
||||
}
|
||||
|
||||
@@ -1845,12 +1845,27 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
|
||||
void* ptr = nullptr;
|
||||
// If runtime disables barrier, then all host allocations must have L2 disabled
|
||||
const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_)
|
||||
? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_
|
||||
: system_segment_;
|
||||
|
||||
hsa_amd_memory_pool_t segment;
|
||||
switch (mem_seg) {
|
||||
case kKernArg :
|
||||
case kNoAtomics :
|
||||
// If runtime disables barrier, then all host allocations must have L2 disabled
|
||||
if ((settings().barrier_sync_) && (system_coarse_segment_.handle != 0)) {
|
||||
segment = system_coarse_segment_;
|
||||
break;
|
||||
}
|
||||
// Falls through on else case.
|
||||
case kAtomics :
|
||||
segment = system_segment_;
|
||||
break;
|
||||
default :
|
||||
guarantee(false && "Invalid Memory Segment");
|
||||
break;
|
||||
}
|
||||
|
||||
assert(segment.handle != 0);
|
||||
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
|
||||
@@ -1900,7 +1915,8 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi
|
||||
void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
void* ptr = nullptr;
|
||||
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
|
||||
ptr = hostAlloc(size, alignment, atomics);
|
||||
ptr = hostAlloc(size, alignment, atomics
|
||||
? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
|
||||
#else
|
||||
int mode = MPOL_DEFAULT;
|
||||
unsigned long nodeMask = 0;
|
||||
@@ -1930,7 +1946,8 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
break;
|
||||
default:
|
||||
// All other modes fall back to default mode
|
||||
ptr = hostAlloc(size, alignment, atomics);
|
||||
ptr = hostAlloc(size, alignment, atomics
|
||||
? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
|
||||
}
|
||||
#endif // ROCCLR_SUPPORT_NUMA_POLICY
|
||||
return ptr;
|
||||
|
||||
@@ -363,7 +363,8 @@ class Device : public NullDevice {
|
||||
//! Gets free memory on a GPU device
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const;
|
||||
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
|
||||
virtual void* hostAlloc(size_t size, size_t alignment,
|
||||
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
|
||||
|
||||
virtual void hostFree(void* ptr, size_t size = 0) const;
|
||||
|
||||
|
||||
@@ -696,7 +696,7 @@ void Buffer::destroy() {
|
||||
// ================================================================================================
|
||||
bool Buffer::create() {
|
||||
if (owner() == nullptr) {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, false);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
if (deviceMemory_ != nullptr) {
|
||||
flags_ |= HostMemoryDirectAccess;
|
||||
return true;
|
||||
@@ -730,12 +730,14 @@ bool Buffer::create() {
|
||||
// GPU accessible or prefetch memory into GPU
|
||||
dev().SvmAllocInit(deviceMemory_, size());
|
||||
#else
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, false);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
#endif // AMD_HMM_SUPPORT
|
||||
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
|
||||
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
|
||||
} else {
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, ((memFlags & CL_MEM_SVM_ATOMICS) != 0)
|
||||
? Device::MemorySegment::kAtomics
|
||||
: Device::MemorySegment::kNoAtomics);
|
||||
}
|
||||
} else {
|
||||
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
|
||||
@@ -810,7 +812,7 @@ bool Buffer::create() {
|
||||
return true;
|
||||
}
|
||||
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, false);
|
||||
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
|
||||
owner()->setHostMem(deviceMemory_);
|
||||
|
||||
if ((deviceMemory_ != nullptr) && dev().settings().apuSystem_) {
|
||||
@@ -1102,7 +1104,7 @@ bool Image::create() {
|
||||
}
|
||||
|
||||
if (originalDeviceMemory_ == nullptr) {
|
||||
originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
|
||||
originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, Device::MemorySegment::kNoAtomics);
|
||||
if ((originalDeviceMemory_ != nullptr) && dev().settings().apuSystem_) {
|
||||
const_cast<Device&>(dev()).updateFreeMemory(alloc_size, false);
|
||||
}
|
||||
|
||||
@@ -1041,7 +1041,8 @@ bool VirtualGPU::create() {
|
||||
// ================================================================================================
|
||||
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
|
||||
kernarg_pool_size_ = kernarg_pool_size;
|
||||
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
|
||||
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
|
||||
Device::MemorySegment::kKernArg));
|
||||
if (kernarg_pool_base_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -284,7 +284,9 @@ int Context::create(const intptr_t* properties) {
|
||||
|
||||
void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const {
|
||||
if (customHostAllocDevice_ != NULL) {
|
||||
return customHostAllocDevice_->hostAlloc(size, alignment, atomics);
|
||||
return customHostAllocDevice_->hostAlloc(size, alignment, atomics
|
||||
? Device::MemorySegment::kAtomics
|
||||
: Device::MemorySegment::kNoAtomics);
|
||||
}
|
||||
return AlignedMemory::allocate(size, alignment);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user