SWDEV-270013 - Allocate kernel_arguments from kern_arg & finegrain pool instead of coarse grain.

Change-Id: Id4c6977934fdd6ef2311f6e75593801f1e51983c


[ROCm/clr commit: 2df099df9e]
This commit is contained in:
kjayapra-amd
2021-02-02 12:22:47 -05:00
committed by Karthik Jayaprakash
parent 4da1282882
commit 4583cbafee
11 changed files with 54 additions and 22 deletions
+8 -1
View File
@@ -1486,6 +1486,12 @@ class Device : public RuntimeObject {
kLinkAtomicSupport
} LinkAttribute;
typedef enum MemorySegment {
kNoAtomics = 0,
kAtomics = 1,
kKernArg = 2
} MemorySegment;
typedef std::pair<LinkAttribute, int32_t /* value */> LinkAttrType;
static constexpr size_t kP2PStagingSize = 4 * Mi;
@@ -1624,7 +1630,8 @@ class Device : public RuntimeObject {
/**
* @copydoc amd::Context::hostAlloc
*/
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const {
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = kNoAtomics) const {
ShouldNotCallThis();
return NULL;
}
+1 -1
View File
@@ -2144,7 +2144,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
hwStateSize);
}
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
void* Device::hostAlloc(size_t size, size_t alignment, bool MemorySegment mem_seg) const {
// for discrete gpu, we only reserve,no commit yet.
return amd::Os::reserveMemory(NULL, size, alignment, amd::Os::MEM_PROT_NONE);
}
+1 -1
View File
@@ -523,7 +523,7 @@ class Device : public NullDevice, public CALGSLDevice {
) const;
//! host memory alloc
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
+1 -1
View File
@@ -2112,7 +2112,7 @@ void Device::fillHwSampler(uint32_t state, void* hwState, uint32_t hwStateSize,
iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
}
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
// for discrete gpu, we only reserve,no commit yet.
return amd::Os::reserveMemory(nullptr, size, alignment, amd::Os::MEM_PROT_NONE);
}
+1 -1
View File
@@ -473,7 +473,7 @@ class Device : public NullDevice {
) const;
//! host memory alloc
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
virtual void* hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg = kNoAtomics) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
@@ -550,7 +550,8 @@ bool PerfCounterProfile::initialize() {
}
if (cmd_buf.ptr == nullptr) {
void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment, 1);
void *buf_ptr = roc_device_.hostAlloc(profile_.command_buffer.size, alignment,
Device::MemorySegment::kAtomics);
if (buf_ptr != nullptr) {
profile_.command_buffer.ptr = buf_ptr;
}
@@ -565,7 +566,8 @@ bool PerfCounterProfile::initialize() {
}
if (out_buf.ptr == nullptr) {
void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment, 1);
void *buf_ptr = roc_device_.hostAlloc(profile_.output_buffer.size, alignment,
Device::MemorySegment::kAtomics);
if (buf_ptr != nullptr) {
profile_.output_buffer.ptr = buf_ptr;
}
+24 -7
View File
@@ -1845,12 +1845,27 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
}
// ================================================================================================
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
void* ptr = nullptr;
// If runtime disables barrier, then all host allocations must have L2 disabled
const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_)
? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_
: system_segment_;
hsa_amd_memory_pool_t segment;
switch (mem_seg) {
case kKernArg :
case kNoAtomics :
// If runtime disables barrier, then all host allocations must have L2 disabled
if ((settings().barrier_sync_) && (system_coarse_segment_.handle != 0)) {
segment = system_coarse_segment_;
break;
}
// Falls through on else case.
case kAtomics :
segment = system_segment_;
break;
default :
guarantee(false && "Invalid Memory Segment");
break;
}
assert(segment.handle != 0);
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
@@ -1900,7 +1915,8 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi
void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
void* ptr = nullptr;
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
ptr = hostAlloc(size, alignment, atomics);
ptr = hostAlloc(size, alignment, atomics
? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
#else
int mode = MPOL_DEFAULT;
unsigned long nodeMask = 0;
@@ -1930,7 +1946,8 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
break;
default:
// All other modes fall back to default mode
ptr = hostAlloc(size, alignment, atomics);
ptr = hostAlloc(size, alignment, atomics
? Device::MemorySegment::kAtomics : Device::MemorySegment::kNoAtomics);
}
#endif // ROCCLR_SUPPORT_NUMA_POLICY
return ptr;
@@ -363,7 +363,8 @@ class Device : public NullDevice {
//! Gets free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
virtual void* hostAlloc(size_t size, size_t alignment,
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
virtual void hostFree(void* ptr, size_t size = 0) const;
@@ -696,7 +696,7 @@ void Buffer::destroy() {
// ================================================================================================
bool Buffer::create() {
if (owner() == nullptr) {
deviceMemory_ = dev().hostAlloc(size(), 1, false);
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
if (deviceMemory_ != nullptr) {
flags_ |= HostMemoryDirectAccess;
return true;
@@ -730,12 +730,14 @@ bool Buffer::create() {
// GPU accessible or prefetch memory into GPU
dev().SvmAllocInit(deviceMemory_, size());
#else
deviceMemory_ = dev().hostAlloc(size(), 1, false);
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
#endif // AMD_HMM_SUPPORT
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
} else {
deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0);
deviceMemory_ = dev().hostAlloc(size(), 1, ((memFlags & CL_MEM_SVM_ATOMICS) != 0)
? Device::MemorySegment::kAtomics
: Device::MemorySegment::kNoAtomics);
}
} else {
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -810,7 +812,7 @@ bool Buffer::create() {
return true;
}
deviceMemory_ = dev().hostAlloc(size(), 1, false);
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
owner()->setHostMem(deviceMemory_);
if ((deviceMemory_ != nullptr) && dev().settings().apuSystem_) {
@@ -1102,7 +1104,7 @@ bool Image::create() {
}
if (originalDeviceMemory_ == nullptr) {
originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, false);
originalDeviceMemory_ = dev().hostAlloc(alloc_size, 1, Device::MemorySegment::kNoAtomics);
if ((originalDeviceMemory_ != nullptr) && dev().settings().apuSystem_) {
const_cast<Device&>(dev()).updateFreeMemory(alloc_size, false);
}
@@ -1041,7 +1041,8 @@ bool VirtualGPU::create() {
// ================================================================================================
bool VirtualGPU::initPool(size_t kernarg_pool_size) {
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, false));
kernarg_pool_base_ = reinterpret_cast<char*>(roc_device_.hostAlloc(kernarg_pool_size_, 0,
Device::MemorySegment::kKernArg));
if (kernarg_pool_base_ == nullptr) {
return false;
}
+3 -1
View File
@@ -284,7 +284,9 @@ int Context::create(const intptr_t* properties) {
void* Context::hostAlloc(size_t size, size_t alignment, bool atomics) const {
if (customHostAllocDevice_ != NULL) {
return customHostAllocDevice_->hostAlloc(size, alignment, atomics);
return customHostAllocDevice_->hostAlloc(size, alignment, atomics
? Device::MemorySegment::kAtomics
: Device::MemorySegment::kNoAtomics);
}
return AlignedMemory::allocate(size, alignment);
}