P4 to Git Change 1533784 by vsytchen@vsytchen-ocl-win10 on 2018/03/28 11:22:25

SWDEV-133818 - PAL support for Linux Pro: Coarse Grain SVM for OpenCL 2.0

	This change enables Fine/Coarse Grain Buffer SVM suballocations for PAL devices

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/14486/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#57 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#48 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.hpp#28 edit


[ROCm/clr commit: 3d15c543a0]
Этот коммит содержится в:
foreman
2018-03-28 11:35:51 -04:00
родитель 8edced52b8
Коммит 9bcdb4aa73
4 изменённых файлов: 184 добавлений и 80 удалений
+138 -71
Просмотреть файл
@@ -440,7 +440,7 @@ bool Resource::CreateImage(CreateParams* params)
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, &subOffset_);
createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -590,7 +590,7 @@ bool Resource::CreateImage(CreateParams* params)
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, &subOffset_);
createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -982,14 +982,12 @@ bool Resource::CreatePinned(CreateParams* params)
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
{
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
const Pal::gpusize svmAlignment = isFineGrain ? MaxGpuAlignment :
dev().properties().gpuMemoryProperties.fragmentSize;
size_t allocSize = amd::alignUp(desc().width_ * elementSize_, svmAlignment);
size_t allocSize = amd::alignUp(desc().width_ * elementSize_, MaxGpuAlignment);
if (isFineGrain) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = desc_.isAllocExecute_;
createInfo.size = allocSize;
createInfo.alignment = svmAlignment;
createInfo.alignment = MaxGpuAlignment;
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
@@ -998,12 +996,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.flags.useReservedGpuVa = false;
createInfo.pReservedGpuVaOwner = nullptr;
}
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (!dev().settings().svmFineGrainSystem_) {
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
}
if (memRef_ == nullptr) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
}
else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = allocSize;
createInfo.alignment = svmAlignment;
createInfo.alignment = MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Svm;
createInfo.priority = Pal::GpuMemPriority::Normal;
if (svmPtr != 0) {
@@ -1011,7 +1015,12 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
memTypeToHeap(&createInfo);
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
if (memRef_ == nullptr) {
createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
}
}
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
@@ -1020,7 +1029,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
desc_.cardMemory_ = false;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
params->owner_->setSvmPtr(reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr));
params->owner_->setSvmPtr(
reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
offset_ += static_cast<size_t>(subOffset_);
}
return true;
}
@@ -1138,7 +1149,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
createInfo.alignment, &subOffset_);
createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -1198,14 +1209,16 @@ void Resource::free()
unmap(nullptr);
} else {
// Delay CPU address unmap until memRef_ destruction
assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
memRef_->cpuAddress_ = address_;
if (!desc_.SVMRes_) {
assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
memRef_->cpuAddress_ = address_;
}
}
}
// Add resource to the cache
if (!dev().resourceCache().addGpuMemory(&desc_, memRef_, subOffset_)) {
palFree();
palFree();
}
}
@@ -1717,7 +1730,12 @@ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers
address_ = mapLayers(gpu, flags);
} else {
// Map current resource
address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
if (memRef_->cpuAddress_ != nullptr) {
// Suballocations are mapped by the memory suballocator
address_ = reinterpret_cast<uint8_t*>(memRef_->cpuAddress_) + subOffset_;
} else {
address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
}
if (address_ == nullptr) {
LogError("cal::ResMap failed!");
--mapCount_;
@@ -1777,68 +1795,109 @@ void Resource::unmapLayers(VirtualGPU* gpu) {
Unimplemented();
}
// ================================================================================================
bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
MemBuddyAllocator* allocator = new MemBuddyAllocator(
device_, device_->settings().subAllocationChunkSize_,
device_->settings().subAllocationMinSize_);
if ((allocator != nullptr) && (allocator->Init() == Pal::Result::Success)) {
heaps_.insert({mem_ref, allocator});
return true;
} else {
delete allocator;
return false;
}
return false;
}
// ================================================================================================
bool MemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = 0;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
createInfo.heapCount = 1;
createInfo.heaps[0] = Pal::GpuHeapInvisible;
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if (mem_ref != nullptr) {
return InitAllocator(mem_ref);
}
return false;
}
// ================================================================================================
bool CoarseMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize;
createInfo.vaRange = Pal::VaRange::Svm;
createInfo.priority = Pal::GpuMemPriority::Normal;
createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
createInfo.pReservedGpuVaOwner = reserved_va;
createInfo.heapCount = 2;
createInfo.heaps[0] = Pal::GpuHeapInvisible;
createInfo.heaps[1] = Pal::GpuHeapLocal;
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if (mem_ref != nullptr) {
return InitAllocator(mem_ref);
}
return false;
}
// ================================================================================================
bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = false;
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = MaxGpuAlignment;
createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
createInfo.pReservedGpuVaOwner = reserved_va;
GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
if ((mem_ref != nullptr) && InitAllocator(mem_ref)) {
mem_ref->iMem()->Map(&mem_ref->cpuAddress_);
return mem_ref->cpuAddress_ != nullptr;
}
return false;
}
// ================================================================================================
MemorySubAllocator::~MemorySubAllocator()
{
// Release memory heap for suballocations
for (auto it : mem_heap_) {
for (auto it : heaps_) {
it.first->release();
delete it.second;
}
}
// ================================================================================================
GpuMemoryReference* MemorySubAllocator::Allocate(
Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset)
GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
{
GpuMemoryReference* mem_ref = nullptr;
MemBuddyAllocator* allocator = nullptr;
// Check if resource size is allowed for suballocation
if (size < device_->settings().subAllocationMaxSize_) {
uint i = 0;
size = amd::alignUp(size, device_->settings().subAllocationMinSize_);
do {
MemBuddyAllocator* allocator = nullptr;
// Find if current heap has enough empty space
for (auto it : mem_heap_) {
for (auto it : heaps_) {
mem_ref = it.first;
allocator = it.second;
// SVM allocations may required a fixed VA, make sure we find the heap with the same VA
if (reserved_va &&
(reserved_va->Desc().gpuVirtAddr != mem_ref->iMem()->Desc().gpuVirtAddr)) {
continue;
}
// If we have found a valid chunk, then suballocate memory
if (Pal::Result::Success == allocator->Allocate(size, alignment, offset)) {
return mem_ref;
} else {
mem_ref = nullptr;
}
}
// Check if a chunk for suballocation doesn't exist
if (mem_ref == nullptr) {
// Allocate a new chunk in memory
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = device_->settings().subAllocationChunkSize_;
createInfo.alignment = 0;
createInfo.vaRange = Pal::VaRange::Default;
createInfo.priority = Pal::GpuMemPriority::Normal;
createInfo.heapCount = 1;
createInfo.heaps[0] = Pal::GpuHeapInvisible;
mem_ref = GpuMemoryReference::Create(*device_, createInfo);
// If chunk was allocated, then allocate BuddyAllocator object
if (mem_ref != nullptr) {
allocator = new MemBuddyAllocator(device_,
device_->settings().subAllocationChunkSize_,
device_->settings().subAllocationMinSize_);
if ((allocator != nullptr) &&
(Pal::Result::Success == allocator->Init())) {
// Add the chunk and suballocator into the heap
mem_heap_.insert(std::pair<GpuMemoryReference*, MemBuddyAllocator*>(
mem_ref, allocator));
} else {
delete allocator;
mem_ref->release();
return nullptr;
}
} else {
if ((mem_ref == nullptr) && !CreateChunk(reserved_va)) {
return nullptr;
}
}
i++;
} while (i < 2);
@@ -1849,24 +1908,24 @@ GpuMemoryReference* MemorySubAllocator::Allocate(
// ================================================================================================
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
{
bool releaseMem = false;
bool release_mem = false;
{
amd::ScopedLock l(monitor);
// Find if current memory reference is a chunk allocation
auto it = mem_heap_.find(ref);
if (it == mem_heap_.end()) {
auto it = heaps_.find(ref);
if (it == heaps_.end()) {
return false;
}
// Free suballocation at the specified offset
it->second->Free(offset);
// If this suballocator empty, then release memory chunk
if (it->second->IsEmpty()) {
delete it->second;
mem_heap_.erase(it);
releaseMem = true;
heaps_.erase(it);
release_mem = true;
}
}
if (releaseMem) {
if (release_mem) {
ref->release();
}
return true;
@@ -1883,11 +1942,13 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
bool result = false;
size_t size = ref->iMem()->Desc().size;
if (desc->type_ == Resource::Local) {
// Check if runtime can free suballocation in local memory
if (memSubAllocLocal_.Free(&lockCacheOps_, ref, offset)) {
return true;
}
// Check if runtime can free suballocation
if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
return mem_sub_alloc_local_.Free(&lockCacheOps_, ref, offset);
} else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
return mem_sub_alloc_coarse_.Free(&lockCacheOps_, ref, offset);
} else if (desc->SVMRes_) {
return mem_sub_alloc_fine_.Free(&lockCacheOps_, ref, offset);
}
// Make sure current allocation isn't bigger than cache
@@ -1918,23 +1979,29 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
// ================================================================================================
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
Pal::gpusize alignment, Pal::gpusize* offset) {
Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
amd::ScopedLock l(&lockCacheOps_);
GpuMemoryReference* ref = nullptr;
// Check if the runtime can suballocate memory
if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
ref = mem_sub_alloc_local_.Allocate(size, alignment, reserved_va, offset);
} else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
ref = mem_sub_alloc_coarse_.Allocate(size, alignment, reserved_va, offset);
} else if (desc->SVMRes_) {
ref = mem_sub_alloc_fine_.Allocate(size, alignment, reserved_va, offset);
}
if (ref != nullptr) {
return ref;
}
// Early exit if resource is too big
if (size >= cacheSizeLimit_ || desc->SVMRes_) {
//! \note we may need to free the cache here to reduce memory pressure
return ref;
}
if (desc->type_ == Resource::Local) {
ref = memSubAllocLocal_.Allocate(size, alignment, offset);
if (ref != nullptr) {
return ref;
}
}
// Serach the right resource through the cache list
for (const auto& it : resCache_) {
Resource::Descriptor* entry = it.first;
+43 -8
Просмотреть файл
@@ -8,6 +8,8 @@
#include "device/pal/paldefs.hpp"
#include "util/palBuddyAllocatorImpl.h"
#include <unordered_map>
//! \namespace pal PAL Resource Implementation
namespace pal {
@@ -459,13 +461,39 @@ public:
~MemorySubAllocator();
GpuMemoryReference* Allocate(Pal::gpusize size,
Pal::gpusize alignment, Pal::gpusize* offset);
bool Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset);
//! Create suballocation
GpuMemoryReference* Allocate(Pal::gpusize size,
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va,
Pal::gpusize* offset
);
//! Free suballocation
bool Free(amd::Monitor* monitor,
GpuMemoryReference* mem_ref,
Pal::gpusize offset
);
protected:
//! Allocate new chunk of memory
virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
bool InitAllocator(GpuMemoryReference* mem_ref);
private:
Device* device_;
std::map<GpuMemoryReference*, MemBuddyAllocator*> mem_heap_;
std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*> heaps_;
};
class CoarseMemorySubAllocator : public MemorySubAllocator {
public:
CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
};
class FineMemorySubAllocator : public MemorySubAllocator {
public:
FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
};
class ResourceCache : public amd::HeapObject {
@@ -475,7 +503,9 @@ class ResourceCache : public amd::HeapObject {
: lockCacheOps_("PAL resource cache", true)
, cacheSize_(0)
, cacheSizeLimit_(cacheSizeLimit)
, memSubAllocLocal_(device) {}
, mem_sub_alloc_local_(device)
, mem_sub_alloc_coarse_ (device)
, mem_sub_alloc_fine_ (device) {}
//! Default destructor
~ResourceCache();
@@ -489,7 +519,10 @@ class ResourceCache : public amd::HeapObject {
//! Finds a PAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset);
Pal::gpusize size,
Pal::gpusize alignment,
const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
Pal::gpusize* offset);
//! Destroys cache
void free(size_t minCacheEntries = 0);
@@ -512,7 +545,9 @@ class ResourceCache : public amd::HeapObject {
//! PAL resource cache
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
MemorySubAllocator memSubAllocLocal_; //!< Allocator for suballocations in Local
MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
};
/*@}*/} // namespace pal
+2
Просмотреть файл
@@ -296,6 +296,7 @@ void* Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) {
void* svmPtrAlloced = NULL;
void* tempPtr = NULL;
amd::ScopedLock lock(&ctxLock_);
for (const auto& dev : svmAllocDevice_) {
if (dev->type() == CL_DEVICE_TYPE_GPU) {
// check if the device support svm platform atomics,
@@ -320,6 +321,7 @@ void Context::svmFree(void* ptr) const {
return;
}
amd::ScopedLock lock(&ctxLock_);
for (const auto& dev : svmAllocDevice_) {
if (dev->type() == CL_DEVICE_TYPE_GPU) {
dev->svmFree(ptr);
+1 -1
Просмотреть файл
@@ -198,7 +198,7 @@ class Context : public RuntimeObject {
Device* customHostAllocDevice_; //!< Device responsible for host allocations
std::vector<Device*> svmAllocDevice_; //!< Devices can support SVM allocations
std::map<const Device*, DeviceQueueInfo> deviceQueues_; //!< Device queues mapping
Monitor ctxLock_; //!< Lock for the context access
mutable Monitor ctxLock_; //!< Lock for the context access
};
/*! @}