From 9bcdb4aa73d7a509e1cd32b15029c73fcf4c4d49 Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 28 Mar 2018 11:35:51 -0400
Subject: [PATCH] P4 to Git Change 1533784 by vsytchen@vsytchen-ocl-win10 on
2018/03/28 11:22:25
SWDEV-133818 - PAL support for Linux Pro: Coarse Grain SVM for OpenCL 2.0
This change enables Fine/Coarse Grain Buffer SVM suballocations for PAL devices
ReviewBoardURL = http://ocltc.amd.com/reviews/r/14486/diff/
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#57 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#20 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#48 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/context.hpp#28 edit
[ROCm/clr commit: 3d15c543a038b2e07ed847347d91b93b7609666c]
---
.../rocclr/runtime/device/pal/palresource.cpp | 209 ++++++++++++------
.../rocclr/runtime/device/pal/palresource.hpp | 51 ++++-
.../clr/rocclr/runtime/platform/context.cpp | 2 +
.../clr/rocclr/runtime/platform/context.hpp | 2 +-
4 files changed, 184 insertions(+), 80 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.cpp b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
index 5d81b5c33d..d247303154 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.cpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.cpp
@@ -440,7 +440,7 @@ bool Resource::CreateImage(CreateParams* params)
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, &subOffset_);
+ createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -590,7 +590,7 @@ bool Resource::CreateImage(CreateParams* params)
memTypeToHeap(&createInfo);
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, &subOffset_);
+ createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -982,14 +982,12 @@ bool Resource::CreatePinned(CreateParams* params)
bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
{
const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
- const Pal::gpusize svmAlignment = isFineGrain ? MaxGpuAlignment :
- dev().properties().gpuMemoryProperties.fragmentSize;
- size_t allocSize = amd::alignUp(desc().width_ * elementSize_, svmAlignment);
+ size_t allocSize = amd::alignUp(desc().width_ * elementSize_, MaxGpuAlignment);
if (isFineGrain) {
Pal::SvmGpuMemoryCreateInfo createInfo = {};
createInfo.isUsedForKernel = desc_.isAllocExecute_;
createInfo.size = allocSize;
- createInfo.alignment = svmAlignment;
+ createInfo.alignment = MaxGpuAlignment;
if (svmPtr != 0) {
createInfo.flags.useReservedGpuVa = true;
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
@@ -998,12 +996,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.flags.useReservedGpuVa = false;
createInfo.pReservedGpuVaOwner = nullptr;
}
- memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+ if (!dev().settings().svmFineGrainSystem_) {
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
+ createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+ }
+ if (memRef_ == nullptr) {
+ memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+ }
}
else {
Pal::GpuMemoryCreateInfo createInfo = {};
createInfo.size = allocSize;
- createInfo.alignment = svmAlignment;
+ createInfo.alignment = MaxGpuAlignment;
createInfo.vaRange = Pal::VaRange::Svm;
createInfo.priority = Pal::GpuMemPriority::Normal;
if (svmPtr != 0) {
@@ -1011,7 +1015,12 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
}
memTypeToHeap(&createInfo);
- memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+ memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
+ createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+ if (memRef_ == nullptr) {
+ createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
+ memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+ }
}
if (nullptr == memRef_) {
LogError("Failed PAL memory allocation!");
@@ -1020,7 +1029,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
desc_.cardMemory_ = false;
if ((nullptr != params) && (nullptr != params->owner_) &&
(nullptr != params->owner_->getSvmPtr())) {
- params->owner_->setSvmPtr(reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr));
+ params->owner_->setSvmPtr(
+ reinterpret_cast(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
+ offset_ += static_cast(subOffset_);
}
return true;
}
@@ -1138,7 +1149,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
memTypeToHeap(&createInfo);
// createInfo.priority;
memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
- createInfo.alignment, &subOffset_);
+ createInfo.alignment, nullptr, &subOffset_);
if (nullptr == memRef_) {
memRef_ = GpuMemoryReference::Create(dev(), createInfo);
if (nullptr == memRef_) {
@@ -1198,14 +1209,16 @@ void Resource::free()
unmap(nullptr);
} else {
// Delay CPU address unmap until memRef_ destruction
- assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
- memRef_->cpuAddress_ = address_;
+ if (!desc_.SVMRes_) {
+ assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
+ memRef_->cpuAddress_ = address_;
+ }
}
}
// Add resource to the cache
if (!dev().resourceCache().addGpuMemory(&desc_, memRef_, subOffset_)) {
- palFree();
+ palFree();
}
}
@@ -1717,7 +1730,12 @@ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers
address_ = mapLayers(gpu, flags);
} else {
// Map current resource
- address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+ if (memRef_->cpuAddress_ != nullptr) {
+ // Suballocations are mapped by the memory suballocator
+ address_ = reinterpret_cast(memRef_->cpuAddress_) + subOffset_;
+ } else {
+ address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+ }
if (address_ == nullptr) {
LogError("cal::ResMap failed!");
--mapCount_;
@@ -1777,68 +1795,109 @@ void Resource::unmapLayers(VirtualGPU* gpu) {
Unimplemented();
}
+// ================================================================================================
+bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
+ MemBuddyAllocator* allocator = new MemBuddyAllocator(
+ device_, device_->settings().subAllocationChunkSize_,
+ device_->settings().subAllocationMinSize_);
+ if ((allocator != nullptr) && (allocator->Init() == Pal::Result::Success)) {
+ heaps_.insert({mem_ref, allocator});
+ return true;
+ } else {
+ delete allocator;
+ return false;
+ }
+ return false;
+}
+
+// ================================================================================================
+bool MemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+ Pal::GpuMemoryCreateInfo createInfo = {};
+ createInfo.size = device_->settings().subAllocationChunkSize_;
+ createInfo.alignment = 0;
+ createInfo.vaRange = Pal::VaRange::Default;
+ createInfo.priority = Pal::GpuMemPriority::Normal;
+ createInfo.heapCount = 1;
+ createInfo.heaps[0] = Pal::GpuHeapInvisible;
+ GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+ if (mem_ref != nullptr) {
+ return InitAllocator(mem_ref);
+ }
+ return false;
+}
+
+// ================================================================================================
+bool CoarseMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+ Pal::GpuMemoryCreateInfo createInfo = {};
+ createInfo.size = device_->settings().subAllocationChunkSize_;
+ createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize;
+ createInfo.vaRange = Pal::VaRange::Svm;
+ createInfo.priority = Pal::GpuMemPriority::Normal;
+ createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
+ createInfo.pReservedGpuVaOwner = reserved_va;
+ createInfo.heapCount = 2;
+ createInfo.heaps[0] = Pal::GpuHeapInvisible;
+ createInfo.heaps[1] = Pal::GpuHeapLocal;
+ GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+ if (mem_ref != nullptr) {
+ return InitAllocator(mem_ref);
+ }
+ return false;
+}
+
+// ================================================================================================
+bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+ Pal::SvmGpuMemoryCreateInfo createInfo = {};
+ createInfo.isUsedForKernel = false;
+ createInfo.size = device_->settings().subAllocationChunkSize_;
+ createInfo.alignment = MaxGpuAlignment;
+ createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
+ createInfo.pReservedGpuVaOwner = reserved_va;
+ GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+ if ((mem_ref != nullptr) && InitAllocator(mem_ref)) {
+ mem_ref->iMem()->Map(&mem_ref->cpuAddress_);
+ return mem_ref->cpuAddress_ != nullptr;
+ }
+ return false;
+}
+
// ================================================================================================
MemorySubAllocator::~MemorySubAllocator()
{
// Release memory heap for suballocations
- for (auto it : mem_heap_) {
+ for (auto it : heaps_) {
it.first->release();
delete it.second;
}
}
// ================================================================================================
-GpuMemoryReference* MemorySubAllocator::Allocate(
- Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset)
+GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
{
GpuMemoryReference* mem_ref = nullptr;
+ MemBuddyAllocator* allocator = nullptr;
// Check if resource size is allowed for suballocation
if (size < device_->settings().subAllocationMaxSize_) {
uint i = 0;
size = amd::alignUp(size, device_->settings().subAllocationMinSize_);
do {
- MemBuddyAllocator* allocator = nullptr;
// Find if current heap has enough empty space
- for (auto it : mem_heap_) {
+ for (auto it : heaps_) {
mem_ref = it.first;
allocator = it.second;
+ // SVM allocations may required a fixed VA, make sure we find the heap with the same VA
+ if (reserved_va &&
+ (reserved_va->Desc().gpuVirtAddr != mem_ref->iMem()->Desc().gpuVirtAddr)) {
+ continue;
+ }
// If we have found a valid chunk, then suballocate memory
if (Pal::Result::Success == allocator->Allocate(size, alignment, offset)) {
return mem_ref;
- } else {
- mem_ref = nullptr;
}
}
-
- // Check if a chunk for suballocation doesn't exist
- if (mem_ref == nullptr) {
- // Allocate a new chunk in memory
- Pal::GpuMemoryCreateInfo createInfo = {};
- createInfo.size = device_->settings().subAllocationChunkSize_;
- createInfo.alignment = 0;
- createInfo.vaRange = Pal::VaRange::Default;
- createInfo.priority = Pal::GpuMemPriority::Normal;
- createInfo.heapCount = 1;
- createInfo.heaps[0] = Pal::GpuHeapInvisible;
- mem_ref = GpuMemoryReference::Create(*device_, createInfo);
- // If chunk was allocated, then allocate BuddyAllocator object
- if (mem_ref != nullptr) {
- allocator = new MemBuddyAllocator(device_,
- device_->settings().subAllocationChunkSize_,
- device_->settings().subAllocationMinSize_);
- if ((allocator != nullptr) &&
- (Pal::Result::Success == allocator->Init())) {
- // Add the chunk and suballocator into the heap
- mem_heap_.insert(std::pair(
- mem_ref, allocator));
- } else {
- delete allocator;
- mem_ref->release();
- return nullptr;
- }
- } else {
+ if ((mem_ref == nullptr) && !CreateChunk(reserved_va)) {
return nullptr;
- }
}
i++;
} while (i < 2);
@@ -1849,24 +1908,24 @@ GpuMemoryReference* MemorySubAllocator::Allocate(
// ================================================================================================
bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
{
- bool releaseMem = false;
+ bool release_mem = false;
{
amd::ScopedLock l(monitor);
// Find if current memory reference is a chunk allocation
- auto it = mem_heap_.find(ref);
- if (it == mem_heap_.end()) {
+ auto it = heaps_.find(ref);
+ if (it == heaps_.end()) {
return false;
}
- // Free suballocation at the specified offset
+
it->second->Free(offset);
// If this suballocator empty, then release memory chunk
if (it->second->IsEmpty()) {
delete it->second;
- mem_heap_.erase(it);
- releaseMem = true;
+ heaps_.erase(it);
+ release_mem = true;
}
}
- if (releaseMem) {
+ if (release_mem) {
ref->release();
}
return true;
@@ -1883,11 +1942,13 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
bool result = false;
size_t size = ref->iMem()->Desc().size;
- if (desc->type_ == Resource::Local) {
- // Check if runtime can free suballocation in local memory
- if (memSubAllocLocal_.Free(&lockCacheOps_, ref, offset)) {
- return true;
- }
+ // Check if runtime can free suballocation
+ if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
+ return mem_sub_alloc_local_.Free(&lockCacheOps_, ref, offset);
+ } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
+ return mem_sub_alloc_coarse_.Free(&lockCacheOps_, ref, offset);
+ } else if (desc->SVMRes_) {
+ return mem_sub_alloc_fine_.Free(&lockCacheOps_, ref, offset);
}
// Make sure current allocation isn't bigger than cache
@@ -1918,23 +1979,29 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
// ================================================================================================
GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
- Pal::gpusize alignment, Pal::gpusize* offset) {
+ Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
amd::ScopedLock l(&lockCacheOps_);
GpuMemoryReference* ref = nullptr;
+ // Check if the runtime can suballocate memory
+ if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
+ ref = mem_sub_alloc_local_.Allocate(size, alignment, reserved_va, offset);
+ } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
+ ref = mem_sub_alloc_coarse_.Allocate(size, alignment, reserved_va, offset);
+ } else if (desc->SVMRes_) {
+ ref = mem_sub_alloc_fine_.Allocate(size, alignment, reserved_va, offset);
+ }
+
+ if (ref != nullptr) {
+ return ref;
+ }
+
// Early exit if resource is too big
if (size >= cacheSizeLimit_ || desc->SVMRes_) {
//! \note we may need to free the cache here to reduce memory pressure
return ref;
}
- if (desc->type_ == Resource::Local) {
- ref = memSubAllocLocal_.Allocate(size, alignment, offset);
- if (ref != nullptr) {
- return ref;
- }
- }
-
// Serach the right resource through the cache list
for (const auto& it : resCache_) {
Resource::Descriptor* entry = it.first;
diff --git a/projects/clr/rocclr/runtime/device/pal/palresource.hpp b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
index 510c802823..47d1a6eb13 100644
--- a/projects/clr/rocclr/runtime/device/pal/palresource.hpp
+++ b/projects/clr/rocclr/runtime/device/pal/palresource.hpp
@@ -8,6 +8,8 @@
#include "device/pal/paldefs.hpp"
#include "util/palBuddyAllocatorImpl.h"
+#include
+
//! \namespace pal PAL Resource Implementation
namespace pal {
@@ -459,13 +461,39 @@ public:
~MemorySubAllocator();
- GpuMemoryReference* Allocate(Pal::gpusize size,
- Pal::gpusize alignment, Pal::gpusize* offset);
- bool Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset);
+ //! Create suballocation
+ GpuMemoryReference* Allocate(Pal::gpusize size,
+ Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va,
+ Pal::gpusize* offset
+ );
+ //! Free suballocation
+ bool Free(amd::Monitor* monitor,
+ GpuMemoryReference* mem_ref,
+ Pal::gpusize offset
+ );
+
+protected:
+ //! Allocate new chunk of memory
+ virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
+ bool InitAllocator(GpuMemoryReference* mem_ref);
-private:
Device* device_;
- std::map mem_heap_;
+ std::unordered_map heaps_;
+};
+
+class CoarseMemorySubAllocator : public MemorySubAllocator {
+public:
+ CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
+
+ bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
+};
+
+class FineMemorySubAllocator : public MemorySubAllocator {
+public:
+ FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
+
+ bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
};
class ResourceCache : public amd::HeapObject {
@@ -475,7 +503,9 @@ class ResourceCache : public amd::HeapObject {
: lockCacheOps_("PAL resource cache", true)
, cacheSize_(0)
, cacheSizeLimit_(cacheSizeLimit)
- , memSubAllocLocal_(device) {}
+ , mem_sub_alloc_local_(device)
+ , mem_sub_alloc_coarse_ (device)
+ , mem_sub_alloc_fine_ (device) {}
//! Default destructor
~ResourceCache();
@@ -489,7 +519,10 @@ class ResourceCache : public amd::HeapObject {
//! Finds a PAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
- Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset);
+ Pal::gpusize size,
+ Pal::gpusize alignment,
+ const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
+ Pal::gpusize* offset);
//! Destroys cache
void free(size_t minCacheEntries = 0);
@@ -512,7 +545,9 @@ class ResourceCache : public amd::HeapObject {
//! PAL resource cache
std::list > resCache_;
- MemorySubAllocator memSubAllocLocal_; //!< Allocator for suballocations in Local
+ MemorySubAllocator mem_sub_alloc_local_; //!< Allocator for suballocations in Local
+ CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
+ FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
};
/*@}*/} // namespace pal
diff --git a/projects/clr/rocclr/runtime/platform/context.cpp b/projects/clr/rocclr/runtime/platform/context.cpp
index 43e4c13216..15faac4396 100644
--- a/projects/clr/rocclr/runtime/platform/context.cpp
+++ b/projects/clr/rocclr/runtime/platform/context.cpp
@@ -296,6 +296,7 @@ void* Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) {
void* svmPtrAlloced = NULL;
void* tempPtr = NULL;
+ amd::ScopedLock lock(&ctxLock_);
for (const auto& dev : svmAllocDevice_) {
if (dev->type() == CL_DEVICE_TYPE_GPU) {
// check if the device support svm platform atomics,
@@ -320,6 +321,7 @@ void Context::svmFree(void* ptr) const {
return;
}
+ amd::ScopedLock lock(&ctxLock_);
for (const auto& dev : svmAllocDevice_) {
if (dev->type() == CL_DEVICE_TYPE_GPU) {
dev->svmFree(ptr);
diff --git a/projects/clr/rocclr/runtime/platform/context.hpp b/projects/clr/rocclr/runtime/platform/context.hpp
index f3ae1306c6..dae6cd3449 100644
--- a/projects/clr/rocclr/runtime/platform/context.hpp
+++ b/projects/clr/rocclr/runtime/platform/context.hpp
@@ -198,7 +198,7 @@ class Context : public RuntimeObject {
Device* customHostAllocDevice_; //!< Device responsible for host allocations
std::vector svmAllocDevice_; //!< Devices can support SVM allocations
std::map deviceQueues_; //!< Device queues mapping
- Monitor ctxLock_; //!< Lock for the context access
+ mutable Monitor ctxLock_; //!< Lock for the context access
};
/*! @}