P4 to Git Change 1533784 by vsytchen@vsytchen-ocl-win10 on 2018/03/28 11:22:25

SWDEV-133818 - PAL support for Linux Pro: Coarse Grain SVM for OpenCL 2.0 This change enables Fine/Coarse Grain Buffer SVM suballocations for PAL devices ReviewBoardURL = http://ocltc.amd.com/reviews/r/14486/diff/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#57 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#20 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/context.cpp#48 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/context.hpp#28 edit [ROCm/clr commit: 3d15c543a0]
2018-03-28 11:35:51 -04:00
@@ -440,7 +440,7 @@ bool Resource::CreateImage(CreateParams* params)
      memTypeToHeap(&createInfo);
      // createInfo.priority;
      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-        createInfo.alignment, &subOffset_);
+        createInfo.alignment, nullptr, &subOffset_);
      if (nullptr == memRef_) {
        memRef_ = GpuMemoryReference::Create(dev(), createInfo);
        if (nullptr == memRef_) {
@@ -590,7 +590,7 @@ bool Resource::CreateImage(CreateParams* params)
    memTypeToHeap(&createInfo);

    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-      createInfo.alignment, &subOffset_);
+      createInfo.alignment, nullptr, &subOffset_);
    if (nullptr == memRef_) {
      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
      if (nullptr == memRef_) {
@@ -982,14 +982,12 @@ bool Resource::CreatePinned(CreateParams* params)
 bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
 {
  const bool isFineGrain = (memoryType() == RemoteUSWC) || (memoryType() == Remote);
-  const Pal::gpusize svmAlignment = isFineGrain ? MaxGpuAlignment :
-    dev().properties().gpuMemoryProperties.fragmentSize;
-  size_t allocSize = amd::alignUp(desc().width_ * elementSize_, svmAlignment);
+  size_t allocSize = amd::alignUp(desc().width_ * elementSize_, MaxGpuAlignment);
  if (isFineGrain) {
    Pal::SvmGpuMemoryCreateInfo createInfo = {};
    createInfo.isUsedForKernel = desc_.isAllocExecute_;
    createInfo.size = allocSize;
-    createInfo.alignment = svmAlignment;
+    createInfo.alignment = MaxGpuAlignment;
    if (svmPtr != 0) {
      createInfo.flags.useReservedGpuVa = true;
      createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
@@ -998,12 +996,18 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
      createInfo.flags.useReservedGpuVa = false;
      createInfo.pReservedGpuVaOwner = nullptr;
    }
-    memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+    if (!dev().settings().svmFineGrainSystem_) {
+      memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
+        createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+    }
+    if (memRef_ == nullptr) {
+      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+    }
  }
  else {
    Pal::GpuMemoryCreateInfo createInfo = {};
    createInfo.size = allocSize;
-    createInfo.alignment = svmAlignment;
+    createInfo.alignment = MaxGpuAlignment;
    createInfo.vaRange = Pal::VaRange::Svm;
    createInfo.priority = Pal::GpuMemPriority::Normal;
    if (svmPtr != 0) {
@@ -1011,7 +1015,12 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
      createInfo.pReservedGpuVaOwner = params->svmBase_->iMem();
    }
    memTypeToHeap(&createInfo);
-    memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+    memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
+      createInfo.alignment, createInfo.pReservedGpuVaOwner, &subOffset_);
+    if (memRef_ == nullptr) {
+      createInfo.alignment = dev().properties().gpuMemoryProperties.fragmentSize;
+      memRef_ = GpuMemoryReference::Create(dev(), createInfo);
+    }
  }
  if (nullptr == memRef_) {
    LogError("Failed PAL memory allocation!");
@@ -1020,7 +1029,9 @@ bool Resource::CreateSvm(CreateParams* params, Pal::gpusize svmPtr)
  desc_.cardMemory_ = false;
  if ((nullptr != params) && (nullptr != params->owner_) &&
    (nullptr != params->owner_->getSvmPtr())) {
-    params->owner_->setSvmPtr(reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr));
+    params->owner_->setSvmPtr(
+      reinterpret_cast<void*>(memRef_->iMem()->Desc().gpuVirtAddr + subOffset_));
+    offset_ += static_cast<size_t>(subOffset_);
  }
  return true;
 }
@@ -1138,7 +1149,7 @@ bool Resource::create(MemoryType memType, CreateParams* params) {
  memTypeToHeap(&createInfo);
  // createInfo.priority;
  memRef_ = dev().resourceCache().findGpuMemory(&desc_, createInfo.size,
-    createInfo.alignment, &subOffset_);
+    createInfo.alignment, nullptr, &subOffset_);
  if (nullptr == memRef_) {
    memRef_ = GpuMemoryReference::Create(dev(), createInfo);
    if (nullptr == memRef_) {
@@ -1198,14 +1209,16 @@ void Resource::free()
        unmap(nullptr);
      } else {
        // Delay CPU address unmap until memRef_ destruction
-        assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
-        memRef_->cpuAddress_ = address_;
+        if (!desc_.SVMRes_) {
+          assert(memRef_->cpuAddress_ == nullptr && "Memref shouldn't have a valid CPU address");
+          memRef_->cpuAddress_ = address_;
+        }
      }
    }

    // Add resource to the cache
    if (!dev().resourceCache().addGpuMemory(&desc_, memRef_, subOffset_)) {
-    palFree();
+      palFree();
    }
  }

@@ -1717,7 +1730,12 @@ void* Resource::map(VirtualGPU* gpu, uint flags, uint startLayer, uint numLayers
      address_ = mapLayers(gpu, flags);
    } else {
      // Map current resource
-      address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+      if (memRef_->cpuAddress_ != nullptr) {
+        // Suballocations are mapped by the memory suballocator
+        address_ = reinterpret_cast<uint8_t*>(memRef_->cpuAddress_) + subOffset_;
+      } else {
+        address_ = gpuMemoryMap(&desc_.pitch_, flags, iMem());
+      }
      if (address_ == nullptr) {
        LogError("cal::ResMap failed!");
        --mapCount_;
@@ -1777,68 +1795,109 @@ void Resource::unmapLayers(VirtualGPU* gpu) {
  Unimplemented();
 }

+// ================================================================================================
+bool MemorySubAllocator::InitAllocator(GpuMemoryReference* mem_ref) {
+  MemBuddyAllocator* allocator = new MemBuddyAllocator(
+    device_, device_->settings().subAllocationChunkSize_,
+    device_->settings().subAllocationMinSize_);
+  if ((allocator != nullptr) && (allocator->Init() == Pal::Result::Success)) {
+    heaps_.insert({mem_ref, allocator});
+    return true;
+  } else {
+    delete allocator;
+    return false;
+  }
+  return false;
+}
+
+// ================================================================================================
+bool MemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+  Pal::GpuMemoryCreateInfo createInfo = {};
+  createInfo.size = device_->settings().subAllocationChunkSize_;
+  createInfo.alignment = 0;
+  createInfo.vaRange = Pal::VaRange::Default;
+  createInfo.priority = Pal::GpuMemPriority::Normal;
+  createInfo.heapCount = 1;
+  createInfo.heaps[0] = Pal::GpuHeapInvisible;
+  GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+  if (mem_ref != nullptr) {
+    return InitAllocator(mem_ref);
+  }
+  return false;
+}
+
+// ================================================================================================
+bool CoarseMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+  Pal::GpuMemoryCreateInfo createInfo = {};
+  createInfo.size = device_->settings().subAllocationChunkSize_;
+  createInfo.alignment = device_->properties().gpuMemoryProperties.fragmentSize;
+  createInfo.vaRange = Pal::VaRange::Svm;
+  createInfo.priority = Pal::GpuMemPriority::Normal;
+  createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
+  createInfo.pReservedGpuVaOwner = reserved_va;
+  createInfo.heapCount = 2;
+  createInfo.heaps[0] = Pal::GpuHeapInvisible;
+  createInfo.heaps[1] = Pal::GpuHeapLocal;
+  GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+  if (mem_ref != nullptr) {
+    return InitAllocator(mem_ref);
+  }
+  return false;
+}
+
+// ================================================================================================
+bool FineMemorySubAllocator::CreateChunk(const Pal::IGpuMemory* reserved_va) {
+  Pal::SvmGpuMemoryCreateInfo createInfo = {};
+  createInfo.isUsedForKernel = false;
+  createInfo.size = device_->settings().subAllocationChunkSize_;
+  createInfo.alignment = MaxGpuAlignment;
+  createInfo.flags.useReservedGpuVa = (reserved_va != nullptr);
+  createInfo.pReservedGpuVaOwner = reserved_va;
+  GpuMemoryReference* mem_ref = GpuMemoryReference::Create(*device_, createInfo);
+  if ((mem_ref != nullptr) && InitAllocator(mem_ref)) {
+    mem_ref->iMem()->Map(&mem_ref->cpuAddress_);
+    return mem_ref->cpuAddress_ != nullptr;
+  }
+  return false;
+}
+
 // ================================================================================================
 MemorySubAllocator::~MemorySubAllocator()
 {
  // Release memory heap for suballocations
-  for (auto it : mem_heap_) {
+  for (auto it : heaps_) {
    it.first->release();
    delete it.second;
  }
 }

 // ================================================================================================
-GpuMemoryReference* MemorySubAllocator::Allocate(
-  Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset)
+GpuMemoryReference* MemorySubAllocator::Allocate(Pal::gpusize size, Pal::gpusize alignment,
+  const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset)
 {
  GpuMemoryReference* mem_ref = nullptr;
+  MemBuddyAllocator* allocator = nullptr;
  // Check if resource size is allowed for suballocation
  if (size < device_->settings().subAllocationMaxSize_) {
    uint i = 0;
    size = amd::alignUp(size, device_->settings().subAllocationMinSize_);
    do {
-      MemBuddyAllocator*  allocator = nullptr;
      // Find if current heap has enough empty space
-      for (auto it : mem_heap_) {
+      for (auto it : heaps_) {
        mem_ref = it.first;
        allocator = it.second;
+        // SVM allocations may required a fixed VA, make sure we find the heap with the same VA
+        if (reserved_va &&
+            (reserved_va->Desc().gpuVirtAddr != mem_ref->iMem()->Desc().gpuVirtAddr)) {
+          continue;
+        }
        // If we have found a valid chunk, then suballocate memory
        if (Pal::Result::Success == allocator->Allocate(size, alignment, offset)) {
          return mem_ref;
-        } else {
-          mem_ref = nullptr;
        }
      }
-      
-      // Check if a chunk for suballocation doesn't exist
-      if (mem_ref == nullptr) {
-        // Allocate a new chunk in memory
-        Pal::GpuMemoryCreateInfo createInfo = {};
-        createInfo.size       = device_->settings().subAllocationChunkSize_;
-        createInfo.alignment  = 0;
-        createInfo.vaRange    = Pal::VaRange::Default;
-        createInfo.priority   = Pal::GpuMemPriority::Normal;
-        createInfo.heapCount  = 1;
-        createInfo.heaps[0]   = Pal::GpuHeapInvisible;
-        mem_ref = GpuMemoryReference::Create(*device_, createInfo);
-        // If chunk was allocated, then allocate BuddyAllocator object
-        if (mem_ref != nullptr) {
-          allocator = new MemBuddyAllocator(device_,
-          device_->settings().subAllocationChunkSize_,
-          device_->settings().subAllocationMinSize_);
-          if ((allocator != nullptr) &&
-              (Pal::Result::Success == allocator->Init())) {
-            // Add the chunk and suballocator into the heap
-            mem_heap_.insert(std::pair<GpuMemoryReference*, MemBuddyAllocator*>(
-                mem_ref, allocator));
-          } else {
-            delete allocator;
-            mem_ref->release();
-            return nullptr;  
-          }
-        } else {
+      if ((mem_ref == nullptr) && !CreateChunk(reserved_va)) {
          return nullptr;
-        }
      }
      i++;
    } while (i < 2);
@@ -1849,24 +1908,24 @@ GpuMemoryReference* MemorySubAllocator::Allocate(
 // ================================================================================================
 bool MemorySubAllocator::Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset)
 {
-  bool releaseMem =  false;
+  bool release_mem = false;
  {
    amd::ScopedLock l(monitor);
    // Find if current memory reference is a chunk allocation
-    auto it = mem_heap_.find(ref);
-    if (it == mem_heap_.end()) {
+    auto it = heaps_.find(ref);
+    if (it == heaps_.end()) {
      return false;
    }
-    // Free suballocation at the specified offset
+
    it->second->Free(offset);
    // If this suballocator empty, then release memory chunk
    if (it->second->IsEmpty()) {
      delete it->second;
-      mem_heap_.erase(it);
-      releaseMem = true;
+      heaps_.erase(it);
+      release_mem = true;
    }
  }
-  if (releaseMem) {
+  if (release_mem) {
    ref->release();
  }
  return true;
@@ -1883,11 +1942,13 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,
  bool result = false;
  size_t size = ref->iMem()->Desc().size;

-  if (desc->type_ == Resource::Local) {
-    // Check if runtime can free suballocation in local memory
-    if (memSubAllocLocal_.Free(&lockCacheOps_, ref, offset)) {
-      return true;
-    }
+  // Check if runtime can free suballocation
+  if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
+    return mem_sub_alloc_local_.Free(&lockCacheOps_, ref, offset);
+  } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
+    return mem_sub_alloc_coarse_.Free(&lockCacheOps_, ref, offset);
+  } else if (desc->SVMRes_) {
+    return mem_sub_alloc_fine_.Free(&lockCacheOps_, ref, offset);
  }

  // Make sure current allocation isn't bigger than cache
@@ -1918,23 +1979,29 @@ bool ResourceCache::addGpuMemory(Resource::Descriptor* desc,

 // ================================================================================================
 GpuMemoryReference* ResourceCache::findGpuMemory(Resource::Descriptor* desc, Pal::gpusize size,
-                                                 Pal::gpusize alignment, Pal::gpusize* offset) {
+  Pal::gpusize alignment, const Pal::IGpuMemory* reserved_va, Pal::gpusize* offset) {
  amd::ScopedLock l(&lockCacheOps_);
  GpuMemoryReference* ref = nullptr;

+  // Check if the runtime can suballocate memory
+  if ((desc->type_ == Resource::Local) && !desc->SVMRes_) {
+    ref = mem_sub_alloc_local_.Allocate(size, alignment, reserved_va, offset);
+  } else if ((desc->type_ == Resource::Local) && desc->SVMRes_) {
+    ref = mem_sub_alloc_coarse_.Allocate(size, alignment, reserved_va, offset);
+  } else if (desc->SVMRes_) {
+    ref = mem_sub_alloc_fine_.Allocate(size, alignment, reserved_va, offset);
+  }
+
+  if (ref != nullptr) {
+    return ref;
+  }
+
  // Early exit if resource is too big
  if (size >= cacheSizeLimit_ || desc->SVMRes_) {
    //! \note we may need to free the cache here to reduce memory pressure
    return ref;
  }

-  if (desc->type_ == Resource::Local) {
-    ref = memSubAllocLocal_.Allocate(size, alignment, offset);
-    if (ref != nullptr) {
-      return ref;
-    }
-  }
-
  // Serach the right resource through the cache list
  for (const auto& it : resCache_) {
    Resource::Descriptor* entry = it.first;
@@ -8,6 +8,8 @@
 #include "device/pal/paldefs.hpp"
 #include "util/palBuddyAllocatorImpl.h"

+#include <unordered_map>
+
 //! \namespace pal PAL Resource Implementation
 namespace pal {

@@ -459,13 +461,39 @@ public:

  ~MemorySubAllocator();

-  GpuMemoryReference*  Allocate(Pal::gpusize size,
-    Pal::gpusize alignment, Pal::gpusize* offset);
-  bool Free(amd::Monitor* monitor, GpuMemoryReference* ref, Pal::gpusize offset);
+  //! Create suballocation
+  GpuMemoryReference* Allocate(Pal::gpusize size,
+                               Pal::gpusize alignment,
+                               const Pal::IGpuMemory* reserved_va,
+                               Pal::gpusize* offset
+                               );
+  //! Free suballocation
+  bool Free(amd::Monitor* monitor,
+            GpuMemoryReference* mem_ref,
+            Pal::gpusize offset
+            );
+
+protected:
+  //! Allocate new chunk of memory
+  virtual bool CreateChunk(const Pal::IGpuMemory* reserved_va);
+  bool InitAllocator(GpuMemoryReference* mem_ref);

-private:
  Device* device_;
-  std::map<GpuMemoryReference*, MemBuddyAllocator*>  mem_heap_;
+  std::unordered_map<GpuMemoryReference*, MemBuddyAllocator*>  heaps_;
+};
+
+class CoarseMemorySubAllocator : public MemorySubAllocator {
+public:
+  CoarseMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
+
+  bool CreateChunk(const Pal::IGpuMemory* reservedVa) override;
+};
+
+class FineMemorySubAllocator : public MemorySubAllocator {
+public:
+  FineMemorySubAllocator(Device* device) : MemorySubAllocator(device) {}
+
+  bool CreateChunk(const Pal::IGpuMemory* reserved_va) override;
 };

 class ResourceCache : public amd::HeapObject {
@@ -475,7 +503,9 @@ class ResourceCache : public amd::HeapObject {
      : lockCacheOps_("PAL resource cache", true)
      , cacheSize_(0)
      , cacheSizeLimit_(cacheSizeLimit)
-      , memSubAllocLocal_(device) {}
+      , mem_sub_alloc_local_(device)
+      , mem_sub_alloc_coarse_ (device)
+      , mem_sub_alloc_fine_ (device) {}

  //! Default destructor
  ~ResourceCache();
@@ -489,7 +519,10 @@ class ResourceCache : public amd::HeapObject {
  //! Finds a PAL resource from the cache
  GpuMemoryReference* findGpuMemory(
      Resource::Descriptor* desc,  //!< Resource descriptor - cache key
-      Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset);
+      Pal::gpusize size,
+      Pal::gpusize alignment,
+      const Pal::IGpuMemory* reserved_va, //!< Reserved VA for SVM suballocations
+      Pal::gpusize* offset);

  //! Destroys cache
  void free(size_t minCacheEntries = 0);
@@ -512,7 +545,9 @@ class ResourceCache : public amd::HeapObject {
  //! PAL resource cache
  std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;

-  MemorySubAllocator  memSubAllocLocal_;  //!< Allocator for suballocations in Local
+  MemorySubAllocator  mem_sub_alloc_local_;  //!< Allocator for suballocations in Local
+  CoarseMemorySubAllocator mem_sub_alloc_coarse_; //!< Allocator for suballocations in Coarse SVM
+  FineMemorySubAllocator mem_sub_alloc_fine_; //!< Allocator for suballocations in Fine SVM
 };

 /*@}*/} // namespace pal
@@ -296,6 +296,7 @@ void* Context::svmAlloc(size_t size, size_t alignment, cl_svm_mem_flags flags) {
    void* svmPtrAlloced = NULL;
    void* tempPtr = NULL;

+    amd::ScopedLock lock(&ctxLock_);
    for (const auto& dev : svmAllocDevice_) {
      if (dev->type() == CL_DEVICE_TYPE_GPU) {
        // check if the device support svm platform atomics,
@@ -320,6 +321,7 @@ void Context::svmFree(void* ptr) const {
    return;
  }

+  amd::ScopedLock lock(&ctxLock_);
  for (const auto& dev : svmAllocDevice_) {
    if (dev->type() == CL_DEVICE_TYPE_GPU) {
      dev->svmFree(ptr);
@@ -198,7 +198,7 @@ class Context : public RuntimeObject {
  Device* customHostAllocDevice_;        //!< Device responsible for host allocations
  std::vector<Device*> svmAllocDevice_;  //!< Devices can support SVM allocations
  std::map<const Device*, DeviceQueueInfo> deviceQueues_;  //!< Device queues mapping
-  Monitor ctxLock_;                                        //!< Lock for the context access
+  mutable Monitor ctxLock_;                                //!< Lock for the context access
 };

 /*! @}