P4 to Git Change 1527848 by gandryey@gera-w8 on 2018/03/15 17:11:43

SWDEV-79445 - OCL generic changes and code clean-up - Add suballocations support for local(invisible) memory. It should significantly improve memory footprint and TLB usage with 2MB pages - Implementation uses BuddyAllocator provided in PAL - The chunk allocation size is 64MB, min allocation 4KB and max 4MB. GPU_MAX_SUBALLOC_SIZE controls the max size in KB Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#33 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#76 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#56 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#51 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#17 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#45 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#77 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#42 edit ... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#285 edit
2018-03-15 17:26:25 -04:00
@@ -8,6 +8,7 @@
 #include "palGpuMemory.h"
 #include "palImage.h"
 #include "palFormatInfo.h"
+#include "util/palSysMemory.h"

 //
 /// Memory Object Type
@@ -54,6 +54,10 @@ void PalDeviceUnload() { pal::Device::tearDown(); }

 namespace pal {

+Util::GenericAllocator  NullDevice::allocator_;
+char* Device::platformObj_;
+Pal::IPlatform*  Device::platform_;
+
 NullDevice::Compiler* NullDevice::compiler_;
 AppProfile Device::appProfile_;

@@ -183,6 +187,7 @@ bool NullDevice::init() {
  return true;
 }

+
 bool NullDevice::create(Pal::AsicRevision asicRevision, Pal::GfxIpLevel ipLevel,
                        uint xNACKSupported) {
  online_ = false;
@@ -736,7 +741,7 @@ bool Device::create(Pal::IDevice* device) {
  if (!amd::Device::create()) {
    return false;
  }
-  resourceList_ = new std::list<GpuMemoryReference*>();
+  resourceList_ = new std::list<Resource*>();
  if (nullptr == resourceList_) {
    return false;
  }
@@ -865,7 +870,7 @@ bool Device::create(Pal::IDevice* device) {
  size_t resourceCacheSize = settings().resourceCacheSize_;
  // Create resource cache.
  // \note Cache must be created before any resource creation to avoid nullptr check
-  resourceCache_ = new ResourceCache(resourceCacheSize);
+  resourceCache_ = new ResourceCache(this, resourceCacheSize);
  if (nullptr == resourceCache_) {
    return false;
  }
@@ -925,8 +930,6 @@ bool Device::create(Pal::IDevice* device) {
  return true;
 }

-static Pal::IPlatform* platform;
-
 bool Device::initializeHeapResources() {
  amd::ScopedLock k(lockForInitHeap_);
  if (!heapInitComplete_) {
@@ -998,7 +1001,7 @@ bool Device::initializeHeapResources() {
    xferQueue_->enableSyncedBlit();

    // Create RGP capture manager
-    rgpCaptureMgr_ = RgpCaptureMgr::Create(platform, *this);
+    rgpCaptureMgr_ = RgpCaptureMgr::Create(platform_, *this);
  }
  return true;
 }
@@ -1096,8 +1099,6 @@ static int reportHook(int reportType, char* message, int* returnValue) {
 }
 #endif  // _WIN32 & DEBUG

-static char* platformObj;
-
 bool Device::init() {
  uint32_t numDevices = 0;
  bool useDeviceList = false;
@@ -1123,7 +1124,7 @@ bool Device::init() {
 #endif  // !defined(WITH_LIGHTNING_COMPILER)

  size_t size = Pal::GetPlatformSize();
-  platformObj = new char[size];
+  platformObj_ = new char[size];
  Pal::PlatformCreateInfo info = {};
  info.flags.disableGpuTimeout = true;
 #if !defined(PAL_BUILD_DTIF)
@@ -1138,14 +1139,14 @@ bool Device::init() {
  info.maxSvmSize = static_cast<Pal::gpusize>(OCL_SET_SVM_SIZE * Mi);

  // PAL init
-  if (Pal::Result::Success != Pal::CreatePlatform(info, platformObj, &platform)) {
+  if (Pal::Result::Success != Pal::CreatePlatform(info, platformObj_, &platform_)) {
    return false;
  }

  // Get the total number of active devices
  // Count up all the devices in the system.
  Pal::IDevice* deviceList[Pal::MaxDevices] = {};
-  platform->EnumerateDevices(&numDevices, &deviceList[0]);
+  platform_->EnumerateDevices(&numDevices, &deviceList[0]);

  uint ordinal = 0;
  const char* selectDeviceByName = nullptr;
@@ -1175,8 +1176,8 @@ bool Device::init() {
 }

 void Device::tearDown() {
-  platform->Destroy();
-  delete platformObj;
+  platform_->Destroy();
+  delete platformObj_;

 #if !defined(WITH_LIGHTNING_COMPILER)
  if (compiler_ != nullptr) {
@@ -120,7 +120,12 @@ class NullDevice : public amd::Device {
  amd::CacheCompilation* cacheCompilation() const { return cacheCompilation_.get(); }
 #endif

+  void* Alloc(const Util::AllocInfo& allocInfo) { return allocator_.Alloc(allocInfo); }
+  void  Free(const Util::FreeInfo& freeInfo) { allocator_.Free(freeInfo); }
+
 protected:
+  static Util::GenericAllocator allocator_; //!< Generic memory allocator in PAL
+
  Pal::AsicRevision asicRevision_;  //!< ASIC revision
  Pal::GfxIpLevel ipLevel_;         //!< Device IP level
  const AMDDeviceInfo* hwInfo_;     //!< Device HW info structure
@@ -464,6 +469,9 @@ class Device : public NullDevice {
  //! Returns PAL device properties
  const Pal::DeviceProperties& properties() const { return properties_; }

+  //! Returns PAL platform interface
+  Pal::IPlatform* iPlat() const { return platform_; }
+
  //! Returns PAL device interface
  Pal::IDevice* iDev() const { return device_; }

@@ -496,19 +504,19 @@ class Device : public NullDevice {
  bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;

  //! Adds a resource to the global list
-  void addResource(GpuMemoryReference* mem) const {
+  void addResource(Resource* res) const {
    amd::ScopedLock lock(lockResources());
-    auto findIt = std::find(resourceList_->begin(), resourceList_->end(), mem);
-    mem->events_.resize(numOfVgpus());
+    auto findIt = std::find(resourceList_->begin(), resourceList_->end(), res);
+    res->resizeGpuEvents(numOfVgpus() - 1);
    if (resourceList_->end() == findIt) {
-      resourceList_->push_back(mem);
+      resourceList_->push_back(res);
    }
  }

  //! Removes a resource from the global list
-  void removeResource(GpuMemoryReference* mem) const {
+  void removeResource(Resource* res) const {
    amd::ScopedLock lock(lockResources());
-    resourceList_->remove(mem);
+    resourceList_->remove(res);
  }

  //! Resizes global resource list to accumulate a new queue
@@ -566,6 +574,9 @@ class Device : public NullDevice {
  bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
  bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;

+  static char* platformObj_;          //!< Memory allocated for PAL platform object
+  static Pal::IPlatform*  platform_;  //!< Pointer to the PAL platform object
+
  amd::Context* context_;       //!< A dummy context for internal allocations
  amd::Monitor* lockAsyncOps_;  //!< Lock to serialise all async ops on this device
  amd::Monitor*
@@ -592,7 +603,7 @@ class Device : public NullDevice {
  Pal::IDevice* device_;                 //!< PAL device object
  std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount];  //!< Free memory counter
  amd::Monitor* lockResourceOps_;        //!< Lock to serialise resource access
-  std::list<GpuMemoryReference*>* resourceList_;     //!< Active resource list
+  std::list<Resource*>* resourceList_;   //!< Active resource list
  RgpCaptureMgr*   rgpCaptureMgr_;       //!< RGP capture manager
 };

@@ -89,14 +89,14 @@ void Segment::copy(size_t offset, const void* src, size_t size) {
    amd::ScopedLock k(gpuAccess_->dev().xferMgr().lockXfer());
    VirtualGPU& gpu = *gpuAccess_->dev().xferQueue();
    Memory& xferBuf = gpuAccess_->dev().xferWrite().acquire();
-    size_t tmpSize = std::min(static_cast<size_t>(xferBuf.vmSize()), size);
+    size_t tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
    size_t srcOffs = 0;
    while (size != 0) {
      xferBuf.hostWrite(&gpu, reinterpret_cast<const_address>(src) + srcOffs, 0, tmpSize);
      xferBuf.partialMemCopyTo(gpu, 0, (offset + srcOffs), tmpSize, *gpuAccess_, false, true);
      size -= tmpSize;
      srcOffs += tmpSize;
-      tmpSize = std::min(static_cast<size_t>(xferBuf.vmSize()), size);
+      tmpSize = std::min(static_cast<size_t>(xferBuf.size()), size);
    }
    gpu.waitAllEngines();
  }
@@ -6,6 +6,7 @@
 #include "platform/command.hpp"
 #include "platform/program.hpp"
 #include "device/pal/paldefs.hpp"
+#include "util/palBuddyAllocatorImpl.h"

 //! \namespace pal PAL Resource Implementation
 namespace pal {
@@ -16,7 +17,6 @@ class VirtualGPU;
 /*! \addtogroup PAL PAL Resource Implementation
 *  @{
 */
-
 class GpuMemoryReference : public amd::ReferenceCountedObject {
 public:
  static GpuMemoryReference* Create(const Device& dev, const Pal::GpuMemoryCreateInfo& createInfo);
@@ -36,12 +36,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
  //! Default constructor
  GpuMemoryReference(const Device& dev);

-  //! Resizes the events array to account the new queue
-  void resizeGpuEvents(uint index) { events_.resize(index + 1); }
-
-  //! Erase an entry in the array for provided queue index
-  void eraseGpuEvents(uint index) { events_.erase(events_.begin() + index); }
-
  //! Get PAL memory object
  Pal::IGpuMemory* iMem() const { return gpuMem_; }

@@ -50,7 +44,6 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
  const Device& device_;      //!< GPU device
  //! @note: This field is necessary for the thread safe release only
  VirtualGPU* gpu_;           //!< Resource will be used only on this queue
-  std::vector<GpuEvent> events_;  //!< GPU events associated with the resource

 protected:
  //! Default destructor
@@ -64,6 +57,8 @@ class GpuMemoryReference : public amd::ReferenceCountedObject {
  GpuMemoryReference& operator=(const GpuMemoryReference&);
 };

+static constexpr Pal::gpusize MaxGpuAlignment = 4 * Ki;
+
 //! GPU resource
 class Resource : public amd::HeapObject {
 public:
@@ -178,7 +173,7 @@ class Resource : public amd::HeapObject {
        uint imageArray_ : 1;      //!< PAL resource is an array of images
        uint buffer_ : 1;          //!< PAL resource is a buffer
        uint tiled_ : 1;           //!< PAL resource is tiled
-        uint SVMRes_ : 1;          //!< SVM flag to the cal resource
+        uint SVMRes_ : 1;          //!< SVM flag to the pal resource
        uint scratch_ : 1;         //!< Scratch buffer
        uint isAllocExecute_ : 1;  //!< SVM resource allocation attribute for shader\cmdbuf
        uint isDoppTexture_ : 1;   //!< PAL resource is for a DOPP desktop texture
@@ -205,9 +200,9 @@ class Resource : public amd::HeapObject {
  //! Destructor of the resource
  virtual ~Resource();

-  /*! \brief Creates a CAL object, associated with the resource
+  /*! \brief Creates a PAL object, associated with the resource
   *
-   *  \return True if we succesfully created a CAL resource
+   *  \return True if we succesfully created a PAL resource
   */
  virtual bool create(MemoryType memType,       //!< memory type
                      CreateParams* params = 0  //!< special parameters for resource allocation
@@ -263,7 +258,7 @@ class Resource : public amd::HeapObject {
  uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }

  //! Returns global memory offset
-  uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
+  uint64_t vmSize() const { return desc_.width_ * elementSize(); }

  //! Returns global memory offset
  bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
@@ -290,7 +285,7 @@ class Resource : public amd::HeapObject {

  //! Marks the resource as busy
  void setBusy(VirtualGPU& gpu,   //!< Virtual GPU device object
-               GpuEvent calEvent  //!< CAL event
+               GpuEvent calEvent  //!< PAL event
               ) const;

  //! Wait for the resource
@@ -326,7 +321,7 @@ class Resource : public amd::HeapObject {
  //! Get the mapped address of this resource
  address data() const { return reinterpret_cast<address>(address_); }

-  //! Frees all allocated CAL memories and resources,
+  //! Frees all allocated PAL memories and resources,
  //! associated with this objects. And also destroys all rename structures
  //! Note: doesn't destroy the object itself
  void free();
@@ -360,7 +355,42 @@ class Resource : public amd::HeapObject {
  //! Returns GPU event associated with this resource and specified queue
  GpuEvent* getGpuEvent(const VirtualGPU& gpu) const;

+  //! Resizes the events array to account the new queue
+  void resizeGpuEvents(uint index) { events_.resize(index + 1); }
+
+  //! Erase an entry in the array for provided queue index
+  void eraseGpuEvents(uint index) { events_.erase(events_.begin() + index); }
+
 protected:
+  /*! \brief Creates a PAL iamge object, associated with the resource
+  *
+  *  \return True if we succesfully created a PAL resource
+  */
+  bool CreateImage(CreateParams* params //!< special parameters for resource allocation
+                   );
+
+  /*! \brief Creates a PAL interop object, associated with the resource
+  *
+  *  \return True if we succesfully created a PAL interop resource
+  */
+  bool CreateInterop(CreateParams* params //!< special parameters for resource allocation
+                     );
+
+  /*! \brief Creates a PAL pinned object, associated with the resource
+  *
+  *  \return True if we succesfully created a PAL pinned resource
+  */
+  bool CreatePinned(CreateParams* params //!< special parameters for resource allocation
+                    );
+
+  /*! \brief Creates a PAL SVM object, associated with the resource
+  *
+  *  \return True if we succesfully created a PAL SVM resource
+  */
+  bool CreateSvm(CreateParams* params,  //!< special parameters for resource allocation
+                 Pal::gpusize svmPtr
+                 );
+
  uint elementSize_;  //!< Size of a single element in bytes

 private:
@@ -424,6 +454,7 @@ class Resource : public amd::HeapObject {
  uint32_t curRename_;          //!< Current active rename in the list
  RenameList renames_;          //!< Rename resource list
  GpuMemoryReference* memRef_;  //!< PAL resource reference
+  Pal::gpusize  subOffset_;     //!< GPU memory offset in the oririnal resource
  const Resource* viewOwner_;   //!< GPU resource, which owns this view
  void* glInteropMbRes_;        //!< Mb Res handle
  uint32_t glType_;             //!< GL interop type
@@ -438,26 +469,50 @@ class Resource : public amd::HeapObject {

  uint32_t* hwState_;  //!< HW state for image object
  uint64_t hwSrd_;     //!< GPU pointer to HW SRD
+
+  //! Note: Access to the events are thread safe.
+  mutable std::vector<GpuEvent> events_;  //!< GPU events associated with the resource
+};
+
+typedef Util::BuddyAllocator<Device> MemBuddyAllocator;
+
+class MemorySubAllocator : public amd::HeapObject {
+public:
+  MemorySubAllocator(Device* device) : device_(device) {}
+
+  ~MemorySubAllocator();
+
+  GpuMemoryReference*  Allocate(Pal::gpusize size,
+    Pal::gpusize alignment, Pal::gpusize* offset);
+  bool Free(GpuMemoryReference* ref, Pal::gpusize offset);
+
+private:
+  Device* device_;
+  std::map<GpuMemoryReference*, MemBuddyAllocator*>  mem_heap_;
 };

 class ResourceCache : public amd::HeapObject {
 public:
  //! Default constructor
-  ResourceCache(size_t cacheSizeLimit)
-      : lockCacheOps_("PAL resource cache", true), cacheSize_(0), cacheSizeLimit_(cacheSizeLimit) {}
+  ResourceCache(Device* device, size_t cacheSizeLimit)
+      : lockCacheOps_("PAL resource cache", true)
+      , cacheSize_(0)
+      , cacheSizeLimit_(cacheSizeLimit)
+      , memSubAllocLocal_(device) {}

  //! Default destructor
  ~ResourceCache();

-  //! Adds a CAL resource to the cache
-  bool addGpuMemory(Resource::Descriptor* desc,  //!< Resource descriptor - cache key
-                    GpuMemoryReference* ref      //!< Resource reference
+  //! Adds a PAL resource to the cache
+  bool addGpuMemory(Resource::Descriptor* desc,   //!< Resource descriptor - cache key
+                    GpuMemoryReference*   ref,    //!< Resource reference
+                    Pal::gpusize          offset  //!< Original resource offset
                    );

-  //! Finds a CAL resource from the cache
+  //! Finds a PAL resource from the cache
  GpuMemoryReference* findGpuMemory(
      Resource::Descriptor* desc,  //!< Resource descriptor - cache key
-      Pal::gpusize size, Pal::gpusize alignment);
+      Pal::gpusize size, Pal::gpusize alignment, Pal::gpusize* offset);

  //! Destroys cache
  bool free(size_t minCacheEntries = 0);
@@ -477,8 +532,10 @@ class ResourceCache : public amd::HeapObject {
  size_t cacheSize_;            //!< Current cache size in bytes
  const size_t cacheSizeLimit_; //!< Cache size limit in bytes

-  //! CAL resource cache
+  //! PAL resource cache
  std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
+
+  MemorySubAllocator  memSubAllocLocal_;  //!< Allocator for suballocations in Local
 };

 /*@}*/} // namespace pal
@@ -138,6 +138,12 @@ Settings::Settings() {
  rgpSqttDispCount_ = PAL_RGP_DISP_COUNT;
  rgpSqttWaitIdle_ = true;
  rgpSqttForceDisable_ = false;
+
+  // Sub allocation parameters
+  subAllocationMinSize_ = 4 * Ki;
+  subAllocationChunkSize_ = 64 * Mi;
+  subAllocationMaxSize_ =
+    std::min(static_cast<uint64_t>(GPU_MAX_SUBALLOC_SIZE) * Ki, subAllocationChunkSize_);
 }

 bool Settings::create(const Pal::DeviceProperties& palProp,
@@ -98,6 +98,10 @@ class Settings : public device::Settings {
  uint64_t maxAllocSize_;      //!< Maximum single allocation size
  uint rgpSqttDispCount_;      //!< The number of dispatches captured in SQTT

+  uint64_t subAllocationMinSize_;   //!< Minimum size allowed for suballocations
+  uint64_t subAllocationMaxSize_;   //!< Maximum size allowed with suballocations
+  uint64_t subAllocationChunkSize_; //!< Chunk size for suballocaitons
+  
  amd::LibrarySelector libSelector_;  //!< Select linking libraries for compiler

  //! Default constructor
@@ -409,7 +409,7 @@ void VirtualGPU::MemoryDependency::validate(VirtualGPU& gpu, const Memory* memor
  }

  uint64_t curStart = memory->vmAddress();
-  uint64_t curEnd = curStart + memory->vmSize();
+  uint64_t curEnd = curStart + memory->size();

  // Loop through all memory objects in the queue and find dependency
  // @note don't include objects from the current kernel
@@ -1974,6 +1974,7 @@ void VirtualGPU::PostDeviceEnqueue(
    uint64_t vmParentWrap,
    GpuEvent* gpuEvent)
 {
+  uint32_t id  = gpuEvent->id;
  amd::DeviceQueue* defQueue = kernel.program().context().defDeviceQueue(dev());

  // Make sure exculsive access to the device queue
@@ -2055,6 +2056,9 @@ void VirtualGPU::PostDeviceEnqueue(
    iCmd()->CmdVirtualQueueHandshake(vmParentWrap + offsetof(AmdAqlWrap, state), AQL_WRAP_DONE,
      vmParentWrap + offsetof(AmdAqlWrap, child_counter),
      signalAddr, dev().settings().useDeviceQueue_);
+    if (id != gpuEvent->id) {
+        LogError("Something is wrong. ID mismatch!\n");
+    }
    eventEnd(MainEngine, *gpuEvent);
  }

@@ -2203,6 +2207,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
    if (profiling() || state_.profileEnabled_) {
      addBarrier();
    }
+    if (id != gpuEvent.id) {
+      LogError("Something is wrong. ID mismatch!\n");
+    }
    eventEnd(MainEngine, gpuEvent);

    // Execute scheduler for device enqueue
@@ -2210,9 +2217,6 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
      PostDeviceEnqueue(kernel, hsaKernel, gpuDefQueue, vmDefQueue, vmParentWrap, &gpuEvent);
    }

-    if (id != gpuEvent.id) {
-      LogError("Something is wrong. ID mismatch!\n");
-    }
    // Update the global GPU event
    setGpuEvent(gpuEvent, needFlush);

@@ -2266,7 +2270,7 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
  }
 }

-void VirtualGPU::releaseMemory(GpuMemoryReference* mem, GpuEvent* event) {
+void VirtualGPU::releaseMemory(GpuMemoryReference* mem) {
  queues_[MainEngine]->removeCmdMemRef(mem);
  queues_[SdmaEngine]->removeCmdMemRef(mem);
 }
@@ -314,7 +314,7 @@ class VirtualGPU : public device::VirtualDevice {
  virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);

-  void releaseMemory(GpuMemoryReference* mem, GpuEvent* event);
+  void releaseMemory(GpuMemoryReference* mem);

  void flush(amd::Command* list = nullptr, bool wait = false);
  bool terminate() { return true; }
@@ -86,6 +86,8 @@ release(size_t, GPU_PINNED_MIN_XFER_SIZE, 512,                                \
        "The minimal buffer size for pinned read/write transfers in KBytes")  \
 release(size_t, GPU_RESOURCE_CACHE_SIZE, 64,                                  \
        "The resource cache size in MB")                                      \
+release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096,                                  \
+        "The maximum size accepted for suballocaitons in KB")                 \
 release(uint, GPU_ASYNC_MEM_COPY, 0,                                          \
        "Enables async memory transfers with DRM engine")                     \
 release(bool, GPU_FORCE_64BIT_PTR, 0,                                         \