P4 to Git Change 1410373 by gandryey@gera-w8 on 2017/05/16 17:16:52

SWDEV-120180 - [amdgpu-pro] OpenCL support for SSG - Add initial support of DGMA memory under ROCr backend. - The implementation requires amdgpu-pro stack initialization and memory allocation. - An interop with HSA device is created for ROCr access Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/Makefile#10 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#153 edit ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_lqdflash_amd.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#285 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#351 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile.oclrocm#16 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/lnxheaders.h#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.cpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodevice.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/pro/prodriver.hpp#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#51 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocsettings.cpp#18 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/platform/memory.hpp#101 edit ... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#41 edit
2017-05-16 17:22:59 -04:00
@@ -157,7 +157,7 @@ static const char* OclExtensionsString[] = {"cl_khr_fp64 ",
                                            "cl_khr_mipmap_image ",
                                            "cl_khr_mipmap_image_writes ",
                                            "",
-                                            (IS_LINUX) ? "" : "cl_amd_liquid_flash ",
+                                            "cl_amd_liquid_flash ",
                                            NULL};

 namespace device {
@@ -313,6 +313,7 @@ bool Settings::create(const CALdeviceattribs& calAttr, bool reportAsOCL12Device,
      maxWorkloadTime_ = modifyMaxWorkload.time;
    }
  }
+  enableExtension(ClAMDLiquidFlash);
 #endif  // defined(_WIN32)

  // Enable atomics support
@@ -332,7 +333,6 @@ bool Settings::create(const CALdeviceattribs& calAttr, bool reportAsOCL12Device,
  // Enable some platform extensions
  enableExtension(ClAmdDeviceAttributeQuery);
  enableExtension(ClKhrSpir);
-  enableExtension(ClAMDLiquidFlash);

  hwLDSSize_ = 32 * Ki;

@@ -0,0 +1,30 @@
+//
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma once
+
+// NOTE: Some of the Linux driver stack's headers don't wrap their C-style interface names in 'extern "C" { ... }'
+// blocks when building with a C++ compiler, so we need to add that ourselves.
+#if __cplusplus
+extern "C"
+{
+#endif
+
+#include <amdgpu.h>
+#include <amdgpu_drm.h>
+#include <amdgpu_shared.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+constexpr int32_t InvalidFd = -1; // value representing a invalid file descriptor for Linux
+
+#if __cplusplus
+} // extern "C"
+#endif
@@ -0,0 +1,162 @@
+//
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef WITHOUT_HSA_BACKEND
+
+#include "hsa_ext_amd.h"
+#include "lnxheaders.h"
+#include "prodevice.hpp"
+#include "amdgpu_drm.h"
+
+namespace roc {
+
+constexpr uint32_t kMaxDevices  = 32;
+constexpr uint32_t kAtiVendorId = 0x1002;
+
+IProDevice* IProDevice::Init(uint32_t bus, uint32_t dev, uint32_t func)
+{
+  ProDevice* pro_device = new ProDevice();
+
+  if (pro_device == nullptr || !pro_device->Create(bus, dev, func)) {
+    delete pro_device;
+    return nullptr;
+  }
+  return pro_device;
+}
+
+ProDevice::~ProDevice() {
+  delete alloc_ops_;
+
+  if (dev_handle_ != nullptr) {
+    amdgpu_device_deinitialize(dev_handle_);
+  }
+  if (file_desc_ > 0) {
+    close(file_desc_);
+  }
+}
+
+#ifndef AMDGPU_CAPABILITY_SSG_FLAG
+#define AMDGPU_CAPABILITY_SSG_FLAG 4
+#endif
+
+// ================================================================================================
+// Open drm device and initialize it. And also get the drm information.
+bool ProDevice::Create(uint32_t bus, uint32_t device, uint32_t func) {
+  drmDevicePtr  devices[kMaxDevices] = { };
+  int32_t       device_count        = drmGetDevices(devices, kMaxDevices);
+  bool          result = false;
+
+  for (int32_t i = 0; i < device_count; i++) {
+    // Check if the device vendor is AMD
+    if (devices[i]->deviceinfo.pci->vendor_id != kAtiVendorId) {
+      continue;
+    }
+    if ((devices[i]->businfo.pci->bus == bus) &&
+        (devices[i]->businfo.pci->dev == device) &&
+			  (devices[i]->businfo.pci->func == func)) {
+
+      // pDevices[i]->nodes[DRM_NODE_PRIMARY];
+      // Using render node here so that we can do the off-screen rendering without authentication
+      file_desc_ = open(devices[i]->nodes[DRM_NODE_RENDER], O_RDWR, 0);
+
+      if (file_desc_ > 0) {
+        void* data, *file, *cap;
+
+        // Initialize the admgpu device.
+        if (amdgpu_device_initialize(file_desc_, &major_ver_,
+                                     &minor_ver_, &dev_handle_) == 0) {
+          uint32_t version = 0;
+          // amdgpu_query_gpu_info will never fail only if it is initialized
+          amdgpu_query_gpu_info(dev_handle_, &gpu_info_);
+
+          drm_amdgpu_capability cap = {};
+          amdgpu_query_info(dev_handle_, AMDGPU_INFO_CAPABILITY, sizeof(drm_amdgpu_capability), &cap);
+
+          // Check if DGMA and SSG are available
+          if ((cap.flag & (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) == 
+              (AMDGPU_CAPABILITY_DIRECT_GMA_FLAG | AMDGPU_CAPABILITY_SSG_FLAG)) {
+            result = true;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (result) {
+    alloc_ops_ = new amd::Monitor("DGMA mem alloc lock", true);
+    if (nullptr == alloc_ops_) {
+      return true;
+    }
+  }
+
+  return result;
+}
+
+void* ProDevice::AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const
+{
+  amd::ScopedLock l(alloc_ops_);
+  void* ptr = nullptr;
+  amdgpu_bo_handle buf_handle = 0;
+  amdgpu_bo_alloc_request req = {0};
+  *host_ptr = nullptr;
+
+  req.alloc_size = size;
+  req.phys_alignment = 64 * Ki;
+  req.preferred_heap = AMDGPU_GEM_DOMAIN_DGMA;
+
+  // Allocate buffer in DGMA heap
+  if (0 == amdgpu_bo_alloc(dev_handle_, &req, &buf_handle)) {
+    amdgpu_bo_handle_type type = amdgpu_bo_handle_type_dma_buf_fd;
+    uint32_t shared_handle = 0;
+    // Find the base driver handle
+    if (0 == amdgpu_bo_export(buf_handle, type, &shared_handle)) {
+      uint32_t  flags = 0;
+      size_t    buf_size = 0;
+      // Map memory object to HSA device
+      if (0 == hsa_amd_interop_map_buffer(1, &agent, shared_handle,
+                                          flags, &buf_size, &ptr, nullptr, nullptr)) {
+        // Ask GPUPro driver to provide CPU access to allocation
+        if (0 == amdgpu_bo_cpu_map(buf_handle, host_ptr)) {
+          allocs_.insert(std::pair<void*, std::pair<amdgpu_bo_handle, uint32_t>>(
+                         ptr, std::pair<amdgpu_bo_handle, uint32_t>(buf_handle, shared_handle)));
+        }
+        else {
+          hsa_amd_interop_unmap_buffer(ptr);
+          close(shared_handle);
+          amdgpu_bo_free(buf_handle);
+        }
+      }
+      else {
+        close(shared_handle);
+        amdgpu_bo_free(buf_handle);
+      }
+    }
+    else {
+      amdgpu_bo_free(buf_handle);
+    }
+  }
+
+  return ptr;
+}
+
+void ProDevice::FreeDmaBuffer(void* ptr) const
+{
+  amd::ScopedLock l(alloc_ops_);
+  auto it = allocs_.find(ptr);
+  if (it != allocs_.end()) {
+    amdgpu_bo_cpu_unmap(it->second.first);
+    // Unmap memory from HSA device
+    hsa_amd_interop_unmap_buffer(ptr);
+    // Close shared handle
+    close(it->second.second);
+    int error = amdgpu_bo_free(it->second.first);
+    allocs_.erase(it);
+  }
+}
+
+}
+
+#endif  // WITHOUT_HSA_BACKEND
+
@@ -0,0 +1,53 @@
+//
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma once
+
+#ifndef WITHOUT_HSA_BACKEND
+
+#include "prodriver.hpp"
+#include "thread/monitor.hpp"
+#include <map>
+
+/*! \addtogroup HSA
+ *  @{
+ */
+
+//! HSA Device Implementation
+namespace roc {
+
+class ProDevice : public IProDevice {
+public:
+  ProDevice()
+    : file_desc_(0)
+    , major_ver_(0)
+    , minor_ver_(0)
+    , cp_ver_(0)
+    , alloc_ops_(nullptr) {}
+  virtual ~ProDevice() override;
+
+  bool Create(uint32_t bus, uint32_t device, uint32_t func);
+
+  virtual void* AllocDmaBuffer(
+      hsa_agent_t agent, size_t size, void** host_ptr) const override;
+  virtual void FreeDmaBuffer(void* ptr) const override;
+
+private:
+  int32_t               file_desc_;   //!< File descriptor for the device
+  uint32_t              major_ver_;   //!< Major driver version
+  uint32_t              minor_ver_;   //!< Minor driver version
+  uint32_t              cp_ver_;      //!< CP ucode version
+  amdgpu_device_handle  dev_handle_;  //!< AMD gpu device handle
+  amdgpu_gpu_info       gpu_info_;    //!< GPU info structure
+  amdgpu_heap_info      heap_info_;   //!< Information about memory
+  mutable std::map<void*, std::pair<amdgpu_bo_handle, uint32_t>> allocs_; //!< Alloced memory mapping
+  amd::Monitor*         alloc_ops_;   //!< Serializes memory allocations/destructions
+};
+
+}  // namespace roc
+
+/**
+ * @}
+ */
+#endif /*WITHOUT_HSA_BACKEND*/
@@ -0,0 +1,35 @@
+//
+// Copyright (c) 2017 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma once
+
+#ifndef WITHOUT_HSA_BACKEND
+
+#include "top.hpp"
+#include "hsa.h"
+
+/*! \addtogroup HSA
+ *  @{
+ */
+
+namespace roc {
+
+//! Pro Device Interface
+class IProDevice : public amd::HeapObject {
+public:
+  static IProDevice* Init(uint32_t bus, uint32_t device, uint32_t func);
+
+  virtual void* AllocDmaBuffer(hsa_agent_t agent, size_t size, void** host_ptr) const = 0;
+  virtual void FreeDmaBuffer(void* ptr) const = 0;
+
+  IProDevice() {}
+  virtual ~IProDevice() {}
+};
+
+}  // namespace roc
+
+/**
+ * @}
+ */
+#endif /*WITHOUT_HSA_BACKEND*/
@@ -25,6 +25,7 @@
 #endif  // !defined(WITH_LIGHTNING_COMPILER)
 #include "device/rocm/rocmemory.hpp"
 #include "device/rocm/rocglinterop.hpp"
+#include "pro/prodriver.hpp"
 #include <cstring>
 #include <fstream>
 #include <sstream>
@@ -111,16 +112,18 @@ bool NullDevice::create(const AMDDeviceInfo& deviceInfo) {
 }

 Device::Device(hsa_agent_t bkendDevice)
-    : mapCacheOps_(nullptr),
-      mapCache_(nullptr),
-      _bkendDevice(bkendDevice),
-      gpuvm_segment_max_alloc_(0),
-      alloc_granularity_(0),
-      context_(nullptr),
-      xferQueue_(nullptr),
-      xferRead_(nullptr),
-      xferWrite_(nullptr),
-      numOfVgpus_(0) {
+    : mapCacheOps_(nullptr)
+    , mapCache_(nullptr)
+    , _bkendDevice(bkendDevice)
+    , gpuvm_segment_max_alloc_(0)
+    , alloc_granularity_(0)
+    , context_(nullptr)
+    , xferQueue_(nullptr)
+    , xferRead_(nullptr)
+    , xferWrite_(nullptr)
+    , pro_device_(nullptr)
+    , pro_ena_(false)
+    , numOfVgpus_(0) {
  group_segment_.handle = 0;
  system_segment_.handle = 0;
  system_coarse_segment_.handle = 0;
@@ -128,6 +131,10 @@ Device::Device(hsa_agent_t bkendDevice)
 }

 Device::~Device() {
+#ifdef WITH_AMDGPU_PRO
+  delete pro_device_;
+#endif
+
  // Release cached map targets
  for (uint i = 0; mapCache_ != nullptr && i < mapCache_->size(); ++i) {
    if ((*mapCache_)[i] != nullptr) {
@@ -474,16 +481,22 @@ bool Device::init() {

    roc_device->deviceInfo_.gfxipVersion_ = major * 100 + minor * 10 + stepping;

-    if (!roc_device->mapHSADeviceToOpenCLDevice(agent)) {
-      LogError("Failed mapping of HsaDevice to Device.");
-      continue;
-    }
-
    if (!roc_device->create()) {
      LogError("Error creating new instance of Device.");
      continue;
    }

+    // Setup System Memory to be Non-Coherent per user
+    // request via environment variable. By default the
+    // System Memory is setup to be Coherent
+    if (roc_device->settings().enableNCMode_) {
+      hsa_status_t err = hsa_amd_coherency_set_type(agent, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
+      if (err != HSA_STATUS_SUCCESS) {
+        LogError("Unable to set NC memory policy!");
+        continue;
+      }
+    }
+
    if (selectedDevices[ordinal++] &&
        (flagIsDefault(GPU_DEVICE_NAME) || GPU_DEVICE_NAME == 0 || GPU_DEVICE_NAME[0] == '\0' ||
         !strcmp(GPU_DEVICE_NAME, roc_device->info_.name_))) {
@@ -500,10 +513,71 @@ void Device::tearDown() {
 }

 bool Device::create() {
+  if (HSA_STATUS_SUCCESS !=
+      hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_PROFILE, &agent_profile_)) {
+    return false;
+  }
+
+  // Create HSA settings
+  settings_ = new Settings();
+  roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
+  if ((hsaSettings == nullptr) ||
+      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) {
+    return false;
+  }
+
  if (!amd::Device::create()) {
    return false;
  }

+  uint32_t hsa_bdf_id = 0;
+  if (HSA_STATUS_SUCCESS !=
+      hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) {
+    return false;
+  }
+
+  info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
+  info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
+  info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
+  info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
+
+#ifdef WITH_AMDGPU_PRO
+  // Create amdgpu-pro device interface for SSG support
+  pro_device_ = IProDevice::Init(
+      info_.deviceTopology_.pcie.bus,
+      info_.deviceTopology_.pcie.device,
+      info_.deviceTopology_.pcie.function);
+  if (pro_device_ != nullptr) {
+    pro_ena_ = true;
+    settings_->enableExtension(ClAMDLiquidFlash);
+  }
+#endif
+
+  if (populateOCLDeviceConstants() == false) {
+    return false;
+  }
+
+#if defined(WITH_LIGHTNING_COMPILER)
+  //  create compilation object with cache support
+  int gfxipMajor = deviceInfo_.gfxipVersion_ / 100;
+  int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10;
+  int gfxipStepping = deviceInfo_.gfxipVersion_ % 10;
+
+  // Use compute capability as target (AMD:AMDGPU:major:minor:stepping)
+  // with dash as delimiter to be compatible with Windows directory name
+  std::ostringstream cacheTarget;
+  cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping;
+
+  amd::CacheCompilation* compObj = new amd::CacheCompilation(
+      cacheTarget.str(), "_rocm", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
+  if (!compObj) {
+    LogError("Unable to create cache compilation object!");
+    return false;
+  }
+
+  cacheCompilation_.reset(compObj);
+#endif
+
  amd::Context::Info info = {0};
  std::vector<amd::Device*> devices;
  devices.push_back(this);
@@ -568,59 +642,6 @@ device::Program* Device::createProgram(amd::option::Options* options) {
  return new roc::HSAILProgram(*this);
 }

-bool Device::mapHSADeviceToOpenCLDevice(hsa_agent_t dev) {
-  if (HSA_STATUS_SUCCESS !=
-      hsa_agent_get_info(_bkendDevice, HSA_AGENT_INFO_PROFILE, &agent_profile_)) {
-    return false;
-  }
-
-  // Create HSA settings
-  settings_ = new Settings();
-  roc::Settings* hsaSettings = static_cast<roc::Settings*>(settings_);
-  if ((hsaSettings == nullptr) ||
-      !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), deviceInfo_.gfxipVersion_)) {
-    return false;
-  }
-
-  if (populateOCLDeviceConstants() == false) {
-    return false;
-  }
-
-  // Setup System Memory to be Non-Coherent per user
-  // request via environment variable. By default the
-  // System Memory is setup to be Coherent
-  if (hsaSettings->enableNCMode_) {
-    hsa_status_t err = hsa_amd_coherency_set_type(dev, HSA_AMD_COHERENCY_TYPE_NONCOHERENT);
-    if (err != HSA_STATUS_SUCCESS) {
-      LogError("Unable to set NC memory policy!");
-      return false;
-    }
-  }
-
-#if defined(WITH_LIGHTNING_COMPILER)
-  //  create compilation object with cache support
-  int gfxipMajor = deviceInfo_.gfxipVersion_ / 100;
-  int gfxipMinor = deviceInfo_.gfxipVersion_ / 10 % 10;
-  int gfxipStepping = deviceInfo_.gfxipVersion_ % 10;
-
-  // Use compute capability as target (AMD:AMDGPU:major:minor:stepping)
-  // with dash as delimiter to be compatible with Windows directory name
-  std::ostringstream cacheTarget;
-  cacheTarget << "AMD-AMDGPU-" << gfxipMajor << "-" << gfxipMinor << "-" << gfxipStepping;
-
-  amd::CacheCompilation* compObj = new amd::CacheCompilation(
-      cacheTarget.str(), "_rocm", OCL_CODE_CACHE_ENABLE, OCL_CODE_CACHE_RESET);
-  if (!compObj) {
-    LogError("Unable to create cache compilation object!");
-    return false;
-  }
-
-  cacheCompilation_.reset(compObj);
-#endif
-
-  return true;
-}
-
 hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, void* data) {
  if (data == nullptr) {
    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -734,16 +755,6 @@ bool Device::populateOCLDeviceConstants() {

  info_.type_ = CL_DEVICE_TYPE_GPU;

-  uint32_t hsa_bdf_id = 0;
-  if (HSA_STATUS_SUCCESS !=
-      hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &hsa_bdf_id)) {
-    return false;
-  }
-
-  info_.deviceTopology_.pcie.type = CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD;
-  info_.deviceTopology_.pcie.bus = (hsa_bdf_id & (0xFF << 8)) >> 8;
-  info_.deviceTopology_.pcie.device = (hsa_bdf_id & (0x1F << 3)) >> 3;
-  info_.deviceTopology_.pcie.function = (hsa_bdf_id & 0x07);
  info_.extensions_ = getExtensionString();
  info_.nativeVectorWidthDouble_ = info_.preferredVectorWidthDouble_ =
      (settings().doublePrecision_) ? 1 : 0;
@@ -58,6 +58,7 @@ class Memory;
 class Resource;
 class VirtualDevice;
 class PrintfDbg;
+class IProDevice;

 // A NULL Device type used only for offline compilation
 // Only functions that are used for compilation will be in this device
@@ -276,8 +277,6 @@ class Device : public NullDevice {
  //! Destructor for the physical HSA device
  virtual ~Device();

-  bool mapHSADeviceToOpenCLDevice(hsa_agent_t hsadevice);
-
  // Temporary, delete it later when HSA Runtime and KFD is fully fucntional.
  void fake_device();

@@ -388,6 +387,10 @@ class Device : public NullDevice {

  amd::Context& context() const { return *context_; }

+  // Returns AMD GPU Pro interfaces
+  const IProDevice& iPro() const { return *pro_device_; }
+  bool ProEna() const  { return pro_ena_; }
+
 private:
  static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;

@@ -415,6 +418,8 @@ class Device : public NullDevice {

  XferBuffers* xferRead_;   //!< Transfer buffers read
  XferBuffers* xferWrite_;  //!< Transfer buffers write
+  const IProDevice* pro_device_;  //!< AMDGPUPro device
+  bool  pro_ena_;           //!< Extra functionality with AMDGPUPro device, beyond ROCr

 public:
  amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
@@ -20,6 +20,7 @@
 #include "platform/memory.hpp"
 #include "platform/sampler.hpp"
 #include "amdocl/cl_gl_amd.hpp"
+#include "pro/prodriver.hpp"

 namespace roc {

@@ -548,7 +549,12 @@ void Buffer::destroy() {
  }

  const cl_mem_flags memFlags = owner()->getMemFlags();
-
+#ifdef WITH_AMDGPU_PRO
+  if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
+    dev().iPro().FreeDmaBuffer(deviceMemory_);
+    return;
+  }
+#endif
  if ((deviceMemory_ != nullptr) && (deviceMemory_ != owner()->getHostMem())) {
    // if they are identical, the host pointer will be
    // deallocated later on => avoid double deallocation
@@ -611,6 +617,20 @@ bool Buffer::create() {

  // Allocate backing storage in device local memory unless UHP or AHP are set
  const cl_mem_flags memFlags = owner()->getMemFlags();
+
+#ifdef WITH_AMDGPU_PRO
+  if ((memFlags & CL_MEM_USE_PERSISTENT_MEM_AMD) && dev().ProEna()) {
+    void* host_ptr = nullptr;
+    deviceMemory_ = dev().iPro().AllocDmaBuffer(dev().getGpuAgents()[0], size(), &host_ptr);
+    if (deviceMemory_ == nullptr) {
+      return false;
+    }
+    flags_ |= HostMemoryDirectAccess;
+    owner()->setHostMem(host_ptr);
+    return true;
+  }
+#endif
+
  if (!(memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))) {
    deviceMemory_ = dev().deviceLocalAlloc(size());

@@ -95,6 +95,7 @@ bool Settings::create(bool fullProfile, int gfxipVersion) {
  enableExtension(ClKhr3DImageWrites);
  enableExtension(ClAmdMediaOps);
  enableExtension(ClAmdMediaOps2);
+  enableExtension(ClAMDLiquidFlash);
  if (MesaInterop::Supported()) {
    enableExtension(ClKhrGlSharing);
  }
@@ -1757,4 +1757,56 @@ amd::Memory* VirtualGPU::findPinnedMem(void* addr, size_t size) {
 }

 void VirtualGPU::enableSyncBlit() const { blitMgr_->enableSynchronization(); }
+
+void VirtualGPU::submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd) {
+  size_t copySize = cmd.size()[0];
+  size_t fileOffset = cmd.fileOffset();
+  Memory* mem = dev().getRocMemory(&cmd.memory());
+  uint idx = 0;
+
+  assert((cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD) ||
+         (cmd.type() == CL_COMMAND_WRITE_SSG_FILE_AMD));
+  const bool writeBuffer(cmd.type() == CL_COMMAND_READ_SSG_FILE_AMD);
+
+  if (writeBuffer) {
+    size_t dstOffset = cmd.origin()[0];
+    while (copySize > 0) {
+      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
+      size_t dstSize = amd::TransferBufferFileCommand::StagingBufferSize;
+      dstSize = std::min(dstSize, copySize);
+      void* dstBuffer = staging->cpuMap(*this);
+      if (!cmd.file()->transferBlock(writeBuffer, dstBuffer, staging->size(), fileOffset, 0,
+                                     dstSize)) {
+        cmd.setStatus(CL_INVALID_OPERATION);
+        return;
+      }
+      staging->cpuUnmap(*this);
+
+      bool result = blitMgr().copyBuffer(*staging, *mem, 0, dstOffset, dstSize, false);
+      fileOffset += dstSize;
+      dstOffset += dstSize;
+      copySize -= dstSize;
+    }
+  } else {
+    size_t srcOffset = cmd.origin()[0];
+    while (copySize > 0) {
+      Memory* staging = dev().getRocMemory(&cmd.staging(idx));
+      size_t srcSize = amd::TransferBufferFileCommand::StagingBufferSize;
+      srcSize = std::min(srcSize, copySize);
+      bool result = blitMgr().copyBuffer(*mem, *staging, srcOffset, 0, srcSize, false);
+
+      void* srcBuffer = staging->cpuMap(*this);
+      if (!cmd.file()->transferBlock(writeBuffer, srcBuffer, staging->size(), fileOffset, 0,
+                                     srcSize)) {
+        cmd.setStatus(CL_INVALID_OPERATION);
+        return;
+      }
+      staging->cpuUnmap(*this);
+
+      fileOffset += srcSize;
+      srcOffset += srcSize;
+      copySize -= srcSize;
+    }
+  }
+}
 }  // End of roc namespace
@@ -190,6 +190,7 @@ class VirtualGPU : public device::VirtualDevice {
  virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
  virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
  virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
+  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);

  void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {}
  void submitThreadTrace(amd::ThreadTraceCommand& vcmd) {}
@@ -595,7 +595,7 @@ class SvmBuffer : AllStatic {
 //! Liquid flash extension
 class LiquidFlashFile : public RuntimeObject {
 private:
-  const wchar_t* name_;
+  std::wstring name_;
  cl_file_flags_amd flags_;
  void* handle_;
  uint32_t blockSize_;