rocm-systems/wddm/gpu_memory.cpp

#include <sys/stat.h>
#include <cinttypes>
#include <cassert>
#include "impl/wddm/gpu_memory.h"
#include "impl/wddm/device.h"
#include "util/utils.h"

using namespace std;

namespace wsl {
namespace thunk {

size_t GpuMemory::CalcChunkNumbers(gpusize size) {
  const auto chunk_size = WDDMDevice::GpuMemoryChunkSize;
  return (size + chunk_size - 1) / chunk_size;
}

gpusize GpuMemory::AdjustSize(gpusize size) const {
  const auto &device_info = device_->DeviceInfo();

  if (device_info.enable_big_page_alignment && desc_.domain == thunk_proxy::kLocal) {
    uint32_t alignment = device_info.big_page_alignment_size;
    // BigPage is only supported for allocations > bigPageMinAlignment.
    // Also, if bigPageMinAlignment == 0, BigPage optimization is not supported per KMD.
    // We do either LargePage or BigPage alignment, whichever has a higher value.
    if ((device_info.hw_big_page_min_alignment_size > 0) && (size > device_info.hw_big_page_min_alignment_size)) {
      alignment = std::max(alignment, device_info.hw_big_page_min_alignment_size);
      if (size > device_info.hw_big_page_alignment_size)
        alignment = std::max(alignment, device_info.hw_big_page_alignment_size);
    }
    if (alignment > 0)
      size = AlignUp(size, alignment);
  } else {
    const size_t min_size = 4096;
    size = AlignUp(size, min_size);
  }
  return size;
}

GpuMemory::GpuMemory(WDDMDevice *device) : device_(device) {
  num_allocations_ = 0;
  alloc_handles_ptr_ = nullptr;
  alloc_handle_ = 0;
  resource_ = 0;
  mem_fd_ = -1;
}

GpuMemory::~GpuMemory() {
  FreeGpuVirtualAddress(GpuAddress(), Size());
  FreePhysicalMemory();
  if (desc_.handle_ape_addr > 0)
    dxg_runtime->HandleApertureFree(desc_.handle_ape_addr);
}

ErrorCode GpuMemory::Init(const GpuMemoryCreateInfo &create_info) {
  desc_.domain = create_info.domain;
  desc_.adapter_luid = device_->GetLuid();
  desc_.client_size = create_info.size;
  desc_.alignment = create_info.alignment;
  desc_.mem_flags = create_info.mem_flags;
  desc_.engine_flag = create_info.engine_flag;
  desc_.flags.is_virtual = create_info.flags.virtual_alloc;
  desc_.flags.is_physical_only = create_info.flags.physical_only;
  desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
  desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
  desc_.flags.is_sysmem_exporter = create_info.flags.sysmem_ipc_sig_exporter;
  desc_.flags.is_va_required = create_info.flags.alloc_va;

  /* we can't tell the allocation is regular vmm or ipc mem at creation stage,
     they share same creation parameters, so forcing all vram allocations to
     sharable to support IPC mem */
  if (create_info.flags.interprocess ||
      desc_.domain == thunk_proxy::AllocDomain::kLocal)
    desc_.flags.is_shared = true;

  desc_.flags.is_locked = create_info.flags.locked;
  desc_.size = AdjustSize(desc_.client_size);

  if (IsUserMemory() || IsSystem())
    desc_.cpu_addr = create_info.user_ptr;

  num_allocations_ = CalcChunkNumbers(Size());
  if (num_allocations_ == 1)
    alloc_handles_ptr_ = &alloc_handle_;
  else
    alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];

  memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));

  auto code = ErrorCode::Success;

  if (IsPhysicalOnly()) {
    code = CreatePhysicalMemory();
    if (code == ErrorCode::Success)
      code = dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
    return code;
  }

  code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
  if (IsVirtual() || (code != ErrorCode::Success))
      return code;

  bool physical_created = false;

  auto guard = MakeScopeGuard([this, &physical_created, &code]() {
    if (code != ErrorCode::Success) {

      if (physical_created) {
        FreePhysicalMemory();
      }
      FreeGpuVirtualAddress(GpuAddress(), Size());
    }
  });
  (void)guard;

  code = CreatePhysicalMemory();
  if (code != ErrorCode::Success)
    return code;

  physical_created = true;

  code = MapGpuVirtualAddress(GpuAddress(), Size());
  if (code != ErrorCode::Success)
    return code;

  code = MakeResident();
  if (code != ErrorCode::Success)
    return code;

  if (!GetDevice()->WaitOnPagingFenceFromCpu())
    code = ErrorCode::Unknown;

  return code;
}

ErrorCode GpuMemory::UnmapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {
  auto code = ErrorCode::Success;
  size_t i = 0;
  auto map_addr = addr;
  auto map_size = size;

  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
    offset -= WDDMDevice::GpuMemoryChunkSize;
    i += 1;
  }

  while (map_size > 0) {
    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);

    D3DDDI_MAPGPUVIRTUALADDRESS args{};

    args.hPagingQueue = device_->PagingQueue();
    args.BaseAddress = map_addr;
    args.hAllocation = GetAllocationHandle(i);
    args.SizeInPages = block_size / 0x1000;
    args.Protection.NoAccess = 1;

    code = d3dthunk::MapGpuVirtualAddress(&args);

    if (code == ErrorCode::NotReady)
      device_->UpdatePageFence(args.PagingFenceValue);
    else if (code != ErrorCode::Success)
      break;

    map_addr += block_size;
    map_size -= block_size;
    offset = 0;   // reset second unmapped allocation offset to zero
    i += 1;
  }

  return code;
}

ErrorCode GpuMemory::MapGpuVirtualAddress(const gpusize addr, const gpusize size, gpusize offset) {

  auto code = ErrorCode::Success;
  size_t i = 0;
  auto map_addr = addr;
  auto map_size = size;
  const size_t _4K = 0x1000;

  while (offset >= WDDMDevice::GpuMemoryChunkSize) {
    offset -= WDDMDevice::GpuMemoryChunkSize;
    i += 1;
  }
  const size_t first_chunk = i;
  const auto first_chunk_offset = offset;
  /* Found two limitation for local vram:
   * 1. invisible vram va has to be 64K aligned, otherwise map gpu va fail
   * 2. visible vram can not be cpu mapped when command submission or after gpu mapped
   */
  while (map_size > 0) {
    auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);

    D3DDDI_MAPGPUVIRTUALADDRESS args{};

    args.hPagingQueue = device_->PagingQueue();
    args.BaseAddress = map_addr;
    args.hAllocation = GetAllocationHandle(i);
    args.OffsetInPages = offset / _4K;
    args.SizeInPages = block_size / _4K;
    args.Protection.Write = 1;

    code = d3dthunk::MapGpuVirtualAddress(&args);

    if (code != ErrorCode::Success) {
      if (code == ErrorCode::NotReady) {
        const uint64_t fence_value = args.PagingFenceValue;
        device_->UpdatePageFence(fence_value);
        code = ErrorCode::Success;
      } else
        break;
    }

    map_addr += block_size;
    map_size -= block_size;
    offset = 0;  // reset second mapped allocation offset to zero
    i++;
  }

  if (code != ErrorCode::Success) {
    // Map failed, unmap partial mapped block
    offset = first_chunk_offset;
    map_addr = addr;
    map_size = size;
    for (size_t j = first_chunk; j < i; j++) {
      auto block_size = std::min(map_size, WDDMDevice::GpuMemoryChunkSize);

      D3DDDI_MAPGPUVIRTUALADDRESS args{};

      args.hPagingQueue = device_->PagingQueue();
      args.BaseAddress = map_addr;
      args.hAllocation = 0;
      args.OffsetInPages = offset / _4K;
      args.SizeInPages = block_size / _4K;
      args.Protection.NoAccess = 1;

      auto unmap_code = d3dthunk::MapGpuVirtualAddress(&args);
      if (unmap_code == ErrorCode::NotReady)
        device_->UpdatePageFence(args.PagingFenceValue);

      map_addr += block_size;
      map_size -= block_size;
    }
  }

  return code;
}

ErrorCode GpuMemory::ReserveGpuVirtualAddress(gpusize base_virt_addr, gpusize size, gpusize alignment) {
  ErrorCode status;
  gpusize gpu_virt_addr = 0;
  if ((desc_.flags.is_sysmem_exporter || desc_.flags.is_imported_sys_memfd)
      && desc_.domain == thunk_proxy::AllocDomain::kSystem) {
    int mfd = (mem_fd_ > -1)? mem_fd_ : -1;
    status = device_->ReserveIPCSysMem(Size(), &gpu_virt_addr, desc_.alignment, mfd, desc_.flags.is_locked);
    if (status == ErrorCode::Success)
      mem_fd_ = mfd;
  } else {
    status = device_->ReserveGpuVirtualAddress(desc_.domain, base_virt_addr, size, &gpu_virt_addr, alignment,
        desc_.flags.is_locked);
  }

  if (status == ErrorCode::Success) {
    desc_.gpu_addr = gpu_virt_addr;

    if (IsSystem())
      desc_.cpu_addr = reinterpret_cast<void *>(desc_.gpu_addr);
  }
  return status;
}

ErrorCode GpuMemory::FreeGpuVirtualAddress(gpusize base_addr, gpusize size) {
  if (mem_fd_ > -1)
    return device_->FreeIPCSysMem(GpuAddress(), Size(), mem_fd_);

  return base_addr != 0 ?
         device_->FreeGpuVirtualAddress(desc_.domain, base_addr, size) :
         ErrorCode::Success;
}

ErrorCode GpuMemory::CreatePhysicalMemory() {

  assert(!IsVirtual() && NumChunks() > 0);

  const auto num_allocations = NumChunks();
  void *priv_drv_data;
  void *alloc_priv;
  int priv_drv_data_size;
  int alloc_priv_data_size;

  if (!thunk_proxy::CreatePrivateAllocInfo(NumChunks(), &priv_drv_data, &alloc_priv,
                                          &priv_drv_data_size, &alloc_priv_data_size))
    return ErrorCode::OutOfMemory;

  auto alloc_info = reinterpret_cast<D3DDDI_ALLOCATIONINFO2*>(
       static_cast<unsigned char*>(priv_drv_data) + priv_drv_data_size * num_allocations);

  size_t size = desc_.size;
  uint64_t addr = desc_.gpu_addr;
  char *cpu_addr = static_cast<char *>(desc_.cpu_addr);
  const auto &device_info = GetDevice()->DeviceInfo();

  for (size_t i = 0; i < num_allocations; i++) {

    void* priv_data = (void*)((char*)priv_drv_data + priv_drv_data_size * i);
    size_t block_size = std::min(size, WDDMDevice::GpuMemoryChunkSize);

    if (IsUserMemory() || IsSystem()) {
      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, 0, desc_.mem_flags, desc_.engine_flag, device_info);
      alloc_info[i].pSystemMem = static_cast<void *>(cpu_addr);
      cpu_addr += block_size;
    } else {
      thunk_proxy::SetAllocationInfo(priv_data, block_size, desc_.domain, addr, desc_.mem_flags, desc_.engine_flag, device_info);
    }

    size -= block_size;
    addr += block_size;

    alloc_info[i].pPrivateDriverData = priv_data;
    alloc_info[i].PrivateDriverDataSize = priv_drv_data_size;
    alloc_info[i].VidPnSourceId = D3DDDI_ID_UNINITIALIZED;
  }

  D3DKMT_CREATEALLOCATION args = {};
  args.hDevice = device_->DeviceHandle();
  args.pPrivateDriverData = alloc_priv;
  args.PrivateDriverDataSize = alloc_priv_data_size;
  args.NumAllocations = num_allocations;
  args.pAllocationInfo2 = alloc_info;

  /* The PhysicallyContiguous flag causes allocation failure
   * args.Flags.PhysicallyContiguous = IsPhysicalContiguous();
   */

  SharedHandleInfo shared_info;
  if (IsShared()) {
    shared_info.size = desc_.size;
    shared_info.client_size = desc_.client_size;
    shared_info.domain = desc_.domain;
    shared_info.adapter_luid = desc_.adapter_luid;
    shared_info.flags = reinterpret_cast<uint32_t>(desc_.flags.reserved);
    shared_info.mem_flags = desc_.mem_flags;
    shared_info.pid = dxg_runtime->parent_pid;
    shared_info.gpu_addr = desc_.gpu_addr;
    args.pPrivateRuntimeData = &shared_info;
    args.PrivateRuntimeDataSize = sizeof(shared_info);
    args.Flags.NtSecuritySharing = 1;
    args.Flags.CreateShared = 1;
    args.Flags.CreateResource = 1;
  }

  auto status = d3dthunk::CreateAllocation(&args);
  if (status == ErrorCode::Success) {
    for (size_t i = 0; i < num_allocations; i++)
      alloc_handles_ptr_[i] = alloc_info[i].hAllocation;

    resource_ = args.hResource;
  }
  thunk_proxy::DestroyPrivateAllocInfo(priv_drv_data, alloc_priv);
  return status;
}

ErrorCode GpuMemory::FreePhysicalMemory() {
  auto code = ErrorCode::Success;

  if (alloc_handles_ptr_ == nullptr || (NumChunks() == 1 && *alloc_handles_ptr_ == 0))
      return code;

  code = d3dthunk::DestroyAllocation(device_->DeviceHandle(),
                                  resource_,
                                  NumChunks(),
                                  alloc_handles_ptr_);
  if (NumChunks() > 1)
    delete[] alloc_handles_ptr_;

  alloc_handles_ptr_ = nullptr;
  return code;
}

ErrorCode GpuMemory::MakeResident() {

  D3DDDI_MAKERESIDENT args = {};
  args.hPagingQueue = device_->PagingQueue();
  args.NumAllocations = NumChunks();
  args.AllocationList = alloc_handles_ptr_;
  args.Flags.CantTrimFurther = 1;

  auto code = d3dthunk::MakeResident(&args);
  if (code == ErrorCode::NotReady) {
    const auto fence_value = args.PagingFenceValue;
    device_->UpdatePageFence(fence_value);
    code = ErrorCode::Success;
  }
  return code;
}

ErrorCode GpuMemory::Evict() {

  D3DKMT_EVICT args = {};
  args.hDevice = device_->DeviceHandle();
  args.NumAllocations = NumChunks();
  args.AllocationList = alloc_handles_ptr_;

  return d3dthunk::Evict(&args);
}

ErrorCode GpuMemory::ExportPhysicalHandle(int* dmabuf_fd, uint32_t flags) {
  if (mem_fd_ > -1) {
    *dmabuf_fd = mem_fd_;
    return ErrorCode::Success;
  }

  if (IsShared())
    return d3dthunk::ShareObjects(1, resource_, flags, dmabuf_fd);
  else
    return ErrorCode::UnSupported;
}


ErrorCode GpuMemory::ImportPhysicalHandle(const GpuMemoryCreateInfo &create_info, gpusize *gpu_addr) {
  D3DKMT_QUERYRESOURCEINFOFROMNTHANDLE query_args;
  int dmabuf_fd = create_info.dmabuf_fd;

  if (dmabuf_fd <= 0)
    return ErrorCode::InvalidateParams;

  if(create_info.flags.sysmem_ipc_sig_importer) {
    // the ipc signal sys mem fd will be closed in Runtime::IPCClientImport, dup to hold a reference
    mem_fd_ = dup(dmabuf_fd);
    desc_.client_size = create_info.size;
    desc_.size = AdjustSize(desc_.client_size);
    desc_.domain = thunk_proxy::AllocDomain::kSystem;
    desc_.adapter_luid = device_->GetLuid();
    desc_.alignment = 0x1000;
    desc_.mem_flags = create_info.mem_flags;
    desc_.engine_flag = create_info.engine_flag;
    desc_.flags.is_imported_sys_memfd = create_info.flags.sysmem_ipc_sig_importer;
    desc_.flags.is_va_required = create_info.flags.alloc_va;
    desc_.flags.is_virtual = create_info.flags.virtual_alloc;
    desc_.flags.is_physical_only = create_info.flags.physical_only;
    desc_.flags.is_physical_contiguous = create_info.flags.physical_contiguous;
    desc_.flags.is_locked = create_info.flags.locked;

    auto code = ReserveGpuVirtualAddress(create_info.va_hint, Size(), create_info.alignment);
    if (code != ErrorCode::Success)
      return code;

    bool physical_created = false;
    auto guard = MakeScopeGuard([this, &physical_created, &code]() {
          if (code != ErrorCode::Success) {
            if (physical_created)
              FreePhysicalMemory();
            FreeGpuVirtualAddress(GpuAddress(), Size());
          }
        });
    (void)guard;

    num_allocations_ = CalcChunkNumbers(Size());
    if (num_allocations_ == 1)
      alloc_handles_ptr_ = &alloc_handle_;
    else
      alloc_handles_ptr_ = new WinAllocationHandle[num_allocations_];

    memset(alloc_handles_ptr_, 0, num_allocations_ * sizeof(WinAllocationHandle));

    code = CreatePhysicalMemory();
    if (code != ErrorCode::Success)
      return code;

    physical_created = true;

    code = MapGpuVirtualAddress(GpuAddress(), Size());
    if (code != ErrorCode::Success)
      return code;

    code = MakeResident();
    if (code != ErrorCode::Success)
      return code;

    if (!GetDevice()->WaitOnPagingFenceFromCpu())
      code = ErrorCode::Unknown;

    return code;
  } else {
    // vmem importer / ipc vram importer
    memset(&query_args, 0, sizeof(query_args));
    query_args.hDevice = device_->DeviceHandle();
    query_args.hNtHandle = reinterpret_cast<HANDLE>(dmabuf_fd);
    auto ret = d3dthunk::QueryResourceInfoFromNtHandle(&query_args);
    if (ret != ErrorCode::Success) {
      pr_err("query resource info from nt handle failed %d\n", static_cast<int>(ret));
      return ErrorCode::InvalidateParams;
    }
    pr_debug("wsl-thunk: import from nt handle %d, get allocation number %d,"
             " runtime data size %#x total driver data size %#x resource data size=%#x\n",
             dmabuf_fd,
             query_args.NumAllocations,
             query_args.PrivateRuntimeDataSize,
             query_args.TotalPrivateDriverDataSize,
             query_args.ResourcePrivateDriverDataSize);

    SharedHandleInfo shared_info;
    if(sizeof(shared_info) != query_args.PrivateRuntimeDataSize) {
      pr_err("shared hanle info size mismatch:%d vs %ld\n",
             query_args.PrivateRuntimeDataSize, sizeof(shared_info));
      return ErrorCode::UnSupported;
    }

    uint32_t total_size = query_args.NumAllocations * sizeof(D3DDDI_OPENALLOCATIONINFO2) +
      query_args.TotalPrivateDriverDataSize +
      query_args.ResourcePrivateDriverDataSize;
    D3DDDI_OPENALLOCATIONINFO2 *open_info =
      reinterpret_cast<D3DDDI_OPENALLOCATIONINFO2*> (calloc(1, total_size));
    if (!open_info) {
      pr_err("alloc open_info failed, NumAllocations:%d\n",
             query_args.NumAllocations);
      return ErrorCode::OutOfMemory;
    }

    auto guard = MakeScopeGuard([&open_info]() { free(open_info); });

    alloc_handles_ptr_ = new WinAllocationHandle[query_args.NumAllocations];

    D3DKMT_OPENRESOURCEFROMNTHANDLE open_args;
    memset(&open_args, 0, sizeof(open_args));
    open_args.hDevice = query_args.hDevice;
    open_args.hNtHandle = query_args.hNtHandle;
    open_args.NumAllocations = query_args.NumAllocations;
    open_args.pOpenAllocationInfo2 = open_info;
    open_args.TotalPrivateDriverDataBufferSize = query_args.TotalPrivateDriverDataSize;
    open_args.pTotalPrivateDriverDataBuffer = reinterpret_cast<void*>
      (open_args.pOpenAllocationInfo2 + open_args.NumAllocations);
    open_args.ResourcePrivateDriverDataSize = query_args.ResourcePrivateDriverDataSize;
    open_args.pResourcePrivateDriverData = reinterpret_cast<void*>
      (((uint64_t)open_args.pTotalPrivateDriverDataBuffer) +
       open_args.TotalPrivateDriverDataBufferSize);
    open_args.PrivateRuntimeDataSize = query_args.PrivateRuntimeDataSize;
    open_args.pPrivateRuntimeData = reinterpret_cast<void*> (&shared_info);

    ret = d3dthunk::OpenResourceFromNtHandle(&open_args);
    if (ret != ErrorCode::Success) {
      ret = ErrorCode::InvalidateParams;
      pr_err("open resource failed %d\n", static_cast<int>(ret));
      return ret;
    }
    if (shared_info.pid == dxg_runtime->parent_pid &&
      create_info.flags.alloc_va &&
      IsSameAdapter(shared_info.adapter_luid) &&
      shared_info.gpu_addr) {
      pr_info("import from same device and samve process, va is required. "
               "a buffer can't be mapped to 2 va. delete the imported buffer, use the existing one.\n");
      if (gpu_addr)
        *gpu_addr = shared_info.gpu_addr;
      return ErrorCode::SameProcessSameDevice;
    }

    desc_.size = shared_info.size;
    desc_.client_size = shared_info.client_size;
    desc_.domain = shared_info.domain;
    desc_.flags.reserved = shared_info.flags;
    desc_.mem_flags = shared_info.mem_flags;
    desc_.adapter_luid = shared_info.adapter_luid;
    resource_ = open_args.hResource;
    num_allocations_ = open_args.NumAllocations;
    for (int i = 0; i < num_allocations_; i++)
      alloc_handles_ptr_[i] = open_info[i].hAllocation;

    desc_.flags.is_va_required = create_info.flags.alloc_va;
    if (desc_.flags.is_va_required) {
      desc_.flags.is_imported_vram_ipc = 1;
      ret = ReserveGpuVirtualAddress(create_info.va_hint, desc_.size, create_info.alignment);
      if (ret != ErrorCode::Success)
        pr_err("failed to allocate svm range, error:%d\n", static_cast<int>(ret));

      return ret;
    } else {
      desc_.flags.is_imported_vram_vmem = 1;
      return dxg_runtime->HandleApertureAlloc(desc_.size, &desc_.handle_ape_addr);
    }
  }
}

} // namespace thunk
} // namespace wsl