SWDEV-423835 - Fixing kernel launch issues on Virtual Memory Management path.

Change-Id: I9f5e8a3d83af3809b2c50b21a10697e26113dd23
2024-02-05 16:50:51 -05:00
@@ -2104,13 +2104,8 @@ class GraphMemAllocNode final : public GraphNode {
      // Retain memory object because command release will release it
      memory_->retain();
      size_ = aligned_size;
-      // Save geenric allocation info to match VM interfaces
-      memory_->getUserData().data = new hip::MemMapAllocUserData(dptr, aligned_size, va_);
      // Execute the original mapping command
      VirtualMapCommand::submit(device);
-      // Update the internal svm address to ptr
-      memory()->setSvmPtr(va_->getSvmPtr());
-      // Can't destroy VA, because it's used in mapping even if the node will be destroyed
      va_->retain();
      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute: %p, %p",
          va_->getSvmPtr(), memory());
@@ -2234,24 +2229,21 @@ class GraphMemFreeNode : public GraphNode {

    virtual void submit(device::VirtualDevice& device) final {
      // Find memory object before unmap logic
-      auto alloc = amd::MemObjMap::FindMemObj(ptr());
+      auto vaddr_mem_obj = amd::MemObjMap::FindMemObj(ptr());
+      amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
+      assert(phys_mem_obj != nullptr);
      VirtualMapCommand::submit(device);
-      // Restore the original address of the generic allocation
-      auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(alloc->getUserData().data);
-      alloc->setSvmPtr(ga->ptr_);
      if (!AMD_DIRECT_DISPATCH) {
        // Update the current device, since hip event, used in mem pools, requires device
        hip::setCurrentDevice(device_id_);
      }
      // Free virtual address
-      ga->va_->release();
-      alloc->getUserData().data = nullptr;
+      vaddr_mem_obj->release();
      // Release the allocation back to graph's pool
-      graph_->FreeMemory(ga->ptr_, static_cast<hip::Stream*>(queue()));
-      amd::MemObjMap::AddMemObj(ptr(), ga->va_);
-      delete ga;
+      graph_->FreeMemory(phys_mem_obj->getSvmPtr(), static_cast<hip::Stream*>(queue()));
+      amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
      ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p",
-          ptr(), alloc);
+          ptr(), vaddr_mem_obj);
    }

   private:
@@ -225,6 +225,10 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
  {
    amd::ScopedLock lock(lock_pool_ops_);

+    if (memory->getUserData().phys_mem_obj != nullptr) {
+      memory = memory->getUserData().phys_mem_obj;
+    }
+
    // If the free heap grows over the busy heap, then force release
    if (AMD_DIRECT_DISPATCH && (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize())) {
      // Use event base release to reduce memory pressure
@@ -249,22 +253,14 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
    }
    ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory);

-    auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(memory->getUserData().data);
-    if (ga != nullptr) {
-      if (stream == nullptr) {
+    if (stream == nullptr) {
        stream = g_devices[memory->getUserData().deviceId]->NullStream();
-      }
-      // Unmap virtual address from memory
-      auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
-                                            memory->getSvmPtr(), ga->size_, nullptr);
-      cmd->enqueue();
-      cmd->release();
-      memory->setSvmPtr(ga->ptr_);
-      // Free virtual address and destroy generic allocation object
-      ga->va_->release();
-      delete ga;
-      memory->getUserData().data = nullptr;
    }
+    // Unmap virtual address from memory
+    auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
+                                          memory->getSvmPtr(), memory->getSize(), nullptr);
+    cmd->enqueue();
+    cmd->release();

    if (stream != nullptr) {
      // The stream of destruction is a safe stream, because the app must handle sync
@@ -120,11 +120,15 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,

  // Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations.
  size_t offset = 0; //this is ignored
-  amd::Memory* memObj = getMemoryObject(ptr, offset);
+  amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset);
  //saves the current device id so that it can be accessed later
-  memObj->getUserData().deviceId = prop->location.id;
-  memObj->getUserData().data = new hip::GenericAllocation(ptr, size, *prop);
-  *handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(memObj->getUserData().data);
+  phys_mem_obj->getUserData().deviceId = prop->location.id;
+  phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop);
+  *handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(phys_mem_obj->getUserData().data);
+
+  // Remove because the entry of 0x1 is not needed in MemObjMap.
+  // We save the copy of Phy mem obj in virtual mem obj during mapping.
+  amd::MemObjMap::RemoveMemObj(ptr);

  HIP_RETURN(hipSuccess);
 }
@@ -225,9 +229,6 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
  cmd->awaitCompletion();
  cmd->release();

-  // update the internal svm address to ptr
-  ga->asAmdMemory().setSvmPtr(ptr);
-
  HIP_RETURN(hipSuccess);
 }

@@ -268,7 +269,8 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
    HIP_RETURN(hipErrorInvalidValue);
  }

-  *handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(mem->getUserData().data);
+  *handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(
+              mem->getUserData().phys_mem_obj->getUserData().data);

  if (*handle == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
@@ -312,17 +314,17 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  amd::Memory* pa = amd::MemObjMap::FindMemObj(ptr);
-  if (pa == nullptr) {
+  amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(ptr);
+  if (vaddr_mem_obj == nullptr && vaddr_mem_obj->getSize() != size) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(ptr);
-  if (va == nullptr && va->getSize() != size) {
+  amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
+  if (phys_mem_obj == nullptr) {
    HIP_RETURN(hipErrorInvalidValue);
  }

-  auto& queue = *g_devices[pa->getUserData().deviceId]->NullStream();
+  auto& queue = *g_devices[phys_mem_obj->getUserData().deviceId]->NullStream();

  amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size,
                                                 nullptr);
@@ -331,9 +333,8 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
  cmd->release();

  // restore the original pa of the generic allocation
-  hip::GenericAllocation* ga = reinterpret_cast<hip::GenericAllocation*>(pa->getUserData().data);
-  pa->setSvmPtr(ga->genericAddress());
-
+  hip::GenericAllocation* ga
+    = reinterpret_cast<hip::GenericAllocation*>(phys_mem_obj->getUserData().data);
  ga->release();

  HIP_RETURN(hipSuccess);
@@ -30,35 +30,23 @@ namespace hip {

 hipError_t ihipFree(void* ptr);

-struct MemMapAllocUserData {
-  void* ptr_;       // Original pointer of the allocation
-  size_t size_;     // Aligned size of the allocation
-  amd::Memory* va_; // Memory object for the virtual address
-
-  MemMapAllocUserData(void* ptr, size_t size, amd::Memory* va) : ptr_(ptr), size_(size), va_(va) {}
-};
-
 class GenericAllocation : public amd::RuntimeObject {
-  void* ptr_;                          //<! Device ptr
+  amd::Memory& phys_mem_ref_;        //<! Physical memory object
  size_t size_;                        //<! Allocated size
  hipMemAllocationProp properties_;    //<! Allocation Properties

 public:
-  GenericAllocation(void* ptr, size_t size, const hipMemAllocationProp& prop)
-                     : ptr_(ptr), size_(size), properties_(prop) {}
-  ~GenericAllocation() {
-    hipError_t err = ihipFree(ptr_);
-  }
+  GenericAllocation(amd::Memory& phys_mem_ref, size_t size, const hipMemAllocationProp& prop) 
+                    : phys_mem_ref_(phys_mem_ref), size_(size), properties_(prop) {}
+  ~GenericAllocation() {}

  const hipMemAllocationProp& GetProperties() const { return properties_; }
  hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() {
    return reinterpret_cast<hipMemGenericAllocationHandle_t>(this);
  }
  amd::Memory& asAmdMemory() {
-    size_t discardOffset;
-    return *getMemoryObject(genericAddress(), discardOffset);
+    return phys_mem_ref_;
  }
-  void* genericAddress() const { return ptr_; }

  virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; }
 };
@@ -2192,18 +2192,18 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  amd::ScopedLock lock(execution());

  profilingBegin(vcmd);
-  amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
-  if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
+  amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
+  if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
    profilingEnd(vcmd);
    return;
  }
-  pal::Memory* vaRange = dev().getGpuMemory(va);
-  Pal::IGpuMemory* memory = (vcmd.memory() == nullptr) ?
+  pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_mem_obj);
+  Pal::IGpuMemory* phymem_igpu_mem = (vcmd.memory() == nullptr) ?
      nullptr : dev().getGpuMemory(vcmd.memory())->iMem();
  Pal::VirtualMemoryRemapRange range{
-    vaRange->iMem(),
+    vaddr_pal_mem->iMem(),
    0,
-    memory,
+    phymem_igpu_mem,
    0,
    vcmd.size(),
    Pal::VirtualGpuMemAccessMode::NoAccess
@@ -2224,13 +2224,15 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  setGpuEvent(event);
  if (result == Pal::Result::Success) {
    if (vcmd.memory() != nullptr) {
-      // assert the va wasn't mapped already
+      // assert the vaddr_mem_obj wasn't mapped already
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
-      amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
+      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
+      vaddr_mem_obj->getUserData().phys_mem_obj = vcmd.memory();
    } else {
-      // assert the va is mapped and needs to be removed
+      // assert the vaddr_mem_obj is mapped and needs to be removed
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
      amd::MemObjMap::RemoveMemObj(vcmd.ptr());
+      vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
    }
  }
  profilingEnd(vcmd);
@@ -2301,6 +2301,16 @@ uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const {
  return hsa_vmem_handle.handle;
 }

+void Device::deviceVmemRelease(uint64_t mem_handle) const {
+  hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {};
+  hsa_vmem_handle.handle = mem_handle;
+
+  hsa_status_t hsa_status = hsa_amd_vmem_handle_release(hsa_vmem_handle);
+  if (hsa_status != HSA_STATUS_SUCCESS) {
+    LogPrintfError("Failed hsa_amd_vmem_handle_release! Failed with hsa status: %d \n", hsa_status);
+  }
+}
+
 void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const {
  const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_
                                      : (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_;
@@ -2381,7 +2391,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
      return nullptr;
    }

-    if (mem->getSvmPtr() != nullptr) {
+    if (mem->getSvmPtr() != nullptr || mem->getMemFlags() & ROCCLR_MEM_PHYMEM) {
      // add the information to context so that we can use it later.
      amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
    }
@@ -450,6 +450,7 @@ class Device : public NullDevice {
  bool deviceAllowAccess(void* dst) const;

  bool allowPeerAccess(device::Memory* memory) const;
+  void deviceVmemRelease(uint64_t mem_handle) const;
  uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
  void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const;

@@ -648,6 +648,12 @@ void Buffer::destroy() {
    }
    const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;

+    if (memFlags & ROCCLR_MEM_PHYMEM) {
+      // If this is physical memory, dont call hsa free function, since device mem was never created
+      dev().deviceVmemRelease(owner()->getUserData().hsa_handle);
+      return;
+    }
+
    if (kind_ != MEMORY_KIND_PTRGIVEN) {
      if (isFineGrain) {
        if (memFlags & CL_MEM_ALLOC_HOST_PTR) {
@@ -767,7 +773,10 @@ bool Buffer::create(bool alloc_local) {
    owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0);
    if (owner()->getUserData().hsa_handle == 0) {
      LogError("HSA Opaque Handle returned was null");
+      return false;
    }
+    deviceMemory_ = reinterpret_cast<void*>(amd::Memory::MemoryType::kPhyMemHandlePtr);
+    return true;
  }

  if ((owner()->parent() == nullptr) &&
@@ -2589,36 +2589,39 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {

  profilingBegin(vcmd);

-  // Find the amd::Memory object for virtual ptr.
-  amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
-  if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
+  // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
+  amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
+  if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
    profilingEnd(vcmd);
    return;
  }

  // Get the amd::Memory object for the physical address
-  amd::Memory* pa = vcmd.memory();
+  amd::Memory* phys_mem_obj = vcmd.memory();
  hsa_status_t hsa_status = HSA_STATUS_SUCCESS;

  // If Physical address is not set, then it is map command. If set, it is unmap command.
-  if (pa != nullptr) {
+  if (phys_mem_obj != nullptr) {
    // Map the physical to virtual address the hsa api
    hsa_amd_vmem_alloc_handle_t opaque_hsa_handle;
-    opaque_hsa_handle.handle = pa->getUserData().hsa_handle;
-    if ((hsa_status = hsa_amd_vmem_map(va->getSvmPtr(), va->getSize(), va->getOffset(),
-                                       opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
+    opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle;
+    if ((hsa_status = hsa_amd_vmem_map(vaddr_mem_obj->getSvmPtr(), vcmd.size(),
+                        vaddr_mem_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
      // Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap.
-      amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
+      amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
+      vaddr_mem_obj->getUserData().phys_mem_obj = phys_mem_obj;
    } else {
      LogError("HSA Command: hsa_amd_vmem_map failed!");
    }
  } else {
    // Unmap the object, since the physical addr is set.
-    if ((hsa_status = hsa_amd_vmem_unmap(va->getSvmPtr(), va->getSize())) == HSA_STATUS_SUCCESS) {
+    if ((hsa_status = hsa_amd_vmem_unmap(vaddr_mem_obj->getSvmPtr(), vcmd.size()))
+                        == HSA_STATUS_SUCCESS) {
      // assert the va is mapped and needs to be removed
      assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
      amd::MemObjMap::RemoveMemObj(vcmd.ptr());
+      vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
    } else {
      LogError("HSA Command: hsa_amd_vmem_unmap failed");
    }
@@ -142,13 +142,15 @@ class Memory : public amd::RuntimeObject {
 public:
  enum MemoryType {
    kSvmMemoryPtr = 0x1,
-    kArenaMemoryPtr = 0x100
+    kArenaMemoryPtr = 0x100,
+    kPhyMemHandlePtr = 0x101
  };

  struct UserData
  {
     int deviceId = 0;     //!< Device ID memory is allocated on
     void* data = nullptr; //!< Opaque user data from CL or HIP or etc.
+     amd::Memory* phys_mem_obj = nullptr; //<! Physical mem obj, only set on virtual mem
     uint64_t hsa_handle = 0; //!<Opaque hsa handle saved for Virtual memories
     unsigned int flags = 0; //!< HIP memory flags
     //! hipMallocPitch allocates buffer using width & height and returns pitch & device pointer.