diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index fb62acc06f..a80bb71c7c 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -2104,13 +2104,8 @@ class GraphMemAllocNode final : public GraphNode { // Retain memory object because command release will release it memory_->retain(); size_ = aligned_size; - // Save geenric allocation info to match VM interfaces - memory_->getUserData().data = new hip::MemMapAllocUserData(dptr, aligned_size, va_); // Execute the original mapping command VirtualMapCommand::submit(device); - // Update the internal svm address to ptr - memory()->setSvmPtr(va_->getSvmPtr()); - // Can't destroy VA, because it's used in mapping even if the node will be destroyed va_->retain(); ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute: %p, %p", va_->getSvmPtr(), memory()); @@ -2234,24 +2229,21 @@ class GraphMemFreeNode : public GraphNode { virtual void submit(device::VirtualDevice& device) final { // Find memory object before unmap logic - auto alloc = amd::MemObjMap::FindMemObj(ptr()); + auto vaddr_mem_obj = amd::MemObjMap::FindMemObj(ptr()); + amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj; + assert(phys_mem_obj != nullptr); VirtualMapCommand::submit(device); - // Restore the original address of the generic allocation - auto ga = reinterpret_cast(alloc->getUserData().data); - alloc->setSvmPtr(ga->ptr_); if (!AMD_DIRECT_DISPATCH) { // Update the current device, since hip event, used in mem pools, requires device hip::setCurrentDevice(device_id_); } // Free virtual address - ga->va_->release(); - alloc->getUserData().data = nullptr; + vaddr_mem_obj->release(); // Release the allocation back to graph's pool - graph_->FreeMemory(ga->ptr_, static_cast(queue())); - amd::MemObjMap::AddMemObj(ptr(), ga->va_); - delete ga; + graph_->FreeMemory(phys_mem_obj->getSvmPtr(), static_cast(queue())); + amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj); ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p", - ptr(), alloc); + ptr(), vaddr_mem_obj); } private: diff --git a/hipamd/src/hip_mempool_impl.cpp b/hipamd/src/hip_mempool_impl.cpp index dc1ca68bbc..44e23a91d9 100644 --- a/hipamd/src/hip_mempool_impl.cpp +++ b/hipamd/src/hip_mempool_impl.cpp @@ -225,6 +225,10 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) { { amd::ScopedLock lock(lock_pool_ops_); + if (memory->getUserData().phys_mem_obj != nullptr) { + memory = memory->getUserData().phys_mem_obj; + } + // If the free heap grows over the busy heap, then force release if (AMD_DIRECT_DISPATCH && (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize())) { // Use event base release to reduce memory pressure @@ -249,22 +253,14 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) { } ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory); - auto ga = reinterpret_cast(memory->getUserData().data); - if (ga != nullptr) { - if (stream == nullptr) { + if (stream == nullptr) { stream = g_devices[memory->getUserData().deviceId]->NullStream(); - } - // Unmap virtual address from memory - auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{}, - memory->getSvmPtr(), ga->size_, nullptr); - cmd->enqueue(); - cmd->release(); - memory->setSvmPtr(ga->ptr_); - // Free virtual address and destroy generic allocation object - ga->va_->release(); - delete ga; - memory->getUserData().data = nullptr; } + // Unmap virtual address from memory + auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{}, + memory->getSvmPtr(), memory->getSize(), nullptr); + cmd->enqueue(); + cmd->release(); if (stream != nullptr) { // The stream of destruction is a safe stream, because the app must handle sync diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp index 73ba16776b..f8dd197c13 100644 --- a/hipamd/src/hip_vm.cpp +++ b/hipamd/src/hip_vm.cpp @@ -120,11 +120,15 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, // Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations. size_t offset = 0; //this is ignored - amd::Memory* memObj = getMemoryObject(ptr, offset); + amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset); //saves the current device id so that it can be accessed later - memObj->getUserData().deviceId = prop->location.id; - memObj->getUserData().data = new hip::GenericAllocation(ptr, size, *prop); - *handle = reinterpret_cast(memObj->getUserData().data); + phys_mem_obj->getUserData().deviceId = prop->location.id; + phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop); + *handle = reinterpret_cast(phys_mem_obj->getUserData().data); + + // Remove because the entry of 0x1 is not needed in MemObjMap. + // We save the copy of Phy mem obj in virtual mem obj during mapping. + amd::MemObjMap::RemoveMemObj(ptr); HIP_RETURN(hipSuccess); } @@ -225,9 +229,6 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat cmd->awaitCompletion(); cmd->release(); - // update the internal svm address to ptr - ga->asAmdMemory().setSvmPtr(ptr); - HIP_RETURN(hipSuccess); } @@ -268,7 +269,8 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, HIP_RETURN(hipErrorInvalidValue); } - *handle = reinterpret_cast(mem->getUserData().data); + *handle = reinterpret_cast( + mem->getUserData().phys_mem_obj->getUserData().data); if (*handle == nullptr) { HIP_RETURN(hipErrorInvalidValue); @@ -312,17 +314,17 @@ hipError_t hipMemUnmap(void* ptr, size_t size) { HIP_RETURN(hipErrorInvalidValue); } - amd::Memory* pa = amd::MemObjMap::FindMemObj(ptr); - if (pa == nullptr) { + amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(ptr); + if (vaddr_mem_obj == nullptr && vaddr_mem_obj->getSize() != size) { HIP_RETURN(hipErrorInvalidValue); } - amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(ptr); - if (va == nullptr && va->getSize() != size) { + amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj; + if (phys_mem_obj == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - auto& queue = *g_devices[pa->getUserData().deviceId]->NullStream(); + auto& queue = *g_devices[phys_mem_obj->getUserData().deviceId]->NullStream(); amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size, nullptr); @@ -331,9 +333,8 @@ hipError_t hipMemUnmap(void* ptr, size_t size) { cmd->release(); // restore the original pa of the generic allocation - hip::GenericAllocation* ga = reinterpret_cast(pa->getUserData().data); - pa->setSvmPtr(ga->genericAddress()); - + hip::GenericAllocation* ga + = reinterpret_cast(phys_mem_obj->getUserData().data); ga->release(); HIP_RETURN(hipSuccess); diff --git a/hipamd/src/hip_vm.hpp b/hipamd/src/hip_vm.hpp index 3302ace54f..de7bf881e4 100644 --- a/hipamd/src/hip_vm.hpp +++ b/hipamd/src/hip_vm.hpp @@ -30,35 +30,23 @@ namespace hip { hipError_t ihipFree(void* ptr); -struct MemMapAllocUserData { - void* ptr_; // Original pointer of the allocation - size_t size_; // Aligned size of the allocation - amd::Memory* va_; // Memory object for the virtual address - - MemMapAllocUserData(void* ptr, size_t size, amd::Memory* va) : ptr_(ptr), size_(size), va_(va) {} -}; - class GenericAllocation : public amd::RuntimeObject { - void* ptr_; //(this); } amd::Memory& asAmdMemory() { - size_t discardOffset; - return *getMemoryObject(genericAddress(), discardOffset); + return phys_mem_ref_; } - void* genericAddress() const { return ptr_; } virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; } }; diff --git a/rocclr/device/pal/palvirtual.cpp b/rocclr/device/pal/palvirtual.cpp index 1378f2696c..a19d3aa1c6 100644 --- a/rocclr/device/pal/palvirtual.cpp +++ b/rocclr/device/pal/palvirtual.cpp @@ -2192,18 +2192,18 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { amd::ScopedLock lock(execution()); profilingBegin(vcmd); - amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); - if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); + if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { profilingEnd(vcmd); return; } - pal::Memory* vaRange = dev().getGpuMemory(va); - Pal::IGpuMemory* memory = (vcmd.memory() == nullptr) ? + pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_mem_obj); + Pal::IGpuMemory* phymem_igpu_mem = (vcmd.memory() == nullptr) ? nullptr : dev().getGpuMemory(vcmd.memory())->iMem(); Pal::VirtualMemoryRemapRange range{ - vaRange->iMem(), + vaddr_pal_mem->iMem(), 0, - memory, + phymem_igpu_mem, 0, vcmd.size(), Pal::VirtualGpuMemAccessMode::NoAccess @@ -2224,13 +2224,15 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { setGpuEvent(event); if (result == Pal::Result::Success) { if (vcmd.memory() != nullptr) { - // assert the va wasn't mapped already + // assert the vaddr_mem_obj wasn't mapped already assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr); - amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory()); + amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj); + vaddr_mem_obj->getUserData().phys_mem_obj = vcmd.memory(); } else { - // assert the va is mapped and needs to be removed + // assert the vaddr_mem_obj is mapped and needs to be removed assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr); amd::MemObjMap::RemoveMemObj(vcmd.ptr()); + vaddr_mem_obj->getUserData().phys_mem_obj = nullptr; } } profilingEnd(vcmd); diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 8a3fc9c54b..c4afdf9194 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -2301,6 +2301,16 @@ uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const { return hsa_vmem_handle.handle; } +void Device::deviceVmemRelease(uint64_t mem_handle) const { + hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {}; + hsa_vmem_handle.handle = mem_handle; + + hsa_status_t hsa_status = hsa_amd_vmem_handle_release(hsa_vmem_handle); + if (hsa_status != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_handle_release! Failed with hsa status: %d \n", hsa_status); + } +} + void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const { const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_ : (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_; @@ -2381,7 +2391,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return nullptr; } - if (mem->getSvmPtr() != nullptr) { + if (mem->getSvmPtr() != nullptr || mem->getMemFlags() & ROCCLR_MEM_PHYMEM) { // add the information to context so that we can use it later. amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem); } diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 0606c593b6..eef891f2e5 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -450,6 +450,7 @@ class Device : public NullDevice { bool deviceAllowAccess(void* dst) const; bool allowPeerAccess(device::Memory* memory) const; + void deviceVmemRelease(uint64_t mem_handle) const; uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const; void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const; diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 763317df22..6113262340 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -648,6 +648,12 @@ void Buffer::destroy() { } const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER; + if (memFlags & ROCCLR_MEM_PHYMEM) { + // If this is physical memory, dont call hsa free function, since device mem was never created + dev().deviceVmemRelease(owner()->getUserData().hsa_handle); + return; + } + if (kind_ != MEMORY_KIND_PTRGIVEN) { if (isFineGrain) { if (memFlags & CL_MEM_ALLOC_HOST_PTR) { @@ -767,7 +773,10 @@ bool Buffer::create(bool alloc_local) { owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0); if (owner()->getUserData().hsa_handle == 0) { LogError("HSA Opaque Handle returned was null"); + return false; } + deviceMemory_ = reinterpret_cast(amd::Memory::MemoryType::kPhyMemHandlePtr); + return true; } if ((owner()->parent() == nullptr) && diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 6f2f879e9a..7083dfbee1 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2589,36 +2589,39 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { profilingBegin(vcmd); - // Find the amd::Memory object for virtual ptr. - amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); - if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + // Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr. + amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); + if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { profilingEnd(vcmd); return; } // Get the amd::Memory object for the physical address - amd::Memory* pa = vcmd.memory(); + amd::Memory* phys_mem_obj = vcmd.memory(); hsa_status_t hsa_status = HSA_STATUS_SUCCESS; // If Physical address is not set, then it is map command. If set, it is unmap command. - if (pa != nullptr) { + if (phys_mem_obj != nullptr) { // Map the physical to virtual address the hsa api hsa_amd_vmem_alloc_handle_t opaque_hsa_handle; - opaque_hsa_handle.handle = pa->getUserData().hsa_handle; - if ((hsa_status = hsa_amd_vmem_map(va->getSvmPtr(), va->getSize(), va->getOffset(), - opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) { + opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle; + if ((hsa_status = hsa_amd_vmem_map(vaddr_mem_obj->getSvmPtr(), vcmd.size(), + vaddr_mem_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) { assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr); // Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap. - amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory()); + amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj); + vaddr_mem_obj->getUserData().phys_mem_obj = phys_mem_obj; } else { LogError("HSA Command: hsa_amd_vmem_map failed!"); } } else { // Unmap the object, since the physical addr is set. - if ((hsa_status = hsa_amd_vmem_unmap(va->getSvmPtr(), va->getSize())) == HSA_STATUS_SUCCESS) { + if ((hsa_status = hsa_amd_vmem_unmap(vaddr_mem_obj->getSvmPtr(), vcmd.size())) + == HSA_STATUS_SUCCESS) { // assert the va is mapped and needs to be removed assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr); amd::MemObjMap::RemoveMemObj(vcmd.ptr()); + vaddr_mem_obj->getUserData().phys_mem_obj = nullptr; } else { LogError("HSA Command: hsa_amd_vmem_unmap failed"); } diff --git a/rocclr/platform/memory.hpp b/rocclr/platform/memory.hpp index e89ea28791..afda26623c 100644 --- a/rocclr/platform/memory.hpp +++ b/rocclr/platform/memory.hpp @@ -142,13 +142,15 @@ class Memory : public amd::RuntimeObject { public: enum MemoryType { kSvmMemoryPtr = 0x1, - kArenaMemoryPtr = 0x100 + kArenaMemoryPtr = 0x100, + kPhyMemHandlePtr = 0x101 }; struct UserData { int deviceId = 0; //!< Device ID memory is allocated on void* data = nullptr; //!< Opaque user data from CL or HIP or etc. + amd::Memory* phys_mem_obj = nullptr; //