SWDEV-423835 - Fixing kernel launch issues on Virtual Memory Management path.

Change-Id: I9f5e8a3d83af3809b2c50b21a10697e26113dd23
This commit is contained in:
kjayapra-amd
2024-02-05 16:50:51 -05:00
کامیت شده توسط Karthik Jayaprakash
والد dd43dc930d
کامیت f5ca620baa
10فایلهای تغییر یافته به همراه87 افزوده شده و 83 حذف شده
@@ -2104,13 +2104,8 @@ class GraphMemAllocNode final : public GraphNode {
// Retain memory object because command release will release it
memory_->retain();
size_ = aligned_size;
// Save geenric allocation info to match VM interfaces
memory_->getUserData().data = new hip::MemMapAllocUserData(dptr, aligned_size, va_);
// Execute the original mapping command
VirtualMapCommand::submit(device);
// Update the internal svm address to ptr
memory()->setSvmPtr(va_->getSvmPtr());
// Can't destroy VA, because it's used in mapping even if the node will be destroyed
va_->retain();
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemAlloc execute: %p, %p",
va_->getSvmPtr(), memory());
@@ -2234,24 +2229,21 @@ class GraphMemFreeNode : public GraphNode {
virtual void submit(device::VirtualDevice& device) final {
// Find memory object before unmap logic
auto alloc = amd::MemObjMap::FindMemObj(ptr());
auto vaddr_mem_obj = amd::MemObjMap::FindMemObj(ptr());
amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
assert(phys_mem_obj != nullptr);
VirtualMapCommand::submit(device);
// Restore the original address of the generic allocation
auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(alloc->getUserData().data);
alloc->setSvmPtr(ga->ptr_);
if (!AMD_DIRECT_DISPATCH) {
// Update the current device, since hip event, used in mem pools, requires device
hip::setCurrentDevice(device_id_);
}
// Free virtual address
ga->va_->release();
alloc->getUserData().data = nullptr;
vaddr_mem_obj->release();
// Release the allocation back to graph's pool
graph_->FreeMemory(ga->ptr_, static_cast<hip::Stream*>(queue()));
amd::MemObjMap::AddMemObj(ptr(), ga->va_);
delete ga;
graph_->FreeMemory(phys_mem_obj->getSvmPtr(), static_cast<hip::Stream*>(queue()));
amd::MemObjMap::AddMemObj(ptr(), vaddr_mem_obj);
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Graph MemFree execute: %p, %p",
ptr(), alloc);
ptr(), vaddr_mem_obj);
}
private:
+10 -14
مشاهده پرونده
@@ -225,6 +225,10 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
{
amd::ScopedLock lock(lock_pool_ops_);
if (memory->getUserData().phys_mem_obj != nullptr) {
memory = memory->getUserData().phys_mem_obj;
}
// If the free heap grows over the busy heap, then force release
if (AMD_DIRECT_DISPATCH && (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize())) {
// Use event base release to reduce memory pressure
@@ -249,22 +253,14 @@ bool MemoryPool::FreeMemory(amd::Memory* memory, Stream* stream, Event* event) {
}
ClPrint(amd::LOG_INFO, amd::LOG_MEM_POOL, "Pool FreeMem: %p, %p", memory->getSvmPtr(), memory);
auto ga = reinterpret_cast<hip::MemMapAllocUserData*>(memory->getUserData().data);
if (ga != nullptr) {
if (stream == nullptr) {
if (stream == nullptr) {
stream = g_devices[memory->getUserData().deviceId]->NullStream();
}
// Unmap virtual address from memory
auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
memory->getSvmPtr(), ga->size_, nullptr);
cmd->enqueue();
cmd->release();
memory->setSvmPtr(ga->ptr_);
// Free virtual address and destroy generic allocation object
ga->va_->release();
delete ga;
memory->getUserData().data = nullptr;
}
// Unmap virtual address from memory
auto cmd = new amd::VirtualMapCommand(*stream, amd::Command::EventWaitList{},
memory->getSvmPtr(), memory->getSize(), nullptr);
cmd->enqueue();
cmd->release();
if (stream != nullptr) {
// The stream of destruction is a safe stream, because the app must handle sync
+17 -16
مشاهده پرونده
@@ -120,11 +120,15 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size,
// Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations.
size_t offset = 0; //this is ignored
amd::Memory* memObj = getMemoryObject(ptr, offset);
amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset);
//saves the current device id so that it can be accessed later
memObj->getUserData().deviceId = prop->location.id;
memObj->getUserData().data = new hip::GenericAllocation(ptr, size, *prop);
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(memObj->getUserData().data);
phys_mem_obj->getUserData().deviceId = prop->location.id;
phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop);
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(phys_mem_obj->getUserData().data);
// Remove because the entry of 0x1 is not needed in MemObjMap.
// We save the copy of Phy mem obj in virtual mem obj during mapping.
amd::MemObjMap::RemoveMemObj(ptr);
HIP_RETURN(hipSuccess);
}
@@ -225,9 +229,6 @@ hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocat
cmd->awaitCompletion();
cmd->release();
// update the internal svm address to ptr
ga->asAmdMemory().setSvmPtr(ptr);
HIP_RETURN(hipSuccess);
}
@@ -268,7 +269,8 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
HIP_RETURN(hipErrorInvalidValue);
}
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(mem->getUserData().data);
*handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(
mem->getUserData().phys_mem_obj->getUserData().data);
if (*handle == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
@@ -312,17 +314,17 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::Memory* pa = amd::MemObjMap::FindMemObj(ptr);
if (pa == nullptr) {
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(ptr);
if (vaddr_mem_obj == nullptr && vaddr_mem_obj->getSize() != size) {
HIP_RETURN(hipErrorInvalidValue);
}
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(ptr);
if (va == nullptr && va->getSize() != size) {
amd::Memory* phys_mem_obj = vaddr_mem_obj->getUserData().phys_mem_obj;
if (phys_mem_obj == nullptr) {
HIP_RETURN(hipErrorInvalidValue);
}
auto& queue = *g_devices[pa->getUserData().deviceId]->NullStream();
auto& queue = *g_devices[phys_mem_obj->getUserData().deviceId]->NullStream();
amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size,
nullptr);
@@ -331,9 +333,8 @@ hipError_t hipMemUnmap(void* ptr, size_t size) {
cmd->release();
// restore the original pa of the generic allocation
hip::GenericAllocation* ga = reinterpret_cast<hip::GenericAllocation*>(pa->getUserData().data);
pa->setSvmPtr(ga->genericAddress());
hip::GenericAllocation* ga
= reinterpret_cast<hip::GenericAllocation*>(phys_mem_obj->getUserData().data);
ga->release();
HIP_RETURN(hipSuccess);
+5 -17
مشاهده پرونده
@@ -30,35 +30,23 @@ namespace hip {
hipError_t ihipFree(void* ptr);
struct MemMapAllocUserData {
void* ptr_; // Original pointer of the allocation
size_t size_; // Aligned size of the allocation
amd::Memory* va_; // Memory object for the virtual address
MemMapAllocUserData(void* ptr, size_t size, amd::Memory* va) : ptr_(ptr), size_(size), va_(va) {}
};
class GenericAllocation : public amd::RuntimeObject {
void* ptr_; //<! Device ptr
amd::Memory& phys_mem_ref_; //<! Physical memory object
size_t size_; //<! Allocated size
hipMemAllocationProp properties_; //<! Allocation Properties
public:
GenericAllocation(void* ptr, size_t size, const hipMemAllocationProp& prop)
: ptr_(ptr), size_(size), properties_(prop) {}
~GenericAllocation() {
hipError_t err = ihipFree(ptr_);
}
GenericAllocation(amd::Memory& phys_mem_ref, size_t size, const hipMemAllocationProp& prop)
: phys_mem_ref_(phys_mem_ref), size_(size), properties_(prop) {}
~GenericAllocation() {}
const hipMemAllocationProp& GetProperties() const { return properties_; }
hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() {
return reinterpret_cast<hipMemGenericAllocationHandle_t>(this);
}
amd::Memory& asAmdMemory() {
size_t discardOffset;
return *getMemoryObject(genericAddress(), discardOffset);
return phys_mem_ref_;
}
void* genericAddress() const { return ptr_; }
virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; }
};
@@ -2192,18 +2192,18 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
profilingEnd(vcmd);
return;
}
pal::Memory* vaRange = dev().getGpuMemory(va);
Pal::IGpuMemory* memory = (vcmd.memory() == nullptr) ?
pal::Memory* vaddr_pal_mem = dev().getGpuMemory(vaddr_mem_obj);
Pal::IGpuMemory* phymem_igpu_mem = (vcmd.memory() == nullptr) ?
nullptr : dev().getGpuMemory(vcmd.memory())->iMem();
Pal::VirtualMemoryRemapRange range{
vaRange->iMem(),
vaddr_pal_mem->iMem(),
0,
memory,
phymem_igpu_mem,
0,
vcmd.size(),
Pal::VirtualGpuMemAccessMode::NoAccess
@@ -2224,13 +2224,15 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
setGpuEvent(event);
if (result == Pal::Result::Success) {
if (vcmd.memory() != nullptr) {
// assert the va wasn't mapped already
// assert the vaddr_mem_obj wasn't mapped already
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
vaddr_mem_obj->getUserData().phys_mem_obj = vcmd.memory();
} else {
// assert the va is mapped and needs to be removed
// assert the vaddr_mem_obj is mapped and needs to be removed
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
amd::MemObjMap::RemoveMemObj(vcmd.ptr());
vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
}
}
profilingEnd(vcmd);
@@ -2301,6 +2301,16 @@ uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const {
return hsa_vmem_handle.handle;
}
void Device::deviceVmemRelease(uint64_t mem_handle) const {
hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {};
hsa_vmem_handle.handle = mem_handle;
hsa_status_t hsa_status = hsa_amd_vmem_handle_release(hsa_vmem_handle);
if (hsa_status != HSA_STATUS_SUCCESS) {
LogPrintfError("Failed hsa_amd_vmem_handle_release! Failed with hsa status: %d \n", hsa_status);
}
}
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const {
const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_
: (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_;
@@ -2381,7 +2391,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
return nullptr;
}
if (mem->getSvmPtr() != nullptr) {
if (mem->getSvmPtr() != nullptr || mem->getMemFlags() & ROCCLR_MEM_PHYMEM) {
// add the information to context so that we can use it later.
amd::MemObjMap::AddMemObj(mem->getSvmPtr(), mem);
}
@@ -450,6 +450,7 @@ class Device : public NullDevice {
bool deviceAllowAccess(void* dst) const;
bool allowPeerAccess(device::Memory* memory) const;
void deviceVmemRelease(uint64_t mem_handle) const;
uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const;
void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const;
@@ -648,6 +648,12 @@ void Buffer::destroy() {
}
const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;
if (memFlags & ROCCLR_MEM_PHYMEM) {
// If this is physical memory, dont call hsa free function, since device mem was never created
dev().deviceVmemRelease(owner()->getUserData().hsa_handle);
return;
}
if (kind_ != MEMORY_KIND_PTRGIVEN) {
if (isFineGrain) {
if (memFlags & CL_MEM_ALLOC_HOST_PTR) {
@@ -767,7 +773,10 @@ bool Buffer::create(bool alloc_local) {
owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0);
if (owner()->getUserData().hsa_handle == 0) {
LogError("HSA Opaque Handle returned was null");
return false;
}
deviceMemory_ = reinterpret_cast<void*>(amd::Memory::MemoryType::kPhyMemHandlePtr);
return true;
}
if ((owner()->parent() == nullptr) &&
@@ -2589,36 +2589,39 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
profilingBegin(vcmd);
// Find the amd::Memory object for virtual ptr.
amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
// Find the amd::Memory object for virtual ptr. vcmd.ptr() is vaddr.
amd::Memory* vaddr_mem_obj = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr());
if (vaddr_mem_obj == nullptr || !(vaddr_mem_obj->getMemFlags() & CL_MEM_VA_RANGE_AMD)) {
profilingEnd(vcmd);
return;
}
// Get the amd::Memory object for the physical address
amd::Memory* pa = vcmd.memory();
amd::Memory* phys_mem_obj = vcmd.memory();
hsa_status_t hsa_status = HSA_STATUS_SUCCESS;
// If Physical address is not set, then it is map command. If set, it is unmap command.
if (pa != nullptr) {
if (phys_mem_obj != nullptr) {
// Map the physical to virtual address the hsa api
hsa_amd_vmem_alloc_handle_t opaque_hsa_handle;
opaque_hsa_handle.handle = pa->getUserData().hsa_handle;
if ((hsa_status = hsa_amd_vmem_map(va->getSvmPtr(), va->getSize(), va->getOffset(),
opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle;
if ((hsa_status = hsa_amd_vmem_map(vaddr_mem_obj->getSvmPtr(), vcmd.size(),
vaddr_mem_obj->getOffset(), opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) {
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr);
// Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap.
amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory());
amd::MemObjMap::AddMemObj(vcmd.ptr(), vaddr_mem_obj);
vaddr_mem_obj->getUserData().phys_mem_obj = phys_mem_obj;
} else {
LogError("HSA Command: hsa_amd_vmem_map failed!");
}
} else {
// Unmap the object, since the physical addr is set.
if ((hsa_status = hsa_amd_vmem_unmap(va->getSvmPtr(), va->getSize())) == HSA_STATUS_SUCCESS) {
if ((hsa_status = hsa_amd_vmem_unmap(vaddr_mem_obj->getSvmPtr(), vcmd.size()))
== HSA_STATUS_SUCCESS) {
// assert the va is mapped and needs to be removed
assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr);
amd::MemObjMap::RemoveMemObj(vcmd.ptr());
vaddr_mem_obj->getUserData().phys_mem_obj = nullptr;
} else {
LogError("HSA Command: hsa_amd_vmem_unmap failed");
}
+3 -1
مشاهده پرونده
@@ -142,13 +142,15 @@ class Memory : public amd::RuntimeObject {
public:
enum MemoryType {
kSvmMemoryPtr = 0x1,
kArenaMemoryPtr = 0x100
kArenaMemoryPtr = 0x100,
kPhyMemHandlePtr = 0x101
};
struct UserData
{
int deviceId = 0; //!< Device ID memory is allocated on
void* data = nullptr; //!< Opaque user data from CL or HIP or etc.
amd::Memory* phys_mem_obj = nullptr; //<! Physical mem obj, only set on virtual mem
uint64_t hsa_handle = 0; //!<Opaque hsa handle saved for Virtual memories
unsigned int flags = 0; //!< HIP memory flags
//! hipMallocPitch allocates buffer using width & height and returns pitch & device pointer.