diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp index 17287d8b41..2059419489 100644 --- a/hipamd/src/hip_vm.cpp +++ b/hipamd/src/hip_vm.cpp @@ -22,6 +22,16 @@ #include "hip_internal.hpp" #include "hip_vm.hpp" +static_assert(static_cast(hipMemAccessFlagsProtNone) + == static_cast(amd::Device::VmmAccess::kNone), + "Mem Access Flag None mismatch with ROCclr!"); +static_assert(static_cast(hipMemAccessFlagsProtRead) + == static_cast(amd::Device::VmmAccess::kReadOnly), + "Mem Access Flag Read mismatch with ROCclr!"); +static_assert(static_cast(hipMemAccessFlagsProtReadWrite) + == static_cast(amd::Device::VmmAccess::kReadWrite), + "Mem Access Flag Read Write mismatch with ROCclr!"); + hipError_t hipMemAddressFree(void* devPtr, size_t size) { HIP_INIT_API(hipMemAddressFree, devPtr, size); @@ -29,52 +39,47 @@ hipError_t hipMemAddressFree(void* devPtr, size_t size) { HIP_RETURN(hipErrorInvalidValue); } - for (auto& dev: g_devices) { - dev->devices()[0]->virtualFree(devPtr); - } + // Single call frees address range for all devices. + g_devices[0]->devices()[0]->virtualFree(devPtr); HIP_RETURN(hipSuccess); } -hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* addr, unsigned long long flags) { +hipError_t hipMemAddressReserve(void** ptr, size_t size, size_t alignment, void* addr, + unsigned long long flags) { HIP_INIT_API(hipMemAddressReserve, ptr, size, alignment, addr, flags); - if (ptr == nullptr || - flags !=0) { + if (ptr == nullptr || flags != 0) { HIP_RETURN(hipErrorInvalidValue); } + const auto& dev_info = g_devices[0]->devices()[0]->info(); + if (size == 0 || ((size % dev_info.virtualMemAllocGranularity_) != 0)) { + HIP_RETURN(hipErrorMemoryAllocation); + } + + // Initialize the ptr, single virtual alloc call would reserve va range for all devices. *ptr = nullptr; + *ptr = g_devices[0]->devices()[0]->virtualAlloc(addr, size, alignment); + if (*ptr == nullptr) { + HIP_RETURN(hipErrorOutOfMemory); + } - void* startAddress = addr; - - for (auto& dev : g_devices) { - *ptr = dev->devices()[0]->virtualAlloc(startAddress, size, alignment); - - // if addr==0 we generate the va and use it for other devices - if (startAddress == nullptr) { - startAddress = *ptr; - } else if (*ptr != startAddress) { - // if we cannot reserve the same VA on other devices, just fail - for (auto& d : g_devices) { - if (d == dev) HIP_RETURN(hipErrorOutOfMemory); - d->devices()[0]->virtualFree(startAddress); - } - } + // If requested address was not allocated, printf error message. + if (addr != nullptr && addr == *ptr) { + LogPrintfError("Requested address : 0x%x was not allocated. Allocated address : 0x%x ", *ptr); } HIP_RETURN(hipSuccess); } -hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, const hipMemAllocationProp* prop, unsigned long long flags) { +hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, + const hipMemAllocationProp* prop, unsigned long long flags) { HIP_INIT_API(hipMemCreate, handle, size, prop, flags); - if (handle == nullptr || - size == 0 || - flags != 0 || - prop == nullptr || - prop->type != hipMemAllocationTypePinned || - prop->location.type != hipMemLocationTypeDevice || + // Currently we do not support Pinned memory + if (handle == nullptr || size == 0 || flags != 0 || prop == nullptr || + prop->type != hipMemAllocationTypePinned || prop->location.type != hipMemLocationTypeDevice || prop->location.id >= g_devices.size()) { HIP_RETURN(hipErrorInvalidValue); } @@ -84,6 +89,7 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, co HIP_RETURN(hipErrorNotSupported); } + // Device info validation const auto& dev_info = g_devices[prop->location.id]->devices()[0]->info(); if (dev_info.maxPhysicalMemAllocSize_ < size) { @@ -95,34 +101,39 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, co amd::Context* amdContext = g_devices[prop->location.id]->asContext(); - void* ptr = amd::SvmBuffer::malloc(*amdContext, 0, size, dev_info.memBaseAddrAlign_, - nullptr); + // When ROCCLR_MEM_PHYMEM is set, ROCr impl gets and stores unique hsa handle. Flag no-op on PAL. + void* ptr = amd::SvmBuffer::malloc(*amdContext, ROCCLR_MEM_PHYMEM, size, + dev_info.memBaseAddrAlign_, nullptr); + // Handle out of memory cases, if (ptr == nullptr) { size_t free = 0, total =0; - hipError_t err = hipMemGetInfo(&free, &total); - if (err == hipSuccess) { - LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu | total :%zu \n", size, free, total); + hipError_t hip_error = hipMemGetInfo(&free, &total); + if (hip_error == hipSuccess) { + LogPrintfError("Allocation failed : Device memory : required :%zu | free :%zu" + "| total :%zu \n", size, free, total); } HIP_RETURN(hipErrorOutOfMemory); } + + // Add this to amd::Memory object, so this ptr is accesible for other hipmemory operations. size_t offset = 0; //this is ignored amd::Memory* memObj = getMemoryObject(ptr, offset); //saves the current device id so that it can be accessed later memObj->getUserData().deviceId = prop->location.id; memObj->getUserData().data = new hip::GenericAllocation(ptr, size, *prop); - *handle = reinterpret_cast(memObj->getUserData().data); HIP_RETURN(hipSuccess); } -hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAllocationHandle_t handle, hipMemAllocationHandleType handleType, unsigned long long flags) { +hipError_t hipMemExportToShareableHandle(void* shareableHandle, + hipMemGenericAllocationHandle_t handle, + hipMemAllocationHandleType handleType, + unsigned long long flags) { HIP_INIT_API(hipMemExportToShareableHandle, shareableHandle, handle, handleType, flags); - if (flags != 0 || - handle == nullptr || - shareableHandle == nullptr) { + if (flags != 0 || handle == nullptr || shareableHandle == nullptr) { HIP_RETURN(hipErrorInvalidValue); } @@ -132,23 +143,30 @@ hipError_t hipMemExportToShareableHandle(void* shareableHandle, hipMemGenericAll hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location, void* ptr) { HIP_INIT_API(hipMemGetAccess, flags, location, ptr); - if (flags == nullptr || - location == nullptr || - ptr == nullptr) { + if (flags == nullptr || location == nullptr || ptr == nullptr + || location->type != hipMemLocationTypeDevice || location->id >= g_devices.size()) { + HIP_RETURN(hipErrorInvalidValue) + } + + // Convert the access flags to amd::Device access flag + auto& dev = g_devices[location->id]; + amd::Device::VmmAccess access_flags = static_cast(0); + + if (!dev->devices()[0]->GetMemAccess(ptr, &access_flags)) { HIP_RETURN(hipErrorInvalidValue); } + *flags = static_cast(access_flags); + HIP_RETURN(hipSuccess); } -hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop, hipMemAllocationGranularity_flags option) { +hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop, + hipMemAllocationGranularity_flags option) { HIP_INIT_API(hipMemGetAllocationGranularity, granularity, prop, option); - if (granularity == nullptr || - prop == nullptr || - prop->type != hipMemAllocationTypePinned || - prop->location.type != hipMemLocationTypeDevice || - prop->location.id >= g_devices.size()) { + if (granularity == nullptr || prop == nullptr || prop->type != hipMemAllocationTypePinned || + prop->location.type != hipMemLocationTypeDevice || prop->location.id >= g_devices.size()) { HIP_RETURN(hipErrorInvalidValue); } @@ -171,7 +189,8 @@ hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop, h HIP_RETURN(hipSuccess); } -hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle, void* osHandle, hipMemAllocationHandleType shHandleType) { +hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle, void* osHandle, + hipMemAllocationHandleType shHandleType) { HIP_INIT_API(hipMemImportFromShareableHandle, handle, osHandle, shHandleType); if (handle == nullptr || osHandle == nullptr) { @@ -181,22 +200,23 @@ hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* hand HIP_RETURN(hipErrorNotSupported); } -hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle, unsigned long long flags) { +hipError_t hipMemMap(void* ptr, size_t size, size_t offset, hipMemGenericAllocationHandle_t handle, + unsigned long long flags) { HIP_INIT_API(hipMemMap, ptr, size, offset, handle, flags); - if (ptr == nullptr || - handle == nullptr || - size == 0 || - offset != 0 || - flags != 0) { + if (ptr == nullptr || handle == nullptr || size == 0 || offset != 0 || flags != 0) { HIP_RETURN(hipErrorInvalidValue); } + // Re-interpret the ga handle and set the mapped flag hip::GenericAllocation* ga = reinterpret_cast(handle); + ga->retain(); auto& queue = *g_devices[ga->GetProperties().location.id]->NullStream(); - amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size, &ga->asAmdMemory()); + // Map the physical address to virtual address + amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size, + &ga->asAmdMemory()); cmd->enqueue(); cmd->awaitCompletion(); cmd->release(); @@ -220,11 +240,13 @@ hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList, unsigned int count hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) { HIP_INIT_API(hipMemRelease, handle); - if (handle == nullptr) HIP_RETURN(hipErrorInvalidValue); + if (handle == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } + // Re-interpret the ga handle and make sure it is not already released. hip::GenericAllocation* ga = reinterpret_cast(handle); - - delete ga; + ga->release(); HIP_RETURN(hipSuccess); } @@ -232,7 +254,9 @@ hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) { hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, void* addr) { HIP_INIT_API(hipMemRetainAllocationHandle, handle, addr); - if (handle == nullptr || addr == nullptr) HIP_RETURN(hipErrorInvalidValue); + if (handle == nullptr || addr == nullptr) { + HIP_RETURN(hipErrorInvalidValue); + } amd::Memory* mem = amd::MemObjMap::FindMemObj(addr); @@ -252,10 +276,18 @@ hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle, hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, size_t count) { HIP_INIT_API(hipMemSetAccess, ptr, size, desc, count); - if (ptr == nullptr || - size == 0 || - desc == nullptr || - count == 0) { + if (ptr == nullptr || size == 0 || desc == nullptr || count == 0) { + HIP_RETURN(hipErrorInvalidValue); + } + + if (desc->location.id >= g_devices.size()) { + HIP_RETURN(hipErrorInvalidValue) + } + + auto& dev = g_devices[desc->location.id]; + amd::Device::VmmAccess access_flags = static_cast(desc->flags); + + if (!dev->devices()[0]->SetMemAccess(ptr, size, access_flags, count)) { HIP_RETURN(hipErrorInvalidValue); } @@ -265,13 +297,15 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, hipError_t hipMemUnmap(void* ptr, size_t size) { HIP_INIT_API(hipMemUnmap, ptr, size); - if (ptr == nullptr) HIP_RETURN(hipErrorInvalidValue); + if (ptr == nullptr || size == 0) { + HIP_RETURN(hipErrorInvalidValue); + } amd::Memory* va = amd::MemObjMap::FindMemObj(ptr); - auto& queue = *g_devices[va->getUserData().deviceId]->NullStream(); - amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size, nullptr); + amd::Command* cmd = new amd::VirtualMapCommand(queue, amd::Command::EventWaitList{}, ptr, size, + nullptr); cmd->enqueue(); cmd->awaitCompletion(); cmd->release(); @@ -280,6 +314,7 @@ hipError_t hipMemUnmap(void* ptr, size_t size) { hip::GenericAllocation* ga = reinterpret_cast(va->getUserData().data); va->setSvmPtr(ga->genericAddress()); - HIP_RETURN(hipSuccess); -} + ga->release(); + HIP_RETURN(hipSuccess); +} \ No newline at end of file diff --git a/hipamd/src/hip_vm.hpp b/hipamd/src/hip_vm.hpp index 1b8db6a1cb..241c07dda7 100644 --- a/hipamd/src/hip_vm.hpp +++ b/hipamd/src/hip_vm.hpp @@ -24,6 +24,8 @@ #include #include "hip_internal.hpp" +#include "platform/object.hpp" + hipError_t ihipFree(void* ptr); namespace hip { @@ -36,22 +38,29 @@ struct MemMapAllocUserData { MemMapAllocUserData(void* ptr, size_t size, amd::Memory* va) : ptr_(ptr), size_(size), va_(va) {} }; -class GenericAllocation { - void* ptr_; - size_t size_; - hipMemAllocationProp properties_; +class GenericAllocation : public amd::RuntimeObject { + void* ptr_; //(this); } + hipMemGenericAllocationHandle_t asMemGenericAllocationHandle() { + return reinterpret_cast(this); + } amd::Memory& asAmdMemory() { size_t discardOffset; return *getMemoryObject(genericAddress(), discardOffset); } void* genericAddress() const { return ptr_; } + + virtual ObjectType objectType() const { return ObjectTypeVMMAlloc; } }; }; diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index a033fcd7eb..32a321962c 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1584,6 +1584,14 @@ class Device : public RuntimeObject { kCacheStateSystem = 2 } CacheState; + // LinkAttrType; static constexpr size_t kP2PStagingSize = 4 * Mi; @@ -1784,6 +1792,25 @@ class Device : public RuntimeObject { */ virtual void* virtualAlloc(void* addr, size_t size, size_t alignment) = 0; + /** + * Set Access permisions for a virtual memory object. + * + * @param va_addr Virtual Address ptr + * @param va_size Virtual Address Size + * @param access_flags Access permissions + * @param count Number of access permissions + */ + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + size_t count) = 0; + + /** + * Get Access permisions for a virtual memory object. + * + * @param va_addr Virtual Address ptr + * @param access_flags_ptr Access permissions to be filled + */ + virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) = 0; + /** * Free a VA range * @@ -1966,6 +1993,7 @@ class Device : public RuntimeObject { virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0) { return nullptr; } + #if defined(__clang__) #if __has_feature(address_sanitizer) virtual device::UriLocator* createUriLocator() const = 0; diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index d47346c922..fbeaa9ba68 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -2385,7 +2385,11 @@ void* Device::virtualAlloc(void* addr, size_t size, size_t alignment) { return nullptr; } - if (!mem->create(nullptr, false)) { + constexpr bool kSysMemAlloc = false; + constexpr bool kSkipAlloc = false; + constexpr bool kForceAlloc = true; + // Force the alloc now for VA_Range reservation. + if (!mem->create(nullptr, kSysMemAlloc, kSkipAlloc, kForceAlloc)) { LogError("failed to create a va range mem object"); mem->release(); return nullptr; diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 764bea98ab..1dab8b22d6 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -147,6 +147,14 @@ class NullDevice : public amd::Device { virtual void* virtualAlloc(void* addr, size_t size, size_t alignment) { return nullptr; }; virtual void virtualFree(void* addr) { }; + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, size_t count) { + return true; + } + + virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) { + return true; + } + virtual bool importExtSemaphore(void** extSemaphore,const amd::Os::FileDesc& handle, amd::ExternalSemaphoreHandleType sem_handle_type) override { return false; @@ -535,6 +543,14 @@ class Device : public NullDevice { virtual void* virtualAlloc(void* addr, size_t size, size_t alignment); virtual void virtualFree(void* addr); + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, size_t count) { + return true; + } + + virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) { + return true; + } + //! Returns SRD manger object SrdManager& srds() const { return *srdManager_; } diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 58a6838948..14b1827b47 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -64,6 +64,20 @@ #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) #define OPENCL_C_VERSION_STR XSTR(OPENCL_C_MAJOR) "." XSTR(OPENCL_C_MINOR) + +static_assert(static_cast(amd::Device::VmmAccess::kNone) + == static_cast(HSA_ACCESS_PERMISSION_NONE), + "Vmm Access Flag None mismatch with ROC-runtime!"); +static_assert(static_cast(amd::Device::VmmAccess::kReadOnly) + == static_cast(HSA_ACCESS_PERMISSION_RO), + "Vmm Access Flag Read mismatch with ROCr-runtime!"); +static_assert(static_cast(amd::Device::VmmAccess::kWriteOnly) + == static_cast(HSA_ACCESS_PERMISSION_WO), + "Vmm Access Flag Write mismatch with ROC-runtime!"); +static_assert(static_cast(amd::Device::VmmAccess::kReadWrite) + == static_cast(HSA_ACCESS_PERMISSION_RW), + "Vmm Access Flag Read Write mismatch with ROC-runtime!"); + #ifndef WITHOUT_HSA_BACKEND namespace { @@ -935,6 +949,14 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo } else { dev->info_.largeBar_ = ROC_ENABLE_LARGE_BAR; } + + // Query the recommended granularity for this pool. + stat = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, + &(dev->info_.virtualMemAllocGranularity_)); + if (stat != HSA_STATUS_SUCCESS) { + LogPrintfError("Cannot query HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE info" + "failed with hsa_status: %d \n", stat); + } } if (dev->gpuvm_segment_.handle == 0) { @@ -1720,7 +1742,17 @@ bool Device::populateOCLDeviceConstants() { maxSdmaReadMask_, maxSdmaWriteMask_); info_.globalCUMask_ = {}; + + // Virtual memory Management Support, if set to true then the HW and SW Stack supports VMM. info_.virtualMemoryManagement_ = false; + if (HIP_VMEM_MANAGE_SUPPORT) { + if (HSA_STATUS_SUCCESS != hsa_system_get_info( + static_cast(HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED), + &info_.virtualMemoryManagement_)) { + LogError("HSA_AMD_SYSTEM_INFO_VIRTUAL_MEM_API_SUPPORTED query failed "); + } + } + switch (isa().versionMajor()) { case (11): if (isa().versionMinor() == 0) { @@ -2212,6 +2244,19 @@ bool Device::allowPeerAccess(device::Memory* memory) const { return true; } +uint64_t Device::deviceVmemAlloc(size_t size, uint64_t flags) const { + hsa_amd_vmem_alloc_handle_t hsa_vmem_handle {}; + + // We only allow pinned memory at this time. + hsa_status_t hsa_status = hsa_amd_vmem_handle_create(gpuvm_segment_, size, MEMORY_TYPE_PINNED, + flags, &hsa_vmem_handle); + if (hsa_status != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_handle_create! Failed with hsa status: %d \n", hsa_status); + } + + return hsa_vmem_handle.handle; +} + void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain) const { const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain) ? gpu_ext_fine_grained_segment_ : (atomics) ? gpu_fine_grained_segment_ : gpuvm_segment_; @@ -2311,13 +2356,83 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return svmPtr; } -void* Device::virtualAlloc(void* addr, size_t size, size_t alignment) -{ - return nullptr; +void* Device::virtualAlloc(void* req_addr, size_t size, size_t alignment) { + void* vptr = nullptr; + // Reserves the address using HSA APIs, with requested address. + // There is no guarantee that we will get the requested address. + hsa_status_t hsa_status = hsa_amd_vmem_address_reserve(&vptr, size, + reinterpret_cast(req_addr), 0); + if (hsa_status != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_address_reserve. Failed with status: %d \n", hsa_status); + return nullptr; + } + + // This mem->create() does not create an actual memory but stores the memory info with given vptr. + auto mem = new (context()) amd::Buffer(context(), CL_MEM_VA_RANGE_AMD, size, vptr); + if (mem == nullptr) { + LogError("failed to new a va range mem object!"); + return nullptr; + } + + if (!mem->create(nullptr, false)) { + LogError("failed to create a va range mem object"); + mem->release(); + return nullptr; + } + + // Assert to make sure that amd::Memory object has set the right ptr. + guarantee(vptr == mem->getSvmPtr(), "amd::Memory object does not have the right ptr"); + + return mem->getSvmPtr(); } -void Device::virtualFree(void* addr) -{ +void Device::virtualFree(void* addr) { + amd::Memory* memObj = amd::MemObjMap::FindVirtualMemObj(addr); + if (memObj == nullptr) { + LogPrintfError("Cannot find the Virtual MemObj entry for this addr 0x%x", addr); + } + + hsa_status_t hsa_status = hsa_amd_vmem_address_free(memObj->getSvmPtr(), memObj->getSize()); + if (hsa_status != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_address_free. Failed with status:%d \n", hsa_status); + } +} + +bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, size_t count) { + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + hsa_amd_memory_access_desc_t desc; + desc.permissions = static_cast(access_flags); + desc.agent_handle = getBackendDevice(); + + if ((hsa_status = hsa_amd_vmem_set_access(va_addr, va_size, &desc, count)) + != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_set_access. Failed with status:%d \n", hsa_status); + return false; + } + + return true; +} + +bool Device::GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) { + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + hsa_access_permission_t perms; + + size_t discard_offset = 0; + amd::Memory* va_mem_obj = amd::MemObjMap::FindMemObj(va_addr, &discard_offset); + if (va_mem_obj == nullptr) { + LogPrintfError("Failed to get Memory Object for va_addr: 0x%x", va_addr); + return false; + } + + if ((hsa_status = hsa_amd_vmem_get_access(va_mem_obj->getSvmPtr(), &perms, getBackendDevice())) + != HSA_STATUS_SUCCESS) { + LogPrintfError("Failed hsa_amd_vmem_get_access. Failed with status:%d \n", hsa_status); + return false; + } + + *access_flags_ptr = static_cast(perms); + + return true; } // ================================================================================================ diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 9cfe9d3b09..141449328d 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -221,7 +221,7 @@ class NullDevice : public amd::Device { ShouldNotReachHere(); return; } - void* virtualAlloc(void* addr, size_t size, size_t alignment) override { + void* virtualAlloc(void* req_addr, size_t size, size_t alignment) override { ShouldNotReachHere(); return nullptr; } @@ -231,6 +231,17 @@ class NullDevice : public amd::Device { return; } + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, size_t count) + override { + ShouldNotReachHere(); + return false; + } + + virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) override { + ShouldNotReachHere(); + return false; + } + //! Determine if we can use device memory for SVM const bool forceFineGrain(amd::Memory* memory) const { return (memory->getContext().devices().size() > 1); @@ -439,7 +450,7 @@ class Device : public NullDevice { bool deviceAllowAccess(void* dst) const; bool allowPeerAccess(device::Memory* memory) const; - + uint64_t deviceVmemAlloc(size_t size, uint64_t flags) const; void* deviceLocalAlloc(size_t size, bool atomics = false, bool pseudo_fine_grain=false) const; void memFree(void* ptr, size_t size) const; @@ -454,9 +465,12 @@ class Device : public NullDevice { virtual bool GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, size_t num_attributes, const void* dev_ptr, size_t count) const; - virtual void* virtualAlloc(void* addr, size_t size, size_t alignment); + virtual void* virtualAlloc(void* req_addr, size_t size, size_t alignment); virtual void virtualFree(void* addr); + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, size_t count); + virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr); + virtual bool SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput); diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 0a84bae241..757aafcc68 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -748,9 +748,19 @@ bool Buffer::create(bool alloc_local) { owner()->setSvmPtr(orig_dev_ptr); } + + // Allocate backing storage in device local memory unless UHP or AHP are set cl_mem_flags memFlags = owner()->getMemFlags(); + if (memFlags & ROCCLR_MEM_PHYMEM) { + // If this is physical memory request, then get an handle and store it in user data + owner()->getUserData().hsa_handle = dev().deviceVmemAlloc(owner()->getSize(), 0); + if (owner()->getUserData().hsa_handle == 0) { + LogError("HSA Opaque Handle returned was null"); + } + } + if ((owner()->parent() == nullptr) && (owner()->getSvmPtr() != nullptr)) { if (dev().forceFineGrain(owner()) || dev().isFineGrainedSystem(true)) { diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 697d2c0df6..876e727ca3 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2539,6 +2539,51 @@ void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) { profilingEnd(cmd); } +// ================================================================================================ +void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { + // Make sure VirtualGPU has an exclusive access to the resources + amd::ScopedLock lock(execution()); + + profilingBegin(vcmd); + + // Find the amd::Memory object for virtual ptr. + amd::Memory* va = amd::MemObjMap::FindVirtualMemObj(vcmd.ptr()); + if (va == nullptr || !(va->getMemFlags() & CL_MEM_VA_RANGE_AMD)) { + profilingEnd(vcmd); + return; + } + + // Get the amd::Memory object for the physical address + amd::Memory* pa = vcmd.memory(); + hsa_status_t hsa_status = HSA_STATUS_SUCCESS; + + // If Physical address is not set, then it is map command. If set, it is unmap command. + if (pa != nullptr) { + // Map the physical to virtual address the hsa api + hsa_amd_vmem_alloc_handle_t opaque_hsa_handle; + opaque_hsa_handle.handle = pa->getUserData().hsa_handle; + if ((hsa_status = hsa_amd_vmem_map(va->getSvmPtr(), va->getSize(), va->getOffset(), + opaque_hsa_handle, 0)) == HSA_STATUS_SUCCESS) { + assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) == nullptr); + // Now that we have mapped physical addr to virtual addr, make an entry in the MemObjMap. + amd::MemObjMap::AddMemObj(vcmd.ptr(), vcmd.memory()); + } else { + LogError("HSA Command: hsa_amd_vmem_map failed!"); + } + } else { + // Unmap the object, since the physical addr is set. + if ((hsa_status = hsa_amd_vmem_unmap(va->getSvmPtr(), va->getSize())) == HSA_STATUS_SUCCESS) { + // assert the va is mapped and needs to be removed + assert(amd::MemObjMap::FindMemObj(vcmd.ptr()) != nullptr); + amd::MemObjMap::RemoveMemObj(vcmd.ptr()); + } else { + LogError("HSA Command: hsa_amd_vmem_unmap failed"); + } + } + + profilingEnd(vcmd); +} + // ================================================================================================ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources diff --git a/rocclr/device/rocm/rocvirtual.hpp b/rocclr/device/rocm/rocvirtual.hpp index bb6d7f0c7e..c2638cadab 100644 --- a/rocclr/device/rocm/rocvirtual.hpp +++ b/rocclr/device/rocm/rocvirtual.hpp @@ -336,6 +336,7 @@ class VirtualGPU : public device::VirtualDevice { void flush(amd::Command* list = nullptr, bool wait = false); void submitFillMemory(amd::FillMemoryCommand& cmd); void submitStreamOperation(amd::StreamOperationCommand& cmd); + void submitVirtualMap(amd::VirtualMapCommand& cmd); void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd); void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd); diff --git a/rocclr/platform/memory.hpp b/rocclr/platform/memory.hpp index 850a7db290..e89ea28791 100644 --- a/rocclr/platform/memory.hpp +++ b/rocclr/platform/memory.hpp @@ -43,6 +43,7 @@ #define CL_MEM_VA_RANGE_AMD (1u << 28) #define ROCCLR_MEM_HSA_UNCACHED (1u << 27) #define ROCCLR_MEM_INTERPROCESS (1u << 26) +#define ROCCLR_MEM_PHYMEM (1u << 25) namespace device { class Memory; @@ -148,6 +149,7 @@ class Memory : public amd::RuntimeObject { { int deviceId = 0; //!< Device ID memory is allocated on void* data = nullptr; //!< Opaque user data from CL or HIP or etc. + uint64_t hsa_handle = 0; //!