From 35629e433d54ad4d91e1da3e4b1b2abab51d1b54 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras <38722728+iassiour@users.noreply.github.com> Date: Wed, 10 Sep 2025 23:06:20 +0100 Subject: [PATCH] SWDEV-546146 - Added support for hipMemLocationTypeHost in hipMemSetAccess (#682) --- projects/clr/hipamd/src/hip_vm.cpp | 70 +++++++---- projects/clr/rocclr/device/device.cpp | 3 +- projects/clr/rocclr/device/device.hpp | 7 +- projects/clr/rocclr/device/pal/paldevice.cpp | 5 +- projects/clr/rocclr/device/pal/paldevice.hpp | 8 +- projects/clr/rocclr/device/pal/palvirtual.cpp | 2 +- projects/clr/rocclr/device/rocm/rocdevice.cpp | 8 +- projects/clr/rocclr/device/rocm/rocdevice.hpp | 6 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 2 +- projects/clr/rocclr/platform/memory.hpp | 4 +- .../hipMemSetGetAccess.cc | 119 ++++++++++++++++++ 11 files changed, 197 insertions(+), 37 deletions(-) diff --git a/projects/clr/hipamd/src/hip_vm.cpp b/projects/clr/hipamd/src/hip_vm.cpp index 48ccc9ca62..69f96d4488 100644 --- a/projects/clr/hipamd/src/hip_vm.cpp +++ b/projects/clr/hipamd/src/hip_vm.cpp @@ -87,7 +87,8 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, // Currently we do not support Pinned memory if (handle == nullptr || size == 0 || flags != 0 || prop == nullptr || (prop->type != hipMemAllocationTypePinned && prop->type != hipMemAllocationTypeUncached) || - prop->location.type != hipMemLocationTypeDevice) { + (prop->location.type != hipMemLocationTypeDevice && + prop->location.type != hipMemLocationTypeHost)) { HIP_RETURN(hipErrorInvalidValue); } @@ -106,8 +107,15 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, ihipFlags |= CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED; } - // Device info validation - const auto& dev_info = g_devices[prop->location.id]->devices()[0]->info(); + bool useHostDevice = (prop->location.type == hipMemLocationTypeHost); + amd::Context* curDevContext = hip::getCurrentDevice()->asContext(); + amd::Context* amdContext = useHostDevice ? hip::host_context : curDevContext; + + if (amdContext == nullptr) { + return hipErrorOutOfMemory; + } + + const auto& dev_info = amdContext->devices()[0]->info(); if (dev_info.maxPhysicalMemAllocSize_ < size) { HIP_RETURN(hipErrorOutOfMemory); @@ -116,10 +124,8 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, HIP_RETURN(hipErrorInvalidValue); } - amd::Context* amdContext = g_devices[prop->location.id]->asContext(); - - void* ptr = amd::SvmBuffer::malloc(*amdContext, ihipFlags, size, - dev_info.memBaseAddrAlign_, nullptr); + void* ptr = amd::SvmBuffer::malloc(*amdContext, ihipFlags, size, dev_info.memBaseAddrAlign_, + useHostDevice ? curDevContext->svmDevices()[0] : nullptr); // Handle out of memory cases, if (ptr == nullptr) { @@ -139,6 +145,7 @@ hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle, size_t size, amd::Memory* phys_mem_obj = getMemoryObject(ptr, offset); // saves the current device id so that it can be accessed later phys_mem_obj->getUserData().deviceId = prop->location.id; + phys_mem_obj->getUserData().locationType = prop->location.type; phys_mem_obj->getUserData().data = new hip::GenericAllocation(*phys_mem_obj, size, *prop); *handle = reinterpret_cast(phys_mem_obj->getUserData().data); @@ -203,17 +210,23 @@ hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* loca hipError_t hipMemGetAllocationGranularity(size_t* granularity, const hipMemAllocationProp* prop, hipMemAllocationGranularity_flags option) { + HIP_INIT_API(hipMemGetAllocationGranularity, granularity, prop, option); if (granularity == nullptr || prop == nullptr || (prop->type != hipMemAllocationTypePinned && prop->type != hipMemAllocationTypeUncached) || - prop->location.type != hipMemLocationTypeDevice || prop->location.id >= g_devices.size() || + (prop->location.type != hipMemLocationTypeDevice && + prop->location.type != hipMemLocationTypeHost) || + prop->location.id >= g_devices.size() || (option != hipMemAllocationGranularityMinimum && option != hipMemAllocationGranularityRecommended)) { HIP_RETURN(hipErrorInvalidValue); } - const auto& dev_info = g_devices[prop->location.id]->devices()[0]->info(); + bool useHostDevice = (prop->location.type == hipMemLocationTypeHost); + amd::Context* curDevContext = hip::getCurrentDevice()->asContext(); + amd::Context* amdContext = useHostDevice ? hip::host_context : curDevContext; + const auto& dev_info = amdContext->devices()[0]->info(); *granularity = dev_info.virtualMemAllocGranularity_; @@ -347,26 +360,38 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, // Ensure that the specified size parameter matches the total size of a complete set of // sub-buffers, disallowing partial sub-buffer coverage auto mem_object = amd::MemObjMap::FindMemObj(ptr); - if (mem_object && mem_object->parent()) { - size_t accumulated_buffer_size = 0; - for (auto sub_buffer : mem_object->parent()->subBuffers()) { - accumulated_buffer_size += sub_buffer->getSize(); - if (accumulated_buffer_size > size) { + hipMemLocationType memLocationType = hipMemLocationTypeNone; + + if (mem_object) { + memLocationType = static_cast(mem_object->getUserData().locationType); + if (mem_object->parent()) { + size_t accumulated_buffer_size = 0; + for (auto sub_buffer : mem_object->parent()->subBuffers()) { + accumulated_buffer_size += sub_buffer->getSize(); + if (accumulated_buffer_size > size) { + HIP_RETURN(hipErrorInvalidValue); + } else if (accumulated_buffer_size == size) { + break; + } + } + + if (accumulated_buffer_size != size) { HIP_RETURN(hipErrorInvalidValue); - } else if (accumulated_buffer_size == size) { - break; } } - - if (accumulated_buffer_size != size) { - HIP_RETURN(hipErrorInvalidValue); - } + } else { + HIP_RETURN(hipErrorInvalidValue); } for (size_t desc_idx = 0; desc_idx < count; ++desc_idx) { - if (desc[desc_idx].location.type != hipMemLocationTypeDevice) { + hipMemLocationType accessLocationType = desc[desc_idx].location.type; + if (accessLocationType != hipMemLocationTypeDevice && accessLocationType != hipMemLocationTypeHost) { HIP_RETURN(hipErrorInvalidValue); } + if (accessLocationType == hipMemLocationTypeHost && + memLocationType != hipMemLocationTypeHost) { + HIP_RETURN(hipErrorInvalidValue) + } if (desc[desc_idx].location.id >= g_devices.size()) { HIP_RETURN(hipErrorInvalidValue) @@ -380,7 +405,8 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, HIP_RETURN(hipErrorInvalidValue); } - if (!dev->devices()[0]->SetMemAccess(ptr, size, access_flags)) { + if (!dev->devices()[0]->SetMemAccess(ptr, size, access_flags, + static_cast(accessLocationType))) { HIP_RETURN(hipErrorInvalidValue); } } diff --git a/projects/clr/rocclr/device/device.cpp b/projects/clr/rocclr/device/device.cpp index 6beb967129..6cddf83921 100644 --- a/projects/clr/rocclr/device/device.cpp +++ b/projects/clr/rocclr/device/device.cpp @@ -526,7 +526,7 @@ bool Device::ValidateVirtualAddressRange(amd::Memory* vaddr_base_obj, amd::Memor //================================================================================================== amd::Memory* Device::CreateVirtualBuffer(amd::Context& device_context, void* vptr, size_t size, - int deviceId, bool parent, bool kForceAlloc) { + int deviceId, int locationType, bool parent, bool kForceAlloc) { amd::Memory* vaddr_base_obj = nullptr; amd::Memory* vaddr_sub_obj = nullptr; constexpr bool kSysMemAlloc = false; @@ -572,6 +572,7 @@ amd::Memory* Device::CreateVirtualBuffer(amd::Context& device_context, void* vpt } vaddr_sub_obj->getUserData().deviceId = deviceId; + vaddr_sub_obj->getUserData().locationType = locationType; if (!ValidateVirtualAddressRange(vaddr_base_obj, vaddr_sub_obj)) { LogError("Validation failed on address range, returning nullptr"); diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 84558efcba..c6660bc30e 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1658,6 +1658,8 @@ class Device : public RuntimeObject { // LinkAttrType; @@ -1892,7 +1894,7 @@ class Device : public RuntimeObject { * @param ForceAlloc force_alloc */ amd::Memory* CreateVirtualBuffer(Context& device_context, void* vptr, size_t size, int deviceId, - bool parent, bool kForceAlloc = false); + int locationType, bool parent, bool kForceAlloc = false); /** * Deletes Virtual Buffer and creates memob @@ -1918,7 +1920,8 @@ class Device : public RuntimeObject { * @param access_flags Access permissions * @param count Number of access permissions */ - virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags) = 0; + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType = VmmLocationType::kDevice) = 0; /** * Get Access permisions for a virtual memory object. diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index 0003cf04ea..fb6f3914eb 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -2528,7 +2528,7 @@ void Device::svmFree(void* ptr) const { void* Device::virtualAlloc(void* addr, size_t size, size_t alignment) { constexpr bool kParent = true; constexpr bool kForceAlloc = true; - amd::Memory* mem = CreateVirtualBuffer(context(), addr, size, -1, kParent, kForceAlloc); + amd::Memory* mem = CreateVirtualBuffer(context(), addr, size, -1, -1, kParent, kForceAlloc); assert(mem != nullptr); return mem->getSvmPtr(); } @@ -2549,7 +2549,8 @@ bool Device::virtualFree(void* addr) { } // ================================================================================================ -bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags) { +bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType access_location) { amd::Memory* amd_mem_obj = amd::MemObjMap::FindMemObj(va_addr); if (amd_mem_obj == nullptr) { // If the amd_mem_obj is null, the check if this is a valid va_addr, but not-mapped, diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index bff5fe15a7..b6f959c3aa 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -145,7 +145,10 @@ class NullDevice : public amd::Device { virtual void* virtualAlloc(void* addr, size_t size, size_t alignment) { return nullptr; }; virtual bool virtualFree(void* addr) { return true; } - virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags) { return true; } + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType = VmmLocationType::kDevice) { + return true; + } virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const { return true; } @@ -555,7 +558,8 @@ class Device : public NullDevice { virtual bool virtualFree(void* addr); //! Set/Get memory access set by the app - virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags); + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType = VmmLocationType::kDevice); virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const; virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const; diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 610c926601..243f2ceca2 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -2285,7 +2285,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { constexpr bool kParent = false; vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer( phys_mem_obj->getContext(), const_cast(vcmd.ptr()), vcmd.size(), - phys_mem_obj->getUserData().deviceId, kParent); + phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent); // Calculate the offset from the original pointer. vaddr_offset = (reinterpret_cast
(vaddr_sub_obj->getSvmPtr()) - diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 29f5e45d39..2894c2dc31 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2290,7 +2290,7 @@ void* Device::virtualAlloc(void* req_addr, size_t size, size_t alignment) { } constexpr bool kParent = true; - amd::Memory* mem = CreateVirtualBuffer(context(), vptr, size, -1, kParent); + amd::Memory* mem = CreateVirtualBuffer(context(), vptr, size, -1, -1, kParent); if (mem == nullptr) { LogPrintfError("Cannot create Virtual Buffer for vptr: %p of size: %u", vptr, size); } @@ -2316,11 +2316,13 @@ bool Device::virtualFree(void* addr) { return true; } -bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags) { +bool Device::SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType access_location) { hsa_status_t hsa_status = HSA_STATUS_SUCCESS; hsa_amd_memory_access_desc_t desc; desc.permissions = static_cast(access_flags); - desc.agent_handle = getBackendDevice(); + desc.agent_handle = + access_location == VmmLocationType::kDevice ? getBackendDevice() : getCpuAgent(); if ((hsa_status = hsa_amd_vmem_set_access(va_addr, va_size, &desc, 1)) != HSA_STATUS_SUCCESS) { LogPrintfError("Failed hsa_amd_vmem_set_access. Failed with status:%d \n", hsa_status); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index af709ff252..0847537444 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -230,7 +230,8 @@ class NullDevice : public amd::Device { return true; } - virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags) override { + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType = VmmLocationType::kDevice) override { ShouldNotReachHere(); return false; } @@ -442,7 +443,8 @@ class Device : public NullDevice { virtual void* virtualAlloc(void* req_addr, size_t size, size_t alignment); virtual bool virtualFree(void* addr); - virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags); + virtual bool SetMemAccess(void* va_addr, size_t va_size, VmmAccess access_flags, + VmmLocationType = VmmLocationType::kDevice); virtual bool GetMemAccess(void* va_addr, VmmAccess* access_flags_ptr) const; virtual bool ValidateMemAccess(amd::Memory& mem, bool read_write) const { return true; } diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 5135a49afc..4913c05a61 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -2948,7 +2948,7 @@ void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) { constexpr bool kParent = false; amd::Memory* vaddr_sub_obj = phys_mem_obj->getContext().devices()[0]->CreateVirtualBuffer( phys_mem_obj->getContext(), const_cast(vcmd.ptr()), vcmd.size(), - phys_mem_obj->getUserData().deviceId, kParent); + phys_mem_obj->getUserData().deviceId, phys_mem_obj->getUserData().locationType, kParent); // Map the physical to virtual address the hsa api hsa_amd_vmem_alloc_handle_t opaque_hsa_handle; opaque_hsa_handle.handle = phys_mem_obj->getUserData().hsa_handle; diff --git a/projects/clr/rocclr/platform/memory.hpp b/projects/clr/rocclr/platform/memory.hpp index d77e9df51c..1a7d820982 100644 --- a/projects/clr/rocclr/platform/memory.hpp +++ b/projects/clr/rocclr/platform/memory.hpp @@ -146,7 +146,9 @@ class Memory : public amd::RuntimeObject { }; struct UserData { - int deviceId = 0; //!< Device ID memory is allocated on + int deviceId = 0; //!< Device ID memory is allocated on + int locationType = + 0; //!< The type of the location (i.e. device or host) memory is allocated on void* data = nullptr; //!< Opaque user data from CL or HIP or etc. amd::Memory* phys_mem_obj = nullptr; //(addr); + for (size_t i = 0; i < N; ++i) hostPtr[i] = static_cast(i); + + // Device output buffer + int* dOut = nullptr; + HIP_CHECK(hipMalloc(&dOut, bytes)); + + // Launch kernel to read host memory and write to device buffer + dim3 block(256), grid((N + block.x - 1) / block.x); + hipLaunchKernelGGL(copyFromHostMem, grid, block, 0, 0, reinterpret_cast(addr), dOut, + static_cast(N)); + HIP_CHECK(hipDeviceSynchronize()); + + // Verify + std::vector out(N, -1); + HIP_CHECK(hipMemcpy(out.data(), dOut, bytes, hipMemcpyDeviceToHost)); + for (size_t i = 0; i < N; ++i) { + REQUIRE(out[i] == static_cast(i)); + } + + // Cleanup + HIP_CHECK(hipFree(dOut)); + HIP_CHECK(hipMemUnmap(addr, mapSize)); + HIP_CHECK(hipMemAddressFree(addr, mapSize)); + HIP_CHECK(hipMemRelease(handle)); +} + +TEST_CASE("Unit_hipMemSetAccessHost_devicealloc") { + // Ensure device 0 is selected + REQUIRE(hipSetDevice(0) == hipSuccess); + + // ---- Describe a DEVICE-backed allocation + hipMemAllocationProp prop{}; + prop.type = hipMemAllocationTypePinned; // pinned system RAM + prop.location.type = hipMemLocationTypeDevice; // generic host + prop.location.id = 0; // host id must be 0 + prop.requestedHandleType = hipMemHandleTypeNone; + + constexpr size_t N = 1024; + constexpr size_t bytes = N * sizeof(int); + + //get minimum granularity + size_t gran = 0; + HIP_CHECK(hipMemGetAllocationGranularity(&gran, &prop, hipMemAllocationGranularityMinimum)); + size_t mapSize = ((bytes + gran - 1) / gran) * gran; + + // Create host-backed allocation handle + hipMemGenericAllocationHandle_t handle{}; + HIP_CHECK(hipMemCreate(&handle, mapSize, &prop, 0 /*flags*/)); + + // Reserve VA and map + void* addr = nullptr; + HIP_CHECK(hipMemAddressReserve(&addr, mapSize, 0 /*align*/, 0 /*addr*/, 0 /*flags*/)); + + HIP_CHECK(hipMemMap(addr, mapSize, 0 /*offset*/, handle, 0 /*flags*/)); + + // Grant HOST access. + hipMemAccessDesc accHost{}; + accHost.flags = hipMemAccessFlagsProtReadWrite; + accHost.location.type = hipMemLocationTypeHost; + accHost.location.id = 0; + HIP_CHECK_ERROR(hipMemSetAccess(addr, mapSize, &accHost, 1), hipErrorInvalidValue); + + HIP_CHECK(hipMemUnmap(addr, mapSize)); + HIP_CHECK(hipMemAddressFree(addr, mapSize)); + HIP_CHECK(hipMemRelease(handle)); +} + /** * End doxygen group VirtualMemoryManagementTest. * @}