* rocr: Add support for VMM and RDMA

Add extra CPU mapping so that kernel-mode drivers can look up the memory
mapping by virtual address.

* Update projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp

Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>

* Update projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h

Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>

* rocr: Honor uncache flag in memory_lock_to_pool()

Also, combined several flag options used in apis into a
single integer.

Signed-off-by: Chris Freehill <cfreehil@amd.com>

* rocr: Fix hsa_amd_pointer_info on CPU agents

Fix hsa_amd_pointer_info query returning allowd on VMM pointers for CPU
agents when CPU mapping was mapped with PROT_NONE.

---------

Signed-off-by: Chris Freehill <cfreehil@amd.com>
Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com>
Co-authored-by: Chris Freehill <cfreehil@amd.com>
Co-authored-by: cfreeamd <166262151+cfreeamd@users.noreply.github.com>
Этот коммит содержится в:
David Yat Sin
2025-10-21 12:19:02 -04:00
коммит произвёл GitHub
родитель 65d4ff9d04
Коммит e2f3bd2429
8 изменённых файлов: 76 добавлений и 47 удалений
+11 -21
Просмотреть файл
@@ -1061,9 +1061,7 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
}
static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
uint64_t size,
bool coarse_grain,
bool ext_coherent)
uint64_t size, HsaMemFlags flags)
{
struct kfd_ioctl_svm_args *args;
size_t s_attr;
@@ -1080,10 +1078,11 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
args->size = aligned_size;
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
args->nattr = 2;
args->attrs[0].type = coarse_grain ?
args->attrs[0].type = flags.ui32.CoarseGrain ?
HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
args->attrs[0].value = HSA_SVM_FLAG_COHERENT;
args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS ;
args->attrs[1].type = flags.ui32.ExtendedCoherent ?
HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS;
args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT;
pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
aligned_size);
@@ -3748,8 +3747,7 @@ bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offse
static HSAKMT_STATUS fmm_register_user_memory(void *addr,
HSAuint64 size,
vm_object_t **obj_ret,
bool coarse_grain,
bool ext_coherent)
HsaMemFlags flags)
{
manageable_aperture_t *aperture = svm.dgpu_aperture;
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
@@ -3774,8 +3772,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
&aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
(ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
(flags.ui32.CoarseGrain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
(flags.ui32.ExtendedCoherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
0,
&obj);
if (!svm_addr)
@@ -3813,8 +3811,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain,
bool ext_coherent)
HsaMemFlags flags)
{
manageable_aperture_t *aperture = NULL;
vm_object_t *object = NULL;
@@ -3823,7 +3820,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
if (gpu_id_array_size > 0 && !gpu_id_array)
return HSAKMT_STATUS_INVALID_PARAMETER;
if (coarse_grain && ext_coherent)
if (flags.ui32.CoarseGrain && flags.ui32.ExtendedCoherent)
return HSAKMT_STATUS_INVALID_PARAMETER;
object = vm_find_object(address, size_in_bytes, &aperture);
@@ -3834,19 +3831,12 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
/* Register a new user ptr */
if (hsakmt_is_svm_api_supported) {
ret = fmm_register_mem_svm_api(address,
size_in_bytes,
coarse_grain,
ext_coherent);
ret = fmm_register_mem_svm_api(address, size_in_bytes, flags);
if (ret == HSAKMT_STATUS_SUCCESS)
return ret;
pr_debug("SVM failed, falling back to old registration\n");
}
ret = fmm_register_user_memory(address,
size_in_bytes,
&object,
coarse_grain,
ext_coherent);
ret = fmm_register_user_memory(address, size_in_bytes, &object, flags);
if (ret != HSAKMT_STATUS_SUCCESS)
return ret;
+1 -2
Просмотреть файл
@@ -75,8 +75,7 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_ty
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
uint32_t *gpu_id_array,
uint32_t gpu_id_array_size,
bool coarse_grain,
bool ext_coherent);
HsaMemFlags flags);
HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
HsaGraphicsResourceInfo *GraphicsResourceInfo,
uint32_t *gpu_id_array,
+10 -3
Просмотреть файл
@@ -268,8 +268,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
/* TODO: support mixed APU and dGPU configurations */
return HSAKMT_STATUS_SUCCESS;
HsaMemFlags flags;
flags.ui32.CoarseGrain = 1;
flags.ui32.ExtendedCoherent = 0;
return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, true, false);
NULL, 0, flags);
}
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
@@ -292,10 +295,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
NumberOfNodes, NodeArray);
if (ret == HSAKMT_STATUS_SUCCESS) {
HsaMemFlags flags;
flags.ui32.CoarseGrain = 1;
flags.ui32.ExtendedCoherent = 0;
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
gpu_id_array,
NumberOfNodes*sizeof(uint32_t),
true, false);
flags);
if (ret != HSAKMT_STATUS_SUCCESS)
free(gpu_id_array);
}
@@ -325,7 +332,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
return HSAKMT_STATUS_NOT_SUPPORTED;
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent);
NULL, 0, MemFlags);
return ret;
}
+2 -1
Просмотреть файл
@@ -105,7 +105,8 @@ class MemoryRegion : public core::MemoryRegion {
hsa_status_t Migrate(uint32_t flag, const void* ptr) const;
hsa_status_t Lock(uint32_t num_agents, const hsa_agent_t* agents,
void* host_ptr, size_t size, void** agent_ptr) const;
void* host_ptr, size_t size, uint32_t flags,
void** agent_ptr) const;
hsa_status_t Unlock(void* host_ptr) const;
+4 -5
Просмотреть файл
@@ -879,12 +879,11 @@ class Runtime {
};
struct MappedHandle {
MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle,
MappedHandle(MemoryHandle* mem_handle, AddressHandle* address_handle, void* va,
uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr,
hsa_access_permission_t perm, ShareableHandle shareable_handle)
: mem_handle(mem_handle), address_handle(address_handle),
offset(offset), size(size), drm_fd(drm_fd),
drm_cpu_addr(drm_cpu_addr), shareable_handle(shareable_handle) {}
hsa_access_permission_t perm, ShareableHandle shareable_handle);
MappedHandle() {}
__forceinline core::Agent* agentOwner() const { return mem_handle->region->owner(); }
+8 -2
Просмотреть файл
@@ -538,7 +538,7 @@ hsa_status_t MemoryRegion::Migrate(uint32_t flag, const void* ptr) const {
}
hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
void* host_ptr, size_t size,
void* host_ptr, size_t size, uint32_t flags,
void** agent_ptr) const {
if (!IsSystem()) {
return HSA_STATUS_ERROR;
@@ -581,9 +581,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
*agent_ptr = host_ptr;
return HSA_STATUS_SUCCESS;
}
HsaMemFlags local_mem_flag = mem_flag_;
if (flags & HSA_AMD_MEMORY_POOL_UNCACHED_FLAG) {
local_mem_flag.ui32.Uncached = 1;
local_mem_flag.ui32.CoarseGrain = 0;
local_mem_flag.ui32.ExtendedCoherent = 0;
}
// Call kernel driver to register and pin the memory.
if (owner()->driver().RegisterMemory(host_ptr, size, const_cast<HsaMemFlags&>(mem_flag_)) ==
if (owner()->driver().RegisterMemory(host_ptr, size, local_mem_flag) ==
HSA_STATUS_SUCCESS) {
uint64_t alternate_va = 0;
if (owner()->driver().MakeMemoryResident(host_ptr, size, &alternate_va, &map_flag_,
+3 -3
Просмотреть файл
@@ -761,7 +761,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr);
return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
CATCH;
}
@@ -771,7 +771,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
TRY;
IS_OPEN();
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr || flags != 0) {
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -789,7 +789,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
if (mem_region->owner()->device_type() != core::Agent::kAmdCpuDevice)
return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL;
return mem_region->Lock(num_agent, agents, host_ptr, size, agent_ptr);
return mem_region->Lock(num_agent, agents, host_ptr, size, flags, agent_ptr);
CATCH;
}
+37 -10
Просмотреть файл
@@ -543,7 +543,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
void* tmp;
hsa_agent_t agent = locking_agent->public_handle();
hsa_status_t err = system_region->Lock(1, &agent, ptr, size, &tmp);
hsa_status_t err = system_region->Lock(1, &agent, ptr, 0, size, &tmp);
if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Lock failed in hsa_memory_copy.");
gpuPtr = ptr;
ptr = tmp;
@@ -977,7 +977,8 @@ hsa_status_t Runtime::VMemoryPtrInfo(const void* ptr, hsa_amd_pointer_info_t* in
for (auto agentPermsIt = mappedHandleIt->second.allowed_agents.begin();
agentPermsIt != mappedHandleIt->second.allowed_agents.end(); agentPermsIt++) {
allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle());
if ((*agentPermsIt).second.permissions != HSA_ACCESS_PERMISSION_NONE)
allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle());
}
AMD::callback_t<decltype(alloc)> Alloc(alloc);
@@ -3519,7 +3520,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
mapped_handle_map_.emplace(
std::piecewise_construct, std::forward_as_tuple(va),
std::forward_as_tuple(&memoryHandleIt->second, addressHandle, offset, size, drm_fd,
std::forward_as_tuple(&memoryHandleIt->second, addressHandle, va, offset, size, drm_fd,
reinterpret_cast<void*>(drm_cpu_addr), HSA_ACCESS_PERMISSION_NONE,
shareable_handle));
@@ -3655,17 +3656,43 @@ hsa_status_t Runtime::MappedHandleAllowedAgent::EnableAccess(hsa_access_permissi
hsa_status_t Runtime::MappedHandleAllowedAgent::RemoveAccess() {
if (targetAgent->device_type() == core::Agent::DeviceType::kAmdCpuDevice) {
#if defined(__linux__)
if (munmap(va, size) != 0)
return HSA_STATUS_ERROR;
#else
assert(!"Unimplemented!");
#endif
return HSA_STATUS_SUCCESS;
if (permissions != HSA_ACCESS_PERMISSION_NONE) {
if (munmap(va, size) != 0) return HSA_STATUS_ERROR;
/* We need to keep the CPU mapping. So change it to PROT_NONE */
void* mapped_ptr = mmap(va, mappedHandle->size, PROT_NONE, MAP_SHARED | MAP_FIXED,
mappedHandle->drm_fd,
reinterpret_cast<uint64_t>(mappedHandle->drm_cpu_addr));
if (mapped_ptr != va)
return HSA_STATUS_ERROR;
permissions = HSA_ACCESS_PERMISSION_NONE;
}
} else {
return targetAgent->driver().Unmap(
shareable_handle, va, mappedHandle->offset, mappedHandle->size);
}
return HSA_STATUS_SUCCESS;
}
Runtime::MappedHandle::MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle,
void* va, uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr,
hsa_access_permission_t perm, ShareableHandle shareable_handle)
: mem_handle(mem_handle), address_handle(address_handle), offset(offset),
size(size), drm_fd(drm_fd), drm_cpu_addr(drm_cpu_addr),
shareable_handle(shareable_handle)
{
/* Create a CPU mapping with PROT_NONE */
auto cpu_agent = static_cast<AMD::GpuAgent*>(agentOwner())->GetNearestCpuAgent();
auto agentPermsIt = allowed_agents.emplace(std::piecewise_construct,
std::forward_as_tuple(cpu_agent),
std::forward_as_tuple(this, cpu_agent, va,
size, HSA_ACCESS_PERMISSION_NONE))
.first;
auto ret = agentPermsIt->second.EnableAccess(HSA_ACCESS_PERMISSION_NONE);
if (ret != HSA_STATUS_SUCCESS)
throw AMD::hsa_exception(ret, "Failed to create default CPU mapping");
}
// Note: VMemorySetAccessPerHandle should be called with &memory_lock_ held