Changes for RDMA with VMM (#801)
* rocr: Add support for VMM and RDMA Add extra CPU mapping so that kernel-mode drivers can look up the memory mapping by virtual address. * Update projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com> * Update projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com> * rocr: Honor uncache flag in memory_lock_to_pool() Also, combined several flag options used in apis into a single integer. Signed-off-by: Chris Freehill <cfreehil@amd.com> * rocr: Fix hsa_amd_pointer_info on CPU agents Fix hsa_amd_pointer_info query returning allowd on VMM pointers for CPU agents when CPU mapping was mapped with PROT_NONE. --------- Signed-off-by: Chris Freehill <cfreehil@amd.com> Co-authored-by: Yiannis Papadopoulos <102817138+ypapadop-amd@users.noreply.github.com> Co-authored-by: Chris Freehill <cfreehil@amd.com> Co-authored-by: cfreeamd <166262151+cfreeamd@users.noreply.github.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
65d4ff9d04
Коммит
e2f3bd2429
@@ -1061,9 +1061,7 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags)
|
||||
}
|
||||
|
||||
static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
|
||||
uint64_t size,
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
uint64_t size, HsaMemFlags flags)
|
||||
{
|
||||
struct kfd_ioctl_svm_args *args;
|
||||
size_t s_attr;
|
||||
@@ -1080,10 +1078,11 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address,
|
||||
args->size = aligned_size;
|
||||
args->op = KFD_IOCTL_SVM_OP_SET_ATTR;
|
||||
args->nattr = 2;
|
||||
args->attrs[0].type = coarse_grain ?
|
||||
args->attrs[0].type = flags.ui32.CoarseGrain ?
|
||||
HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS;
|
||||
args->attrs[0].value = HSA_SVM_FLAG_COHERENT;
|
||||
args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS ;
|
||||
args->attrs[1].type = flags.ui32.ExtendedCoherent ?
|
||||
HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS;
|
||||
args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT;
|
||||
pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr,
|
||||
aligned_size);
|
||||
@@ -3748,8 +3747,7 @@ bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offse
|
||||
static HSAKMT_STATUS fmm_register_user_memory(void *addr,
|
||||
HSAuint64 size,
|
||||
vm_object_t **obj_ret,
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
HsaMemFlags flags)
|
||||
{
|
||||
manageable_aperture_t *aperture = svm.dgpu_aperture;
|
||||
HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1);
|
||||
@@ -3774,8 +3772,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
|
||||
&aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
|
||||
(coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
|
||||
(ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
|
||||
(flags.ui32.CoarseGrain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) |
|
||||
(flags.ui32.ExtendedCoherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0),
|
||||
0,
|
||||
&obj);
|
||||
if (!svm_addr)
|
||||
@@ -3813,8 +3811,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr,
|
||||
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
uint32_t *gpu_id_array,
|
||||
uint32_t gpu_id_array_size,
|
||||
bool coarse_grain,
|
||||
bool ext_coherent)
|
||||
HsaMemFlags flags)
|
||||
{
|
||||
manageable_aperture_t *aperture = NULL;
|
||||
vm_object_t *object = NULL;
|
||||
@@ -3823,7 +3820,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
if (gpu_id_array_size > 0 && !gpu_id_array)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
if (coarse_grain && ext_coherent)
|
||||
if (flags.ui32.CoarseGrain && flags.ui32.ExtendedCoherent)
|
||||
return HSAKMT_STATUS_INVALID_PARAMETER;
|
||||
|
||||
object = vm_find_object(address, size_in_bytes, &aperture);
|
||||
@@ -3834,19 +3831,12 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
|
||||
/* Register a new user ptr */
|
||||
if (hsakmt_is_svm_api_supported) {
|
||||
ret = fmm_register_mem_svm_api(address,
|
||||
size_in_bytes,
|
||||
coarse_grain,
|
||||
ext_coherent);
|
||||
ret = fmm_register_mem_svm_api(address, size_in_bytes, flags);
|
||||
if (ret == HSAKMT_STATUS_SUCCESS)
|
||||
return ret;
|
||||
pr_debug("SVM failed, falling back to old registration\n");
|
||||
}
|
||||
ret = fmm_register_user_memory(address,
|
||||
size_in_bytes,
|
||||
&object,
|
||||
coarse_grain,
|
||||
ext_coherent);
|
||||
ret = fmm_register_user_memory(address, size_in_bytes, &object, flags);
|
||||
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
return ret;
|
||||
|
||||
@@ -75,8 +75,7 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_ty
|
||||
HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes,
|
||||
uint32_t *gpu_id_array,
|
||||
uint32_t gpu_id_array_size,
|
||||
bool coarse_grain,
|
||||
bool ext_coherent);
|
||||
HsaMemFlags flags);
|
||||
HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle,
|
||||
HsaGraphicsResourceInfo *GraphicsResourceInfo,
|
||||
uint32_t *gpu_id_array,
|
||||
|
||||
@@ -268,8 +268,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
|
||||
/* TODO: support mixed APU and dGPU configurations */
|
||||
return HSAKMT_STATUS_SUCCESS;
|
||||
|
||||
HsaMemFlags flags;
|
||||
flags.ui32.CoarseGrain = 1;
|
||||
flags.ui32.ExtendedCoherent = 0;
|
||||
return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
NULL, 0, true, false);
|
||||
NULL, 0, flags);
|
||||
}
|
||||
|
||||
HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
|
||||
@@ -292,10 +295,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress,
|
||||
NumberOfNodes, NodeArray);
|
||||
|
||||
if (ret == HSAKMT_STATUS_SUCCESS) {
|
||||
HsaMemFlags flags;
|
||||
flags.ui32.CoarseGrain = 1;
|
||||
flags.ui32.ExtendedCoherent = 0;
|
||||
|
||||
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
gpu_id_array,
|
||||
NumberOfNodes*sizeof(uint32_t),
|
||||
true, false);
|
||||
flags);
|
||||
if (ret != HSAKMT_STATUS_SUCCESS)
|
||||
free(gpu_id_array);
|
||||
}
|
||||
@@ -325,7 +332,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress,
|
||||
return HSAKMT_STATUS_NOT_SUPPORTED;
|
||||
|
||||
ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes,
|
||||
NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent);
|
||||
NULL, 0, MemFlags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -105,7 +105,8 @@ class MemoryRegion : public core::MemoryRegion {
|
||||
hsa_status_t Migrate(uint32_t flag, const void* ptr) const;
|
||||
|
||||
hsa_status_t Lock(uint32_t num_agents, const hsa_agent_t* agents,
|
||||
void* host_ptr, size_t size, void** agent_ptr) const;
|
||||
void* host_ptr, size_t size, uint32_t flags,
|
||||
void** agent_ptr) const;
|
||||
|
||||
hsa_status_t Unlock(void* host_ptr) const;
|
||||
|
||||
|
||||
@@ -879,12 +879,11 @@ class Runtime {
|
||||
};
|
||||
|
||||
struct MappedHandle {
|
||||
MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle,
|
||||
MappedHandle(MemoryHandle* mem_handle, AddressHandle* address_handle, void* va,
|
||||
uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr,
|
||||
hsa_access_permission_t perm, ShareableHandle shareable_handle)
|
||||
: mem_handle(mem_handle), address_handle(address_handle),
|
||||
offset(offset), size(size), drm_fd(drm_fd),
|
||||
drm_cpu_addr(drm_cpu_addr), shareable_handle(shareable_handle) {}
|
||||
hsa_access_permission_t perm, ShareableHandle shareable_handle);
|
||||
|
||||
MappedHandle() {}
|
||||
|
||||
__forceinline core::Agent* agentOwner() const { return mem_handle->region->owner(); }
|
||||
|
||||
|
||||
@@ -538,7 +538,7 @@ hsa_status_t MemoryRegion::Migrate(uint32_t flag, const void* ptr) const {
|
||||
}
|
||||
|
||||
hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
|
||||
void* host_ptr, size_t size,
|
||||
void* host_ptr, size_t size, uint32_t flags,
|
||||
void** agent_ptr) const {
|
||||
if (!IsSystem()) {
|
||||
return HSA_STATUS_ERROR;
|
||||
@@ -581,9 +581,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
|
||||
*agent_ptr = host_ptr;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
HsaMemFlags local_mem_flag = mem_flag_;
|
||||
if (flags & HSA_AMD_MEMORY_POOL_UNCACHED_FLAG) {
|
||||
local_mem_flag.ui32.Uncached = 1;
|
||||
local_mem_flag.ui32.CoarseGrain = 0;
|
||||
local_mem_flag.ui32.ExtendedCoherent = 0;
|
||||
}
|
||||
|
||||
// Call kernel driver to register and pin the memory.
|
||||
if (owner()->driver().RegisterMemory(host_ptr, size, const_cast<HsaMemFlags&>(mem_flag_)) ==
|
||||
if (owner()->driver().RegisterMemory(host_ptr, size, local_mem_flag) ==
|
||||
HSA_STATUS_SUCCESS) {
|
||||
uint64_t alternate_va = 0;
|
||||
if (owner()->driver().MakeMemoryResident(host_ptr, size, &alternate_va, &map_flag_,
|
||||
|
||||
@@ -761,7 +761,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
|
||||
const AMD::MemoryRegion* system_region = static_cast<const AMD::MemoryRegion*>(
|
||||
core::Runtime::runtime_singleton_->system_regions_coarse()[0]);
|
||||
|
||||
return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr);
|
||||
return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
@@ -771,7 +771,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
|
||||
TRY;
|
||||
IS_OPEN();
|
||||
|
||||
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr || flags != 0) {
|
||||
if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
@@ -789,7 +789,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_
|
||||
if (mem_region->owner()->device_type() != core::Agent::kAmdCpuDevice)
|
||||
return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL;
|
||||
|
||||
return mem_region->Lock(num_agent, agents, host_ptr, size, agent_ptr);
|
||||
return mem_region->Lock(num_agent, agents, host_ptr, size, flags, agent_ptr);
|
||||
CATCH;
|
||||
}
|
||||
|
||||
|
||||
@@ -543,7 +543,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
|
||||
const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) {
|
||||
void* tmp;
|
||||
hsa_agent_t agent = locking_agent->public_handle();
|
||||
hsa_status_t err = system_region->Lock(1, &agent, ptr, size, &tmp);
|
||||
hsa_status_t err = system_region->Lock(1, &agent, ptr, 0, size, &tmp);
|
||||
if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Lock failed in hsa_memory_copy.");
|
||||
gpuPtr = ptr;
|
||||
ptr = tmp;
|
||||
@@ -977,7 +977,8 @@ hsa_status_t Runtime::VMemoryPtrInfo(const void* ptr, hsa_amd_pointer_info_t* in
|
||||
|
||||
for (auto agentPermsIt = mappedHandleIt->second.allowed_agents.begin();
|
||||
agentPermsIt != mappedHandleIt->second.allowed_agents.end(); agentPermsIt++) {
|
||||
allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle());
|
||||
if ((*agentPermsIt).second.permissions != HSA_ACCESS_PERMISSION_NONE)
|
||||
allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle());
|
||||
}
|
||||
|
||||
AMD::callback_t<decltype(alloc)> Alloc(alloc);
|
||||
@@ -3519,7 +3520,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset,
|
||||
|
||||
mapped_handle_map_.emplace(
|
||||
std::piecewise_construct, std::forward_as_tuple(va),
|
||||
std::forward_as_tuple(&memoryHandleIt->second, addressHandle, offset, size, drm_fd,
|
||||
std::forward_as_tuple(&memoryHandleIt->second, addressHandle, va, offset, size, drm_fd,
|
||||
reinterpret_cast<void*>(drm_cpu_addr), HSA_ACCESS_PERMISSION_NONE,
|
||||
shareable_handle));
|
||||
|
||||
@@ -3655,17 +3656,43 @@ hsa_status_t Runtime::MappedHandleAllowedAgent::EnableAccess(hsa_access_permissi
|
||||
|
||||
hsa_status_t Runtime::MappedHandleAllowedAgent::RemoveAccess() {
|
||||
if (targetAgent->device_type() == core::Agent::DeviceType::kAmdCpuDevice) {
|
||||
#if defined(__linux__)
|
||||
if (munmap(va, size) != 0)
|
||||
return HSA_STATUS_ERROR;
|
||||
#else
|
||||
assert(!"Unimplemented!");
|
||||
#endif
|
||||
return HSA_STATUS_SUCCESS;
|
||||
if (permissions != HSA_ACCESS_PERMISSION_NONE) {
|
||||
if (munmap(va, size) != 0) return HSA_STATUS_ERROR;
|
||||
|
||||
/* We need to keep the CPU mapping. So change it to PROT_NONE */
|
||||
void* mapped_ptr = mmap(va, mappedHandle->size, PROT_NONE, MAP_SHARED | MAP_FIXED,
|
||||
mappedHandle->drm_fd,
|
||||
reinterpret_cast<uint64_t>(mappedHandle->drm_cpu_addr));
|
||||
if (mapped_ptr != va)
|
||||
return HSA_STATUS_ERROR;
|
||||
|
||||
permissions = HSA_ACCESS_PERMISSION_NONE;
|
||||
}
|
||||
} else {
|
||||
return targetAgent->driver().Unmap(
|
||||
shareable_handle, va, mappedHandle->offset, mappedHandle->size);
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
Runtime::MappedHandle::MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle,
|
||||
void* va, uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr,
|
||||
hsa_access_permission_t perm, ShareableHandle shareable_handle)
|
||||
: mem_handle(mem_handle), address_handle(address_handle), offset(offset),
|
||||
size(size), drm_fd(drm_fd), drm_cpu_addr(drm_cpu_addr),
|
||||
shareable_handle(shareable_handle)
|
||||
{
|
||||
/* Create a CPU mapping with PROT_NONE */
|
||||
auto cpu_agent = static_cast<AMD::GpuAgent*>(agentOwner())->GetNearestCpuAgent();
|
||||
auto agentPermsIt = allowed_agents.emplace(std::piecewise_construct,
|
||||
std::forward_as_tuple(cpu_agent),
|
||||
std::forward_as_tuple(this, cpu_agent, va,
|
||||
size, HSA_ACCESS_PERMISSION_NONE))
|
||||
.first;
|
||||
|
||||
auto ret = agentPermsIt->second.EnableAccess(HSA_ACCESS_PERMISSION_NONE);
|
||||
if (ret != HSA_STATUS_SUCCESS)
|
||||
throw AMD::hsa_exception(ret, "Failed to create default CPU mapping");
|
||||
}
|
||||
|
||||
// Note: VMemorySetAccessPerHandle should be called with &memory_lock_ held
|
||||
|
||||
Ссылка в новой задаче
Block a user