diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.c b/projects/rocr-runtime/libhsakmt/src/fmm.c index adb5dddaa6..75b9b481de 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.c +++ b/projects/rocr-runtime/libhsakmt/src/fmm.c @@ -1061,9 +1061,7 @@ static HsaMemFlags fmm_translate_ioc_to_hsa_flags(uint32_t ioc_flags) } static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, - uint64_t size, - bool coarse_grain, - bool ext_coherent) + uint64_t size, HsaMemFlags flags) { struct kfd_ioctl_svm_args *args; size_t s_attr; @@ -1080,10 +1078,11 @@ static HSAKMT_STATUS fmm_register_mem_svm_api(void *address, args->size = aligned_size; args->op = KFD_IOCTL_SVM_OP_SET_ATTR; args->nattr = 2; - args->attrs[0].type = coarse_grain ? + args->attrs[0].type = flags.ui32.CoarseGrain ? HSA_SVM_ATTR_CLR_FLAGS : HSA_SVM_ATTR_SET_FLAGS; args->attrs[0].value = HSA_SVM_FLAG_COHERENT; - args->attrs[1].type = ext_coherent ? HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS ; + args->attrs[1].type = flags.ui32.ExtendedCoherent ? + HSA_SVM_ATTR_SET_FLAGS : HSA_SVM_ATTR_CLR_FLAGS; args->attrs[1].value = HSA_SVM_FLAG_EXT_COHERENT; pr_debug("Registering to SVM %p size: %ld\n", (void*)aligned_addr, aligned_size); @@ -3748,8 +3747,7 @@ bool hsakmt_fmm_get_handle(void *address, uint64_t *handle, uint64_t *size_offse static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAuint64 size, vm_object_t **obj_ret, - bool coarse_grain, - bool ext_coherent) + HsaMemFlags flags) { manageable_aperture_t *aperture = svm.dgpu_aperture; HSAuint32 page_offset = (HSAuint64)addr & (PAGE_SIZE-1); @@ -3774,8 +3772,8 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, &aligned_addr, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | - (coarse_grain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) | - (ext_coherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0), + (flags.ui32.CoarseGrain ? 0 : KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) | + (flags.ui32.ExtendedCoherent ? KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT : 0), 0, &obj); if (!svm_addr) @@ -3813,8 +3811,7 @@ static HSAKMT_STATUS fmm_register_user_memory(void *addr, HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, - bool coarse_grain, - bool ext_coherent) + HsaMemFlags flags) { manageable_aperture_t *aperture = NULL; vm_object_t *object = NULL; @@ -3823,7 +3820,7 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, if (gpu_id_array_size > 0 && !gpu_id_array) return HSAKMT_STATUS_INVALID_PARAMETER; - if (coarse_grain && ext_coherent) + if (flags.ui32.CoarseGrain && flags.ui32.ExtendedCoherent) return HSAKMT_STATUS_INVALID_PARAMETER; object = vm_find_object(address, size_in_bytes, &aperture); @@ -3834,19 +3831,12 @@ HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, /* Register a new user ptr */ if (hsakmt_is_svm_api_supported) { - ret = fmm_register_mem_svm_api(address, - size_in_bytes, - coarse_grain, - ext_coherent); + ret = fmm_register_mem_svm_api(address, size_in_bytes, flags); if (ret == HSAKMT_STATUS_SUCCESS) return ret; pr_debug("SVM failed, falling back to old registration\n"); } - ret = fmm_register_user_memory(address, - size_in_bytes, - &object, - coarse_grain, - ext_coherent); + ret = fmm_register_user_memory(address, size_in_bytes, &object, flags); if (ret != HSAKMT_STATUS_SUCCESS) return ret; diff --git a/projects/rocr-runtime/libhsakmt/src/fmm.h b/projects/rocr-runtime/libhsakmt/src/fmm.h index 9cb3a8c220..f98b129b5d 100644 --- a/projects/rocr-runtime/libhsakmt/src/fmm.h +++ b/projects/rocr-runtime/libhsakmt/src/fmm.h @@ -75,8 +75,7 @@ HSAKMT_STATUS hsakmt_fmm_get_aperture_base_and_limit(aperture_type_e aperture_ty HSAKMT_STATUS hsakmt_fmm_register_memory(void *address, uint64_t size_in_bytes, uint32_t *gpu_id_array, uint32_t gpu_id_array_size, - bool coarse_grain, - bool ext_coherent); + HsaMemFlags flags); HSAKMT_STATUS hsakmt_fmm_register_graphics_handle(HSAuint64 GraphicsResourceHandle, HsaGraphicsResourceInfo *GraphicsResourceInfo, uint32_t *gpu_id_array, diff --git a/projects/rocr-runtime/libhsakmt/src/memory.c b/projects/rocr-runtime/libhsakmt/src/memory.c index e7230c1015..fb317f71d6 100644 --- a/projects/rocr-runtime/libhsakmt/src/memory.c +++ b/projects/rocr-runtime/libhsakmt/src/memory.c @@ -268,8 +268,11 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress, /* TODO: support mixed APU and dGPU configurations */ return HSAKMT_STATUS_SUCCESS; + HsaMemFlags flags; + flags.ui32.CoarseGrain = 1; + flags.ui32.ExtendedCoherent = 0; return hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, - NULL, 0, true, false); + NULL, 0, flags); } HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, @@ -292,10 +295,14 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryToNodes(void *MemoryAddress, NumberOfNodes, NodeArray); if (ret == HSAKMT_STATUS_SUCCESS) { + HsaMemFlags flags; + flags.ui32.CoarseGrain = 1; + flags.ui32.ExtendedCoherent = 0; + ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, gpu_id_array, NumberOfNodes*sizeof(uint32_t), - true, false); + flags); if (ret != HSAKMT_STATUS_SUCCESS) free(gpu_id_array); } @@ -325,7 +332,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemoryWithFlags(void *MemoryAddress, return HSAKMT_STATUS_NOT_SUPPORTED; ret = hsakmt_fmm_register_memory(MemoryAddress, MemorySizeInBytes, - NULL, 0, MemFlags.ui32.CoarseGrain, MemFlags.ui32.ExtendedCoherent); + NULL, 0, MemFlags); return ret; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h index 13e8313c45..82b110d70d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_memory_region.h @@ -105,7 +105,8 @@ class MemoryRegion : public core::MemoryRegion { hsa_status_t Migrate(uint32_t flag, const void* ptr) const; hsa_status_t Lock(uint32_t num_agents, const hsa_agent_t* agents, - void* host_ptr, size_t size, void** agent_ptr) const; + void* host_ptr, size_t size, uint32_t flags, + void** agent_ptr) const; hsa_status_t Unlock(void* host_ptr) const; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h index 24715d4c24..4df2d2050a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/runtime.h @@ -879,12 +879,11 @@ class Runtime { }; struct MappedHandle { - MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle, + MappedHandle(MemoryHandle* mem_handle, AddressHandle* address_handle, void* va, uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr, - hsa_access_permission_t perm, ShareableHandle shareable_handle) - : mem_handle(mem_handle), address_handle(address_handle), - offset(offset), size(size), drm_fd(drm_fd), - drm_cpu_addr(drm_cpu_addr), shareable_handle(shareable_handle) {} + hsa_access_permission_t perm, ShareableHandle shareable_handle); + + MappedHandle() {} __forceinline core::Agent* agentOwner() const { return mem_handle->region->owner(); } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 591ff98fd1..842ef96165 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -538,7 +538,7 @@ hsa_status_t MemoryRegion::Migrate(uint32_t flag, const void* ptr) const { } hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, - void* host_ptr, size_t size, + void* host_ptr, size_t size, uint32_t flags, void** agent_ptr) const { if (!IsSystem()) { return HSA_STATUS_ERROR; @@ -581,9 +581,15 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, *agent_ptr = host_ptr; return HSA_STATUS_SUCCESS; } + HsaMemFlags local_mem_flag = mem_flag_; + if (flags & HSA_AMD_MEMORY_POOL_UNCACHED_FLAG) { + local_mem_flag.ui32.Uncached = 1; + local_mem_flag.ui32.CoarseGrain = 0; + local_mem_flag.ui32.ExtendedCoherent = 0; + } // Call kernel driver to register and pin the memory. - if (owner()->driver().RegisterMemory(host_ptr, size, const_cast(mem_flag_)) == + if (owner()->driver().RegisterMemory(host_ptr, size, local_mem_flag) == HSA_STATUS_SUCCESS) { uint64_t alternate_va = 0; if (owner()->driver().MakeMemoryResident(host_ptr, size, &alternate_va, &map_flag_, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index cf342e41bd..310bf0a16a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -761,7 +761,7 @@ hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, const AMD::MemoryRegion* system_region = static_cast( core::Runtime::runtime_singleton_->system_regions_coarse()[0]); - return system_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); + return system_region->Lock(num_agent, agents, host_ptr, size, 0, agent_ptr); CATCH; } @@ -771,7 +771,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_ TRY; IS_OPEN(); - if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr || flags != 0) { + if (size == 0 || host_ptr == nullptr || agent_ptr == nullptr) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -789,7 +789,7 @@ hsa_status_t hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_ if (mem_region->owner()->device_type() != core::Agent::kAmdCpuDevice) return (hsa_status_t)HSA_STATUS_ERROR_INVALID_MEMORY_POOL; - return mem_region->Lock(num_agent, agents, host_ptr, size, agent_ptr); + return mem_region->Lock(num_agent, agents, host_ptr, size, flags, agent_ptr); CATCH; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 518b7f125b..263c2cbbd1 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -543,7 +543,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) { void* tmp; hsa_agent_t agent = locking_agent->public_handle(); - hsa_status_t err = system_region->Lock(1, &agent, ptr, size, &tmp); + hsa_status_t err = system_region->Lock(1, &agent, ptr, 0, size, &tmp); if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Lock failed in hsa_memory_copy."); gpuPtr = ptr; ptr = tmp; @@ -977,7 +977,8 @@ hsa_status_t Runtime::VMemoryPtrInfo(const void* ptr, hsa_amd_pointer_info_t* in for (auto agentPermsIt = mappedHandleIt->second.allowed_agents.begin(); agentPermsIt != mappedHandleIt->second.allowed_agents.end(); agentPermsIt++) { - allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle()); + if ((*agentPermsIt).second.permissions != HSA_ACCESS_PERMISSION_NONE) + allowed_agents.push_back((*agentPermsIt).second.targetAgent->public_handle()); } AMD::callback_t Alloc(alloc); @@ -3519,7 +3520,7 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset, mapped_handle_map_.emplace( std::piecewise_construct, std::forward_as_tuple(va), - std::forward_as_tuple(&memoryHandleIt->second, addressHandle, offset, size, drm_fd, + std::forward_as_tuple(&memoryHandleIt->second, addressHandle, va, offset, size, drm_fd, reinterpret_cast(drm_cpu_addr), HSA_ACCESS_PERMISSION_NONE, shareable_handle)); @@ -3655,17 +3656,43 @@ hsa_status_t Runtime::MappedHandleAllowedAgent::EnableAccess(hsa_access_permissi hsa_status_t Runtime::MappedHandleAllowedAgent::RemoveAccess() { if (targetAgent->device_type() == core::Agent::DeviceType::kAmdCpuDevice) { - #if defined(__linux__) - if (munmap(va, size) != 0) - return HSA_STATUS_ERROR; - #else - assert(!"Unimplemented!"); - #endif - return HSA_STATUS_SUCCESS; + if (permissions != HSA_ACCESS_PERMISSION_NONE) { + if (munmap(va, size) != 0) return HSA_STATUS_ERROR; + + /* We need to keep the CPU mapping. So change it to PROT_NONE */ + void* mapped_ptr = mmap(va, mappedHandle->size, PROT_NONE, MAP_SHARED | MAP_FIXED, + mappedHandle->drm_fd, + reinterpret_cast(mappedHandle->drm_cpu_addr)); + if (mapped_ptr != va) + return HSA_STATUS_ERROR; + + permissions = HSA_ACCESS_PERMISSION_NONE; + } } else { return targetAgent->driver().Unmap( shareable_handle, va, mappedHandle->offset, mappedHandle->size); } + return HSA_STATUS_SUCCESS; +} + +Runtime::MappedHandle::MappedHandle(MemoryHandle *mem_handle, AddressHandle *address_handle, + void* va, uint64_t offset, size_t size, int drm_fd, void *drm_cpu_addr, + hsa_access_permission_t perm, ShareableHandle shareable_handle) + : mem_handle(mem_handle), address_handle(address_handle), offset(offset), + size(size), drm_fd(drm_fd), drm_cpu_addr(drm_cpu_addr), + shareable_handle(shareable_handle) +{ + /* Create a CPU mapping with PROT_NONE */ + auto cpu_agent = static_cast(agentOwner())->GetNearestCpuAgent(); + auto agentPermsIt = allowed_agents.emplace(std::piecewise_construct, + std::forward_as_tuple(cpu_agent), + std::forward_as_tuple(this, cpu_agent, va, + size, HSA_ACCESS_PERMISSION_NONE)) + .first; + + auto ret = agentPermsIt->second.EnableAccess(HSA_ACCESS_PERMISSION_NONE); + if (ret != HSA_STATUS_SUCCESS) + throw AMD::hsa_exception(ret, "Failed to create default CPU mapping"); } // Note: VMemorySetAccessPerHandle should be called with &memory_lock_ held