Use ptrinfo rather than apertures in hsa_memory_copy
Apertures now overlap with the change to 48bit addressing which precludes using aperture checks to discover buffer ownership. Switches to ptrinfo to decide which device a buffer owned by. This corrects faults in the legacy hsa_memory_copy api. Change-Id: I5c7ce0216e1cdc96f836fc6fec9c3defdf4b9d90
This commit is contained in:
@@ -409,21 +409,6 @@ class Runtime {
|
||||
// @brief Binds virtual memory access fault handler to this node.
|
||||
void BindVmFaultHandler();
|
||||
|
||||
/// @brief Blocking memory copy from src to dst. One of the src or dst
|
||||
/// is user pointer. A particular setup need to be made if the DMA queue
|
||||
/// for the memory copy belongs to a dGPU agent. E.g: pin the user pointer
|
||||
/// before copying, or using a staging buffer.
|
||||
///
|
||||
/// @param [in] dst Memory address of the destination.
|
||||
/// @param [in] src Memory address of the source.
|
||||
/// @param [in] size Copy size in bytes.
|
||||
/// @param [in] dst_malloc If true, then @p dst is the user pointer. Otherwise
|
||||
/// @p src is the user pointer.
|
||||
///
|
||||
/// @retval ::HSA_STATUS_SUCCESS if memory copy is successful and completed.
|
||||
hsa_status_t CopyMemoryHostAlloc(void* dst, const void* src, size_t size,
|
||||
bool dst_malloc);
|
||||
|
||||
/// @brief Get the index of ::link_matrix_.
|
||||
/// @param [in] node_id_from Node id of the source node.
|
||||
/// @param [in] node_id_to Node id of the destination node.
|
||||
|
||||
@@ -344,61 +344,84 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
|
||||
assert(dst != NULL && src != NULL && size != 0);
|
||||
|
||||
// Choose agents from pointer info
|
||||
hsa_amd_pointer_info_t info;
|
||||
bool is_src_system = false;
|
||||
bool is_dst_system = false;
|
||||
const uintptr_t src_uptr = reinterpret_cast<uintptr_t>(src);
|
||||
const uintptr_t dst_uptr = reinterpret_cast<uintptr_t>(dst);
|
||||
core::Agent* src_agent;
|
||||
core::Agent* dst_agent;
|
||||
info.size = sizeof(info);
|
||||
|
||||
if ((reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
|
||||
HSA_PROFILE_FULL)) {
|
||||
is_src_system = (src_uptr < end_svm_address_);
|
||||
is_dst_system = (dst_uptr < end_svm_address_);
|
||||
// Fetch ownership
|
||||
hsa_status_t err = PtrInfo(const_cast<void*>(src), &info, nullptr, nullptr, nullptr);
|
||||
if (err != HSA_STATUS_SUCCESS) return err;
|
||||
ptrdiff_t endPtr = (ptrdiff_t)src + size;
|
||||
if (info.agentBaseAddress <= src &&
|
||||
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
|
||||
src_agent = core::Agent::Convert(info.agentOwner);
|
||||
is_src_system = (src_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice);
|
||||
} else {
|
||||
is_src_system =
|
||||
((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_));
|
||||
is_dst_system =
|
||||
((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_));
|
||||
|
||||
if ((is_src_system && !is_dst_system) ||
|
||||
(!is_src_system && is_dst_system)) {
|
||||
// Use staging buffer or pin if either src or dst is gpuvm and the other
|
||||
// is system memory allocated via OS or C/C++ allocator.
|
||||
return CopyMemoryHostAlloc(dst, src, size, is_dst_system);
|
||||
}
|
||||
src_agent = cpu_agents_[0];
|
||||
is_src_system = true;
|
||||
}
|
||||
|
||||
err = PtrInfo(const_cast<void*>(dst), &info, nullptr, nullptr, nullptr);
|
||||
if (err != HSA_STATUS_SUCCESS) return err;
|
||||
endPtr = (ptrdiff_t)dst + size;
|
||||
if (info.agentBaseAddress <= dst &&
|
||||
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
|
||||
dst_agent = core::Agent::Convert(info.agentOwner);
|
||||
is_dst_system = (dst_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice);
|
||||
} else {
|
||||
dst_agent = cpu_agents_[0];
|
||||
is_dst_system = true;
|
||||
}
|
||||
|
||||
// CPU-CPU
|
||||
if (is_src_system && is_dst_system) {
|
||||
memmove(dst, src, size);
|
||||
memcpy(dst, src, size);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
return blit_agent_->DmaCopy(dst, src, size);
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src,
|
||||
size_t size, bool dst_malloc) {
|
||||
void* usrptr = (dst_malloc) ? dst : const_cast<void*>(src);
|
||||
void* agent_ptr = NULL;
|
||||
|
||||
hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_);
|
||||
// Same GPU
|
||||
if (src_agent->node_id() == dst_agent->node_id()) return dst_agent->DmaCopy(dst, src, size);
|
||||
|
||||
// GPU-CPU
|
||||
// Must ensure that system memory is visible to the GPU during the copy.
|
||||
const amd::MemoryRegion* system_region =
|
||||
reinterpret_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
|
||||
hsa_status_t stat =
|
||||
system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr);
|
||||
|
||||
if (stat != HSA_STATUS_SUCCESS) {
|
||||
return stat;
|
||||
static_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
|
||||
if (is_src_system) {
|
||||
void* gpuPtr;
|
||||
hsa_agent_t agent = dst_agent->public_handle();
|
||||
err = system_region->Lock(1, &agent, const_cast<void*>(src), size, &gpuPtr);
|
||||
if (err != HSA_STATUS_SUCCESS) return err;
|
||||
MAKE_SCOPE_GUARD([&]() { system_region->Unlock(const_cast<void*>(src)); });
|
||||
return dst_agent->DmaCopy(dst, gpuPtr, size);
|
||||
}
|
||||
|
||||
stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst,
|
||||
(dst_malloc) ? src : agent_ptr, size);
|
||||
if (is_dst_system) {
|
||||
void* gpuPtr;
|
||||
hsa_agent_t agent = src_agent->public_handle();
|
||||
err = system_region->Lock(1, &agent, dst, size, &gpuPtr);
|
||||
if (err != HSA_STATUS_SUCCESS) return err;
|
||||
MAKE_SCOPE_GUARD([&]() { system_region->Unlock(dst); });
|
||||
return src_agent->DmaCopy(gpuPtr, src, size);
|
||||
}
|
||||
|
||||
system_region->Unlock(usrptr);
|
||||
|
||||
return stat;
|
||||
/*
|
||||
GPU-GPU - functional support, not a performance path.
|
||||
|
||||
This goes through system memory because we have to support copying between non-peer GPUs
|
||||
and we can't use P2P pointers even if the GPUs are peers. Because hsa_amd_agents_allow_access
|
||||
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
|
||||
would remain mapped for the duration of the copy.
|
||||
*/
|
||||
void* temp = nullptr;
|
||||
system_region->Allocate(size, core::MemoryRegion::AllocateNoFlags, &temp);
|
||||
MAKE_SCOPE_GUARD([&]() { system_region->Free(temp, size); });
|
||||
err = src_agent->DmaCopy(temp, src, size);
|
||||
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
|
||||
return err;
|
||||
}
|
||||
|
||||
hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,
|
||||
|
||||
Verwijs in nieuw issue
Block a user