Use ptrinfo rather than apertures in hsa_memory_copy

Apertures now overlap with the change to 48bit addressing which
precludes using aperture checks to discover buffer ownership.
Switches to ptrinfo to decide which device a buffer owned by.

This corrects faults in the legacy hsa_memory_copy api.

Change-Id: I5c7ce0216e1cdc96f836fc6fec9c3defdf4b9d90
This commit is contained in:
Sean Keely
2018-10-05 01:28:09 -05:00
bovenliggende 386874da55
commit 1e0d690948
2 gewijzigde bestanden met toevoegingen van 63 en 55 verwijderingen
@@ -409,21 +409,6 @@ class Runtime {
// @brief Binds virtual memory access fault handler to this node.
void BindVmFaultHandler();
/// @brief Blocking memory copy from src to dst. One of the src or dst
/// is user pointer. A particular setup need to be made if the DMA queue
/// for the memory copy belongs to a dGPU agent. E.g: pin the user pointer
/// before copying, or using a staging buffer.
///
/// @param [in] dst Memory address of the destination.
/// @param [in] src Memory address of the source.
/// @param [in] size Copy size in bytes.
/// @param [in] dst_malloc If true, then @p dst is the user pointer. Otherwise
/// @p src is the user pointer.
///
/// @retval ::HSA_STATUS_SUCCESS if memory copy is successful and completed.
hsa_status_t CopyMemoryHostAlloc(void* dst, const void* src, size_t size,
bool dst_malloc);
/// @brief Get the index of ::link_matrix_.
/// @param [in] node_id_from Node id of the source node.
/// @param [in] node_id_to Node id of the destination node.
@@ -344,61 +344,84 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
}
hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
assert(dst != NULL && src != NULL && size != 0);
// Choose agents from pointer info
hsa_amd_pointer_info_t info;
bool is_src_system = false;
bool is_dst_system = false;
const uintptr_t src_uptr = reinterpret_cast<uintptr_t>(src);
const uintptr_t dst_uptr = reinterpret_cast<uintptr_t>(dst);
core::Agent* src_agent;
core::Agent* dst_agent;
info.size = sizeof(info);
if ((reinterpret_cast<amd::GpuAgentInt*>(blit_agent_)->profile() ==
HSA_PROFILE_FULL)) {
is_src_system = (src_uptr < end_svm_address_);
is_dst_system = (dst_uptr < end_svm_address_);
// Fetch ownership
hsa_status_t err = PtrInfo(const_cast<void*>(src), &info, nullptr, nullptr, nullptr);
if (err != HSA_STATUS_SUCCESS) return err;
ptrdiff_t endPtr = (ptrdiff_t)src + size;
if (info.agentBaseAddress <= src &&
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
src_agent = core::Agent::Convert(info.agentOwner);
is_src_system = (src_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice);
} else {
is_src_system =
((src_uptr < start_svm_address_) || (src_uptr >= end_svm_address_));
is_dst_system =
((dst_uptr < start_svm_address_) || (dst_uptr >= end_svm_address_));
if ((is_src_system && !is_dst_system) ||
(!is_src_system && is_dst_system)) {
// Use staging buffer or pin if either src or dst is gpuvm and the other
// is system memory allocated via OS or C/C++ allocator.
return CopyMemoryHostAlloc(dst, src, size, is_dst_system);
}
src_agent = cpu_agents_[0];
is_src_system = true;
}
err = PtrInfo(const_cast<void*>(dst), &info, nullptr, nullptr, nullptr);
if (err != HSA_STATUS_SUCCESS) return err;
endPtr = (ptrdiff_t)dst + size;
if (info.agentBaseAddress <= dst &&
endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) {
dst_agent = core::Agent::Convert(info.agentOwner);
is_dst_system = (dst_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice);
} else {
dst_agent = cpu_agents_[0];
is_dst_system = true;
}
// CPU-CPU
if (is_src_system && is_dst_system) {
memmove(dst, src, size);
memcpy(dst, src, size);
return HSA_STATUS_SUCCESS;
}
return blit_agent_->DmaCopy(dst, src, size);
}
hsa_status_t Runtime::CopyMemoryHostAlloc(void* dst, const void* src,
size_t size, bool dst_malloc) {
void* usrptr = (dst_malloc) ? dst : const_cast<void*>(src);
void* agent_ptr = NULL;
hsa_agent_t blit_agent = core::Agent::Convert(blit_agent_);
// Same GPU
if (src_agent->node_id() == dst_agent->node_id()) return dst_agent->DmaCopy(dst, src, size);
// GPU-CPU
// Must ensure that system memory is visible to the GPU during the copy.
const amd::MemoryRegion* system_region =
reinterpret_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
hsa_status_t stat =
system_region->Lock(1, &blit_agent, usrptr, size, &agent_ptr);
if (stat != HSA_STATUS_SUCCESS) {
return stat;
static_cast<const amd::MemoryRegion*>(system_regions_fine_[0]);
if (is_src_system) {
void* gpuPtr;
hsa_agent_t agent = dst_agent->public_handle();
err = system_region->Lock(1, &agent, const_cast<void*>(src), size, &gpuPtr);
if (err != HSA_STATUS_SUCCESS) return err;
MAKE_SCOPE_GUARD([&]() { system_region->Unlock(const_cast<void*>(src)); });
return dst_agent->DmaCopy(dst, gpuPtr, size);
}
stat = blit_agent_->DmaCopy((dst_malloc) ? agent_ptr : dst,
(dst_malloc) ? src : agent_ptr, size);
if (is_dst_system) {
void* gpuPtr;
hsa_agent_t agent = src_agent->public_handle();
err = system_region->Lock(1, &agent, dst, size, &gpuPtr);
if (err != HSA_STATUS_SUCCESS) return err;
MAKE_SCOPE_GUARD([&]() { system_region->Unlock(dst); });
return src_agent->DmaCopy(gpuPtr, src, size);
}
system_region->Unlock(usrptr);
return stat;
/*
GPU-GPU - functional support, not a performance path.
This goes through system memory because we have to support copying between non-peer GPUs
and we can't use P2P pointers even if the GPUs are peers. Because hsa_amd_agents_allow_access
requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
would remain mapped for the duration of the copy.
*/
void* temp = nullptr;
system_region->Allocate(size, core::MemoryRegion::AllocateNoFlags, &temp);
MAKE_SCOPE_GUARD([&]() { system_region->Free(temp, size); });
err = src_agent->DmaCopy(temp, src, size);
if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size);
return err;
}
hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent,