From 2b409b7e57785d67ab3980f0d63e2c47378d429d Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Thu, 13 May 2021 16:03:36 -0400 Subject: [PATCH] SWDEV-240804 - Remove AMD_HMM_SUPPORT define Use dynamic logic for HMM based on it's availability Change-Id: I63751d94571d5af6eb57bef2cb0e071120bfa103 [ROCm/clr commit: 3f7a6b01e31919a1792b8c2800d4019c88ef98e6] --- .../clr/rocclr/device/rocm/CMakeLists.txt | 5 - projects/clr/rocclr/device/rocm/rocdevice.cpp | 394 +++++++++--------- projects/clr/rocclr/device/rocm/rocmemory.cpp | 39 +- .../clr/rocclr/device/rocm/rocvirtual.cpp | 46 +- 4 files changed, 251 insertions(+), 233 deletions(-) diff --git a/projects/clr/rocclr/device/rocm/CMakeLists.txt b/projects/clr/rocclr/device/rocm/CMakeLists.txt index 66af474e7c..7d6a699040 100644 --- a/projects/clr/rocclr/device/rocm/CMakeLists.txt +++ b/projects/clr/rocclr/device/rocm/CMakeLists.txt @@ -52,11 +52,6 @@ target_include_directories(oclrocm ${ROCM_OCL_INCLUDES} $) -option(BUILD_HMM "Build HMM support" ON) -if (BUILD_HMM) - target_compile_definitions(oclrocm - PRIVATE AMD_HMM_SUPPORT) -endif() if(USE_COMGR_LIBRARY) if(${BUILD_SHARED_LIBS}) diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index c7b30dd6a3..ae9046652b 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -1547,7 +1547,6 @@ bool Device::populateOCLDeviceConstants() { : 0; } -#if AMD_HMM_SUPPORT // Generic support for HMM interfaces if (HSA_STATUS_SUCCESS != hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &info_.hmmSupported_)) { @@ -1559,7 +1558,8 @@ bool Device::populateOCLDeviceConstants() { &info_.hmmCpuMemoryAccessible_)) { LogError("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT query failed."); } -#endif // AMD_HMM_SUPPORT + LogPrintfInfo("HMM support: %d, xnack: %d\n", + info_.hmmSupported_, info_.hmmCpuMemoryAccessible_); info_.globalCUMask_ = {}; @@ -2216,63 +2216,66 @@ bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count, return false; } } -#if AMD_HMM_SUPPORT - std::vector attr; - if (first_alloc) { - attr.push_back({HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED}); - } + if (info().hmmSupported_) { + std::vector attr; + if (first_alloc) { + attr.push_back({HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED}); + } - switch (advice) { - case amd::MemoryAdvice::SetReadMostly: - attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, true}); - break; - case amd::MemoryAdvice::UnsetReadMostly: - attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, false}); - break; - case amd::MemoryAdvice::SetPreferredLocation: - if (use_cpu) { - attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getCpuAgent().handle}); - } else { - attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getBackendDevice().handle}); - } - break; - case amd::MemoryAdvice::UnsetPreferredLocation: - // @note: 0 may cause a failure on old runtimes - attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0}); - break; - case amd::MemoryAdvice::SetAccessedBy: - if (use_cpu) { - attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, getCpuAgent().handle}); - } else { - if (first_alloc) { - // Provide access to all possible devices. - //! @note: HMM should support automatic page table update with xnack enabled, - //! but currently it doesn't and runtime explicitly enables access from all devices - for (const auto dev : devices()) { - attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, - static_cast(dev)->getBackendDevice().handle}); - } + switch (advice) { + case amd::MemoryAdvice::SetReadMostly: + attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, true}); + break; + case amd::MemoryAdvice::UnsetReadMostly: + attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, false}); + break; + case amd::MemoryAdvice::SetPreferredLocation: + if (use_cpu) { + attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getCpuAgent().handle}); } else { - attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, getBackendDevice().handle}); + attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, getBackendDevice().handle}); } - } + break; + case amd::MemoryAdvice::UnsetPreferredLocation: + // @note: 0 may cause a failure on old runtimes + attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0}); + break; + case amd::MemoryAdvice::SetAccessedBy: + if (use_cpu) { + attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, getCpuAgent().handle}); + } else { + if (first_alloc) { + // Provide access to all possible devices. + //! @note: HMM should support automatic page table update with xnack enabled, + //! but currently it doesn't and runtime explicitly enables access from all devices + for (const auto dev : devices()) { + attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, + static_cast(dev)->getBackendDevice().handle}); + } + } else { + attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, + getBackendDevice().handle}); + } + } + break; + case amd::MemoryAdvice::UnsetAccessedBy: + // @note: 0 may cause a failure on old runtimes + attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, 0}); + break; + default: + return false; break; - case amd::MemoryAdvice::UnsetAccessedBy: - // @note: 0 may cause a failure on old runtimes - attr.push_back({HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE, 0}); - break; - default: - return false; - break; - } + } - hsa_status_t status = hsa_amd_svm_attributes_set(const_cast(dev_ptr), count, - attr.data(), attr.size()); - if (status != HSA_STATUS_SUCCESS) { - LogPrintfError("hsa_amd_svm_attributes_set() failed. Advice: %d", advice); - return false; + hsa_status_t status = hsa_amd_svm_attributes_set(const_cast(dev_ptr), count, + attr.data(), attr.size()); + if (status != HSA_STATUS_SUCCESS) { + LogPrintfError("hsa_amd_svm_attributes_set() failed. Advice: %d", advice); + return false; + } + } else { + LogWarning("hsa_amd_svm_attributes_set() is ignored, because no HMM support"); } -#endif // AMD_HMM_SUPPORT return true; } @@ -2296,131 +2299,139 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, return false; } } -#if AMD_HMM_SUPPORT - uint32_t accessed_by = 0; - std::vector attr; + if (info().hmmSupported_) { + uint32_t accessed_by = 0; + std::vector attr; - for (size_t i = 0; i < num_attributes; ++i) { - switch (attributes[i]) { - case amd::MemRangeAttribute::ReadMostly: - attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, 0}); + for (size_t i = 0; i < num_attributes; ++i) { + switch (attributes[i]) { + case amd::MemRangeAttribute::ReadMostly: + attr.push_back({HSA_AMD_SVM_ATTRIB_READ_ONLY, 0}); + break; + case amd::MemRangeAttribute::PreferredLocation: + attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0}); + break; + case amd::MemRangeAttribute::AccessedBy: + accessed_by = attr.size(); + // Add all GPU devices into the query + for (const auto agent : getGpuAgents()) { + attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle}); + } + // Add CPU devices + for (const auto agent_info : getCpuAgents()) { + attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle}); + } + accessed_by = attr.size() - accessed_by; + break; + case amd::MemRangeAttribute::LastPrefetchLocation: + attr.push_back({HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION, 0}); + break; + default: + return false; break; - case amd::MemRangeAttribute::PreferredLocation: - attr.push_back({HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION, 0}); - break; - case amd::MemRangeAttribute::AccessedBy: - accessed_by = attr.size(); - // Add all GPU devices into the query - for (const auto agent : getGpuAgents()) { - attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle}); - } - // Add CPU devices - for (const auto agent_info : getCpuAgents()) { - attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle}); - } - accessed_by = attr.size() - accessed_by; - break; - case amd::MemRangeAttribute::LastPrefetchLocation: - attr.push_back({HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION, 0}); - break; - default: - return false; - break; + } } - } - hsa_status_t status = hsa_amd_svm_attributes_get(const_cast(dev_ptr), count, - attr.data(), attr.size()); - if (status != HSA_STATUS_SUCCESS) { - LogError("hsa_amd_svm_attributes_get() failed"); + hsa_status_t status = hsa_amd_svm_attributes_get(const_cast(dev_ptr), count, + attr.data(), attr.size()); + if (status != HSA_STATUS_SUCCESS) { + LogError("hsa_amd_svm_attributes_get() failed"); + return false; + } + + uint32_t idx = 0; + uint32_t rocr_attr = 0; + for (size_t i = 0; i < num_attributes; ++i) { + const auto& it = attr[rocr_attr]; + switch (attributes[i]) { + case amd::MemRangeAttribute::ReadMostly: + if (data_sizes[idx] != sizeof(uint32_t)) { + return false; + } + // Cast ROCr value into the hip format + *reinterpret_cast(data[idx]) = + (static_cast(it.value) > 0) ? true : false; + break; + // The logic should be identical for the both queries + case amd::MemRangeAttribute::PreferredLocation: + case amd::MemRangeAttribute::LastPrefetchLocation: + if (data_sizes[idx] != sizeof(uint32_t)) { + return false; + } + *reinterpret_cast(data[idx]) = static_cast(amd::InvalidDeviceId); + // Find device agent returned by ROCr + for (auto& device : devices()) { + if (static_cast(device)->getBackendDevice().handle == it.value) { + *reinterpret_cast(data[idx]) = static_cast(device->index()); + } + } + // Find CPU agent returned by ROCr + for (auto& agent_info : getCpuAgents()) { + if (agent_info.agent.handle == it.value) { + *reinterpret_cast(data[idx]) = static_cast(amd::CpuDeviceId); + } + } + break; + case amd::MemRangeAttribute::AccessedBy: { + uint32_t entry = 0; + uint32_t device_count = data_sizes[idx] / 4; + // Make sure it's multiple of 4 + if (data_sizes[idx] % 4 != 0) { + return false; + } + for (uint32_t att = 0; att < accessed_by; ++att) { + const auto& it = attr[rocr_attr + att]; + if (entry >= device_count) { + // The size of the array is less than the amount of available devices + break; + } + switch (it.attribute) { + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: + case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: + break; + case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: + reinterpret_cast(data[idx])[entry] = + static_cast(amd::InvalidDeviceId); + // Find device agent returned by ROCr + for (auto& device : devices()) { + if (static_cast(device)->getBackendDevice().handle == it.value) { + reinterpret_cast(data[idx])[entry] = + static_cast(device->index()); + } + } + // Find CPU agent returned by ROCr + for (auto& agent_info : getCpuAgents()) { + if (agent_info.agent.handle == it.value) { + reinterpret_cast(data[idx])[entry] = + static_cast(amd::CpuDeviceId); + } + } + ++entry; + break; + default: + LogWarning("Unexpected result from HSA_AMD_SVM_ATTRIB_ACCESS_QUERY"); + break; + } + } + rocr_attr += accessed_by; + for (uint32_t idx = entry; idx < device_count; ++idx) { + reinterpret_cast(data[idx])[idx] = + static_cast(amd::InvalidDeviceId); + } + break; + } + default: + return false; + break; + } + // Find the next location in the query + ++idx; + } + } else { + LogError("GetSvmAttributes() failed, because no HMM support"); return false; } - uint32_t idx = 0; - uint32_t rocr_attr = 0; - for (size_t i = 0; i < num_attributes; ++i) { - const auto& it = attr[rocr_attr]; - switch (attributes[i]) { - case amd::MemRangeAttribute::ReadMostly: - if (data_sizes[idx] != sizeof(uint32_t)) { - return false; - } - // Cast ROCr value into the hip format - *reinterpret_cast(data[idx]) = - (static_cast(it.value) > 0) ? true : false; - break; - // The logic should be identical for the both queries - case amd::MemRangeAttribute::PreferredLocation: - case amd::MemRangeAttribute::LastPrefetchLocation: - if (data_sizes[idx] != sizeof(uint32_t)) { - return false; - } - *reinterpret_cast(data[idx]) = static_cast(amd::InvalidDeviceId); - // Find device agent returned by ROCr - for (auto& device : devices()) { - if (static_cast(device)->getBackendDevice().handle == it.value) { - *reinterpret_cast(data[idx]) = static_cast(device->index()); - } - } - // Find CPU agent returned by ROCr - for (auto& agent_info : getCpuAgents()) { - if (agent_info.agent.handle == it.value) { - *reinterpret_cast(data[idx]) = static_cast(amd::CpuDeviceId); - } - } - break; - case amd::MemRangeAttribute::AccessedBy: { - uint32_t entry = 0; - uint32_t device_count = data_sizes[idx] / 4; - // Make sure it's multiple of 4 - if (data_sizes[idx] % 4 != 0) { - return false; - } - for (uint32_t att = 0; att < accessed_by; ++att) { - const auto& it = attr[rocr_attr + att]; - if (entry >= device_count) { - // The size of the array is less than the amount of available devices - break; - } - switch (it.attribute) { - case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE: - case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS: - break; - case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE: - reinterpret_cast(data[idx])[entry] = static_cast(amd::InvalidDeviceId); - // Find device agent returned by ROCr - for (auto& device : devices()) { - if (static_cast(device)->getBackendDevice().handle == it.value) { - reinterpret_cast(data[idx])[entry] = static_cast(device->index()); - } - } - // Find CPU agent returned by ROCr - for (auto& agent_info : getCpuAgents()) { - if (agent_info.agent.handle == it.value) { - reinterpret_cast(data[idx])[entry] = static_cast(amd::CpuDeviceId); - } - } - ++entry; - break; - default: - LogWarning("Unexpected result from HSA_AMD_SVM_ATTRIB_ACCESS_QUERY"); - break; - } - } - rocr_attr += accessed_by; - for (uint32_t idx = entry; idx < device_count; ++idx) { - reinterpret_cast(data[idx])[idx] = static_cast(amd::InvalidDeviceId); - } - break; - } - default: - return false; - break; - } - // Find the next location in the query - ++idx; - } -#endif // AMD_HMM_SUPPORT return true; } @@ -2428,35 +2439,42 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes, bool Device::SvmAllocInit(void* memory, size_t size) const { amd::MemoryAdvice advice = amd::MemoryAdvice::SetAccessedBy; constexpr bool kFirstAlloc = true; - SetSvmAttributesInt(memory, size, advice, kFirstAlloc); + if (!SetSvmAttributesInt(memory, size, advice, kFirstAlloc)) { + return false; + } if (settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory) { advice = amd::MemoryAdvice::UnsetPreferredLocation; - SetSvmAttributesInt(memory, size, advice); + if (!SetSvmAttributesInt(memory, size, advice)) { + return false; + } } if ((settings().hmmFlags_ & Settings::Hmm::EnableMallocPrefetch) == 0) { return true; } -#if AMD_HMM_SUPPORT - // Initialize signal for the barrier - hsa_signal_store_relaxed(prefetch_signal_, kInitSignalValueOne); + if (info().hmmSupported_) { + // Initialize signal for the barrier + hsa_signal_store_relaxed(prefetch_signal_, kInitSignalValueOne); - // Initiate a prefetch command which should force memory update in HMM - hsa_status_t status = hsa_amd_svm_prefetch_async(memory, size, getBackendDevice(), - 0, nullptr, prefetch_signal_); - if (status != HSA_STATUS_SUCCESS) { - LogError("hsa_amd_svm_attributes_get() failed"); - return false; + // Initiate a prefetch command which should force memory update in HMM + hsa_status_t status = hsa_amd_svm_prefetch_async(memory, size, getBackendDevice(), + 0, nullptr, prefetch_signal_); + if (status != HSA_STATUS_SUCCESS) { + LogError("hsa_amd_svm_prefetch_async() failed"); + return false; + } + + // Wait for the prefetch + if (!WaitForSignal(prefetch_signal_)) { + LogError("Barrier packet submission failed"); + return false; + } + } else { + LogWarning("Early prefetch failed, because no HMM support"); } - // Wait for the prefetch - if (!WaitForSignal(prefetch_signal_)) { - LogError("Barrier packet submission failed"); - return false; - } -#endif // AMD_HMM_SUPPORT return true; } diff --git a/projects/clr/rocclr/device/rocm/rocmemory.cpp b/projects/clr/rocclr/device/rocm/rocmemory.cpp index e34eaa8785..c9c2d7bc6f 100644 --- a/projects/clr/rocclr/device/rocm/rocmemory.cpp +++ b/projects/clr/rocclr/device/rocm/rocmemory.cpp @@ -632,13 +632,13 @@ void Buffer::destroy() { if (kind_ != MEMORY_KIND_PTRGIVEN) { if (isFineGrain) { if (memFlags & CL_MEM_ALLOC_HOST_PTR) { - #if AMD_HMM_SUPPORT - // AMD HMM path. Destroy system memory - amd::Os::uncommitMemory(deviceMemory_, size()); - amd::Os::releaseMemory(deviceMemory_, size()); - #else - dev().hostFree(deviceMemory_, size());; - #endif // AMD_HMM_SUPPORT + if (dev().info().hmmSupported_) { + // AMD HMM path. Destroy system memory + amd::Os::uncommitMemory(deviceMemory_, size()); + amd::Os::releaseMemory(deviceMemory_, size()); + } else { + dev().hostFree(deviceMemory_, size()); + } } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) { if (HSA_STATUS_SUCCESS != hsa_signal_destroy(signal_)) { ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, @@ -727,17 +727,20 @@ bool Buffer::create() { if (owner()->getSvmPtr() == reinterpret_cast(1)) { if (isFineGrain) { if (memFlags & CL_MEM_ALLOC_HOST_PTR) { -#if AMD_HMM_SUPPORT - // AMD HMM path. Just allocate system memory and KFD will manage it - deviceMemory_ = amd::Os::reserveMemory( - 0, size(), amd::Os::pageSize(), amd::Os::MEM_PROT_RW); - amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW); - // Currently HMM requires cirtain initial calls to mark sysmem allocation as - // GPU accessible or prefetch memory into GPU - dev().SvmAllocInit(deviceMemory_, size()); -#else - deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); -#endif // AMD_HMM_SUPPORT + if (dev().info().hmmSupported_) { + // AMD HMM path. Just allocate system memory and KFD will manage it + deviceMemory_ = amd::Os::reserveMemory( + 0, size(), amd::Os::pageSize(), amd::Os::MEM_PROT_RW); + amd::Os::commitMemory(deviceMemory_, size(), amd::Os::MEM_PROT_RW); + // Currently HMM requires cirtain initial calls to mark sysmem allocation as + // GPU accessible or prefetch memory into GPU + if (!dev().SvmAllocInit(deviceMemory_, size())) { + ClPrint(amd::LOG_ERROR, amd::LOG_MEM, "SVM init in ROCr failed!"); + return false; + } + } else { + deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); + } } else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) { deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) != 0); } else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) { diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp index 5b361beaf1..4e355e4759 100644 --- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp +++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp @@ -1463,34 +1463,36 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd) { void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); -#if AMD_HMM_SUPPORT profilingBegin(cmd); - // Initialize signal for the barrier - hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown); - hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); - uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; - // Find the requested agent for the transfer - hsa_agent_t agent = (cmd.cpu_access() || - (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory)) ? - dev().getCpuAgent() : gpu_device(); + if (dev().info().hmmSupported_) { + // Initialize signal for the barrier + hsa_signal_t* wait_event = Barriers().WaitingSignal(HwQueueEngine::Unknown); + hsa_signal_t active = Barriers().ActiveSignal(kInitSignalValueOne, timestamp_); + uint32_t num_wait_events = (wait_event == nullptr) ? 0 : 1; - // Initiate a prefetch command - hsa_status_t status = hsa_amd_svm_prefetch_async( - const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active); + // Find the requested agent for the transfer + hsa_agent_t agent = (cmd.cpu_access() || + (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory)) ? + dev().getCpuAgent() : gpu_device(); - // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution - if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { - Barriers().ResetCurrentSignal(); - LogError("hsa_amd_svm_prefetch_async failed"); - cmd.setStatus(CL_INVALID_OPERATION); + // Initiate a prefetch command + hsa_status_t status = hsa_amd_svm_prefetch_async( + const_cast(cmd.dev_ptr()), cmd.count(), agent, num_wait_events, wait_event, active); + + // Wait for the prefetch. Should skip wait, but may require extra tracking for kernel execution + if ((status != HSA_STATUS_SUCCESS) || !Barriers().WaitCurrent()) { + Barriers().ResetCurrentSignal(); + LogError("hsa_amd_svm_prefetch_async failed"); + cmd.setStatus(CL_INVALID_OPERATION); + } + + // Add system scope, since the prefetch scope is unclear + addSystemScope(); + } else { + LogWarning("hsa_amd_svm_prefetch_async is ignored, because no HMM support"); } - - // Add system scope, since the prefetch scope is unclear - addSystemScope(); - profilingEnd(cmd); -#endif // AMD_HMM_SUPPORT } // ================================================================================================