clr: Allow all engines but prefer recommended engines (#1750)

* Also honor ROC_P2P_SDMA_SIZE for IPC, since IPC can also mean P2P
Этот коммит содержится в:
SaleelK
2025-11-10 13:10:46 -08:00
коммит произвёл GitHub
родитель 3883bd3e93
Коммит 5e418ca256
3 изменённых файлов: 50 добавлений и 41 удалений
+47 -31
Просмотреть файл
@@ -470,6 +470,21 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
return result;
}
// Select an SDMA engine using priority-based scheduling
// Prefers engines in preferredMask (high-bandwidth engines), otherwise any free engine
static inline uint32_t selectSdmaEngine(uint32_t freeMask, uint32_t preferredMask) {
if (freeMask == 0) return 0;
// Try preferred engines first (high-bandwidth engines)
uint32_t preferredFree = freeMask & preferredMask;
if (preferredFree != 0) {
return preferredFree & (~preferredFree + 1); // Extract lowest preferred engine
}
// Fall back to non-preferred engines (slower engines)
return freeMask & (~freeMask + 1); // Extract lowest available engine
}
// ================================================================================================
inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, const_address src,
hsa_agent_t& srcAgent, size_t size,
@@ -508,31 +523,33 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
copyMask = gpu().getLastUsedSdmaEngine();
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
if (copyMask == 0) {
// Check SDMA engine status
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
// Get the mask of valid engines for this operation (read or write)
uint32_t validEngineMask =
(engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
if (status == HSA_STATUS_SUCCESS) {
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
}
// Check SDMA engine status to get currently free engines
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
if (status == HSA_STATUS_SUCCESS) {
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Query copy engine status %x, srcAgent %p, "
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
// Constrain to valid engines for this operation
freeEngineMask &= validEngineMask;
recIdMask &= validEngineMask;
if (freeEngineMask != 0) {
// Use priority-based scheduling: prefer high-bandwidth engines (recIdMask)
copyMask = selectSdmaEngine(freeEngineMask, recIdMask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Query copy engine status %x, srcAgent %p, "
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
// If requested engine is valid and available, use it
if (recIdMask != 0 && (freeEngineMask & recIdMask) != 0) {
copyMask = recIdMask - (recIdMask & (recIdMask - 1));
} else {
// Otherwise use first available engine
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
}
gpu().setLastUsedSdmaEngine(copyMask);
"Selected SDMA engine: free_mask=0x%x, preferred_mask=0x%x, selected_mask=0x%x",
freeEngineMask, recIdMask, copyMask);
}
if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
@@ -2259,16 +2276,16 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
amd::CopyMetadata copyMetadata) const {
amd::ScopedLock k(lockXferOps_);
bool result = false;
bool p2p = false;
uint32_t blitWg = dev().settings().limit_blit_wg_;
if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
p2p = true;
} else {
constexpr uint32_t kLimitWgForKernelP2p = 16;
blitWg = kLimitWgForKernelP2p;
}
bool isP2pOrIpc = (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) ||
srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
// Use SDMA for large P2P/IPC transfers, shader for small ones
if (isP2pOrIpc && sizeIn[0] <= dev().settings().sdma_p2p_threshold_) {
constexpr uint32_t kLimitWgForKernelP2p = 16;
blitWg = kLimitWgForKernelP2p;
isP2pOrIpc = false;
}
// Determine if we should use shader copy path based on various conditions
@@ -2281,7 +2298,6 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT;
// Check memory access patterns
bool isP2pOrIpc = p2p || srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
bool neitherMemoryIsHostDirectAccess =
!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess();
+3 -7
Просмотреть файл
@@ -233,9 +233,6 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
auto gpu = ts->gpu();
gpu->QueuedAsyncHandlers()--;
// Reset last used SDMA engine mask
gpu->setLastUsedSdmaEngine(0);
bool isBlocking = ts->GetBlocking();
// Update the batch, since signal is complete
@@ -1612,10 +1609,9 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
cuMask_(cuMask),
priority_(priority),
copy_command_type_(0),
fence_state_(Device::CacheState::kCacheStateInvalid),
fence_dirty_(false),
lastUsedSdmaEngineMask_(0) {
copy_command_type_(0),
fence_state_(Device::CacheState::kCacheStateInvalid),
fence_dirty_(false) {
index_ = device.numOfVgpus_++;
gpu_device_ = device.getBackendDevice();
printfdbg_ = nullptr;
-3
Просмотреть файл
@@ -453,8 +453,6 @@ class VirtualGPU : public device::VirtualDevice {
void WaitCompleteSignal(hsa_signal_t signal);
void HiddenHeapInit();
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
uint64_t getQueueID();
//! Analyzes a crashed AQL queue to find a broken AQL packet
@@ -623,7 +621,6 @@ class VirtualGPU : public device::VirtualDevice {
//!< kUnknown/kFlushedToDevice/kFlushedToSystem
std::atomic<bool> fence_dirty_; //!< Fence modified flag
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
uint64_t last_write_index_ = 0; //!< The last HW queue write index for any packet
uint64_t last_barrier_index_ = 0; //!< The last HW queue write index for a packet
//!< with a complition signal