clr: Allow all engines but prefer recommended engines (#1750)
* Also honor ROC_P2P_SDMA_SIZE for IPC, since IPC can also mean P2P
Этот коммит содержится в:
@@ -470,6 +470,21 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
|
||||
return result;
|
||||
}
|
||||
|
||||
// Select an SDMA engine using priority-based scheduling
|
||||
// Prefers engines in preferredMask (high-bandwidth engines), otherwise any free engine
|
||||
static inline uint32_t selectSdmaEngine(uint32_t freeMask, uint32_t preferredMask) {
|
||||
if (freeMask == 0) return 0;
|
||||
|
||||
// Try preferred engines first (high-bandwidth engines)
|
||||
uint32_t preferredFree = freeMask & preferredMask;
|
||||
if (preferredFree != 0) {
|
||||
return preferredFree & (~preferredFree + 1); // Extract lowest preferred engine
|
||||
}
|
||||
|
||||
// Fall back to non-preferred engines (slower engines)
|
||||
return freeMask & (~freeMask + 1); // Extract lowest available engine
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, const_address src,
|
||||
hsa_agent_t& srcAgent, size_t size,
|
||||
@@ -508,31 +523,33 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
|
||||
copyMask = gpu().getLastUsedSdmaEngine();
|
||||
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
|
||||
copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
|
||||
if (copyMask == 0) {
|
||||
// Check SDMA engine status
|
||||
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
|
||||
// Get the mask of valid engines for this operation (read or write)
|
||||
uint32_t validEngineMask =
|
||||
(engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
|
||||
}
|
||||
// Check SDMA engine status to get currently free engines
|
||||
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
|
||||
|
||||
if (status == HSA_STATUS_SUCCESS) {
|
||||
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"Query copy engine status %x, srcAgent %p, "
|
||||
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
|
||||
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
|
||||
|
||||
// Constrain to valid engines for this operation
|
||||
freeEngineMask &= validEngineMask;
|
||||
recIdMask &= validEngineMask;
|
||||
|
||||
if (freeEngineMask != 0) {
|
||||
// Use priority-based scheduling: prefer high-bandwidth engines (recIdMask)
|
||||
copyMask = selectSdmaEngine(freeEngineMask, recIdMask);
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"Query copy engine status %x, srcAgent %p, "
|
||||
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
|
||||
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
|
||||
|
||||
// If requested engine is valid and available, use it
|
||||
if (recIdMask != 0 && (freeEngineMask & recIdMask) != 0) {
|
||||
copyMask = recIdMask - (recIdMask & (recIdMask - 1));
|
||||
} else {
|
||||
// Otherwise use first available engine
|
||||
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
|
||||
}
|
||||
|
||||
gpu().setLastUsedSdmaEngine(copyMask);
|
||||
"Selected SDMA engine: free_mask=0x%x, preferred_mask=0x%x, selected_mask=0x%x",
|
||||
freeEngineMask, recIdMask, copyMask);
|
||||
}
|
||||
|
||||
if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
|
||||
@@ -2259,16 +2276,16 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
amd::CopyMetadata copyMetadata) const {
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
bool p2p = false;
|
||||
uint32_t blitWg = dev().settings().limit_blit_wg_;
|
||||
|
||||
if (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) {
|
||||
if (sizeIn[0] > dev().settings().sdma_p2p_threshold_) {
|
||||
p2p = true;
|
||||
} else {
|
||||
constexpr uint32_t kLimitWgForKernelP2p = 16;
|
||||
blitWg = kLimitWgForKernelP2p;
|
||||
}
|
||||
bool isP2pOrIpc = (&gpuMem(srcMemory).dev() != &gpuMem(dstMemory).dev()) ||
|
||||
srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
|
||||
|
||||
// Use SDMA for large P2P/IPC transfers, shader for small ones
|
||||
if (isP2pOrIpc && sizeIn[0] <= dev().settings().sdma_p2p_threshold_) {
|
||||
constexpr uint32_t kLimitWgForKernelP2p = 16;
|
||||
blitWg = kLimitWgForKernelP2p;
|
||||
isP2pOrIpc = false;
|
||||
}
|
||||
|
||||
// Determine if we should use shader copy path based on various conditions
|
||||
@@ -2281,7 +2298,6 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT;
|
||||
|
||||
// Check memory access patterns
|
||||
bool isP2pOrIpc = p2p || srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
|
||||
bool neitherMemoryIsHostDirectAccess =
|
||||
!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess();
|
||||
|
||||
|
||||
@@ -233,9 +233,6 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
auto gpu = ts->gpu();
|
||||
gpu->QueuedAsyncHandlers()--;
|
||||
|
||||
// Reset last used SDMA engine mask
|
||||
gpu->setLastUsedSdmaEngine(0);
|
||||
|
||||
bool isBlocking = ts->GetBlocking();
|
||||
|
||||
// Update the batch, since signal is complete
|
||||
@@ -1612,10 +1609,9 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
managed_kernarg_buffer_(*this, device.settings().kernargPoolSize_),
|
||||
cuMask_(cuMask),
|
||||
priority_(priority),
|
||||
copy_command_type_(0),
|
||||
fence_state_(Device::CacheState::kCacheStateInvalid),
|
||||
fence_dirty_(false),
|
||||
lastUsedSdmaEngineMask_(0) {
|
||||
copy_command_type_(0),
|
||||
fence_state_(Device::CacheState::kCacheStateInvalid),
|
||||
fence_dirty_(false) {
|
||||
index_ = device.numOfVgpus_++;
|
||||
gpu_device_ = device.getBackendDevice();
|
||||
printfdbg_ = nullptr;
|
||||
|
||||
@@ -453,8 +453,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void WaitCompleteSignal(hsa_signal_t signal);
|
||||
|
||||
void HiddenHeapInit();
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
uint64_t getQueueID();
|
||||
|
||||
//! Analyzes a crashed AQL queue to find a broken AQL packet
|
||||
@@ -623,7 +621,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//!< kUnknown/kFlushedToDevice/kFlushedToSystem
|
||||
std::atomic<bool> fence_dirty_; //!< Fence modified flag
|
||||
|
||||
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
|
||||
uint64_t last_write_index_ = 0; //!< The last HW queue write index for any packet
|
||||
uint64_t last_barrier_index_ = 0; //!< The last HW queue write index for a packet
|
||||
//!< with a complition signal
|
||||
|
||||
Ссылка в новой задаче
Block a user