SWDEV-301667 - Track last used SDMA engine per queue
- Track last SDMA engine per queue, this results in better scheduling
- Reset last SDMA engine upon batch completion. That ensures we dont get
blocked if the same engine is used by another concurrent copy
Change-Id: Id53111980da7ee41d5c932fb44e4aab5b1e065a3
[ROCm/clr commit: bf8baeecb3]
Этот коммит содержится в:
@@ -705,12 +705,18 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
|
||||
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
|
||||
if (copyMask == 0) {
|
||||
// Check SDMA engine status
|
||||
status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x",
|
||||
status, freeEngineMask);
|
||||
// Return a mask with the rightmost bit set
|
||||
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
|
||||
// Check if there a recently used SDMA engine for the stream
|
||||
copyMask = gpu().getLastUsedSdmaEngine();
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
|
||||
if (copyMask == 0) {
|
||||
// Check SDMA engine status
|
||||
status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x",
|
||||
status, freeEngineMask);
|
||||
// Return a mask with the rightmost bit set
|
||||
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
|
||||
gpu().setLastUsedSdmaEngine(copyMask);
|
||||
}
|
||||
}
|
||||
|
||||
if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
|
||||
@@ -718,7 +724,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
|
||||
hsa_amd_sdma_engine_id_t copyEngine = static_cast<hsa_amd_sdma_engine_id_t>(copyMask);
|
||||
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
|
||||
"HSA Async Copy on copy_engine=%x, dst=0x%zx, src=0x%zx, "
|
||||
"HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
|
||||
"size=%ld, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
|
||||
dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
|
||||
active.handle);
|
||||
@@ -2357,10 +2363,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
amd::ScopedLock k(lockXferOps_);
|
||||
bool result = false;
|
||||
constexpr size_t kFillImageThreshold = 256 * 256;
|
||||
|
||||
|
||||
// Use host fill if memory has direct access and image is small
|
||||
if (setup_.disableFillImage_ ||
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
@@ -214,6 +214,9 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
// Update the batch, since signal is complete
|
||||
ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
|
||||
|
||||
// Reset last used SDMA engine mask
|
||||
ts->gpu()->setLastUsedSdmaEngine(0);
|
||||
|
||||
// Reset API callback signal. It will release AQL queue and start commands processing
|
||||
if (callback_signal.handle != 0) {
|
||||
hsa_signal_subtract_relaxed(callback_signal, 1);
|
||||
@@ -1184,7 +1187,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
priority_(priority),
|
||||
copy_command_type_(0),
|
||||
fence_state_(Device::CacheState::kCacheStateInvalid),
|
||||
fence_dirty_(false)
|
||||
fence_dirty_(false),
|
||||
lastUsedSdmaEngineMask_(0)
|
||||
{
|
||||
index_ = device.numOfVgpus_++;
|
||||
gpu_device_ = device.getBackendDevice();
|
||||
|
||||
@@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
bool isFenceDirty() const { return fence_dirty_; }
|
||||
void resetFenceDirty() { fence_dirty_ = false; }
|
||||
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
|
||||
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
|
||||
// } roc OpenCL integration
|
||||
private:
|
||||
//! Dispatches a barrier with blocking HSA signals
|
||||
@@ -567,5 +569,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
int fence_state_; //!< Fence scope
|
||||
//!< kUnknown/kFlushedToDevice/kFlushedToSystem
|
||||
bool fence_dirty_; //!< Fence modified flag
|
||||
|
||||
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
|
||||
};
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user