SWDEV-301667 - Track last used SDMA engine per queue

- Track last SDMA engine per queue, this results in better scheduling
- Reset last SDMA engine upon batch completion. That ensures we dont get
blocked if the same engine is used by another concurrent copy

Change-Id: Id53111980da7ee41d5c932fb44e4aab5b1e065a3


[ROCm/clr commit: bf8baeecb3]
Этот коммит содержится в:
Saleel Kudchadker
2023-10-09 23:35:11 +00:00
родитель 661b3381da
Коммит 0ec2d8847d
3 изменённых файлов: 24 добавлений и 10 удалений
+15 -9
Просмотреть файл
@@ -705,12 +705,18 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
if (copyMask == 0) {
// Check SDMA engine status
status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x",
status, freeEngineMask);
// Return a mask with the rightmost bit set
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
// Check if there a recently used SDMA engine for the stream
copyMask = gpu().getLastUsedSdmaEngine();
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
if (copyMask == 0) {
// Check SDMA engine status
status = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Query copy engine status %x, free_engine mask 0x%x",
status, freeEngineMask);
// Return a mask with the rightmost bit set
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
gpu().setLastUsedSdmaEngine(copyMask);
}
}
if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
@@ -718,7 +724,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
hsa_amd_sdma_engine_id_t copyEngine = static_cast<hsa_amd_sdma_engine_id_t>(copyMask);
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"HSA Async Copy on copy_engine=%x, dst=0x%zx, src=0x%zx, "
"HSA Async Copy on copy_engine=0x%x, dst=0x%zx, src=0x%zx, "
"size=%ld, wait_event=0x%zx, completion_signal=0x%zx", copyEngine,
dst, src, size[0], (wait_events.size() != 0) ? wait_events[0].handle : 0,
active.handle);
@@ -2357,10 +2363,10 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
amd::ScopedLock k(lockXferOps_);
bool result = false;
constexpr size_t kFillImageThreshold = 256 * 256;
// Use host fill if memory has direct access and image is small
if (setup_.disableFillImage_ ||
(gpuMem(memory).isHostMemDirectAccess() &&
(gpuMem(memory).isHostMemDirectAccess() &&
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
// Stall GPU before CPU access
gpu().releaseGpuMemoryFence();
+5 -1
Просмотреть файл
@@ -214,6 +214,9 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
// Update the batch, since signal is complete
ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
// Reset last used SDMA engine mask
ts->gpu()->setLastUsedSdmaEngine(0);
// Reset API callback signal. It will release AQL queue and start commands processing
if (callback_signal.handle != 0) {
hsa_signal_subtract_relaxed(callback_signal, 1);
@@ -1184,7 +1187,8 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
priority_(priority),
copy_command_type_(0),
fence_state_(Device::CacheState::kCacheStateInvalid),
fence_dirty_(false)
fence_dirty_(false),
lastUsedSdmaEngineMask_(0)
{
index_ = device.numOfVgpus_++;
gpu_device_ = device.getBackendDevice();
+4
Просмотреть файл
@@ -419,6 +419,8 @@ class VirtualGPU : public device::VirtualDevice {
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void resetFenceDirty() { fence_dirty_ = false; }
void setLastUsedSdmaEngine(uint32_t mask) { lastUsedSdmaEngineMask_ = mask; }
uint32_t getLastUsedSdmaEngine() const { return lastUsedSdmaEngineMask_.load(); }
// } roc OpenCL integration
private:
//! Dispatches a barrier with blocking HSA signals
@@ -567,5 +569,7 @@ class VirtualGPU : public device::VirtualDevice {
int fence_state_; //!< Fence scope
//!< kUnknown/kFlushedToDevice/kFlushedToSystem
bool fence_dirty_; //!< Fence modified flag
std::atomic<uint> lastUsedSdmaEngineMask_; //!< Last Used SDMA Engine mask
};
}