SWDEV-272496 - Fix multiple timing issues

- Don't notify if the batch is empty, because that means
the current command was processed already.
- Disable pinning optimization to avoid a race condition on stall.
- TS marker submition requires extra AQL barrier
to track the status.

Change-Id: I17eff4ad12ac66cfe1bb44048bebb1891805279d


[ROCm/clr commit: 24299e25bd]
This commit is contained in:
German Andryeyev
2021-02-26 16:17:30 -05:00
committato da Saleel Kudchadker
parent d82e14bf2c
commit f6cc68deac
4 ha cambiato i file con 43 aggiunte e 29 eliminazioni
@@ -750,7 +750,7 @@ bool Buffer::create() {
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
return false;
}
volatile hsa_signal_value_t* signalValuePtr;
volatile hsa_signal_value_t* signalValuePtr = nullptr;
if (HSA_STATUS_SUCCESS != hsa_amd_signal_value_pointer(signal_, &signalValuePtr)) {
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY pointer query failed");
@@ -146,15 +146,13 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
return false;
}
amd::ScopedLock sl(ts->gpu()->execution());
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
static_cast<uint32_t>(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);
// Update the batch, since signal is complete
if (ts->Signals().size() > 0) {
amd::ScopedLock sl(ts->gpu()->execution());
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(%lx)\n",
static_cast<uint32_t>(value), arg, ts->Signals()[0]->signal_.handle);
ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
} else {
LogError("Error: ROCr handler was called for untracked signal!");
}
ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
// Return false, so the callback will not be called again for this signal
return false;
}
@@ -331,6 +329,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
prof_signal->done_ = false;
prof_signal->engine_ = engine_;
if (ts != 0) {
// Save HSA signal earlier to make sure the possible callback will have a valid
// value for processing
prof_signal->ts_ = ts;
ts->AddProfilingSignal(prof_signal);
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
@@ -347,8 +349,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
hsa_amd_profiling_async_copy_enable(true);
sdma_profiling_ = true;
}
prof_signal->ts_ = ts;
ts->AddProfilingSignal(prof_signal);
}
return prof_signal->signal_;
}
@@ -1127,7 +1127,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
}
// ================================================================================================
void VirtualGPU::updateCommandsState(amd::Command* list) {
void VirtualGPU::updateCommandsState(amd::Command* list) const {
Timestamp* ts = nullptr;
amd::Command* current = list;
@@ -2802,6 +2802,7 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}
// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
if (vcmd.profilingInfo().marker_ts_) {
profilingBegin(vcmd);
@@ -2811,11 +2812,16 @@ void VirtualGPU::submitMarker(amd::Marker& vcmd) {
// for cache flushes explicitly and helps wall time
uint16_t header = kNopPacketHeader;
dispatchBarrierPacket(&barrier_packet_, header);
// Direct dispatch requires a barrier with callback and hasPendingDispatch_ triggers that
if (AMD_DIRECT_DISPATCH) {
hasPendingDispatch_ = true;
}
}
profilingEnd(vcmd);
}
}
// ================================================================================================
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
@@ -2856,8 +2862,9 @@ void VirtualGPU::flush(amd::Command* list, bool wait) {
// If runtime didn't submit a barrier, then it can't track the completion of the batch.
// Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
// Due to performance impact of extra barriers CPU wait is selected.
// CPU wait is also forced if pinned buffers were used
skip_cpu_wait &= hasPendingDispatch_ && (pinnedMems_.size() == 0);
// Note: if callback will be selected to update the batch status,
// then the host thread can't update it also, otherwise double free may occur
skip_cpu_wait &= hasPendingDispatch_;
releaseGpuMemoryFence(force_barrier, skip_cpu_wait);
profilingEnd(*current);
@@ -2910,14 +2917,18 @@ void VirtualGPU::addPinnedMem(amd::Memory* mem) {
//! @note: ROCr backend doesn't have per resource busy tracking, hence runtime has to wait
//! unconditionally, before it can release pinned memory
releaseGpuMemoryFence();
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
pinnedMems_.erase(pinnedMems_.begin());
}
if (!AMD_DIRECT_DISPATCH) {
if (nullptr == findPinnedMem(mem->getHostMem(), mem->getSize())) {
if (pinnedMems_.size() > 7) {
pinnedMems_.front()->release();
pinnedMems_.erase(pinnedMems_.begin());
}
// Delay destruction
pinnedMems_.push_back(mem);
// Delay destruction
pinnedMems_.push_back(mem);
}
} else {
mem->release();
}
}
@@ -36,7 +36,6 @@ class Device;
class Memory;
class Timestamp;
struct ProfilingSignal : public amd::HeapObject {
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
@@ -98,7 +97,7 @@ class Timestamp : public amd::HeapObject {
, gpu_(gpu)
, command_(command) {}
virtual ~Timestamp() {}
~Timestamp() {}
uint64_t getStart() {
checkGpuTime();
@@ -236,7 +235,7 @@ class VirtualGPU : public device::VirtualDevice {
void profilingBegin(amd::Command& command, bool drmProfiling = false);
void profilingEnd(amd::Command& command);
void updateCommandsState(amd::Command* list);
void updateCommandsState(amd::Command* list) const;
void submitReadMemory(amd::ReadMemoryCommand& cmd);
void submitWriteMemory(amd::WriteMemoryCommand& cmd);
+9 -5
Vedi File
@@ -213,13 +213,17 @@ bool Event::awaitCompletion() {
// ================================================================================================
bool Event::notifyCmdQueue() {
HostQueue* queue = command().queue();
if ((status() > CL_COMPLETE) &&
// Don't need to notify any marker with direct dispatch,
// because all markers are blocking
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
(!AMD_DIRECT_DISPATCH ||
// Don't need to notify any marker with direct dispatch,
// because all markers are blocking.
((command().type() != CL_COMMAND_MARKER) &&
(command().type() != 0))) &&
(nullptr != queue) && !notified_.test_and_set()) {
(command().type() != 0)) ||
// Don't need to notify if the current batch is empty,
// because that means the command was processed and extra notification
// will cause a stall on the host.
(queue->GetSubmittionBatch() != nullptr)) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
if (command == NULL) {