SWDEV-486602 - Optimize HSA callback performance

- Don't generate callbacks for HIP events
- Don't process profiling info in the callback for HIP events
- Wait for CPU status update of the submitted commands
every 50 calls. That will allow to drain the commands and
destroy HSA signals.

Change-Id: Ib601a350e7e7c2b6c6209a172385389baccf73a9


[ROCm/clr commit: 364dfb0ed1]
Этот коммит содержится в:
German Andryeyev
2024-09-20 19:19:51 -04:00
родитель 043271a3e6
Коммит faea40cbb3
12 изменённых файлов: 58 добавлений и 54 удалений
+12 -14
Просмотреть файл
@@ -64,6 +64,7 @@ hipError_t Event::query() {
return ready() ? hipSuccess : hipErrorNotReady;
}
// ================================================================================================
hipError_t Event::synchronize() {
amd::ScopedLock lock(lock_);
@@ -76,19 +77,12 @@ hipError_t Event::synchronize() {
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
static constexpr bool kWaitCompletion = true;
if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, flags_)) {
if (event_->HwEvent() != nullptr) {
amd::Command* command = nullptr;
hipError_t status = recordCommand(command, event_->command().queue(), flags_);
command->enqueue();
hip_device->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion, flags_);
command->release();
} else {
event_->awaitCompletion();
}
event_->awaitCompletion();
}
return hipSuccess;
}
// ================================================================================================
bool Event::awaitEventCompletion() {
return event_->awaitCompletion();
}
@@ -222,8 +216,9 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) {
return hipSuccess;
}
// ================================================================================================
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
uint32_t ext_flags ) {
uint32_t ext_flags, bool batch_flush) {
if (command == nullptr) {
int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
(hipEventReleaseToDevice | hipEventReleaseToSystem |
@@ -234,11 +229,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
releaseFlags = amd::Device::kCacheStateInvalid;
}
// Always submit a EventMarker.
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
}
return hipSuccess;
}
// ================================================================================================
hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
command->enqueue();
if (event_ == &command->event()) return hipSuccess;
@@ -251,11 +247,13 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command
return hipSuccess;
}
hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
// ================================================================================================
hipError_t Event::addMarker(hipStream_t stream, amd::Command* command,
bool record, bool batch_flush) {
hip::Stream* hip_stream = hip::getStream(stream);
// Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
amd::ScopedLock lock(lock_);
hipError_t status = recordCommand(command, hip_stream);
hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
if (status != hipSuccess) {
return hipSuccess;
}
@@ -415,7 +413,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
return hipErrorInvalidHandle;
}
status = e->addMarker(stream, nullptr, true);
status = e->addMarker(stream, nullptr, true, !hip::Event::kBatchFlush);
}
return status;
}
+9 -4
Просмотреть файл
@@ -81,10 +81,11 @@ typedef struct ihipIpcEventShmem_s {
class EventMarker : public amd::Marker {
public:
EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
int32_t scope = amd::Device::kCacheStateInvalid)
int32_t scope = amd::Device::kCacheStateInvalid, bool batch_flush = true)
: amd::Marker(stream, disableFlush) {
profilingInfo_.enabled_ = true;
profilingInfo_.marker_ts_ = markerTs;
profilingInfo_.batch_flush_ = batch_flush;
profilingInfo_.clear();
setEventScope(scope);
}
@@ -101,6 +102,8 @@ class Event {
}
public:
constexpr static bool kBatchFlush = true; //!< Flushes CPU command batch in direct dispatch mode
Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
event_(nullptr), unrecorded_(false), stream_(nullptr) {
// No need to init event_ here as addMarker does that
@@ -123,9 +126,10 @@ class Event {
virtual hipError_t streamWait(hipStream_t stream, uint flags);
virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
uint32_t flags = 0);
uint32_t flags = 0, bool batch_flush = true);
virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
hipError_t addMarker(hipStream_t stream, amd::Command* command,
bool record, bool batch_flush = true);
void BindCommand(amd::Command& command, bool record) {
amd::ScopedLock lock(lock_);
@@ -217,7 +221,8 @@ class IPCEvent : public Event {
hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
hipError_t streamWait(hipStream_t stream, uint flags);
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
uint32_t flags = 0, bool batch_flush = true) override;
hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
};
+5 -2
Просмотреть файл
@@ -139,16 +139,19 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
return hipSuccess;
}
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
// ================================================================================================
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
uint32_t flags, bool batch_flush) {
bool unrecorded = isUnRecorded();
if (unrecorded) {
command = new amd::Marker(*stream, kMarkerDisableFlush);
} else {
return Event::recordCommand(command, stream);
return Event::recordCommand(command, stream, batch_flush);
}
return hipSuccess;
}
// ================================================================================================
hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
bool unrecorded = isUnRecorded();
if (unrecorded) {
+8 -3
Просмотреть файл
@@ -23,6 +23,7 @@
#include "hip_event.hpp"
#include "thread/monitor.hpp"
#include "hip_prof_api.h"
#include <atomic>
namespace hip {
@@ -358,11 +359,15 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
}
}
bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false;
constexpr bool kDontWaitForCpu = false;
auto hip_stream = hip::getStream(stream, wait);
bool wait_for_cpu = false;
// Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
if (DEBUG_HIP_BLOCK_SYNC != 0) {
static std::atomic<uint64_t> flush = 0;
wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false;
}
// Wait for the current host queue
hip_stream->finish(kDontWaitForCpu);
hip_stream->finish(wait_for_cpu);
if (stream == nullptr) {
// null stream will sync with other streams.
ReleaseGraphExec(hip_stream->DeviceId());
-3
Просмотреть файл
@@ -1323,9 +1323,6 @@ class VirtualDevice : public amd::HeapObject {
//! Returns true if device has active wait setting
bool ActiveWait() const;
//! Returns the status of queue handler callback
virtual bool isHandlerPending() const = 0;
//! Returns fence state of the VirtualGPU
virtual bool isFenceDirty() const = 0;
//! Init hidden heap for device memory allocations
-2
Просмотреть файл
@@ -353,8 +353,6 @@ class VirtualGPU : public device::VirtualDevice {
void profilerAttach(bool enable = false) {}
bool isHandlerPending() const { return false; }
bool isFenceDirty() const { return false; }
void HiddenHeapInit() {}
+4 -3
Просмотреть файл
@@ -127,6 +127,9 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
// ================================================================================================
void Timestamp::checkGpuTime() {
if (amd::IS_HIP && !amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
return;
}
amd::ScopedLock s(lock_);
if (HwProfiling()) {
uint64_t start = std::numeric_limits<uint64_t>::max();
@@ -479,12 +482,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
SetHandlerPending(false);
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
} else if (ts->command().profilingInfo().marker_ts_) {
SetHandlerPending(true);
// Update the current command/marker with HW event
prof_signal->retain();
ts->command().SetHwEvent(prof_signal);
@@ -1652,7 +1653,7 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
// also true for any command B, which falls between A and C.
current = list;
while (current != nullptr) {
if (current->profilingInfo().enabled_) {
if (current->profilingInfo().enabled_) {
if (!current->data().empty()) {
for (auto i = 0; i < current->data().size(); i++) {
// Since this is a valid command to get a timestamp, we use the
+1 -12
Просмотреть файл
@@ -248,7 +248,7 @@ class VirtualGPU : public device::VirtualDevice {
class HwQueueTracker : public amd::EmbeddedObject {
public:
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu), handlerPending_(false) {}
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
~HwQueueTracker();
@@ -289,12 +289,6 @@ class VirtualGPU : public device::VirtualDevice {
//! Empty check for external signals
bool IsExternalSignalListEmpty() const { return external_signals_.empty(); }
//! Set the status to indicate a pending handler
void SetHandlerPending(bool pending) { handlerPending_ = pending; }
//! Check if callback has been queued
bool IsHandlerPending() const { return handlerPending_; }
//! Get/Set SDMA profiling
bool GetSDMAProfiling() { return sdma_profiling_; }
void SetSDMAProfiling(bool profile) {
@@ -319,7 +313,6 @@ class VirtualGPU : public device::VirtualDevice {
const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
std::vector<ProfilingSignal*> external_signals_; //!< External signals for a wait in this queue
std::vector<hsa_signal_t> waiting_signals_; //!< Current waiting signals in this queue
bool handlerPending_; //!< This indicates if we have queued a callback handler
};
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
@@ -434,10 +427,6 @@ class VirtualGPU : public device::VirtualDevice {
Timestamp* timestamp() const { return timestamp_; }
//! Indicates the status of the callback handler. The callback would process the commands
//! and would collect profiling data, update refcounts
bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
void* allocKernArg(size_t size, size_t alignment);
bool isFenceDirty() const { return fence_dirty_; }
void HiddenHeapInit();
+3 -1
Просмотреть файл
@@ -366,7 +366,9 @@ void Command::enqueue() {
ScopedLock sl(queue_->vdev()->execution());
queue_->FormSubmissionBatch(this);
if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
// Enqueue flushes, except profiling markers to avoid frequent expensive callbacks
if (((type() == 0) && profilingInfo().batch_flush_) ||
(type() == CL_COMMAND_MARKER) || (type() == CL_COMMAND_TASK)) {
// The current HSA signal tracking logic requires profiling enabled for the markers
EnableProfiling();
// Update batch head for the current marker. Hence the status of all commands can be
+1
Просмотреть файл
@@ -114,6 +114,7 @@ class Event : public RuntimeObject {
uint64_t correlation_id_;
bool enabled_; //!< Profiling enabled for the wave limiter
bool marker_ts_; //!< TS marker
bool batch_flush_ = true; //!< Command can flush the batch in direct dispatch mode
void clear() {
queued_ = 0ULL;
+13 -10
Просмотреть файл
@@ -65,6 +65,15 @@ bool HostQueue::terminate() {
// destroyed.
Command* lastCommand = getLastQueuedCommand(true);
if (lastCommand != nullptr) {
// Check if CPU batch wasn't flushed for completion with the last command
if (GetSubmittionBatch() != nullptr) {
auto command = new Marker(*this, false);
if (command != nullptr) {
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
command->enqueue();
lastCommand = command;
}
}
lastCommand->awaitCompletion();
// Note that if lastCommand isn't a marker, it may not be lastEnqueueCommand_ now
// after lastCommand->awaitCompletion() is called.
@@ -128,19 +137,13 @@ void HostQueue::finish(bool cpu_wait) {
if (IS_HIP) {
command = getLastQueuedCommand(true);
if (command == nullptr) {
assert(GetSubmittionBatch() == nullptr &&
"Can't claim the queue is finished with the active batch!");
return;
}
}
// If command doesn't contain HW event and runtime didn't request CPU wait,
// then force marker submit
bool force_marker = false;
if (AMD_DIRECT_DISPATCH && (command != nullptr) && !cpu_wait) {
void* hw_event =
(command->NotifyEvent() != nullptr) ? command->NotifyEvent()->HwEvent() : command->HwEvent();
force_marker = (hw_event == nullptr);
}
if (nullptr == command || force_marker ||
vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
// Force marker if the batch wasn't sent for CPU update or fence is dirty
if (nullptr == command || (GetSubmittionBatch() != nullptr) || vdev()->isFenceDirty()) {
if (nullptr != command) {
command->release();
}
+2
Просмотреть файл
@@ -257,6 +257,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
"Forces the number of streams for the graph parallel execution") \
release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false, \
"Force to always use new comgr unbundling action") \
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
"Blocks synchronization on CPU until the callback processing is done")\
release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \
"Enable/Disable multiple kern arg copies") \
release(bool, DEBUG_CLR_USE_STDMUTEX_IN_AMD_MONITOR, false, \