SWDEV-486602 - Optimize HSA callback performance
- Don't generate callbacks for HIP events
- Don't process profiling info in the callback for HIP events
- Wait for CPU status update of the submitted commands
every 50 calls. That will allow to drain the commands and
destroy HSA signals.
Change-Id: Ib601a350e7e7c2b6c6209a172385389baccf73a9
[ROCm/clr commit: 364dfb0ed1]
Этот коммит содержится в:
@@ -64,6 +64,7 @@ hipError_t Event::query() {
|
||||
return ready() ? hipSuccess : hipErrorNotReady;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t Event::synchronize() {
|
||||
amd::ScopedLock lock(lock_);
|
||||
|
||||
@@ -76,19 +77,12 @@ hipError_t Event::synchronize() {
|
||||
// Check HW status of the ROCcrl event. Note: not all ROCclr modes support HW status
|
||||
static constexpr bool kWaitCompletion = true;
|
||||
if (!hip_device->devices()[0]->IsHwEventReady(*event_, kWaitCompletion, flags_)) {
|
||||
if (event_->HwEvent() != nullptr) {
|
||||
amd::Command* command = nullptr;
|
||||
hipError_t status = recordCommand(command, event_->command().queue(), flags_);
|
||||
command->enqueue();
|
||||
hip_device->devices()[0]->IsHwEventReady(command->event(), kWaitCompletion, flags_);
|
||||
command->release();
|
||||
} else {
|
||||
event_->awaitCompletion();
|
||||
}
|
||||
event_->awaitCompletion();
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Event::awaitEventCompletion() {
|
||||
return event_->awaitCompletion();
|
||||
}
|
||||
@@ -222,8 +216,9 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) {
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t ext_flags ) {
|
||||
uint32_t ext_flags, bool batch_flush) {
|
||||
if (command == nullptr) {
|
||||
int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem |
|
||||
@@ -234,11 +229,12 @@ hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
releaseFlags = amd::Device::kCacheStateInvalid;
|
||||
}
|
||||
// Always submit a EventMarker.
|
||||
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
|
||||
command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags, batch_flush);
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
|
||||
command->enqueue();
|
||||
if (event_ == &command->event()) return hipSuccess;
|
||||
@@ -251,11 +247,13 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
|
||||
// ================================================================================================
|
||||
hipError_t Event::addMarker(hipStream_t stream, amd::Command* command,
|
||||
bool record, bool batch_flush) {
|
||||
hip::Stream* hip_stream = hip::getStream(stream);
|
||||
// Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
|
||||
amd::ScopedLock lock(lock_);
|
||||
hipError_t status = recordCommand(command, hip_stream);
|
||||
hipError_t status = recordCommand(command, hip_stream, 0, batch_flush);
|
||||
if (status != hipSuccess) {
|
||||
return hipSuccess;
|
||||
}
|
||||
@@ -415,7 +413,7 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
|
||||
if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
|
||||
return hipErrorInvalidHandle;
|
||||
}
|
||||
status = e->addMarker(stream, nullptr, true);
|
||||
status = e->addMarker(stream, nullptr, true, !hip::Event::kBatchFlush);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -81,10 +81,11 @@ typedef struct ihipIpcEventShmem_s {
|
||||
class EventMarker : public amd::Marker {
|
||||
public:
|
||||
EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
|
||||
int32_t scope = amd::Device::kCacheStateInvalid)
|
||||
int32_t scope = amd::Device::kCacheStateInvalid, bool batch_flush = true)
|
||||
: amd::Marker(stream, disableFlush) {
|
||||
profilingInfo_.enabled_ = true;
|
||||
profilingInfo_.marker_ts_ = markerTs;
|
||||
profilingInfo_.batch_flush_ = batch_flush;
|
||||
profilingInfo_.clear();
|
||||
setEventScope(scope);
|
||||
}
|
||||
@@ -101,6 +102,8 @@ class Event {
|
||||
}
|
||||
|
||||
public:
|
||||
constexpr static bool kBatchFlush = true; //!< Flushes CPU command batch in direct dispatch mode
|
||||
|
||||
Event(uint32_t flags) : flags_(flags), lock_(true) /* hipEvent_t lock*/,
|
||||
event_(nullptr), unrecorded_(false), stream_(nullptr) {
|
||||
// No need to init event_ here as addMarker does that
|
||||
@@ -123,9 +126,10 @@ class Event {
|
||||
virtual hipError_t streamWait(hipStream_t stream, uint flags);
|
||||
|
||||
virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t flags = 0);
|
||||
uint32_t flags = 0, bool batch_flush = true);
|
||||
virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
|
||||
hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
|
||||
hipError_t addMarker(hipStream_t stream, amd::Command* command,
|
||||
bool record, bool batch_flush = true);
|
||||
|
||||
void BindCommand(amd::Command& command, bool record) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
@@ -217,7 +221,8 @@ class IPCEvent : public Event {
|
||||
hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
|
||||
hipError_t streamWait(hipStream_t stream, uint flags);
|
||||
|
||||
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags = 0);
|
||||
hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
|
||||
uint32_t flags = 0, bool batch_flush = true) override;
|
||||
hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
|
||||
};
|
||||
|
||||
|
||||
@@ -139,16 +139,19 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
|
||||
// ================================================================================================
|
||||
hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream,
|
||||
uint32_t flags, bool batch_flush) {
|
||||
bool unrecorded = isUnRecorded();
|
||||
if (unrecorded) {
|
||||
command = new amd::Marker(*stream, kMarkerDisableFlush);
|
||||
} else {
|
||||
return Event::recordCommand(command, stream);
|
||||
return Event::recordCommand(command, stream, batch_flush);
|
||||
}
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
|
||||
bool unrecorded = isUnRecorded();
|
||||
if (unrecorded) {
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "hip_event.hpp"
|
||||
#include "thread/monitor.hpp"
|
||||
#include "hip_prof_api.h"
|
||||
#include <atomic>
|
||||
|
||||
namespace hip {
|
||||
|
||||
@@ -358,11 +359,15 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
|
||||
}
|
||||
}
|
||||
bool wait = (stream == nullptr || stream == hipStreamLegacy) ? true : false;
|
||||
constexpr bool kDontWaitForCpu = false;
|
||||
|
||||
auto hip_stream = hip::getStream(stream, wait);
|
||||
bool wait_for_cpu = false;
|
||||
// Force blocking wait if requested. That allows to avoid a build up of unreleased CPU commands
|
||||
if (DEBUG_HIP_BLOCK_SYNC != 0) {
|
||||
static std::atomic<uint64_t> flush = 0;
|
||||
wait_for_cpu = ((++flush % DEBUG_HIP_BLOCK_SYNC) == 0) ? true : false;
|
||||
}
|
||||
// Wait for the current host queue
|
||||
hip_stream->finish(kDontWaitForCpu);
|
||||
hip_stream->finish(wait_for_cpu);
|
||||
if (stream == nullptr) {
|
||||
// null stream will sync with other streams.
|
||||
ReleaseGraphExec(hip_stream->DeviceId());
|
||||
|
||||
@@ -1323,9 +1323,6 @@ class VirtualDevice : public amd::HeapObject {
|
||||
//! Returns true if device has active wait setting
|
||||
bool ActiveWait() const;
|
||||
|
||||
//! Returns the status of queue handler callback
|
||||
virtual bool isHandlerPending() const = 0;
|
||||
|
||||
//! Returns fence state of the VirtualGPU
|
||||
virtual bool isFenceDirty() const = 0;
|
||||
//! Init hidden heap for device memory allocations
|
||||
|
||||
@@ -353,8 +353,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
void profilerAttach(bool enable = false) {}
|
||||
|
||||
bool isHandlerPending() const { return false; }
|
||||
|
||||
bool isFenceDirty() const { return false; }
|
||||
|
||||
void HiddenHeapInit() {}
|
||||
|
||||
@@ -127,6 +127,9 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
|
||||
|
||||
// ================================================================================================
|
||||
void Timestamp::checkGpuTime() {
|
||||
if (amd::IS_HIP && !amd::activity_prof::IsEnabled(OP_ID_DISPATCH)) {
|
||||
return;
|
||||
}
|
||||
amd::ScopedLock s(lock_);
|
||||
if (HwProfiling()) {
|
||||
uint64_t start = std::numeric_limits<uint64_t>::max();
|
||||
@@ -479,12 +482,10 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
|
||||
prof_signal->signal_.handle, prof_signal);
|
||||
}
|
||||
SetHandlerPending(false);
|
||||
// Update the current command/marker with HW event
|
||||
prof_signal->retain();
|
||||
ts->command().SetHwEvent(prof_signal);
|
||||
} else if (ts->command().profilingInfo().marker_ts_) {
|
||||
SetHandlerPending(true);
|
||||
// Update the current command/marker with HW event
|
||||
prof_signal->retain();
|
||||
ts->command().SetHwEvent(prof_signal);
|
||||
@@ -1652,7 +1653,7 @@ void VirtualGPU::updateCommandsState(amd::Command* list) const {
|
||||
// also true for any command B, which falls between A and C.
|
||||
current = list;
|
||||
while (current != nullptr) {
|
||||
if (current->profilingInfo().enabled_) {
|
||||
if (current->profilingInfo().enabled_) {
|
||||
if (!current->data().empty()) {
|
||||
for (auto i = 0; i < current->data().size(); i++) {
|
||||
// Since this is a valid command to get a timestamp, we use the
|
||||
|
||||
@@ -248,7 +248,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
class HwQueueTracker : public amd::EmbeddedObject {
|
||||
public:
|
||||
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu), handlerPending_(false) {}
|
||||
HwQueueTracker(const VirtualGPU& gpu): gpu_(gpu) {}
|
||||
|
||||
~HwQueueTracker();
|
||||
|
||||
@@ -289,12 +289,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
//! Empty check for external signals
|
||||
bool IsExternalSignalListEmpty() const { return external_signals_.empty(); }
|
||||
|
||||
//! Set the status to indicate a pending handler
|
||||
void SetHandlerPending(bool pending) { handlerPending_ = pending; }
|
||||
|
||||
//! Check if callback has been queued
|
||||
bool IsHandlerPending() const { return handlerPending_; }
|
||||
|
||||
//! Get/Set SDMA profiling
|
||||
bool GetSDMAProfiling() { return sdma_profiling_; }
|
||||
void SetSDMAProfiling(bool profile) {
|
||||
@@ -319,7 +313,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const VirtualGPU& gpu_; //!< VirtualGPU, associated with this tracker
|
||||
std::vector<ProfilingSignal*> external_signals_; //!< External signals for a wait in this queue
|
||||
std::vector<hsa_signal_t> waiting_signals_; //!< Current waiting signals in this queue
|
||||
bool handlerPending_; //!< This indicates if we have queued a callback handler
|
||||
};
|
||||
|
||||
VirtualGPU(Device& device, bool profiling = false, bool cooperative = false,
|
||||
@@ -434,10 +427,6 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
Timestamp* timestamp() const { return timestamp_; }
|
||||
|
||||
//! Indicates the status of the callback handler. The callback would process the commands
|
||||
//! and would collect profiling data, update refcounts
|
||||
bool isHandlerPending() const { return barriers_.IsHandlerPending(); }
|
||||
|
||||
void* allocKernArg(size_t size, size_t alignment);
|
||||
bool isFenceDirty() const { return fence_dirty_; }
|
||||
void HiddenHeapInit();
|
||||
|
||||
@@ -366,7 +366,9 @@ void Command::enqueue() {
|
||||
ScopedLock sl(queue_->vdev()->execution());
|
||||
queue_->FormSubmissionBatch(this);
|
||||
|
||||
if (type() == CL_COMMAND_MARKER || type() == 0 || type() == CL_COMMAND_TASK) {
|
||||
// Enqueue flushes, except profiling markers to avoid frequent expensive callbacks
|
||||
if (((type() == 0) && profilingInfo().batch_flush_) ||
|
||||
(type() == CL_COMMAND_MARKER) || (type() == CL_COMMAND_TASK)) {
|
||||
// The current HSA signal tracking logic requires profiling enabled for the markers
|
||||
EnableProfiling();
|
||||
// Update batch head for the current marker. Hence the status of all commands can be
|
||||
|
||||
@@ -114,6 +114,7 @@ class Event : public RuntimeObject {
|
||||
uint64_t correlation_id_;
|
||||
bool enabled_; //!< Profiling enabled for the wave limiter
|
||||
bool marker_ts_; //!< TS marker
|
||||
bool batch_flush_ = true; //!< Command can flush the batch in direct dispatch mode
|
||||
|
||||
void clear() {
|
||||
queued_ = 0ULL;
|
||||
|
||||
@@ -65,6 +65,15 @@ bool HostQueue::terminate() {
|
||||
// destroyed.
|
||||
Command* lastCommand = getLastQueuedCommand(true);
|
||||
if (lastCommand != nullptr) {
|
||||
// Check if CPU batch wasn't flushed for completion with the last command
|
||||
if (GetSubmittionBatch() != nullptr) {
|
||||
auto command = new Marker(*this, false);
|
||||
if (command != nullptr) {
|
||||
ClPrint(LOG_DEBUG, LOG_CMD, "Marker queued to ensure finish");
|
||||
command->enqueue();
|
||||
lastCommand = command;
|
||||
}
|
||||
}
|
||||
lastCommand->awaitCompletion();
|
||||
// Note that if lastCommand isn't a marker, it may not be lastEnqueueCommand_ now
|
||||
// after lastCommand->awaitCompletion() is called.
|
||||
@@ -128,19 +137,13 @@ void HostQueue::finish(bool cpu_wait) {
|
||||
if (IS_HIP) {
|
||||
command = getLastQueuedCommand(true);
|
||||
if (command == nullptr) {
|
||||
assert(GetSubmittionBatch() == nullptr &&
|
||||
"Can't claim the queue is finished with the active batch!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
// If command doesn't contain HW event and runtime didn't request CPU wait,
|
||||
// then force marker submit
|
||||
bool force_marker = false;
|
||||
if (AMD_DIRECT_DISPATCH && (command != nullptr) && !cpu_wait) {
|
||||
void* hw_event =
|
||||
(command->NotifyEvent() != nullptr) ? command->NotifyEvent()->HwEvent() : command->HwEvent();
|
||||
force_marker = (hw_event == nullptr);
|
||||
}
|
||||
if (nullptr == command || force_marker ||
|
||||
vdev()->isHandlerPending() || vdev()->isFenceDirty()) {
|
||||
// Force marker if the batch wasn't sent for CPU update or fence is dirty
|
||||
if (nullptr == command || (GetSubmittionBatch() != nullptr) || vdev()->isFenceDirty()) {
|
||||
if (nullptr != command) {
|
||||
command->release();
|
||||
}
|
||||
|
||||
@@ -257,6 +257,8 @@ release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
|
||||
"Forces the number of streams for the graph parallel execution") \
|
||||
release(bool, HIP_ALWAYS_USE_NEW_COMGR_UNBUNDLING_ACTION, false, \
|
||||
"Force to always use new comgr unbundling action") \
|
||||
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
|
||||
"Blocks synchronization on CPU until the callback processing is done")\
|
||||
release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \
|
||||
"Enable/Disable multiple kern arg copies") \
|
||||
release(bool, DEBUG_CLR_USE_STDMUTEX_IN_AMD_MONITOR, false, \
|
||||
|
||||
Ссылка в новой задаче
Block a user