clr: Avoid saving all ProfilingSignals at once (#2108)

* While reusing signals, its possible we can come across a timestamp
  that can contain several signals, like when profiling a graph. Reading
timestamps from all signals can make the call severely CPU bound.
Instead cache only that signal so as to avoid the overhead for critical
path.
Bu işleme şunda yer alıyor:
SaleelK
2025-12-08 11:32:16 -08:00
işlemeyi yapan: GitHub
ebeveyn eb357fcd45
işleme acc236fd89
4 değiştirilmiş dosya ile 184 ekleme ve 55 silme
+5
Dosyayı Görüntüle
@@ -41,4 +41,9 @@ enum HwQueueEngine : uint32_t {
Unknown = 5
};
//! Returns true if the engine is an SDMA engine (any type)
inline bool IsSdmaEngine(HwQueueEngine engine) {
return engine >= HwQueueEngine::SdmaRead && engine <= HwQueueEngine::SdmaInter;
}
} // namespace amd::roc
+29
Dosyayı Görüntüle
@@ -89,6 +89,14 @@ class ProfilingSignal : public amd::ReferenceCountedObject {
Flags flags_;
//! Cached timing data - populated when signal completes, avoids repeated HSA calls
struct CachedTiming {
uint64_t start_ = 0; //!< Cached start timestamp from HSA
uint64_t end_ = 0; //!< Cached end timestamp from HSA
bool valid_ = false; //!< True if timing data has been cached
};
CachedTiming cached_timing_;
ProfilingSignal()
: ts_(nullptr),
engine_(HwQueueEngine::Compute),
@@ -101,6 +109,27 @@ class ProfilingSignal : public amd::ReferenceCountedObject {
virtual ~ProfilingSignal();
amd::Monitor& LockSignalOps() { return lock_; }
//! Cache timing data from HSA for this signal (called once when signal completes)
void CacheTimingData(hsa_agent_t gpu_device);
//! Reset cached timing for signal reuse
void ResetCachedTiming() {
amd::ScopedLock lock(lock_);
cached_timing_.start_ = 0;
cached_timing_.end_ = 0;
cached_timing_.valid_ = false;
}
//! Check if timing is already cached
bool IsTimingCached() const { return cached_timing_.valid_; }
//! Get cached timing values
void GetCachedTiming(uint64_t& start, uint64_t& end) {
amd::ScopedLock lock(lock_);
start = cached_timing_.start_;
end = cached_timing_.end_;
}
};
class Sampler : public device::Sampler {
+140 -45
Dosyayı Görüntüle
@@ -122,70 +122,145 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
};
// ================================================================================================
void Timestamp::checkGpuTime() {
void ProfilingSignal::CacheTimingData(hsa_agent_t gpu_device) {
// Lock needed as async handler thread can also touch this structure
amd::ScopedLock lock(lock_);
// Return if timing is already cached
if (cached_timing_.valid_) {
return;
}
// Wait for this signal to complete if not already done
if (Hsa::signal_load_relaxed(signal_) > 0) {
WaitForSignal(signal_);
}
// Extract timing and cache it
if (IsSdmaEngine(engine_)) {
hsa_amd_profiling_async_copy_time_t time = {};
Hsa::profiling_get_async_copy_time(signal_, &time);
cached_timing_.start_ = time.start;
cached_timing_.end_ = time.end;
} else {
hsa_amd_profiling_dispatch_time_t time = {};
Hsa::profiling_get_dispatch_time(gpu_device, signal_, &time);
cached_timing_.start_ = time.start;
cached_timing_.end_ = time.end;
}
cached_timing_.valid_ = true;
}
// ================================================================================================
// Process GPU timing for signals
// If single_signal is nullptr, processes all signals and clears the list
// If single_signal is provided, processes only that signal with merge enabled
void Timestamp::checkGpuTime(ProfilingSignal* single_signal) {
amd::ScopedLock s(lock_);
// For single signal mode, validate it exists in the list
if (single_signal != nullptr) {
auto it = std::find(signals_.begin(), signals_.end(), single_signal);
if (it == signals_.end()) {
return;
}
}
if (HwProfiling()) {
uint64_t start = std::numeric_limits<uint64_t>::max();
uint64_t end = 0;
uint64_t sdmaStart = std::numeric_limits<uint64_t>::max();
uint64_t sdmaEnd = 0;
// Process either single signal or all signals
auto process_signal = [&](ProfilingSignal* sig) {
// Skip signals already processed
if (sig->flags_.done_) {
return;
}
for (auto it : signals_) {
amd::ScopedLock lock(it->LockSignalOps());
// Ignore the wait if runtime processes API callback, because the signal value is bigger
// than expected and the value reset will occur after API callback is done
if (GetCallbackSignal().handle == 0 || GetBlocking() == false) {
WaitForSignal(it->signal_);
ExtractSignalTiming(sig, start, end, sdmaStart, sdmaEnd);
}
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
// to ROCr is very slow and that also affects the overall performance of the callback thread
if (command().GetBatchHead() == nullptr || command().profilingInfo().marker_ts_ ||
command().type() == CL_COMMAND_TASK) {
hsa_amd_profiling_dispatch_time_t time = {};
hsa_amd_profiling_async_copy_time_t timeSdma = {};
amd_signal_t* amdSignal = reinterpret_cast<amd_signal_t*>(it->signal_.handle);
if (it->engine_ == HwQueueEngine::SdmaInter || it->engine_ == HwQueueEngine::SdmaRead ||
it->engine_ == HwQueueEngine::SdmaWrite || it->engine_ == HwQueueEngine::SdmaIntra) {
Hsa::profiling_get_async_copy_time(it->signal_, &timeSdma);
sdmaStart = std::min(timeSdma.start, sdmaStart);
sdmaEnd = std::max(timeSdma.end, sdmaEnd);
// set dispatch time to be used in logging.
time.start = timeSdma.start;
time.end = timeSdma.end;
} else {
Hsa::profiling_get_dispatch_time(gpu()->gpu_device(), it->signal_, &time);
start = std::min(time.start, start);
end = std::max(time.end, end);
}
if ((command().type() == CL_COMMAND_TASK) && (it->flags_.isPacketDispatch_ == true)) {
static_cast<amd::AccumulateCommand&>(command()).addTimestamps(time.start, time.end);
}
uint64_t sig_start, sig_end;
sig->GetCachedTiming(sig_start, sig_end);
amd_signal_t* amdSignal = reinterpret_cast<amd_signal_t*>(sig->signal_.handle);
ClPrint(amd::LOG_INFO, amd::LOG_TS,
"Signal = (0x%lx), Translated start/end = %ld / %ld, Elapsed = %ld ns, "
"ticks start/end = %ld / %ld, Ticks elapsed = %ld, Engine = %u",
it->signal_.handle, time.start, time.end, time.end - time.start,
sig->signal_.handle, sig_start, sig_end, sig_end - sig_start,
amdSignal->start_ts, amdSignal->end_ts, amdSignal->end_ts - amdSignal->start_ts,
it->engine_);
sig->engine_);
}
it->flags_.done_ = true;
};
if (single_signal != nullptr) {
process_signal(single_signal);
} else {
for (auto it : signals_) {
process_signal(it);
}
signals_.clear();
}
signals_.clear();
// Update member timing variables from local accumulators
// When processing single signal, merge with existing timing
// When processing all signals, replace timing
if (end != 0 || sdmaEnd != 0) {
// Check if it's the first execution and update start time
const bool merge_with_existing = (single_signal != nullptr);
uint64_t final_start = ((sdmaEnd != 0) ? sdmaStart : start) * ticksToTime_;
uint64_t final_end = ((sdmaEnd != 0) ? sdmaEnd : end) * ticksToTime_;
if (!accum_ena_) {
start_ = ((sdmaEnd != 0) ? sdmaStart : start) * ticksToTime_;
start_ = final_start;
accum_ena_ = true;
} else if (merge_with_existing) {
start_ = std::min(start_, final_start);
}
// Progress the end time always
end_ = ((sdmaEnd != 0) ? sdmaEnd : end) * ticksToTime_;
end_ = merge_with_existing ? std::max(end_, final_end) : final_end;
}
}
}
// ================================================================================================
// Extract timing from a single signal
void Timestamp::ExtractSignalTiming(ProfilingSignal* signal,
uint64_t& start, uint64_t& end,
uint64_t& sdmaStart, uint64_t& sdmaEnd) {
// Ensure timing data is cached
if (!signal->IsTimingCached()) {
signal->CacheTimingData(gpu()->gpu_device());
}
// Get cached timing
uint64_t sig_start, sig_end;
signal->GetCachedTiming(sig_start, sig_end);
// Lock signal for accessing engine_ and flags_
amd::ScopedLock sig_lock(signal->LockSignalOps());
// Update appropriate accumulators based on engine type
if (IsSdmaEngine(signal->engine_)) {
sdmaStart = std::min(sig_start, sdmaStart);
sdmaEnd = std::max(sig_end, sdmaEnd);
} else {
start = std::min(sig_start, start);
end = std::max(sig_end, end);
}
// Handle AccumulateCommand timestamps
if ((command().type() == CL_COMMAND_TASK) && (signal->flags_.isPacketDispatch_ == true)) {
static_cast<amd::AccumulateCommand&>(command()).addTimestamps(sig_start, sig_end);
}
signal->flags_.done_ = true;
}
// ================================================================================================
bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
Timestamp* ts = reinterpret_cast<Timestamp*>(arg);
@@ -514,7 +589,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(hsa_signal_value_t init_va
prof_signal->flags_.done_ = false;
prof_signal->engine_ = engine_;
prof_signal->flags_.isPacketDispatch_ = false;
prof_signal->ResetCachedTiming();
if (nullptr != cmd) {
// Release any existing HwEvent before setting new one for the same command
@@ -628,26 +703,46 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
// ================================================================================================
bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
// Wait for the current signal
if (signal->ts_ != nullptr) {
// Update timestamp values if requested
auto ts = signal->ts_;
ts->checkGpuTime();
ts->release();
signal->ts_ = nullptr;
} else if (Hsa::signal_load_relaxed(signal->signal_) > 0) {
amd::ScopedLock lock(signal->LockSignalOps());
// Wait for the current signal to complete
if (Hsa::signal_load_relaxed(signal->signal_) > 0) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Host wait on completion_signal=0x%zx",
signal->signal_.handle);
if (!WaitForSignal(signal->signal_, gpu_.ActiveWait())) {
LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
return false;
}
}
// Process this signal's timing before signal reuse
// This copies timing to the Timestamp
if (signal->ts_ != nullptr) {
signal->ts_->checkGpuTime(signal);
signal->ts_->release();
signal->ts_ = nullptr;
} else {
// No timestamp - just mark signal as done
amd::ScopedLock lock(signal->LockSignalOps());
signal->flags_.done_ = true;
}
return true;
}
// ================================================================================================
bool VirtualGPU::HwQueueTracker::WaitCurrent() {
ProfilingSignal* signal = signal_list_[current_id_];
return CpuWaitForSignal(signal);
}
// ================================================================================================
void VirtualGPU::HwQueueTracker::WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
ProfilingSignal* signal = signal_list_[next];
// Only wait, there is no need to save timestamp for the next signal
// It will be saved when the signal is actually used
WaitForSignal(signal->signal_, gpu_.ActiveWait());
}
// ================================================================================================
void VirtualGPU::HwQueueTracker::ResetCurrentSignal() {
// Reset the signal and return
+10 -10
Dosyayı Görüntüle
@@ -118,6 +118,11 @@ class Timestamp : public amd::ReferenceCountedObject {
bool hasHwProfiling_ = false; //!< If TRUE then HwProfiling is enabled for the command
bool blocking_ = true; //!< If TRUE callback is blocking
//! Extract timing from a single signal and update accumulators
void ExtractSignalTiming(ProfilingSignal* signal,
uint64_t& start, uint64_t& end,
uint64_t& sdmaStart, uint64_t& sdmaEnd);
Timestamp(const Timestamp&) = delete;
Timestamp& operator=(const Timestamp&) = delete;
@@ -149,7 +154,9 @@ class Timestamp : public amd::ReferenceCountedObject {
const bool HwProfiling() const { return hasHwProfiling_; }
//! Finds execution ticks on GPU
void checkGpuTime();
//! If single_signal is nullptr, processes all signals and clears the list
//! If single_signal is provided, processes only that signal with merge enabled
void checkGpuTime(ProfilingSignal* single_signal = nullptr);
// Start a timestamp (get timestamp from OS)
void start() { start_ = amd::Os::timeNanos(); }
@@ -272,10 +279,7 @@ class VirtualGPU : public device::VirtualDevice {
Timestamp* ts = nullptr, bool attach_signal = true);
//! Wait for the curent active signal. Can idle the queue
bool WaitCurrent() {
ProfilingSignal* signal = signal_list_[current_id_];
return CpuWaitForSignal(signal);
}
bool WaitCurrent();
//! Update current active engine
void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
@@ -311,11 +315,7 @@ class VirtualGPU : public device::VirtualDevice {
bool CreateSignal(ProfilingSignal* signal, bool interrupt = false) const;
//! Wait for the next active signal
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
ProfilingSignal* signal = signal_list_[next];
CpuWaitForSignal(signal);
}
void WaitNext();
//! Wait for the provided signal
bool CpuWaitForSignal(ProfilingSignal* signal);