SWDEV-504383 - Cleaned up kForcedTimeout10us and removed IsHwEventReadyForcedWait

Also removed active_wait_timeout

Change-Id: I7a429f003c09a4df267b5c0983050704260094c6


[ROCm/clr commit: 4872b420c9]
Этот коммит содержится в:
Jimbo Xie
2025-01-07 18:30:49 -05:00
коммит произвёл Saleel Kudchadker
родитель 40df900647
Коммит cc229f251f
7 изменённых файлов: 14 добавлений и 87 удалений
-6
Просмотреть файл
@@ -1981,12 +1981,6 @@ class Device : public RuntimeObject {
return false;
};
// Returns the status of HW event, associated with amd::Event
virtual bool IsHwEventReadyForcedWait(
const amd::Event& event) const { //!< AMD event for HW status validation
return false;
};
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual const uint32_t getPreferredNumaNode() const { return 0; }
+2 -28
Просмотреть файл
@@ -518,14 +518,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
}
gpu().Barriers().SetActiveEngine(engine);
// Check if host wait has to be forced
bool forceHostWait = forceHostWaitFunc(size);
constexpr bool kIgnoreHostWait = false;
// Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
forceHostWait);
auto wait_events = gpu().Barriers().WaitingSignal(engine);
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
if (copyMask == 0) {
@@ -2635,26 +2629,6 @@ amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize,
return amdMemory;
}
bool DmaBlitManager::forceHostWaitFunc(size_t copy_size) const {
// 10us wait is true for all other targets.
bool forceHostWait = true;
// Based on the profiled results, do not wait for copy size > 24 KB.
static constexpr size_t kGfx90aCopyThreshold = 24;
if ((dev().isa().versionMajor() == 9 && dev().isa().versionMinor() == 0
&& dev().isa().versionStepping() == 10) && (copy_size >= kGfx90aCopyThreshold * Ki)) {
// Check if this is gfx90a and restrict small copy to 24K.
forceHostWait = false;
} else if ((dev().isa().versionMajor() == 9) && (dev().isa().versionMinor() == 4)
&& (dev().isa().versionStepping() == 0 || dev().isa().versionStepping() == 1
|| dev().isa().versionStepping() == 2)) {
// for gfx940, gfx941, gfx942, dependency signal resolution is fast, so no Host wait at all.
forceHostWait = false;
}
return forceHostWait;
}
Memory* KernelBlitManager::createView(const Memory& parent, cl_image_format format,
cl_mem_flags flags) const {
assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
-2
Просмотреть файл
@@ -260,8 +260,6 @@ class DmaBlitManager : public device::HostBlitManager {
bool hostToDev, //!< True if data is copied from H2D
amd::CopyMetadata& copyMetadata //!< Memory copy MetaData
) const;
bool forceHostWaitFunc(size_t copy_size) const;
};
//! Kernel Blit Manager
-12
Просмотреть файл
@@ -2914,18 +2914,6 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
return result;
}
// ================================================================================================
bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
void* hw_event =
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
if (hw_event == nullptr) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
return false;
}
static constexpr bool Timeout = true;
return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
}
// ================================================================================================
bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_event_flags) const {
void* hw_event =
+1 -5
Просмотреть файл
@@ -86,10 +86,9 @@ public:
typedef union {
struct {
uint32_t done_ : 1; //!< True if signal is done
uint32_t forceHostWait_ : 1; //!< Force Host Wait for dependency signals
uint32_t isPacketDispatch_: 1; //!< True if the packet, used with the signal, is dispatch
uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt
uint32_t reserved_ : 28;
uint32_t reserved_ : 29;
};
uint32_t data_;
} Flags;
@@ -104,7 +103,6 @@ public:
signal_.handle = 0;
flags_.data_ = 0;
flags_.done_ = true;
flags_.forceHostWait_ = true;
}
virtual ~ProfilingSignal();
@@ -298,7 +296,6 @@ class NullDevice : public amd::Device {
return false;
}
bool IsHwEventReadyForcedWait(const amd::Event& event) const override { return false; }
void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override{};
void ReleaseGlobalSignal(void* signal) const override {}
@@ -498,7 +495,6 @@ class Device : public NullDevice {
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false,
uint32_t hip_event_flags = 0) const;
virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
virtual void ReleaseGlobalSignal(void* signal) const;
+8 -15
Просмотреть файл
@@ -422,7 +422,7 @@ bool VirtualGPU::HwQueueTracker::Create() {
// ================================================================================================
hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
hsa_signal_value_t init_val, Timestamp* ts, bool forceHostWait) {
hsa_signal_value_t init_val, Timestamp* ts) {
bool new_signal = false;
// Peep signal +2 ahead to see if its done
@@ -502,7 +502,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
prof_signal->flags_.done_ = false;
prof_signal->engine_ = engine_;
prof_signal->flags_.isPacketDispatch_ = false;
prof_signal->flags_.forceHostWait_ = forceHostWait;
if (ts != 0) {
// Save HSA signal earlier to make sure the possible callback will have a valid
// value for processing
@@ -548,8 +547,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
}
// ================================================================================================
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
bool forceHostWait) {
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
bool explicit_wait = false;
// Reset all current waiting signals
waiting_signals_.clear();
@@ -595,17 +593,12 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
// Early signal status check
if (hsa_signal_load_relaxed(external_signals_[i]->signal_) > 0) {
const Settings& settings = gpu_.dev().settings();
// Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
// For small copies set forced wait
if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
external_signals_[i]->flags_.forceHostWait_ : false)) {
if (settings.cpu_wait_for_signal_) {
// Wait on CPU for completion if requested
CpuWaitForSignal(external_signals_[i]);
} else {
// Add HSA signal for tracking on GPU
waiting_signals_.push_back(external_signals_[i]->signal_);
}
if (settings.cpu_wait_for_signal_) {
// Wait on CPU for completion if requested
CpuWaitForSignal(external_signals_[i]);
} else {
// Add HSA signal for tracking on GPU
waiting_signals_.push_back(external_signals_[i]->signal_);
}
}
}
+3 -19
Просмотреть файл
@@ -46,24 +46,12 @@ constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
constexpr static uint64_t kTimeout100us = 100 * K;
constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
// Active wait time out incase same sdma engine is used again,
// then just wait instead of adding dependency wait signal.
constexpr static uint64_t kForcedTimeout10us = 10;
template <bool active_wait_timeout = false>
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) {
if (hsa_signal_load_relaxed(signal) > 0) {
uint64_t timeout = kTimeout100us;
if (active_wait) {
timeout = kUnlimitedWait;
}
if (active_wait_timeout) {
// If forced wait is set, then wait for 10us, else dont wait. (ns * K = us)
timeout = (forced_wait ? kForcedTimeout10us : ROC_ACTIVE_WAIT_TIMEOUT) * K;
if (timeout == 0) {
return false;
}
}
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns",
signal.handle, timeout);
@@ -71,9 +59,6 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool fo
// Active wait with a timeout
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
timeout, HSA_WAIT_STATE_ACTIVE) != 0) {
if (active_wait_timeout) {
return false;
}
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host blocked wait for Signal = (0x%lx)",
signal.handle);
@@ -269,7 +254,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Finds a free signal for the upcomming operation
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
Timestamp* ts = nullptr, bool forceHostWait = true);
Timestamp* ts = nullptr);
//! Wait for the curent active signal. Can idle the queue
bool WaitCurrent() {
@@ -282,8 +267,7 @@ class VirtualGPU : public device::VirtualDevice {
HwQueueEngine GetActiveEngine() const { return engine_; }
//! Returns the last submitted signal for a wait
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
bool forceHostWait = true);
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
void ResetCurrentSignal();