SWDEV-504383 - Cleaned up kForcedTimeout10us and removed IsHwEventReadyForcedWait
Also removed active_wait_timeout Change-Id: I7a429f003c09a4df267b5c0983050704260094c6
Этот коммит содержится в:
коммит произвёл
Saleel Kudchadker
родитель
32daa8f384
Коммит
4872b420c9
@@ -1981,12 +1981,6 @@ class Device : public RuntimeObject {
|
||||
return false;
|
||||
};
|
||||
|
||||
// Returns the status of HW event, associated with amd::Event
|
||||
virtual bool IsHwEventReadyForcedWait(
|
||||
const amd::Event& event) const { //!< AMD event for HW status validation
|
||||
return false;
|
||||
};
|
||||
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
|
||||
|
||||
virtual const uint32_t getPreferredNumaNode() const { return 0; }
|
||||
|
||||
@@ -518,14 +518,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent,
|
||||
}
|
||||
|
||||
gpu().Barriers().SetActiveEngine(engine);
|
||||
// Check if host wait has to be forced
|
||||
bool forceHostWait = forceHostWaitFunc(size);
|
||||
|
||||
constexpr bool kIgnoreHostWait = false;
|
||||
// Ignore waiting on any previous kernel dispatch and queue a signal to ROCr copy api instead
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine, kIgnoreHostWait);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp(),
|
||||
forceHostWait);
|
||||
auto wait_events = gpu().Barriers().WaitingSignal(engine);
|
||||
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
|
||||
|
||||
if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
|
||||
if (copyMask == 0) {
|
||||
@@ -2635,26 +2629,6 @@ amd::Memory* DmaBlitManager::pinHostMemory(const void* hostMem, size_t pinSize,
|
||||
return amdMemory;
|
||||
}
|
||||
|
||||
bool DmaBlitManager::forceHostWaitFunc(size_t copy_size) const {
|
||||
// 10us wait is true for all other targets.
|
||||
bool forceHostWait = true;
|
||||
// Based on the profiled results, do not wait for copy size > 24 KB.
|
||||
static constexpr size_t kGfx90aCopyThreshold = 24;
|
||||
|
||||
if ((dev().isa().versionMajor() == 9 && dev().isa().versionMinor() == 0
|
||||
&& dev().isa().versionStepping() == 10) && (copy_size >= kGfx90aCopyThreshold * Ki)) {
|
||||
// Check if this is gfx90a and restrict small copy to 24K.
|
||||
forceHostWait = false;
|
||||
} else if ((dev().isa().versionMajor() == 9) && (dev().isa().versionMinor() == 4)
|
||||
&& (dev().isa().versionStepping() == 0 || dev().isa().versionStepping() == 1
|
||||
|| dev().isa().versionStepping() == 2)) {
|
||||
// for gfx940, gfx941, gfx942, dependency signal resolution is fast, so no Host wait at all.
|
||||
forceHostWait = false;
|
||||
}
|
||||
|
||||
return forceHostWait;
|
||||
}
|
||||
|
||||
Memory* KernelBlitManager::createView(const Memory& parent, cl_image_format format,
|
||||
cl_mem_flags flags) const {
|
||||
assert((parent.owner()->asBuffer() == nullptr) && "View supports images only");
|
||||
|
||||
@@ -260,8 +260,6 @@ class DmaBlitManager : public device::HostBlitManager {
|
||||
bool hostToDev, //!< True if data is copied from H2D
|
||||
amd::CopyMetadata& copyMetadata //!< Memory copy MetaData
|
||||
) const;
|
||||
|
||||
bool forceHostWaitFunc(size_t copy_size) const;
|
||||
};
|
||||
|
||||
//! Kernel Blit Manager
|
||||
|
||||
@@ -2914,18 +2914,6 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
|
||||
return result;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
|
||||
void* hw_event =
|
||||
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
|
||||
if (hw_event == nullptr) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
|
||||
return false;
|
||||
}
|
||||
static constexpr bool Timeout = true;
|
||||
return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
bool Device::IsHwEventReady(const amd::Event& event, bool wait, uint32_t hip_event_flags) const {
|
||||
void* hw_event =
|
||||
|
||||
@@ -86,10 +86,9 @@ public:
|
||||
typedef union {
|
||||
struct {
|
||||
uint32_t done_ : 1; //!< True if signal is done
|
||||
uint32_t forceHostWait_ : 1; //!< Force Host Wait for dependency signals
|
||||
uint32_t isPacketDispatch_: 1; //!< True if the packet, used with the signal, is dispatch
|
||||
uint32_t interrupt_ : 1; //!< True if the signal will trigger an interrupt
|
||||
uint32_t reserved_ : 28;
|
||||
uint32_t reserved_ : 29;
|
||||
};
|
||||
uint32_t data_;
|
||||
} Flags;
|
||||
@@ -104,7 +103,6 @@ public:
|
||||
signal_.handle = 0;
|
||||
flags_.data_ = 0;
|
||||
flags_.done_ = true;
|
||||
flags_.forceHostWait_ = true;
|
||||
}
|
||||
|
||||
virtual ~ProfilingSignal();
|
||||
@@ -298,7 +296,6 @@ class NullDevice : public amd::Device {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool IsHwEventReadyForcedWait(const amd::Event& event) const override { return false; }
|
||||
void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const override{};
|
||||
void ReleaseGlobalSignal(void* signal) const override {}
|
||||
|
||||
@@ -498,7 +495,6 @@ class Device : public NullDevice {
|
||||
|
||||
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false,
|
||||
uint32_t hip_event_flags = 0) const;
|
||||
virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
|
||||
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
|
||||
virtual void ReleaseGlobalSignal(void* signal) const;
|
||||
|
||||
|
||||
@@ -422,7 +422,7 @@ bool VirtualGPU::HwQueueTracker::Create() {
|
||||
|
||||
// ================================================================================================
|
||||
hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
hsa_signal_value_t init_val, Timestamp* ts, bool forceHostWait) {
|
||||
hsa_signal_value_t init_val, Timestamp* ts) {
|
||||
bool new_signal = false;
|
||||
|
||||
// Peep signal +2 ahead to see if its done
|
||||
@@ -502,7 +502,6 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
prof_signal->flags_.done_ = false;
|
||||
prof_signal->engine_ = engine_;
|
||||
prof_signal->flags_.isPacketDispatch_ = false;
|
||||
prof_signal->flags_.forceHostWait_ = forceHostWait;
|
||||
if (ts != 0) {
|
||||
// Save HSA signal earlier to make sure the possible callback will have a valid
|
||||
// value for processing
|
||||
@@ -548,8 +547,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine,
|
||||
bool forceHostWait) {
|
||||
std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngine engine) {
|
||||
bool explicit_wait = false;
|
||||
// Reset all current waiting signals
|
||||
waiting_signals_.clear();
|
||||
@@ -595,17 +593,12 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
// Early signal status check
|
||||
if (hsa_signal_load_relaxed(external_signals_[i]->signal_) > 0) {
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
// Actively wait on CPU to avoid extra overheads of signal tracking on GPU.
|
||||
// For small copies set forced wait
|
||||
if (!WaitForSignal<true>(external_signals_[i]->signal_, false, forceHostWait ?
|
||||
external_signals_[i]->flags_.forceHostWait_ : false)) {
|
||||
if (settings.cpu_wait_for_signal_) {
|
||||
// Wait on CPU for completion if requested
|
||||
CpuWaitForSignal(external_signals_[i]);
|
||||
} else {
|
||||
// Add HSA signal for tracking on GPU
|
||||
waiting_signals_.push_back(external_signals_[i]->signal_);
|
||||
}
|
||||
if (settings.cpu_wait_for_signal_) {
|
||||
// Wait on CPU for completion if requested
|
||||
CpuWaitForSignal(external_signals_[i]);
|
||||
} else {
|
||||
// Add HSA signal for tracking on GPU
|
||||
waiting_signals_.push_back(external_signals_[i]->signal_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,24 +46,12 @@ constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
|
||||
constexpr static uint64_t kTimeout100us = 100 * K;
|
||||
constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
// Active wait time out incase same sdma engine is used again,
|
||||
// then just wait instead of adding dependency wait signal.
|
||||
constexpr static uint64_t kForcedTimeout10us = 10;
|
||||
|
||||
template <bool active_wait_timeout = false>
|
||||
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
|
||||
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false) {
|
||||
if (hsa_signal_load_relaxed(signal) > 0) {
|
||||
uint64_t timeout = kTimeout100us;
|
||||
if (active_wait) {
|
||||
timeout = kUnlimitedWait;
|
||||
}
|
||||
if (active_wait_timeout) {
|
||||
// If forced wait is set, then wait for 10us, else dont wait. (ns * K = us)
|
||||
timeout = (forced_wait ? kForcedTimeout10us : ROC_ACTIVE_WAIT_TIMEOUT) * K;
|
||||
if (timeout == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host active wait for Signal = (0x%lx) for %d ns",
|
||||
signal.handle, timeout);
|
||||
@@ -71,9 +59,6 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool fo
|
||||
// Active wait with a timeout
|
||||
if (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
|
||||
timeout, HSA_WAIT_STATE_ACTIVE) != 0) {
|
||||
if (active_wait_timeout) {
|
||||
return false;
|
||||
}
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Host blocked wait for Signal = (0x%lx)",
|
||||
signal.handle);
|
||||
|
||||
@@ -269,7 +254,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
//! Finds a free signal for the upcomming operation
|
||||
hsa_signal_t ActiveSignal(hsa_signal_value_t init_val = kInitSignalValueOne,
|
||||
Timestamp* ts = nullptr, bool forceHostWait = true);
|
||||
Timestamp* ts = nullptr);
|
||||
|
||||
//! Wait for the curent active signal. Can idle the queue
|
||||
bool WaitCurrent() {
|
||||
@@ -282,8 +267,7 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
HwQueueEngine GetActiveEngine() const { return engine_; }
|
||||
|
||||
//! Returns the last submitted signal for a wait
|
||||
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute,
|
||||
bool forceHostWait = true);
|
||||
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
|
||||
|
||||
//! Resets current signal back to the previous one. It's necessary in a case of ROCr failure.
|
||||
void ResetCurrentSignal();
|
||||
|
||||
Ссылка в новой задаче
Block a user