SWDEV-287137 - Add blocking signal logic
With HIP API callback runtime has to stall the queue until the
callback is done. Rocclr will introduce SW blocking HSA signal,
which will be released after the callback is done.
Change-Id: I6411f3efab31b468e3b87ebb5c8d155e116b613d
[ROCm/clr commit: d93df7037c]
Este cometimento está contido em:
cometido por
Maneesh Gupta
ascendente
ed789dd3dc
cometimento
a2248eca4c
@@ -115,7 +115,9 @@ void Timestamp::checkGpuTime() {
|
||||
uint64_t end = 0;
|
||||
|
||||
for (auto it : signals_) {
|
||||
if (hsa_signal_load_relaxed(it->signal_) > 0) {
|
||||
// Ignore the wait if runtime processes API callback, because the signal value is bigger
|
||||
// than expected and the value reset will occur after API callback is done
|
||||
if (GetCallbackSignal().handle == 0) {
|
||||
WaitForSignal(it->signal_);
|
||||
}
|
||||
// Avoid profiling data for the sync barrier, in tiny performance tests the first call
|
||||
@@ -153,7 +155,7 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
|
||||
return false;
|
||||
}
|
||||
amd::ScopedLock sl(ts->gpu()->execution());
|
||||
|
||||
if (ts->gpu()->isProfilerAttached()) {
|
||||
amd::Command* head = ts->getParsedCommand();
|
||||
if (head == nullptr) {
|
||||
@@ -185,9 +187,17 @@ bool HsaAmdSignalHandler(hsa_signal_value_t value, void* arg) {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Handler: value(%d), timestamp(%p), handle(0x%lx)",
|
||||
static_cast<uint32_t>(value), arg, ts->HwProfiling() ? ts->Signals()[0]->signal_.handle : 0);
|
||||
|
||||
// Save callback signal
|
||||
hsa_signal_t callback_signal = ts->GetCallbackSignal();
|
||||
|
||||
// Update the batch, since signal is complete
|
||||
ts->gpu()->updateCommandsState(ts->command().GetBatchHead());
|
||||
|
||||
// Reset API callback signal. It will release AQL queue and start commands processing
|
||||
if (callback_signal.handle != 0) {
|
||||
hsa_signal_subtract_relaxed(callback_signal, 1);
|
||||
}
|
||||
|
||||
// Return false, so the callback will not be called again for this signal
|
||||
return false;
|
||||
}
|
||||
@@ -371,8 +381,17 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
|
||||
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
|
||||
// requires the batch update upon HSA signal completion
|
||||
if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
|
||||
uint32_t init_value = kInitSignalValueOne;
|
||||
// If API callback is enabled, then use a blocking signal for AQL queue.
|
||||
// HSA signal will be acquired in SW and released after HSA signal callback
|
||||
if (ts->command().Callback() != nullptr) {
|
||||
ts->SetCallbackSignal(prof_signal->signal_);
|
||||
// Blocks AQL queue from further processing
|
||||
hsa_signal_add_relaxed(prof_signal->signal_, 1);
|
||||
init_value += 1;
|
||||
}
|
||||
hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
|
||||
HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne, &HsaAmdSignalHandler, ts);
|
||||
HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
|
||||
if (HSA_STATUS_SUCCESS != result) {
|
||||
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
|
||||
} else {
|
||||
@@ -396,7 +415,7 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
|
||||
// Does runtime switch the active engine?
|
||||
if (engine != engine_) {
|
||||
// Yes, return the signla from the previous operation for a wait
|
||||
// Yes, return the signal from the previous operation for a wait
|
||||
engine_ = engine;
|
||||
explicit_wait = true;
|
||||
} else {
|
||||
@@ -404,8 +423,8 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
if (engine == HwQueueEngine::Unknown) {
|
||||
explicit_wait = true;
|
||||
} else {
|
||||
// Check if skip wait optimizaiton is enabled. It will try to predice the same engine in ROCr
|
||||
// and ignore signal wait, relying on in-order engine execution
|
||||
// Check if skip wait optimization is enabled. It will try to predict the same engine in ROCr
|
||||
// and ignore the signal wait, relying on in-order engine execution
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
if (!settings.skip_copy_sync_ && (engine != HwQueueEngine::Compute)) {
|
||||
explicit_wait = true;
|
||||
@@ -414,24 +433,33 @@ std::vector<hsa_signal_t>& VirtualGPU::HwQueueTracker::WaitingSignal(HwQueueEngi
|
||||
}
|
||||
// Check if a wait is required
|
||||
if (explicit_wait) {
|
||||
ProfilingSignal** prof_signal;
|
||||
bool skip_internal_signal = false;
|
||||
|
||||
for (uint32_t i = 0; i < external_signals_.size(); ++i) {
|
||||
// If external signal matches internal one, then skip it
|
||||
if (external_signals_[i]->signal_.handle ==
|
||||
signal_list_[current_id_]->signal_.handle) {
|
||||
skip_internal_signal = true;
|
||||
}
|
||||
}
|
||||
// Add the oldest signal into the tracking for a wait
|
||||
external_signals_.push_back(signal_list_[current_id_]);
|
||||
prof_signal = &external_signals_[0];
|
||||
if (!skip_internal_signal) {
|
||||
external_signals_.push_back(signal_list_[current_id_]);
|
||||
}
|
||||
|
||||
// Validate all signals for the wait and skip already completed
|
||||
for (uint32_t i = 0; i < external_signals_.size(); ++i) {
|
||||
// Early signal status check
|
||||
if (hsa_signal_load_relaxed(prof_signal[i]->signal_) > 0) {
|
||||
if (hsa_signal_load_relaxed(external_signals_[i]->signal_) > 0) {
|
||||
const Settings& settings = gpu_.dev().settings();
|
||||
// Actively wait on CPU for 750 us to avoid extra overheads of signal tracking on GPU
|
||||
if (!WaitForSignal<kTimeout750us>(prof_signal[i]->signal_)) {
|
||||
if (!WaitForSignal<kTimeout750us>(external_signals_[i]->signal_)) {
|
||||
if (settings.cpu_wait_for_signal_) {
|
||||
// Wait on CPU for completion if requested
|
||||
CpuWaitForSignal(prof_signal[i]);
|
||||
CpuWaitForSignal(external_signals_[i]);
|
||||
} else {
|
||||
// Add HSA signal for tracking on GPU
|
||||
waiting_signals_.push_back(prof_signal[i]->signal_);
|
||||
waiting_signals_.push_back(external_signals_[i]->signal_);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -891,12 +919,6 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
|
||||
__atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), packetHeader, __ATOMIC_RELEASE);
|
||||
|
||||
hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);
|
||||
// Clear dependent signals for the next packet
|
||||
barrier_packet_.dep_signal[0] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[1] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[2] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[3] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[4] = hsa_signal_t{};
|
||||
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
|
||||
"[%zx] HWq=0x%zx, BarrierAND Header = 0x%x (type=%d, barrier=%d, acquire=%d,"
|
||||
" release=%d), "
|
||||
@@ -913,6 +935,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
|
||||
barrier_packet_.dep_signal[0], barrier_packet_.dep_signal[1],
|
||||
barrier_packet_.dep_signal[2], barrier_packet_.dep_signal[3],
|
||||
barrier_packet_.dep_signal[4], barrier_packet_.completion_signal);
|
||||
// Clear dependent signals for the next packet
|
||||
barrier_packet_.dep_signal[0] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[1] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[2] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[3] = hsa_signal_t{};
|
||||
barrier_packet_.dep_signal[4] = hsa_signal_t{};
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
|
||||
@@ -90,7 +90,11 @@ class Timestamp : public amd::HeapObject {
|
||||
VirtualGPU* gpu_; //!< Virtual GPU, associated with this timestamp
|
||||
const amd::Command& command_; //!< Command, associated with this timestamp
|
||||
amd::Command* parsedCommand_; //!< Command down the list, considering command_ as head
|
||||
std::vector<ProfilingSignal*> signals_;
|
||||
std::vector<ProfilingSignal*> signals_; //!< The list of all signals, associated with the TS
|
||||
hsa_signal_t callback_signal_; //!< Signal associated with a callback for possible later update
|
||||
|
||||
Timestamp(const Timestamp&) = delete;
|
||||
Timestamp& operator=(const Timestamp&) = delete;
|
||||
|
||||
public:
|
||||
Timestamp(VirtualGPU* gpu, const amd::Command& command)
|
||||
@@ -98,7 +102,8 @@ class Timestamp : public amd::HeapObject {
|
||||
, end_(0)
|
||||
, gpu_(gpu)
|
||||
, command_(command)
|
||||
, parsedCommand_(nullptr) {}
|
||||
, parsedCommand_(nullptr)
|
||||
, callback_signal_(hsa_signal_t{}) {}
|
||||
|
||||
~Timestamp() {}
|
||||
|
||||
@@ -141,6 +146,14 @@ class Timestamp : public amd::HeapObject {
|
||||
|
||||
//! Returns virtual GPU device, used with this timestamp
|
||||
VirtualGPU* gpu() const { return gpu_; }
|
||||
|
||||
//! Updates the callback signal
|
||||
void SetCallbackSignal(hsa_signal_t callback_signal) {
|
||||
callback_signal_ = callback_signal;
|
||||
}
|
||||
|
||||
//! Returns the callback signal
|
||||
hsa_signal_t GetCallbackSignal() const { return callback_signal_; }
|
||||
};
|
||||
|
||||
class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador