SWDEV-290160 - Switch to global HSA signals

Runtime can't assign internal HSA signals for HIP events, because
HIP application can destroy the HIP stream or signal reuse may
occur internally. Switch to global HSA signals for HIP events.

Change-Id: Ieaea2d6b039e492b2e7c5112782a8f4e601e50a1


[ROCm/clr commit: ce8dad2ecc]
This commit is contained in:
German Andryeyev
2021-06-18 17:07:40 -04:00
committed by Maneesh Gupta
parent bfc21e4a59
commit 9f03f68a8a
7 changed files with 162 additions and 120 deletions
+2
View File
@@ -49,6 +49,7 @@
#include <map>
#include <list>
#include <set>
#include <unordered_set>
#include <utility>
namespace amd {
@@ -1705,6 +1706,7 @@ class Device : public RuntimeObject {
) const {
return false;
};
virtual void ReleaseGlobalSignal(void* signal) const {}
//! Returns TRUE if the device is available for computations
bool isOnline() const { return online_; }
@@ -2919,6 +2919,7 @@ bool Device::findLinkInfo(const hsa_amd_memory_pool_t& pool,
return true;
}
// ================================================================================================
void Device::getGlobalCUMask(std::string cuMaskStr) {
if (cuMaskStr.length() != 0) {
std::string pre = cuMaskStr.substr(0, 2);
@@ -2971,10 +2972,12 @@ void Device::getGlobalCUMask(std::string cuMaskStr) {
}
}
// ================================================================================================
device::Signal* Device::createSignal() const {
return new roc::Signal();
}
// ================================================================================================
amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
// If arena_mem_obj_ is null, then HMM and Xnack is disabled. Return nullptr.
if (arena_mem_obj_ == nullptr) {
@@ -2989,5 +2992,39 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
return arena_mem_obj_;
}
// ================================================================================================
ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
if (prof_signal != nullptr) {
hsa_agent_t agent = getBackendDevice();
hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
if (ts != 0) {
// Save HSA signal earlier to make sure the possible callback will have a valid
// value for processing
prof_signal->ts_ = ts;
ts->AddProfilingSignal(prof_signal.get());
}
if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
num_agents, agents, &prof_signal->signal_)) {
return prof_signal.release();
}
}
return nullptr;
}
// ================================================================================================
void Device::ReleaseGlobalSignal(void* signal) const {
if (signal != nullptr) {
ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
if (prof_signal->signal_.handle != 0) {
hsa_signal_destroy(prof_signal->signal_);
}
delete prof_signal;
}
}
} // namespace roc
#endif // WITHOUT_HSA_BACKEND
@@ -77,6 +77,21 @@ class VirtualDevice;
class PrintfDbg;
class IProDevice;
struct ProfilingSignal : public amd::HeapObject {
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
HwQueueEngine engine_; //!< Engine used with this signal
bool done_; //!< True if signal is done
amd::Monitor lock_; //!< Signal lock for update
ProfilingSignal()
: ts_(nullptr)
, engine_(HwQueueEngine::Compute)
, done_(true)
, lock_("Signal Ops Lock", true)
{ signal_.handle = 0; }
amd::Monitor& LockSignalOps() { return lock_; }
};
class Sampler : public device::Sampler {
public:
//! Constructor
@@ -237,6 +252,7 @@ class NullDevice : public amd::Device {
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
virtual void ReleaseGlobalSignal(void* signal) const {}
protected:
//! Initialize compiler instance and handle
@@ -405,6 +421,7 @@ class Device : public NullDevice {
cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
virtual void ReleaseGlobalSignal(void* signal) const;
//! Allocate host memory in terms of numa policy set by user
void* hostNumaAlloc(size_t size, size_t alignment, bool atomics = false) const;
@@ -505,6 +522,8 @@ class Device : public NullDevice {
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
private:
bool create();
+60 -77
View File
@@ -109,7 +109,7 @@ static unsigned extractAqlBits(unsigned v, unsigned pos, unsigned width) {
};
// ================================================================================================
void Timestamp::checkGpuTime(bool event_recycle) {
void Timestamp::checkGpuTime() {
if (HwProfiling()) {
uint64_t start = std::numeric_limits<uint64_t>::max();
uint64_t end = 0;
@@ -140,10 +140,6 @@ void Timestamp::checkGpuTime(bool event_recycle) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Signal = (0x%lx), start = %ld, "
"end = %ld", it->signal_.handle, start, end);
}
// The signal is reused and the upper layer can't rely on it.
if (event_recycle) {
const_cast<amd::Command&>(it->ts_->command_).SetHwEvent(nullptr);
}
it->ts_ = nullptr;
it->done_ = true;
}
@@ -390,23 +386,7 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
uint32_t init_value = kInitSignalValueOne;
// If API callback is enabled, then use a blocking signal for AQL queue.
// HSA signal will be acquired in SW and released after HSA signal callback
if (ts->command().Callback() != nullptr) {
ts->SetCallbackSignal(prof_signal->signal_);
// Blocks AQL queue from further processing
hsa_signal_add_relaxed(prof_signal->signal_, 1);
init_value += 1;
}
hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
assert(false && "Runtime should not have batch command in ActiveSignal!");
}
if (!sdma_profiling_) {
hsa_amd_profiling_async_copy_enable(true);
@@ -486,8 +466,7 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
if (!signal->done_) {
// Update timestamp values if requested
if (signal->ts_ != nullptr) {
static constexpr bool kEventRecycle = true;
signal->ts_->checkGpuTime(kEventRecycle);
signal->ts_->checkGpuTime();
} else {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "[%zx]!\t Host wait on completion_signal=0x%zx",
std::this_thread::get_id(), signal->signal_.handle);
@@ -892,7 +871,8 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
}
// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
bool skipSignal, const ProfilingSignal* global_signal) {
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
@@ -915,12 +895,16 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
barrier_packet_.completion_signal.handle = 0;
if (!skipSignal) {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
if (global_signal != nullptr) {
barrier_packet_.completion_signal = global_signal->signal_;
} else {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
// Get active signal for current dispatch if profiling is necessary
barrier_packet_.completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
// Get active signal for current dispatch if profiling is necessary
barrier_packet_.completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
}
}
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1218,7 +1202,8 @@ void VirtualGPU::profilingBegin(amd::Command& command, bool drmProfiling) {
(*it)->NotifyEvent()->HwEvent() : (*it)->HwEvent();
if (hw_event != nullptr) {
Barriers().AddExternalSignal(reinterpret_cast<ProfilingSignal*>(hw_event));
} else if (static_cast<amd::Command*>(*it)->queue() != command.queue()) {
} else if (static_cast<amd::Command*>(*it)->queue() != command.queue() &&
((*it)->status() != CL_COMPLETE)) {
LogPrintfError("Waiting event(%p) doesn't have a HSA signal!\n", *it);
} else {
// Assume serialization on the same queue...
@@ -1239,10 +1224,7 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
timestamp_->end();
}
command.setData(timestamp_);
// Update HW event only for batches
if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
command.SetHwEvent(timestamp_->Signals().back());
}
timestamp_ = nullptr;
}
}
@@ -2926,13 +2908,46 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
if (vcmd.profilingInfo().marker_ts_) {
if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
ProfilingSignal* prof_signal = nullptr;
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (AMD_DIRECT_DISPATCH) {
assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
prof_signal = dev().GetGlobalSignal(timestamp_);
prof_signal->done_ = false;
assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
uint32_t init_value = kInitSignalValueOne;
// If API callback is enabled, then use a blocking signal for AQL queue.
// HSA signal will be acquired in SW and released after HSA signal callback
if (vcmd.Callback() != nullptr) {
timestamp_->SetCallbackSignal(prof_signal->signal_);
// Blocks AQL queue from further processing
hsa_signal_add_relaxed(prof_signal->signal_, 1);
init_value += 1;
}
hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
// Update HW event only for batches
vcmd.SetHwEvent(timestamp_->Signals().back());
}
// Submit a barrier with a cache flushes.
dispatchBarrierPacket(kBarrierPacketHeader);
// Reset this flag since we already enable system scope for kBarrierPacketHeader
hasPendingDispatch_ = false;
dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
// Don't reset the flag for direct dispatch, because the global signals are out of scope
// for internal barrier tracking and SDMA could lose a wait for compute
hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
}
profilingEnd(vcmd);
}
@@ -2958,45 +2973,13 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
// ================================================================================================
void VirtualGPU::flush(amd::Command* list, bool wait) {
// Direct dispatch relies on HSA signal callback
bool skip_cpu_wait = AMD_DIRECT_DISPATCH;
// If barrier is requested, then wait for everything, otherwise
// a per disaptch wait will occur later in updateCommandsState()
releaseGpuMemoryFence();
updateCommandsState(list);
if (skip_cpu_wait) {
// Search for the last command in the batch to track GPU state
amd::Command* current = list;
assert(current != nullptr && "Empty batch for processing!");
// Find the last command
while (current->getNext() != nullptr) {
current = current->getNext();
}
// Always insert a barrier. Some tests rquire async SDMA wait
hasPendingDispatch_ = true;
// Enable profiling, so runtime can track TS
profilingBegin(*current);
// If runtime didn't submit a barrier, then it can't track the completion of the batch.
// Hence runtime either has to insert a barrier unconditionally or have a CPU wait.
// Due to performance impact of extra barriers CPU wait is selected.
// Note: if callback will be selected to update the batch status,
// then the host thread can't update it also, otherwise double free may occur
skip_cpu_wait &= hasPendingDispatch_;
releaseGpuMemoryFence(skip_cpu_wait);
profilingEnd(*current);
} else {
// If barrier is requested, then wait for everything, otherwise
// a per disaptch wait will occur later in updateCommandsState()
releaseGpuMemoryFence();
}
// If CPU waited for GPU, then the queue is idle
if (!skip_cpu_wait) {
updateCommandsState(list);
// Release all pinned memory
releasePinnedMem();
}
// Release all pinned memory
releasePinnedMem();
}
// ================================================================================================
+4 -21
View File
@@ -34,23 +34,9 @@
namespace roc {
class Device;
class Memory;
struct ProfilingSignal;
class Timestamp;
struct ProfilingSignal : public amd::HeapObject {
amd::Monitor lock_; //!< Signal lock for update
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
HwQueueEngine engine_; //!< Engine used with this signal
bool done_; //!< True if signal is done
ProfilingSignal()
: lock_("Signal Ops Lock", true)
, ts_(nullptr)
, engine_(HwQueueEngine::Compute)
, done_(true)
{ signal_.handle = 0; }
amd::Monitor& LockSignalOps() { return lock_; }
};
// Initial HSA signal value
constexpr static hsa_signal_value_t kInitSignalValueOne = 1;
@@ -139,7 +125,7 @@ class Timestamp : public amd::HeapObject {
const bool HwProfiling() const { return !signals_.empty(); }
//! Finds execution ticks on GPU
void checkGpuTime(bool event_recycle = false);
void checkGpuTime();
// Start a timestamp (get timestamp from OS)
void start() { start_ = amd::Os::timeNanos(); }
@@ -225,8 +211,6 @@ class VirtualGPU : public device::VirtualDevice {
//! Wait for the curent active signal. Can idle the queue
bool WaitCurrent() {
ProfilingSignal* signal = signal_list_[current_id_];
ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitCurret completion_signal=0x%zx",
std::this_thread::get_id(), signal->signal_.handle);
return CpuWaitForSignal(signal);
}
@@ -253,8 +237,6 @@ class VirtualGPU : public device::VirtualDevice {
void WaitNext() {
size_t next = (current_id_ + 1) % signal_list_.size();
ProfilingSignal* signal = signal_list_[next];
ClPrint(amd::LOG_DEBUG, amd::LOG_MISC, "[%zx]!\t WaitNext completion_signal=0x%zx",
std::this_thread::get_id(), signal->signal_.handle);
CpuWaitForSignal(signal);
}
@@ -396,7 +378,8 @@ class VirtualGPU : public device::VirtualDevice {
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking,
size_t size = 1);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
const ProfilingSignal* global_signal = nullptr);
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
+39 -22
View File
@@ -48,6 +48,7 @@ Event::Event(HostQueue& queue)
status_(CL_INT_MAX),
hw_event_(nullptr),
notify_event_(nullptr),
device_(&queue.device()),
profilingInfo_(IS_PROFILER_ON || queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
Agent::shouldPostEventEvents()) {
notified_.clear();
@@ -55,7 +56,7 @@ Event::Event(HostQueue& queue)
// ================================================================================================
Event::Event() : callbacks_(NULL), status_(CL_SUBMITTED),
hw_event_(nullptr), notify_event_(nullptr) { notified_.clear(); }
hw_event_(nullptr), notify_event_(nullptr), device_(nullptr) { notified_.clear(); }
// ================================================================================================
Event::~Event() {
@@ -69,6 +70,10 @@ Event::~Event() {
if (notify_event_ != nullptr) {
notify_event_->release();
}
// Destroy global HW event if available
if ((hw_event_ != nullptr) && (device_ != nullptr)) {
device_->ReleaseGlobalSignal(hw_event_);
}
}
// ================================================================================================
@@ -259,21 +264,35 @@ bool Event::awaitCompletion() {
// ================================================================================================
bool Event::notifyCmdQueue() {
HostQueue* queue = command().queue();
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
(!AMD_DIRECT_DISPATCH ||
// If HW event was assigned, then notification can be ignored, since a barrier was issued
(HwEvent() == nullptr)) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
if (command == NULL) {
notified_.clear();
return false;
if (AMD_DIRECT_DISPATCH) {
ScopedLock l(lock_);
if ((status() > CL_COMPLETE) && (nullptr != queue) &&
// If HW event was assigned, then notification can be ignored, since a barrier was issued
(HwEvent() == nullptr) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
if (command == NULL) {
notified_.clear();
return false;
}
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
command->enqueue();
// Save notification, associated with the current event
notify_event_ = command;
}
} else {
if ((status() > CL_COMPLETE) && (nullptr != queue) && !notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
if (command == NULL) {
notified_.clear();
return false;
}
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
command->enqueue();
command->release();
}
ClPrint(LOG_DEBUG, LOG_CMD, "queue marker to command queue: %p", queue);
command->enqueue();
// Save notification, associated with the current event
notify_event_ = command;
}
return true;
}
@@ -318,6 +337,7 @@ void Command::enqueue() {
// update will occur later after flush() with a wait
if (AMD_DIRECT_DISPATCH) {
setStatus(CL_QUEUED);
// Notify all commands about the waiter. Barrier will be sent in order to obtain
// HSA signal for a wait on the current queue
std::for_each(eventWaitList().begin(), eventWaitList().end(),
@@ -333,13 +353,10 @@ void Command::enqueue() {
// Update batch head for the current marker. Hence the status of all commands can be
// updated upon the marker completion
SetBatchHead(queue_->GetSubmittionBatch());
if (profilingInfo().marker_ts_) {
setStatus(CL_SUBMITTED);
submit(*queue_->vdev());
} else {
// Flush the current batch, but skip the wait on CPU if possible to avoid a stall
queue_->vdev()->flush(queue_->GetSubmittionBatch());
}
setStatus(CL_SUBMITTED);
submit(*queue_->vdev());
// The batch will be tracked with the marker now
queue_->ResetSubmissionBatch();
} else {
+1
View File
@@ -95,6 +95,7 @@ class Event : public RuntimeObject {
std::atomic_flag notified_; //!< Command queue was notified
void* hw_event_; //!< HW event ID associated with SW event
Event* notify_event_; //!< Notify event, which should contain HW signal
const Device* device_; //!< Device, this event associated with
protected:
static const EventWaitList nullWaitList;