SWDEV-292018 - Switch to internal signals for markers

Add ref counting to ProfilingSignal class to track the last release.
If a signal was used in the marker, then don't reuse it,
but create a new one for internal usage.
Don't rely on HSA callback for the command status update if there
are no pending dispatches.

Change-Id: I19f14ed9d80acfe79993b343b2187635f8428a20


[ROCm/clr commit: ff15c0893e]
Этот коммит содержится в:
German Andryeyev
2021-07-14 19:56:39 -04:00
коммит произвёл Maneesh Gupta
родитель 76e04c2cf5
Коммит 3393396aaa
6 изменённых файлов: 98 добавлений и 93 удалений
+11 -25
Просмотреть файл
@@ -3012,36 +3012,22 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset) {
}
// ================================================================================================
ProfilingSignal* Device::GetGlobalSignal(Timestamp* ts) const {
std::unique_ptr<ProfilingSignal> prof_signal(new ProfilingSignal());
if (prof_signal != nullptr) {
hsa_agent_t agent = getBackendDevice();
hsa_agent_t* agents = (settings().system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings().system_scope_signal_) ? 0 : 1;
if (ts != 0) {
// Save HSA signal earlier to make sure the possible callback will have a valid
// value for processing
prof_signal->ts_ = ts;
ts->AddProfilingSignal(prof_signal.get());
}
if (HSA_STATUS_SUCCESS == hsa_signal_create(kInitSignalValueOne,
num_agents, agents, &prof_signal->signal_)) {
return prof_signal.release();
}
void Device::ReleaseGlobalSignal(void* signal) const {
if (signal != nullptr) {
reinterpret_cast<ProfilingSignal*>(signal)->release();
}
return nullptr;
}
// ================================================================================================
void Device::ReleaseGlobalSignal(void* signal) const {
if (signal != nullptr) {
ProfilingSignal* prof_signal = reinterpret_cast<ProfilingSignal*>(signal);
if (prof_signal->signal_.handle != 0) {
hsa_signal_destroy(prof_signal->signal_);
ProfilingSignal::~ProfilingSignal() {
if (signal_.handle != 0) {
if (hsa_signal_load_relaxed(signal_) > 0) {
LogError("Runtime shouldn't destroy a signal that is still busy!");
if (hsa_signal_wait_scacquire(signal_, HSA_SIGNAL_CONDITION_LT, kInitSignalValueOne,
kUnlimitedWait, HSA_WAIT_STATE_BLOCKED) != 0) {
}
}
delete prof_signal;
hsa_signal_destroy(signal_);
}
}
+4 -3
Просмотреть файл
@@ -77,7 +77,8 @@ class VirtualDevice;
class PrintfDbg;
class IProDevice;
struct ProfilingSignal : public amd::HeapObject {
class ProfilingSignal : public amd::ReferenceCountedObject {
public:
hsa_signal_t signal_; //!< HSA signal to track profiling information
Timestamp* ts_; //!< Timestamp object associated with the signal
HwQueueEngine engine_; //!< Engine used with this signal
@@ -89,6 +90,8 @@ struct ProfilingSignal : public amd::HeapObject {
, done_(true)
, lock_("Signal Ops Lock", true)
{ signal_.handle = 0; }
virtual ~ProfilingSignal();
amd::Monitor& LockSignalOps() { return lock_; }
};
@@ -531,8 +534,6 @@ class Device : public NullDevice {
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset);
ProfilingSignal* GetGlobalSignal(Timestamp* ts) const;
private:
bool create();
+65 -55
Просмотреть файл
@@ -319,10 +319,7 @@ void VirtualGPU::MemoryDependency::clear(bool all) {
// ================================================================================================
VirtualGPU::HwQueueTracker::~HwQueueTracker() {
for (auto& signal: signal_list_) {
if (signal->signal_.handle != 0) {
hsa_signal_destroy(signal->signal_);
}
delete signal;
signal->release();
}
}
@@ -374,6 +371,26 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// a GPU waiter(which may be not triggered yet) and CPU signal reset below
WaitNext();
if (signal_list_[current_id_]->referenceCount() > 1) {
// The signal was assigned to the global marker's event, hence runtime can't reuse it
// and needs a new signal
std::unique_ptr<ProfilingSignal> signal(new ProfilingSignal());
if (signal != nullptr) {
hsa_agent_t agent = gpu_.gpu_device();
const Settings& settings = gpu_.dev().settings();
hsa_agent_t* agents = (settings.system_scope_signal_) ? nullptr : &agent;
uint32_t num_agents = (settings.system_scope_signal_) ? 0 : 1;
if (HSA_STATUS_SUCCESS == hsa_signal_create(0, num_agents, agents, &signal->signal_)) {
signal_list_[current_id_]->release();
signal_list_[current_id_] = signal.release();
} else {
assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
}
} else {
assert(!"ProfilingSignal reallocaiton failed! Marker has a conflict with signal reuse!");
}
}
ProfilingSignal* prof_signal = signal_list_[current_id_];
// Reset the signal and return
hsa_signal_silent_store_relaxed(prof_signal->signal_, init_val);
@@ -387,7 +404,23 @@ hsa_signal_t VirtualGPU::HwQueueTracker::ActiveSignal(
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (AMD_DIRECT_DISPATCH && (ts->command().GetBatchHead() != nullptr)) {
assert(false && "Runtime should not have batch command in ActiveSignal!");
uint32_t init_value = kInitSignalValueOne;
// If API callback is enabled, then use a blocking signal for AQL queue.
// HSA signal will be acquired in SW and released after HSA signal callback
if (ts->command().Callback() != nullptr) {
ts->SetCallbackSignal(prof_signal->signal_);
// Blocks AQL queue from further processing
hsa_signal_add_relaxed(prof_signal->signal_, 1);
init_value += 1;
}
hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, ts);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
}
if (!sdma_profiling_) {
hsa_amd_profiling_async_copy_enable(true);
@@ -872,8 +905,7 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
}
// ================================================================================================
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
bool skipSignal, const ProfilingSignal* global_signal) {
void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal) {
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
@@ -896,16 +928,12 @@ void VirtualGPU::dispatchBarrierPacket(uint16_t packetHeader,
barrier_packet_.completion_signal.handle = 0;
if (!skipSignal) {
if (global_signal != nullptr) {
barrier_packet_.completion_signal = global_signal->signal_;
} else {
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
// Pool size must grow to the size of pending AQL packets
const uint32_t pool_size = index - read;
// Get active signal for current dispatch if profiling is necessary
barrier_packet_.completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
}
// Get active signal for current dispatch if profiling is necessary
barrier_packet_.completion_signal =
Barriers().ActiveSignal(kInitSignalValueOne, timestamp_, pool_size);
}
while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
@@ -1226,6 +1254,12 @@ void VirtualGPU::profilingEnd(amd::Command& command) {
}
command.setData(timestamp_);
// Update HW event only for batches
if ((AMD_DIRECT_DISPATCH) && (command.GetBatchHead() != nullptr)) {
timestamp_->Signals().back()->retain();
command.SetHwEvent(timestamp_->Signals().back());
}
timestamp_ = nullptr;
}
}
@@ -2889,7 +2923,7 @@ void VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) {
queue->profilingEnd(vcmd);
} else {
// Make sure VirtualGPU has an exclusive access to the resources
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
profilingBegin(vcmd);
@@ -2913,47 +2947,23 @@ void VirtualGPU::submitNativeFn(amd::NativeFnCommand& cmd) {
// ================================================================================================
void VirtualGPU::submitMarker(amd::Marker& vcmd) {
if (AMD_DIRECT_DISPATCH || vcmd.profilingInfo().marker_ts_) {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
ProfilingSignal* prof_signal = nullptr;
// If direct dispatch is enabled and the batch head isn't null, then it's a marker and
// requires the batch update upon HSA signal completion
if (AMD_DIRECT_DISPATCH) {
assert(vcmd.GetBatchHead() != nullptr && "Marker doesn't have batch!");
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
if (vcmd.CpuWaitRequested() && hasPendingDispatch_ == false) {
// It should be safe to call flush directly if there are not pending dispatches without
// HSA signal callback
flush(vcmd.GetBatchHead());
} else {
profilingBegin(vcmd);
if (timestamp_ != nullptr) {
// Submit a barrier with a cache flushes.
dispatchBarrierPacket(kBarrierPacketHeader, false);
prof_signal = dev().GetGlobalSignal(timestamp_);
prof_signal->done_ = false;
assert(prof_signal != nullptr && "Failed to allocate the global HSA signal!");
uint32_t init_value = kInitSignalValueOne;
// If API callback is enabled, then use a blocking signal for AQL queue.
// HSA signal will be acquired in SW and released after HSA signal callback
if (vcmd.Callback() != nullptr) {
timestamp_->SetCallbackSignal(prof_signal->signal_);
// Blocks AQL queue from further processing
hsa_signal_add_relaxed(prof_signal->signal_, 1);
init_value += 1;
}
hsa_status_t result = hsa_amd_signal_async_handler(prof_signal->signal_,
HSA_SIGNAL_CONDITION_LT, init_value, &HsaAmdSignalHandler, timestamp_);
if (HSA_STATUS_SUCCESS != result) {
LogError("hsa_amd_signal_async_handler() failed to set the handler!");
} else {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "Set Handler: handle(0x%lx), timestamp(%p)",
prof_signal->signal_.handle, prof_signal);
}
// Update HW event only for batches
vcmd.SetHwEvent(timestamp_->Signals().back());
hasPendingDispatch_ = false;
}
// Submit a barrier with a cache flushes.
dispatchBarrierPacket(kBarrierPacketHeader, false, prof_signal);
// Don't reset the flag for direct dispatch, because the global signals are out of scope
// for internal barrier tracking and SDMA could lose a wait for compute
hasPendingDispatch_ = AMD_DIRECT_DISPATCH;
profilingEnd(vcmd);
}
profilingEnd(vcmd);
}
}
+2 -2
Просмотреть файл
@@ -223,6 +223,7 @@ class VirtualGPU : public device::VirtualDevice {
//! Update current active engine
void SetActiveEngine(HwQueueEngine engine = HwQueueEngine::Compute) { engine_ = engine; }
HwQueueEngine GetActiveEngine() const { return engine_; }
//! Returns the last submitted signal for a wait
std::vector<hsa_signal_t>& WaitingSignal(HwQueueEngine engine = HwQueueEngine::Compute);
@@ -385,8 +386,7 @@ class VirtualGPU : public device::VirtualDevice {
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking,
size_t size = 1);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
const ProfilingSignal* global_signal = nullptr);
void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false);
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
void dispatchBarrierValuePacket(const hsa_amd_barrier_value_packet_t* packet,
+6 -5
Просмотреть файл
@@ -232,11 +232,12 @@ void Event::processCallbacks(int32_t status) const {
}
}
static constexpr bool kCpuWait = true;
// ================================================================================================
bool Event::awaitCompletion() {
if (status() > CL_COMPLETE) {
// Notifies current command queue about waiting
if (!notifyCmdQueue()) {
// Notifies the current command queue about waiting
if (!notifyCmdQueue(kCpuWait)) {
return false;
}
@@ -262,7 +263,7 @@ bool Event::awaitCompletion() {
}
// ================================================================================================
bool Event::notifyCmdQueue() {
bool Event::notifyCmdQueue(bool cpu_wait) {
HostQueue* queue = command().queue();
if (AMD_DIRECT_DISPATCH) {
ScopedLock l(notify_lock_);
@@ -271,7 +272,7 @@ bool Event::notifyCmdQueue() {
(HwEvent() == nullptr) &&
!notified_.test_and_set()) {
// Make sure the queue is draining the enqueued commands.
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this);
amd::Command* command = new amd::Marker(*queue, false, nullWaitList, this, cpu_wait);
if (command == NULL) {
notified_.clear();
return false;
@@ -341,7 +342,7 @@ void Command::enqueue() {
// Notify all commands about the waiter. Barrier will be sent in order to obtain
// HSA signal for a wait on the current queue
std::for_each(eventWaitList().begin(), eventWaitList().end(),
std::mem_fun(&Command::notifyCmdQueue));
std::bind2nd(std::mem_fun(&Command::notifyCmdQueue), !kCpuWait));
// The batch update must be lock protected to avoid a race condition
// when multiple threads submit/flush/update the batch at the same time
+10 -3
Просмотреть файл
@@ -207,7 +207,7 @@ class Event : public RuntimeObject {
/*! \brief Notifies current command queue about execution status
*/
bool notifyCmdQueue();
bool notifyCmdQueue(bool cpu_wait = false);
//! RTTI internal implementation
virtual ObjectType objectType() const { return ObjectTypeEvent; }
@@ -998,15 +998,22 @@ class ExternalSemaphoreCmd : public Command {
class Marker : public Command {
private:
bool cpu_wait_; //!< If true, then the marker was issued for CPU/GPU sync
public:
//! Create a new Marker
Marker(HostQueue& queue, bool userVisible, const EventWaitList& eventWaitList = nullWaitList,
const Event* waitingEvent = nullptr)
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent) {}
const Event* waitingEvent = nullptr, bool cpu_wait = false)
: Command(queue, userVisible ? CL_COMMAND_MARKER : 0, eventWaitList, 0, waitingEvent)
, cpu_wait_(cpu_wait) {}
//! The actual command implementation.
virtual void submit(device::VirtualDevice& device) { device.submitMarker(*this); }
//! Check if this marker requires CPU wait
bool CpuWaitRequested() const { return cpu_wait_; }
};
/*! \brief Maps CL objects created from external ones and syncs the contents (blocking).