[SDK] Fix for hang in hsa_barrier (#316)
* SWDEV-515895: Fix for hang in hsa_barrier
- Root cause of hang is enqueue_packet using a completed barrier
- This occurs because set_barrier does not correctly mark the barrier
as complete.
- Fixed by consolidating duplicate code for how a barrier is:
- Marked as complete
- Added common function and trace message
- Checked as complete
- Removed internal completion tracking variable in favor of signal
- Added some log messages which were useful in tracking this hang
- Destroying an hsa_barrier now calls the provided completion callback
* formatting
* Fix hsa_barrier test
- Removes assumption that enqueue_packet always generates packets
- Add assertion that a complete barrier does not generate packets
* formatting
* Address review comments
此提交包含在:
@@ -31,7 +31,7 @@ namespace rocprofiler
|
||||
namespace hsa
|
||||
{
|
||||
hsa_barrier::hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api)
|
||||
: _barried_finished(std::move(finished))
|
||||
: _barrier_finished(std::move(finished))
|
||||
, _core_api(core_api)
|
||||
{
|
||||
// Create the barrier signal
|
||||
@@ -40,18 +40,33 @@ hsa_barrier::hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api
|
||||
|
||||
hsa_barrier::~hsa_barrier()
|
||||
{
|
||||
// Destroy the barrier signal
|
||||
if(registration::get_fini_status() < 1)
|
||||
if(registration::get_fini_status() > 0)
|
||||
{
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
|
||||
_core_api.hsa_signal_destroy_fn(_barrier_signal);
|
||||
return;
|
||||
}
|
||||
|
||||
// If this barrier is destroyed and has queued packets, a future signal creation may
|
||||
// inadvertently block these packets if the same handle is re-used by HSA.
|
||||
if(!complete())
|
||||
{
|
||||
_barrier_enqueued.rlock([&](auto& barrier_enqueued) {
|
||||
if(!barrier_enqueued.empty())
|
||||
{
|
||||
ROCP_WARNING << "An incomplete hsa_barrier which was enqueued is being destroyed.";
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Clear and destroy the barrier signal
|
||||
clear_barrier();
|
||||
_core_api.hsa_signal_destroy_fn(_barrier_signal);
|
||||
}
|
||||
|
||||
void
|
||||
hsa_barrier::set_barrier(const queue_map_ptr_t& q)
|
||||
{
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 1);
|
||||
ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now set.";
|
||||
_queue_waiting.wlock([&](auto& queue_waiting) {
|
||||
for(const auto& [_, queue] : q)
|
||||
{
|
||||
@@ -64,8 +79,7 @@ hsa_barrier::set_barrier(const queue_map_ptr_t& q)
|
||||
}
|
||||
if(queue_waiting.empty())
|
||||
{
|
||||
_barried_finished();
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
|
||||
clear_barrier();
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -73,7 +87,7 @@ hsa_barrier::set_barrier(const queue_map_ptr_t& q)
|
||||
std::optional<rocprofiler_packet>
|
||||
hsa_barrier::enqueue_packet(const Queue* queue)
|
||||
{
|
||||
if(_complete) return std::nullopt;
|
||||
if(complete()) return std::nullopt;
|
||||
bool return_block = false;
|
||||
_barrier_enqueued.wlock([&](auto& barrier_enqueued) {
|
||||
if(barrier_enqueued.find(queue->get_id().handle) == barrier_enqueued.end())
|
||||
@@ -88,7 +102,8 @@ hsa_barrier::enqueue_packet(const Queue* queue)
|
||||
rocprofiler_packet barrier{};
|
||||
barrier.barrier_and.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
|
||||
barrier.barrier_and.dep_signal[0] = _barrier_signal;
|
||||
ROCP_INFO << "Barrier Added: " << _barrier_signal.handle;
|
||||
ROCP_INFO << "Barrier (handle: " << _barrier_signal.handle
|
||||
<< ") added to queue (handle: " << queue->get_id().handle << ")";
|
||||
return barrier;
|
||||
}
|
||||
|
||||
@@ -100,9 +115,7 @@ hsa_barrier::remove_queue(const Queue* queue)
|
||||
queue_waiting.erase(queue->get_id().handle);
|
||||
if(queue_waiting.empty())
|
||||
{
|
||||
_barried_finished();
|
||||
_complete = true;
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
|
||||
clear_barrier();
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -120,14 +133,33 @@ hsa_barrier::register_completion(const Queue* queue)
|
||||
queue_waiting.erase(queue->get_id().handle);
|
||||
if(queue_waiting.empty())
|
||||
{
|
||||
_barried_finished();
|
||||
// We are done, release the barrier
|
||||
_complete = true;
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
|
||||
clear_barrier();
|
||||
}
|
||||
}
|
||||
});
|
||||
return found;
|
||||
}
|
||||
|
||||
bool
|
||||
hsa_barrier::complete() const
|
||||
{
|
||||
return _core_api.hsa_signal_load_scacquire_fn(_barrier_signal) == 0;
|
||||
}
|
||||
|
||||
void
|
||||
hsa_barrier::clear_barrier()
|
||||
{
|
||||
if(complete())
|
||||
{
|
||||
return;
|
||||
}
|
||||
if(_barrier_finished)
|
||||
{
|
||||
_barrier_finished();
|
||||
}
|
||||
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
|
||||
ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now cleared.";
|
||||
}
|
||||
|
||||
} // namespace hsa
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -50,22 +50,35 @@ public:
|
||||
hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api);
|
||||
~hsa_barrier();
|
||||
|
||||
// If a queue in q currently has active async packets, sets the barrier and saves how many
|
||||
// packets are currently in each queue to track barrier completion. If there are no queues with
|
||||
// packets, clears the barrier and marks it as complete.
|
||||
void set_barrier(const queue_map_ptr_t& q);
|
||||
|
||||
// Returns the packets needed to place this barrier in a queue.
|
||||
// If the barrier has already been enqueued in the queue given or the barrier is complete, no
|
||||
// packets are returned.
|
||||
std::optional<rocprofiler_packet> enqueue_packet(const Queue* queue);
|
||||
bool register_completion(const Queue* queue);
|
||||
|
||||
bool complete() const { return _core_api.hsa_signal_load_scacquire_fn(_barrier_signal) == 0; }
|
||||
// To be called when 1 async packet is finished in the given queue to decrement the remaining
|
||||
// packets counter. If all packets in all queues given in set_barrier are completed, clears the
|
||||
// barrier and marks it as complete. Returns true if the given queue had async packets waiting.
|
||||
bool register_completion(const Queue* queue);
|
||||
|
||||
// Removes a queue from the barrier dependency list
|
||||
// Checks if this barrier is complete
|
||||
bool complete() const;
|
||||
|
||||
// Removes a queue from the barrier dependency list.
|
||||
// If this is the last queue waiting, clears the barrier and marks it as complete.
|
||||
void remove_queue(const Queue* queue);
|
||||
|
||||
private:
|
||||
std::function<void()> _barried_finished = {};
|
||||
std::function<void()> _barrier_finished = {};
|
||||
CoreApiTable _core_api = {};
|
||||
common::Synchronized<std::unordered_map<int64_t, int64_t>> _queue_waiting = {};
|
||||
common::Synchronized<std::unordered_set<int64_t>> _barrier_enqueued = {};
|
||||
std::atomic<bool> _complete = {false};
|
||||
|
||||
void clear_barrier();
|
||||
|
||||
// Blocks all queues from executing until the barrier is lifted
|
||||
hsa_signal_t _barrier_signal = {};
|
||||
|
||||
@@ -142,10 +142,16 @@ inject_barriers(hsa_barrier& barrier, QueueController::queue_map_t& queues)
|
||||
|
||||
for(auto& [hsa_queue, fq] : queues)
|
||||
{
|
||||
auto _pkt = barrier.enqueue_packet(fq.get());
|
||||
ASSERT_EQ(_pkt.has_value(), true);
|
||||
auto complete = barrier.complete();
|
||||
auto _pkt = barrier.enqueue_packet(fq.get());
|
||||
// If barrier is complete, no packets should be generated
|
||||
ASSERT_NE(complete, _pkt.has_value());
|
||||
|
||||
hsa_barrier_and_packet_t* _packets = (hsa_barrier_and_packet_t*) hsa_queue->base_address;
|
||||
enqueue_pkt(hsa_queue, _packets, _pkt->barrier_and);
|
||||
if(_pkt.has_value())
|
||||
{
|
||||
enqueue_pkt(hsa_queue, _packets, _pkt->barrier_and);
|
||||
}
|
||||
|
||||
// Construct packet that will trigger async handler after barrier is released
|
||||
rocprofiler_packet post_barrier{};
|
||||
|
||||
新增問題並參考
封鎖使用者