[SDK] Fix for hang in hsa_barrier (#316)

* SWDEV-515895: Fix for hang in hsa_barrier

- Root cause of hang is enqueue_packet using a completed barrier
  - This occurs because set_barrier does not correctly mark the barrier
    as complete.
- Fixed by consolidating duplicate code for how a barrier is:
  - Marked as complete
    - Added common function and trace message
  - Checked as complete
    - Removed internal completion tracking variable in favor of signal
- Added some log messages which were useful in tracking this hang
- Destroying an hsa_barrier now calls the provided completion callback

* formatting

* Fix hsa_barrier test

- Removes assumption that enqueue_packet always generates packets
- Add assertion that a complete barrier does not generate packets

* formatting

* Address review comments
此提交包含在:
Meserve, Mark
2025-04-27 19:55:32 -05:00
提交者 GitHub
父節點 a8f3397069
當前提交 61307442f0
共有 3 個檔案被更改,包括 75 行新增24 行删除
+48 -16
查看文件
@@ -31,7 +31,7 @@ namespace rocprofiler
namespace hsa
{
hsa_barrier::hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api)
: _barried_finished(std::move(finished))
: _barrier_finished(std::move(finished))
, _core_api(core_api)
{
// Create the barrier signal
@@ -40,18 +40,33 @@ hsa_barrier::hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api
hsa_barrier::~hsa_barrier()
{
// Destroy the barrier signal
if(registration::get_fini_status() < 1)
if(registration::get_fini_status() > 0)
{
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
_core_api.hsa_signal_destroy_fn(_barrier_signal);
return;
}
// If this barrier is destroyed and has queued packets, a future signal creation may
// inadvertently block these packets if the same handle is re-used by HSA.
if(!complete())
{
_barrier_enqueued.rlock([&](auto& barrier_enqueued) {
if(!barrier_enqueued.empty())
{
ROCP_WARNING << "An incomplete hsa_barrier which was enqueued is being destroyed.";
}
});
}
// Clear and destroy the barrier signal
clear_barrier();
_core_api.hsa_signal_destroy_fn(_barrier_signal);
}
void
hsa_barrier::set_barrier(const queue_map_ptr_t& q)
{
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 1);
ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now set.";
_queue_waiting.wlock([&](auto& queue_waiting) {
for(const auto& [_, queue] : q)
{
@@ -64,8 +79,7 @@ hsa_barrier::set_barrier(const queue_map_ptr_t& q)
}
if(queue_waiting.empty())
{
_barried_finished();
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
clear_barrier();
}
});
}
@@ -73,7 +87,7 @@ hsa_barrier::set_barrier(const queue_map_ptr_t& q)
std::optional<rocprofiler_packet>
hsa_barrier::enqueue_packet(const Queue* queue)
{
if(_complete) return std::nullopt;
if(complete()) return std::nullopt;
bool return_block = false;
_barrier_enqueued.wlock([&](auto& barrier_enqueued) {
if(barrier_enqueued.find(queue->get_id().handle) == barrier_enqueued.end())
@@ -88,7 +102,8 @@ hsa_barrier::enqueue_packet(const Queue* queue)
rocprofiler_packet barrier{};
barrier.barrier_and.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
barrier.barrier_and.dep_signal[0] = _barrier_signal;
ROCP_INFO << "Barrier Added: " << _barrier_signal.handle;
ROCP_INFO << "Barrier (handle: " << _barrier_signal.handle
<< ") added to queue (handle: " << queue->get_id().handle << ")";
return barrier;
}
@@ -100,9 +115,7 @@ hsa_barrier::remove_queue(const Queue* queue)
queue_waiting.erase(queue->get_id().handle);
if(queue_waiting.empty())
{
_barried_finished();
_complete = true;
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
clear_barrier();
}
});
}
@@ -120,14 +133,33 @@ hsa_barrier::register_completion(const Queue* queue)
queue_waiting.erase(queue->get_id().handle);
if(queue_waiting.empty())
{
_barried_finished();
// We are done, release the barrier
_complete = true;
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
clear_barrier();
}
}
});
return found;
}
bool
hsa_barrier::complete() const
{
return _core_api.hsa_signal_load_scacquire_fn(_barrier_signal) == 0;
}
void
hsa_barrier::clear_barrier()
{
if(complete())
{
return;
}
if(_barrier_finished)
{
_barrier_finished();
}
_core_api.hsa_signal_store_screlease_fn(_barrier_signal, 0);
ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now cleared.";
}
} // namespace hsa
} // namespace rocprofiler
+18 -5
查看文件
@@ -50,22 +50,35 @@ public:
hsa_barrier(std::function<void()>&& finished, CoreApiTable core_api);
~hsa_barrier();
// If a queue in q currently has active async packets, sets the barrier and saves how many
// packets are currently in each queue to track barrier completion. If there are no queues with
// packets, clears the barrier and marks it as complete.
void set_barrier(const queue_map_ptr_t& q);
// Returns the packets needed to place this barrier in a queue.
// If the barrier has already been enqueued in the queue given or the barrier is complete, no
// packets are returned.
std::optional<rocprofiler_packet> enqueue_packet(const Queue* queue);
bool register_completion(const Queue* queue);
bool complete() const { return _core_api.hsa_signal_load_scacquire_fn(_barrier_signal) == 0; }
// To be called when 1 async packet is finished in the given queue to decrement the remaining
// packets counter. If all packets in all queues given in set_barrier are completed, clears the
// barrier and marks it as complete. Returns true if the given queue had async packets waiting.
bool register_completion(const Queue* queue);
// Removes a queue from the barrier dependency list
// Checks if this barrier is complete
bool complete() const;
// Removes a queue from the barrier dependency list.
// If this is the last queue waiting, clears the barrier and marks it as complete.
void remove_queue(const Queue* queue);
private:
std::function<void()> _barried_finished = {};
std::function<void()> _barrier_finished = {};
CoreApiTable _core_api = {};
common::Synchronized<std::unordered_map<int64_t, int64_t>> _queue_waiting = {};
common::Synchronized<std::unordered_set<int64_t>> _barrier_enqueued = {};
std::atomic<bool> _complete = {false};
void clear_barrier();
// Blocks all queues from executing until the barrier is lifted
hsa_signal_t _barrier_signal = {};
+9 -3
查看文件
@@ -142,10 +142,16 @@ inject_barriers(hsa_barrier& barrier, QueueController::queue_map_t& queues)
for(auto& [hsa_queue, fq] : queues)
{
auto _pkt = barrier.enqueue_packet(fq.get());
ASSERT_EQ(_pkt.has_value(), true);
auto complete = barrier.complete();
auto _pkt = barrier.enqueue_packet(fq.get());
// If barrier is complete, no packets should be generated
ASSERT_NE(complete, _pkt.has_value());
hsa_barrier_and_packet_t* _packets = (hsa_barrier_and_packet_t*) hsa_queue->base_address;
enqueue_pkt(hsa_queue, _packets, _pkt->barrier_and);
if(_pkt.has_value())
{
enqueue_pkt(hsa_queue, _packets, _pkt->barrier_and);
}
// Construct packet that will trigger async handler after barrier is released
rocprofiler_packet post_barrier{};