Limit the number of HSA signals that are active (#140)

* Limit the number of HSA signals that are active

There is a hard limit currently to the number of
signals that HSA allows to be created (before weird stuff
happens such as hangs or straight up crashes in HSA). While
there is some work going on to fix this in HSA/AQL. Lets limit the
number we create.

Increased the counter colleciton example to 200K launches, which
with this change no longer hangs/crashes randomly in HSA.

* source formatting (clang-format v11) (#142)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* Up timout

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: bwelton <bwelton@users.noreply.github.com>
Dieser Commit ist enthalten in:
Benjamin Welton
2023-10-19 11:18:42 -07:00
committet von GitHub
Ursprung 5819ca589f
Commit de685246a7
4 geänderte Dateien mit 75 neuen und 2 gelöschten Zeilen
+1 -1
Datei anzeigen
@@ -47,7 +47,7 @@ set_tests_properties(
counter-collection
PROPERTIES
TIMEOUT
45
300
LABELS
"samples"
ENVIRONMENT
+1 -1
Datei anzeigen
@@ -40,7 +40,7 @@ kernelC(T* C_d, const T* A_d, size_t N)
void
launchKernals()
{
const int NUM_LAUNCH = 1000;
const int NUM_LAUNCH = 200000;
// Normal HIP Calls
int* gpuMem;
[[maybe_unused]] hipDeviceProp_t devProp;
+54
Datei anzeigen
@@ -25,7 +25,9 @@
#include <sys/syscall.h>
#include <unistd.h>
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <mutex>
#include <string>
#include <vector>
@@ -97,5 +99,57 @@ private:
L _destroy_func;
};
/**
* Limits the number of active items to those set in capacity.
* If capacity is reached, will block until another caller
* removes active capacity.
*/
class active_capacity_gate
{
public:
active_capacity_gate(size_t capacity)
: _capacity(capacity)
{}
void add_active(size_t size)
{
if(size >= _capacity)
{
throw std::runtime_error("Size exceeds gate capacity");
}
std::unique_lock lock(_m);
if(_count + size < _capacity)
{
_count += size;
return;
}
_waiters++;
_cv.wait(lock, [&]() { return _count + size < _capacity; });
_waiters--;
_count += size;
}
void remove_active(size_t size)
{
std::unique_lock lock(_m);
if(_count > size)
_count -= size;
else
_count = 0;
if(_waiters > 0)
{
_cv.notify_all();
}
}
private:
size_t _count{0};
size_t _capacity{0};
size_t _waiters{0};
std::mutex _m;
std::condition_variable _cv;
};
} // namespace common
} // namespace rocprofiler
+19
Datei anzeigen
@@ -31,6 +31,15 @@ namespace hsa
{
namespace
{
common::active_capacity_gate&
signal_limiter()
{
// Limit the maximun number of HSA signals created.
// There is a hard limit to the maximum that can exist.
static common::active_capacity_gate _gate(1024);
return _gate;
}
bool
AsyncSignalHandler(hsa_signal_value_t, void* data)
{
@@ -59,15 +68,24 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
}
});
size_t signals_to_remove = 0;
// Delete signals and packets, signal we have completed.
if(queue_info_session.interrupt_signal.handle != 0u)
{
signals_to_remove++;
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.interrupt_signal);
}
if(queue_info_session.kernel_pkt.completion_signal.handle != 0u)
{
signals_to_remove++;
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.kernel_pkt.completion_signal);
}
if(signals_to_remove > 0)
{
signal_limiter().remove_active(signals_to_remove);
}
queue_info_session.queue.async_complete();
delete static_cast<Queue::queue_info_session_t*>(data);
@@ -267,6 +285,7 @@ Queue::signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_sessio
void
Queue::create_signal(uint32_t attribute, hsa_signal_t* signal) const
{
signal_limiter().add_active(1);
hsa_status_t status = _ext_api.hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "Error: hsa_amd_signal_create failed";