Limit the number of HSA signals that are active (#140)

* Limit the number of HSA signals that are active

There is a hard limit currently to the number of
signals that HSA allows to be created (before weird stuff
happens such as hangs or straight up crashes in HSA). While
there is some work going on to fix this in HSA/AQL. Lets limit the
number we create.

Increased the counter colleciton example to 200K launches, which
with this change no longer hangs/crashes randomly in HSA.

* source formatting (clang-format v11) (#142)

Co-authored-by: bwelton <bwelton@users.noreply.github.com>

* Up timout

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: bwelton <bwelton@users.noreply.github.com>

[ROCm/rocprofiler-sdk commit: de685246a7]
Этот коммит содержится в:
Benjamin Welton
2023-10-19 11:18:42 -07:00
коммит произвёл GitHub
родитель 162c77d349
Коммит d05031db89
4 изменённых файлов: 75 добавлений и 2 удалений
+1 -1
Просмотреть файл
@@ -47,7 +47,7 @@ set_tests_properties(
counter-collection
PROPERTIES
TIMEOUT
45
300
LABELS
"samples"
ENVIRONMENT
+1 -1
Просмотреть файл
@@ -40,7 +40,7 @@ kernelC(T* C_d, const T* A_d, size_t N)
void
launchKernals()
{
const int NUM_LAUNCH = 1000;
const int NUM_LAUNCH = 200000;
// Normal HIP Calls
int* gpuMem;
[[maybe_unused]] hipDeviceProp_t devProp;
+54
Просмотреть файл
@@ -25,7 +25,9 @@
#include <sys/syscall.h>
#include <unistd.h>
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <mutex>
#include <string>
#include <vector>
@@ -97,5 +99,57 @@ private:
L _destroy_func;
};
/**
* Limits the number of active items to those set in capacity.
* If capacity is reached, will block until another caller
* removes active capacity.
*/
class active_capacity_gate
{
public:
active_capacity_gate(size_t capacity)
: _capacity(capacity)
{}
void add_active(size_t size)
{
if(size >= _capacity)
{
throw std::runtime_error("Size exceeds gate capacity");
}
std::unique_lock lock(_m);
if(_count + size < _capacity)
{
_count += size;
return;
}
_waiters++;
_cv.wait(lock, [&]() { return _count + size < _capacity; });
_waiters--;
_count += size;
}
void remove_active(size_t size)
{
std::unique_lock lock(_m);
if(_count > size)
_count -= size;
else
_count = 0;
if(_waiters > 0)
{
_cv.notify_all();
}
}
private:
size_t _count{0};
size_t _capacity{0};
size_t _waiters{0};
std::mutex _m;
std::condition_variable _cv;
};
} // namespace common
} // namespace rocprofiler
+19
Просмотреть файл
@@ -31,6 +31,15 @@ namespace hsa
{
namespace
{
common::active_capacity_gate&
signal_limiter()
{
// Limit the maximun number of HSA signals created.
// There is a hard limit to the maximum that can exist.
static common::active_capacity_gate _gate(1024);
return _gate;
}
bool
AsyncSignalHandler(hsa_signal_value_t, void* data)
{
@@ -59,15 +68,24 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
}
});
size_t signals_to_remove = 0;
// Delete signals and packets, signal we have completed.
if(queue_info_session.interrupt_signal.handle != 0u)
{
signals_to_remove++;
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.interrupt_signal);
}
if(queue_info_session.kernel_pkt.completion_signal.handle != 0u)
{
signals_to_remove++;
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
queue_info_session.kernel_pkt.completion_signal);
}
if(signals_to_remove > 0)
{
signal_limiter().remove_active(signals_to_remove);
}
queue_info_session.queue.async_complete();
delete static_cast<Queue::queue_info_session_t*>(data);
@@ -267,6 +285,7 @@ Queue::signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_sessio
void
Queue::create_signal(uint32_t attribute, hsa_signal_t* signal) const
{
signal_limiter().add_active(1);
hsa_status_t status = _ext_api.hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
<< "Error: hsa_amd_signal_create failed";