Limit the number of HSA signals that are active (#140)
* Limit the number of HSA signals that are active
There is a hard limit currently to the number of
signals that HSA allows to be created (before weird stuff
happens such as hangs or straight up crashes in HSA). While
there is some work going on to fix this in HSA/AQL. Lets limit the
number we create.
Increased the counter colleciton example to 200K launches, which
with this change no longer hangs/crashes randomly in HSA.
* source formatting (clang-format v11) (#142)
Co-authored-by: bwelton <bwelton@users.noreply.github.com>
* Up timout
---------
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: bwelton <bwelton@users.noreply.github.com>
[ROCm/rocprofiler-sdk commit: de685246a7]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
162c77d349
Коммит
d05031db89
@@ -47,7 +47,7 @@ set_tests_properties(
|
||||
counter-collection
|
||||
PROPERTIES
|
||||
TIMEOUT
|
||||
45
|
||||
300
|
||||
LABELS
|
||||
"samples"
|
||||
ENVIRONMENT
|
||||
|
||||
@@ -40,7 +40,7 @@ kernelC(T* C_d, const T* A_d, size_t N)
|
||||
void
|
||||
launchKernals()
|
||||
{
|
||||
const int NUM_LAUNCH = 1000;
|
||||
const int NUM_LAUNCH = 200000;
|
||||
// Normal HIP Calls
|
||||
int* gpuMem;
|
||||
[[maybe_unused]] hipDeviceProp_t devProp;
|
||||
|
||||
@@ -25,7 +25,9 @@
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -97,5 +99,57 @@ private:
|
||||
L _destroy_func;
|
||||
};
|
||||
|
||||
/**
|
||||
* Limits the number of active items to those set in capacity.
|
||||
* If capacity is reached, will block until another caller
|
||||
* removes active capacity.
|
||||
*/
|
||||
class active_capacity_gate
|
||||
{
|
||||
public:
|
||||
active_capacity_gate(size_t capacity)
|
||||
: _capacity(capacity)
|
||||
{}
|
||||
void add_active(size_t size)
|
||||
{
|
||||
if(size >= _capacity)
|
||||
{
|
||||
throw std::runtime_error("Size exceeds gate capacity");
|
||||
}
|
||||
|
||||
std::unique_lock lock(_m);
|
||||
if(_count + size < _capacity)
|
||||
{
|
||||
_count += size;
|
||||
return;
|
||||
}
|
||||
_waiters++;
|
||||
_cv.wait(lock, [&]() { return _count + size < _capacity; });
|
||||
_waiters--;
|
||||
_count += size;
|
||||
}
|
||||
|
||||
void remove_active(size_t size)
|
||||
{
|
||||
std::unique_lock lock(_m);
|
||||
if(_count > size)
|
||||
_count -= size;
|
||||
else
|
||||
_count = 0;
|
||||
|
||||
if(_waiters > 0)
|
||||
{
|
||||
_cv.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
size_t _count{0};
|
||||
size_t _capacity{0};
|
||||
size_t _waiters{0};
|
||||
std::mutex _m;
|
||||
std::condition_variable _cv;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -31,6 +31,15 @@ namespace hsa
|
||||
{
|
||||
namespace
|
||||
{
|
||||
common::active_capacity_gate&
|
||||
signal_limiter()
|
||||
{
|
||||
// Limit the maximun number of HSA signals created.
|
||||
// There is a hard limit to the maximum that can exist.
|
||||
static common::active_capacity_gate _gate(1024);
|
||||
return _gate;
|
||||
}
|
||||
|
||||
bool
|
||||
AsyncSignalHandler(hsa_signal_value_t, void* data)
|
||||
{
|
||||
@@ -59,15 +68,24 @@ AsyncSignalHandler(hsa_signal_value_t, void* data)
|
||||
}
|
||||
});
|
||||
|
||||
size_t signals_to_remove = 0;
|
||||
// Delete signals and packets, signal we have completed.
|
||||
if(queue_info_session.interrupt_signal.handle != 0u)
|
||||
{
|
||||
signals_to_remove++;
|
||||
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
|
||||
queue_info_session.interrupt_signal);
|
||||
}
|
||||
if(queue_info_session.kernel_pkt.completion_signal.handle != 0u)
|
||||
{
|
||||
signals_to_remove++;
|
||||
queue_info_session.queue.core_api().hsa_signal_destroy_fn(
|
||||
queue_info_session.kernel_pkt.completion_signal);
|
||||
}
|
||||
if(signals_to_remove > 0)
|
||||
{
|
||||
signal_limiter().remove_active(signals_to_remove);
|
||||
}
|
||||
queue_info_session.queue.async_complete();
|
||||
|
||||
delete static_cast<Queue::queue_info_session_t*>(data);
|
||||
@@ -267,6 +285,7 @@ Queue::signal_async_handler(const hsa_signal_t& signal, Queue::queue_info_sessio
|
||||
void
|
||||
Queue::create_signal(uint32_t attribute, hsa_signal_t* signal) const
|
||||
{
|
||||
signal_limiter().add_active(1);
|
||||
hsa_status_t status = _ext_api.hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
|
||||
LOG_IF(FATAL, status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK)
|
||||
<< "Error: hsa_amd_signal_create failed";
|
||||
|
||||
Ссылка в новой задаче
Block a user