Improve testing stability (#796)
- Update tests/bin/reproducible-runtime - tweak algorithm for sleeping - Update lib/rocprofiler-sdk/hsa/async_copy - handle egregious skews for async copy times - Update kernel tracing - handle egregious skews for kernel dispatch times - Update lib/rocprofiler-sdk/hsa/code_object - use static object wrappers for code object info - Update lib/rocprofiler-sdk-tool/config.cpp - fix data race in output_keys / get_local_datetime
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
b6c0b50d3e
Коммит
b953774580
@@ -48,20 +48,27 @@ namespace tool
|
||||
{
|
||||
namespace
|
||||
{
|
||||
const auto launch_time = new std::time_t{std::time(nullptr)};
|
||||
const auto env_regexes =
|
||||
std::string*
|
||||
get_local_datetime(const std::string& dt_format);
|
||||
|
||||
const auto* launch_datetime = get_local_datetime(get_env("ROCP_TIME_FORMAT", "%F_%H.%M"));
|
||||
const auto env_regexes =
|
||||
new std::array<std::regex, 2>{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"},
|
||||
std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}};
|
||||
|
||||
std::string
|
||||
get_local_datetime(const char* dt_format, std::time_t* dt_curr)
|
||||
std::string*
|
||||
get_local_datetime(const std::string& dt_format)
|
||||
{
|
||||
char mbstr[512];
|
||||
if(!dt_curr) dt_curr = launch_time;
|
||||
constexpr auto strsize = 512;
|
||||
auto dt_curr = std::time_t{std::time(nullptr)};
|
||||
|
||||
if(std::strftime(mbstr, sizeof(mbstr), dt_format, std::localtime(dt_curr)) != 0)
|
||||
return std::string{mbstr};
|
||||
return std::string{};
|
||||
char mbstr[strsize];
|
||||
memset(mbstr, '\0', sizeof(mbstr) * sizeof(char));
|
||||
|
||||
if(std::strftime(mbstr, sizeof(mbstr) - 1, dt_format.c_str(), std::localtime(&dt_curr)) != 0)
|
||||
return new std::string{mbstr};
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
inline bool
|
||||
@@ -270,9 +277,6 @@ output_keys(std::string _tag)
|
||||
}
|
||||
}
|
||||
|
||||
auto* _launch_time = launch_time;
|
||||
auto _time_format = get_env<std::string>("ROCP_TIME_FORMAT", "%F_%H.%M");
|
||||
|
||||
auto _mpi_size = get_mpi_size();
|
||||
auto _mpi_rank = get_mpi_rank();
|
||||
|
||||
@@ -286,7 +290,6 @@ output_keys(std::string _tag)
|
||||
auto _pwd_string = get_env<std::string>("PWD", ".");
|
||||
auto _slurm_job_id = get_env<std::string>("SLURM_JOB_ID", "0");
|
||||
auto _slurm_proc_id = get_env("SLURM_PROCID", _dmp_rank);
|
||||
auto _launch_string = get_local_datetime(_time_format.c_str(), _launch_time);
|
||||
|
||||
auto _uniq_id = _proc_id;
|
||||
if(get_env<int32_t>("SLURM_PROCID", -1) >= 0)
|
||||
@@ -318,6 +321,8 @@ output_keys(std::string _tag)
|
||||
}
|
||||
}
|
||||
|
||||
auto _launch_time = (launch_datetime) ? *launch_datetime : std::string{".UNKNOWN_LAUNCH_TIME."};
|
||||
|
||||
for(auto&& itr : std::initializer_list<output_key>{
|
||||
{"%pid%", _proc_id, "Process identifier"},
|
||||
{"%ppid%", _parent_id, "Parent process identifier"},
|
||||
@@ -328,7 +333,7 @@ output_keys(std::string _tag)
|
||||
{"%rank%", _slurm_proc_id, "MPI/UPC++ rank"},
|
||||
{"%size%", _dmp_size, "MPI/UPC++ size"},
|
||||
{"%nid%", _uniq_id, "%rank% if possible, otherwise %pid%"},
|
||||
{"%launch_time%", _launch_string, "Data and/or time of run according to time format"},
|
||||
{"%launch_time%", _launch_time, "Data and/or time of run according to time format"},
|
||||
})
|
||||
{
|
||||
_options.emplace_back(itr);
|
||||
|
||||
@@ -175,6 +175,7 @@ struct async_copy_data
|
||||
rocprofiler_agent_id_t src_agent = null_rocp_agent_id;
|
||||
rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE;
|
||||
uint64_t bytes_copied = 0;
|
||||
uint64_t start_ts = 0;
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
tracing::tracing_data tracing_data = {};
|
||||
|
||||
@@ -331,6 +332,22 @@ convert_hsa_handle(Up _hsa_object)
|
||||
return reinterpret_cast<Tp*>(_hsa_object.handle);
|
||||
}
|
||||
|
||||
hsa_amd_profiling_async_copy_time_t&
|
||||
operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
|
||||
{
|
||||
lhs.start += rhs;
|
||||
lhs.end += rhs;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
hsa_amd_profiling_async_copy_time_t&
|
||||
operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
|
||||
{
|
||||
lhs.start *= rhs;
|
||||
lhs.end *= rhs;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
bool
|
||||
async_copy_handler(hsa_signal_value_t signal_value, void* arg)
|
||||
{
|
||||
@@ -351,14 +368,23 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg)
|
||||
return (nanosec / sysclock_hz);
|
||||
}();
|
||||
|
||||
auto ts = common::timestamp_ns();
|
||||
auto* _data = static_cast<async_copy_data*>(arg);
|
||||
auto copy_time = hsa_amd_profiling_async_copy_time_t{};
|
||||
auto copy_time_status = get_amd_ext_table()->hsa_amd_profiling_get_async_copy_time_fn(
|
||||
_data->rocp_signal, ©_time);
|
||||
|
||||
// normalize
|
||||
copy_time.start *= sysclock_period;
|
||||
copy_time.end *= sysclock_period;
|
||||
copy_time *= sysclock_period;
|
||||
|
||||
// below is a hack for clock skew issues:
|
||||
// the timestamp of the function call triggering the copy will always be before when the copy
|
||||
// started
|
||||
if(copy_time.start < _data->start_ts) copy_time += (_data->start_ts - copy_time.start);
|
||||
|
||||
// below is a hack for clock skew issues:
|
||||
// the timestamp of this handler for the copy will always be after when the copy ended
|
||||
if(copy_time.end < ts) copy_time += (ts - copy_time.end);
|
||||
|
||||
// if we encounter this in CI, it will cause test to fail
|
||||
ROCP_CI_LOG_IF(ERROR, copy_time_status == HSA_STATUS_SUCCESS && copy_time.end < copy_time.start)
|
||||
@@ -650,12 +676,13 @@ async_copy_impl(Args... args)
|
||||
|
||||
// if we constructed a correlation id, this decrements the reference count after the underlying
|
||||
// function returns
|
||||
auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop]() {
|
||||
auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop, _data]() {
|
||||
if(_corr_id_pop)
|
||||
{
|
||||
context::pop_latest_correlation_id(_corr_id_pop);
|
||||
_corr_id_pop->sub_ref_count();
|
||||
}
|
||||
_data->start_ts = common::timestamp_ns();
|
||||
}};
|
||||
|
||||
auto thr_id = _data->correlation_id->thread_idx;
|
||||
|
||||
@@ -505,25 +505,27 @@ shutdown(hsa_executable_t executable);
|
||||
|
||||
bool is_shutdown = false;
|
||||
|
||||
auto&
|
||||
auto*
|
||||
get_executables()
|
||||
{
|
||||
static auto _v = common::Synchronized<executable_array_t>{};
|
||||
static auto*& _v = common::static_object<common::Synchronized<executable_array_t>>::construct();
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto&
|
||||
auto*
|
||||
get_code_objects()
|
||||
{
|
||||
static auto _v = common::Synchronized<code_object_array_t>{};
|
||||
static auto*& _v =
|
||||
common::static_object<common::Synchronized<code_object_array_t>>::construct();
|
||||
static auto _dtor = common::scope_destructor{[]() { code_object_shutdown(); }};
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto&
|
||||
auto*
|
||||
get_kernel_object_map()
|
||||
{
|
||||
static auto _v = common::Synchronized<kernel_object_map_t>{};
|
||||
static auto*& _v =
|
||||
common::static_object<common::Synchronized<kernel_object_map_t>>::construct();
|
||||
return _v;
|
||||
}
|
||||
|
||||
@@ -616,12 +618,13 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl
|
||||
// generate a unique kernel symbol id
|
||||
data.kernel_id = ++get_kernel_symbol_id();
|
||||
|
||||
get_kernel_object_map().wlock(
|
||||
[](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) {
|
||||
object_map[_kern_obj] = _kern_id;
|
||||
},
|
||||
data.kernel_object,
|
||||
data.kernel_id);
|
||||
CHECK_NOTNULL(get_kernel_object_map())
|
||||
->wlock(
|
||||
[](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) {
|
||||
object_map[_kern_obj] = _kern_id;
|
||||
},
|
||||
data.kernel_object,
|
||||
data.kernel_id);
|
||||
|
||||
code_obj_v->symbols.emplace_back(std::make_unique<kernel_symbol>(std::move(symbol_v)));
|
||||
|
||||
@@ -809,11 +812,11 @@ code_object_unload_callback(hsa_executable_t executable,
|
||||
|
||||
CHECK_NOTNULL(code_obj_arr);
|
||||
|
||||
// auto _size = get_code_objects().rlock([](const auto& data) { return data.size(); });
|
||||
// ROCP_INFO << "[inp] executable=" << executable.handle
|
||||
// auto _size = CHECK_NOTNULL(get_code_objects())->rlock([](const auto& data) { return
|
||||
// data.size(); }); ROCP_INFO << "[inp] executable=" << executable.handle
|
||||
// << ", code_object=" << loaded_code_object.handle << " vs. " << _size;
|
||||
|
||||
get_code_objects().rlock([&](const code_object_array_t& arr) {
|
||||
CHECK_NOTNULL(get_code_objects())->rlock([&](const code_object_array_t& arr) {
|
||||
for(const auto& itr : arr)
|
||||
{
|
||||
// ROCP_INFO << "[cmp] executable=" << itr->hsa_executable.handle
|
||||
@@ -856,11 +859,12 @@ executable_freeze(hsa_executable_t executable, const char* options)
|
||||
|
||||
ROCP_INFO << "running " << __FUNCTION__ << " (executable=" << executable.handle << ")...";
|
||||
|
||||
get_executables().wlock(
|
||||
[executable](executable_array_t& data) { data.emplace_back(executable); });
|
||||
CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) {
|
||||
data.emplace_back(executable);
|
||||
});
|
||||
|
||||
auto& code_obj_vec = get_code_objects();
|
||||
code_obj_vec.wlock([executable](code_object_array_t& _vec) {
|
||||
auto* code_obj_vec = get_code_objects();
|
||||
CHECK_NOTNULL(code_obj_vec)->wlock([executable](code_object_array_t& _vec) {
|
||||
hsa::get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
|
||||
executable, code_object_load_callback, &_vec);
|
||||
});
|
||||
@@ -881,7 +885,7 @@ executable_freeze(hsa_executable_t executable, const char* options)
|
||||
|
||||
if(!ctxs.empty())
|
||||
{
|
||||
code_obj_vec.rlock([](const code_object_array_t& data) {
|
||||
code_obj_vec->rlock([](const code_object_array_t& data) {
|
||||
auto tidx = common::get_tid();
|
||||
// set the contexts for each code object
|
||||
for(const auto& ditr : data)
|
||||
@@ -960,34 +964,43 @@ executable_destroy(hsa_executable_t executable)
|
||||
|
||||
auto _unloaded = shutdown(executable);
|
||||
|
||||
get_kernel_object_map().wlock([_unloaded](kernel_object_map_t& data) {
|
||||
for(const auto& uitr : _unloaded)
|
||||
{
|
||||
for(const auto& sitr : uitr.symbols)
|
||||
if(get_kernel_object_map())
|
||||
{
|
||||
CHECK_NOTNULL(get_kernel_object_map())->wlock([_unloaded](kernel_object_map_t& data) {
|
||||
for(const auto& uitr : _unloaded)
|
||||
{
|
||||
data.erase(sitr->rocp_data.kernel_id);
|
||||
for(const auto& sitr : uitr.symbols)
|
||||
{
|
||||
data.erase(sitr->rocp_data.kernel_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
get_code_objects().wlock([executable](code_object_array_t& data) {
|
||||
for(auto& itr : data)
|
||||
{
|
||||
if(itr->hsa_executable.handle == executable.handle) itr.reset();
|
||||
}
|
||||
data.erase(
|
||||
std::remove_if(data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }),
|
||||
data.end());
|
||||
});
|
||||
if(get_code_objects())
|
||||
{
|
||||
CHECK_NOTNULL(get_code_objects())->wlock([executable](code_object_array_t& data) {
|
||||
for(auto& itr : data)
|
||||
{
|
||||
if(itr->hsa_executable.handle == executable.handle) itr.reset();
|
||||
}
|
||||
data.erase(std::remove_if(
|
||||
data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }),
|
||||
data.end());
|
||||
});
|
||||
}
|
||||
|
||||
get_executables().wlock([executable](executable_array_t& data) {
|
||||
data.erase(std::remove_if(data.begin(),
|
||||
data.end(),
|
||||
[executable](hsa_executable_t itr) {
|
||||
return (itr.handle == executable.handle);
|
||||
}),
|
||||
data.end());
|
||||
});
|
||||
if(get_executables())
|
||||
{
|
||||
CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) {
|
||||
data.erase(std::remove_if(data.begin(),
|
||||
data.end(),
|
||||
[executable](hsa_executable_t itr) {
|
||||
return (itr.handle == executable.handle);
|
||||
}),
|
||||
data.end());
|
||||
});
|
||||
}
|
||||
|
||||
return CHECK_NOTNULL(get_destroy_function())(executable);
|
||||
}
|
||||
@@ -1100,27 +1113,28 @@ code_object_init(HsaApiTable* table)
|
||||
uint64_t
|
||||
get_kernel_id(uint64_t kernel_object)
|
||||
{
|
||||
return get_kernel_object_map().rlock(
|
||||
[](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t {
|
||||
auto itr = object_map.find(_kern_obj);
|
||||
return (itr == object_map.end()) ? 0 : itr->second;
|
||||
},
|
||||
kernel_object);
|
||||
return CHECK_NOTNULL(get_kernel_object_map())
|
||||
->rlock(
|
||||
[](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t {
|
||||
auto itr = object_map.find(_kern_obj);
|
||||
return (itr == object_map.end()) ? 0 : itr->second;
|
||||
},
|
||||
kernel_object);
|
||||
}
|
||||
|
||||
void
|
||||
code_object_shutdown()
|
||||
{
|
||||
if(is_shutdown) return;
|
||||
if(is_shutdown || !get_executables() || !get_code_objects()) return;
|
||||
|
||||
get_executables().rlock([](const executable_array_t& edata) {
|
||||
CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) {
|
||||
auto tmp = edata;
|
||||
std::reverse(tmp.begin(), tmp.end());
|
||||
for(auto itr : tmp)
|
||||
shutdown(itr);
|
||||
});
|
||||
|
||||
get_code_objects().wlock([](code_object_array_t& data) { data.clear(); });
|
||||
CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); });
|
||||
|
||||
is_shutdown = true;
|
||||
}
|
||||
|
||||
@@ -416,6 +416,7 @@ WriteInterceptor(const void* packets,
|
||||
.inst_pkt = std::move(inst_pkt),
|
||||
.interrupt_signal = interrupt_signal,
|
||||
.tid = thr_id,
|
||||
.enqueue_ts = common::timestamp_ns(),
|
||||
.user_data = user_data,
|
||||
.correlation_id = corr_id,
|
||||
.kernel_pkt = kernel_pkt,
|
||||
|
||||
@@ -56,6 +56,7 @@ struct queue_info_session
|
||||
inst_pkt_t inst_pkt = {};
|
||||
hsa_signal_t interrupt_signal = {};
|
||||
rocprofiler_thread_id_t tid = common::get_tid();
|
||||
rocprofiler_timestamp_t enqueue_ts = 0;
|
||||
rocprofiler_user_data_t user_data = {.value = 0};
|
||||
context::correlation_id* correlation_id = nullptr;
|
||||
rocprofiler_packet kernel_pkt = {};
|
||||
|
||||
@@ -43,6 +43,15 @@
|
||||
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL)
|
||||
#endif
|
||||
|
||||
#define ROCP_HSA_TABLE_CALL(SEVERITY, EXPR) \
|
||||
auto ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) = (EXPR); \
|
||||
LOG_IF(SEVERITY, ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) != HSA_STATUS_SUCCESS) \
|
||||
<< #EXPR << " returned non-zero status code " \
|
||||
<< ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) << " :: " \
|
||||
<< ::rocprofiler::hsa::get_hsa_status_string( \
|
||||
ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__)) \
|
||||
<< " "
|
||||
|
||||
namespace rocprofiler
|
||||
{
|
||||
namespace kernel_dispatch
|
||||
@@ -51,11 +60,38 @@ namespace
|
||||
{
|
||||
using queue_info_session_t = hsa::queue_info_session;
|
||||
using kernel_dispatch_record_t = rocprofiler_buffer_tracing_kernel_dispatch_record_t;
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t&
|
||||
operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
|
||||
{
|
||||
lhs.start += rhs;
|
||||
lhs.end += rhs;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
hsa_amd_profiling_dispatch_time_t&
|
||||
operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
|
||||
{
|
||||
lhs.start *= rhs;
|
||||
lhs.end *= rhs;
|
||||
return lhs;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
dispatch_complete(queue_info_session_t& session)
|
||||
{
|
||||
auto ts = common::timestamp_ns();
|
||||
|
||||
static auto sysclock_period = []() -> uint64_t {
|
||||
constexpr auto nanosec = 1000000000UL;
|
||||
uint64_t sysclock_hz = 0;
|
||||
ROCP_HSA_TABLE_CALL(ERROR,
|
||||
hsa::get_core_table()->hsa_system_get_info_fn(
|
||||
HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz));
|
||||
return (nanosec / sysclock_hz);
|
||||
}();
|
||||
|
||||
// get the contexts that were active when the signal was created
|
||||
auto& tracing_data_v = session.tracing_data;
|
||||
if(tracing_data_v.callback_contexts.empty() && tracing_data_v.buffered_contexts.empty()) return;
|
||||
@@ -80,6 +116,20 @@ dispatch_complete(queue_info_session_t& session)
|
||||
|
||||
if(dispatch_time_status == HSA_STATUS_SUCCESS)
|
||||
{
|
||||
// normalize
|
||||
dispatch_time *= sysclock_period;
|
||||
|
||||
// below is a hack for clock skew issues:
|
||||
// the timestamp of the packet rewriter for the kernel packet will always be before when the
|
||||
// kernel started
|
||||
if(dispatch_time.start < session.enqueue_ts)
|
||||
dispatch_time += (session.enqueue_ts - dispatch_time.start);
|
||||
|
||||
// below is a hack for clock skew issues:
|
||||
// the timestamp of this handler for the kernel dispatch will always be after when the
|
||||
// kernel completed
|
||||
if(dispatch_time.end < ts) dispatch_time += (ts - dispatch_time.end);
|
||||
|
||||
callback_record.start_timestamp = dispatch_time.start;
|
||||
callback_record.end_timestamp = dispatch_time.end;
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ namespace
|
||||
using auto_lock_t = std::unique_lock<std::mutex>;
|
||||
auto print_lock = std::mutex{};
|
||||
double nruntime = 500.0; // ms
|
||||
uint32_t nspin = 1000000;
|
||||
uint32_t nspin = 256 * 10000;
|
||||
size_t nthreads = 2;
|
||||
|
||||
void
|
||||
@@ -81,17 +81,17 @@ main(int argc, char** argv)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"usage: reproducible-runtime [KERNEL RUNTIME PER THREAD (default: %f msec)] "
|
||||
"[SPIN CYCLES PER KERNEL LAUNCH (default: %u)] [NUM_THREADS (default: %zu)]\n",
|
||||
"[NUM_THREADS (default: %zu)] [SPIN CYCLES PER KERNEL LAUNCH (default: %u)]\n",
|
||||
nruntime,
|
||||
nspin,
|
||||
nthreads);
|
||||
nthreads,
|
||||
nspin);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(argc > 1) nruntime = std::stod(argv[1]);
|
||||
if(argc > 2) nspin = std::stoll(argv[2]);
|
||||
if(argc > 3) nthreads = std::stoll(argv[3]);
|
||||
if(argc > 2) nthreads = std::stoll(argv[2]);
|
||||
if(argc > 3) nspin = std::stoll(argv[3]);
|
||||
|
||||
printf("[reproducible-runtime] Kernel runtime per thread: %.3f msec\n", nruntime);
|
||||
printf("[reproducible-runtime] Spin time per kernel: %u cycles\n", nspin);
|
||||
@@ -115,11 +115,11 @@ main(int argc, char** argv)
|
||||
__global__ void
|
||||
reproducible_runtime(uint32_t nspin_v)
|
||||
{
|
||||
for(uint32_t i = 0; i < nspin_v / 2048; i++)
|
||||
asm volatile("s_sleep 32"); // ~2048 cycles -> ~1us
|
||||
uint32_t remainder = nspin_v % 2048;
|
||||
for(uint32_t i = 0; i < remainder / 64; i++)
|
||||
for(uint32_t i = 0; i < nspin_v / 64; i++)
|
||||
asm volatile("s_sleep 1");
|
||||
if(nspin_v > 64)
|
||||
for(uint32_t i = 0; i < nspin_v % 64; i++)
|
||||
asm volatile("s_sleep 1");
|
||||
}
|
||||
|
||||
void
|
||||
@@ -130,10 +130,11 @@ run(int tid, int devid)
|
||||
constexpr int min_avail_simd = 128;
|
||||
dim3 grid(min_avail_simd);
|
||||
dim3 block(32);
|
||||
double time = 0.0;
|
||||
hipStream_t stream = {};
|
||||
hipEvent_t start = {};
|
||||
hipEvent_t stop = {};
|
||||
double time = 0.0;
|
||||
hipStream_t stream = {};
|
||||
hipEvent_t start = {};
|
||||
hipEvent_t stop = {};
|
||||
uint64_t nlaunch = 0;
|
||||
|
||||
HIP_API_CALL(hipSetDevice(devid));
|
||||
HIP_API_CALL(hipStreamCreate(&stream));
|
||||
@@ -152,6 +153,7 @@ run(int tid, int devid)
|
||||
float elapsed = 0.0f;
|
||||
HIP_API_CALL(hipEventElapsedTime(&elapsed, start, stop));
|
||||
time += static_cast<double>(elapsed);
|
||||
++nlaunch;
|
||||
} while(time < nruntime);
|
||||
|
||||
HIP_API_CALL(hipStreamSynchronize(stream));
|
||||
@@ -162,7 +164,7 @@ run(int tid, int devid)
|
||||
auto _msg = std::stringstream{};
|
||||
_msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-runtime is "
|
||||
<< std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3)
|
||||
<< (time / 1000.0f) << " sec)\n";
|
||||
<< (time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n";
|
||||
auto_lock_t _lk{print_lock};
|
||||
std::cout << _msg.str() << std::flush;
|
||||
}
|
||||
@@ -171,6 +173,15 @@ run(int tid, int devid)
|
||||
HIP_API_CALL(hipStreamDestroy(stream));
|
||||
|
||||
roctxRangeStop(roctx_range_id);
|
||||
|
||||
constexpr auto scale = 1.1;
|
||||
if(time > scale * nruntime)
|
||||
{
|
||||
auto _msg = std::stringstream{};
|
||||
_msg << "total kernel runtime exceeded (" << scale << " * " << nruntime << " = "
|
||||
<< (scale * nruntime) << ") :: " << time << " ms";
|
||||
throw std::runtime_error{_msg.str()};
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
|
||||
Ссылка в новой задаче
Block a user