Improve testing stability (#796)

- Update tests/bin/reproducible-runtime
  - tweak algorithm for sleeping
- Update lib/rocprofiler-sdk/hsa/async_copy
  - handle egregious skews for async copy times
- Update kernel tracing
  - handle egregious skews for kernel dispatch times
- Update lib/rocprofiler-sdk/hsa/code_object
  - use static object wrappers for code object info
- Update lib/rocprofiler-sdk-tool/config.cpp
  - fix data race in output_keys / get_local_datetime

[ROCm/rocprofiler-sdk commit: b953774580]
This commit is contained in:
Jonathan R. Madsen
2024-04-18 11:42:51 -05:00
کامیت شده توسط GitHub
والد 37fa6d25d3
کامیت acfc8a9999
7فایلهای تغییر یافته به همراه194 افزوده شده و 85 حذف شده
@@ -48,20 +48,27 @@ namespace tool
{
namespace
{
const auto launch_time = new std::time_t{std::time(nullptr)};
const auto env_regexes =
std::string*
get_local_datetime(const std::string& dt_format);
const auto* launch_datetime = get_local_datetime(get_env("ROCP_TIME_FORMAT", "%F_%H.%M"));
const auto env_regexes =
new std::array<std::regex, 2>{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"},
std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}};
std::string
get_local_datetime(const char* dt_format, std::time_t* dt_curr)
std::string*
get_local_datetime(const std::string& dt_format)
{
char mbstr[512];
if(!dt_curr) dt_curr = launch_time;
constexpr auto strsize = 512;
auto dt_curr = std::time_t{std::time(nullptr)};
if(std::strftime(mbstr, sizeof(mbstr), dt_format, std::localtime(dt_curr)) != 0)
return std::string{mbstr};
return std::string{};
char mbstr[strsize];
memset(mbstr, '\0', sizeof(mbstr) * sizeof(char));
if(std::strftime(mbstr, sizeof(mbstr) - 1, dt_format.c_str(), std::localtime(&dt_curr)) != 0)
return new std::string{mbstr};
return nullptr;
}
inline bool
@@ -270,9 +277,6 @@ output_keys(std::string _tag)
}
}
auto* _launch_time = launch_time;
auto _time_format = get_env<std::string>("ROCP_TIME_FORMAT", "%F_%H.%M");
auto _mpi_size = get_mpi_size();
auto _mpi_rank = get_mpi_rank();
@@ -286,7 +290,6 @@ output_keys(std::string _tag)
auto _pwd_string = get_env<std::string>("PWD", ".");
auto _slurm_job_id = get_env<std::string>("SLURM_JOB_ID", "0");
auto _slurm_proc_id = get_env("SLURM_PROCID", _dmp_rank);
auto _launch_string = get_local_datetime(_time_format.c_str(), _launch_time);
auto _uniq_id = _proc_id;
if(get_env<int32_t>("SLURM_PROCID", -1) >= 0)
@@ -318,6 +321,8 @@ output_keys(std::string _tag)
}
}
auto _launch_time = (launch_datetime) ? *launch_datetime : std::string{".UNKNOWN_LAUNCH_TIME."};
for(auto&& itr : std::initializer_list<output_key>{
{"%pid%", _proc_id, "Process identifier"},
{"%ppid%", _parent_id, "Parent process identifier"},
@@ -328,7 +333,7 @@ output_keys(std::string _tag)
{"%rank%", _slurm_proc_id, "MPI/UPC++ rank"},
{"%size%", _dmp_size, "MPI/UPC++ size"},
{"%nid%", _uniq_id, "%rank% if possible, otherwise %pid%"},
{"%launch_time%", _launch_string, "Data and/or time of run according to time format"},
{"%launch_time%", _launch_time, "Data and/or time of run according to time format"},
})
{
_options.emplace_back(itr);
@@ -175,6 +175,7 @@ struct async_copy_data
rocprofiler_agent_id_t src_agent = null_rocp_agent_id;
rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE;
uint64_t bytes_copied = 0;
uint64_t start_ts = 0;
context::correlation_id* correlation_id = nullptr;
tracing::tracing_data tracing_data = {};
@@ -331,6 +332,22 @@ convert_hsa_handle(Up _hsa_object)
return reinterpret_cast<Tp*>(_hsa_object.handle);
}
hsa_amd_profiling_async_copy_time_t&
operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_async_copy_time_t&
operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
bool
async_copy_handler(hsa_signal_value_t signal_value, void* arg)
{
@@ -351,14 +368,23 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg)
return (nanosec / sysclock_hz);
}();
auto ts = common::timestamp_ns();
auto* _data = static_cast<async_copy_data*>(arg);
auto copy_time = hsa_amd_profiling_async_copy_time_t{};
auto copy_time_status = get_amd_ext_table()->hsa_amd_profiling_get_async_copy_time_fn(
_data->rocp_signal, &copy_time);
// normalize
copy_time.start *= sysclock_period;
copy_time.end *= sysclock_period;
copy_time *= sysclock_period;
// below is a hack for clock skew issues:
// the timestamp of the function call triggering the copy will always be before when the copy
// started
if(copy_time.start < _data->start_ts) copy_time += (_data->start_ts - copy_time.start);
// below is a hack for clock skew issues:
// the timestamp of this handler for the copy will always be after when the copy ended
if(copy_time.end < ts) copy_time += (ts - copy_time.end);
// if we encounter this in CI, it will cause test to fail
ROCP_CI_LOG_IF(ERROR, copy_time_status == HSA_STATUS_SUCCESS && copy_time.end < copy_time.start)
@@ -650,12 +676,13 @@ async_copy_impl(Args... args)
// if we constructed a correlation id, this decrements the reference count after the underlying
// function returns
auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop]() {
auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop, _data]() {
if(_corr_id_pop)
{
context::pop_latest_correlation_id(_corr_id_pop);
_corr_id_pop->sub_ref_count();
}
_data->start_ts = common::timestamp_ns();
}};
auto thr_id = _data->correlation_id->thread_idx;
@@ -505,25 +505,27 @@ shutdown(hsa_executable_t executable);
bool is_shutdown = false;
auto&
auto*
get_executables()
{
static auto _v = common::Synchronized<executable_array_t>{};
static auto*& _v = common::static_object<common::Synchronized<executable_array_t>>::construct();
return _v;
}
auto&
auto*
get_code_objects()
{
static auto _v = common::Synchronized<code_object_array_t>{};
static auto*& _v =
common::static_object<common::Synchronized<code_object_array_t>>::construct();
static auto _dtor = common::scope_destructor{[]() { code_object_shutdown(); }};
return _v;
}
auto&
auto*
get_kernel_object_map()
{
static auto _v = common::Synchronized<kernel_object_map_t>{};
static auto*& _v =
common::static_object<common::Synchronized<kernel_object_map_t>>::construct();
return _v;
}
@@ -616,12 +618,13 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl
// generate a unique kernel symbol id
data.kernel_id = ++get_kernel_symbol_id();
get_kernel_object_map().wlock(
[](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) {
object_map[_kern_obj] = _kern_id;
},
data.kernel_object,
data.kernel_id);
CHECK_NOTNULL(get_kernel_object_map())
->wlock(
[](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) {
object_map[_kern_obj] = _kern_id;
},
data.kernel_object,
data.kernel_id);
code_obj_v->symbols.emplace_back(std::make_unique<kernel_symbol>(std::move(symbol_v)));
@@ -809,11 +812,11 @@ code_object_unload_callback(hsa_executable_t executable,
CHECK_NOTNULL(code_obj_arr);
// auto _size = get_code_objects().rlock([](const auto& data) { return data.size(); });
// ROCP_INFO << "[inp] executable=" << executable.handle
// auto _size = CHECK_NOTNULL(get_code_objects())->rlock([](const auto& data) { return
// data.size(); }); ROCP_INFO << "[inp] executable=" << executable.handle
// << ", code_object=" << loaded_code_object.handle << " vs. " << _size;
get_code_objects().rlock([&](const code_object_array_t& arr) {
CHECK_NOTNULL(get_code_objects())->rlock([&](const code_object_array_t& arr) {
for(const auto& itr : arr)
{
// ROCP_INFO << "[cmp] executable=" << itr->hsa_executable.handle
@@ -856,11 +859,12 @@ executable_freeze(hsa_executable_t executable, const char* options)
ROCP_INFO << "running " << __FUNCTION__ << " (executable=" << executable.handle << ")...";
get_executables().wlock(
[executable](executable_array_t& data) { data.emplace_back(executable); });
CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) {
data.emplace_back(executable);
});
auto& code_obj_vec = get_code_objects();
code_obj_vec.wlock([executable](code_object_array_t& _vec) {
auto* code_obj_vec = get_code_objects();
CHECK_NOTNULL(code_obj_vec)->wlock([executable](code_object_array_t& _vec) {
hsa::get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
executable, code_object_load_callback, &_vec);
});
@@ -881,7 +885,7 @@ executable_freeze(hsa_executable_t executable, const char* options)
if(!ctxs.empty())
{
code_obj_vec.rlock([](const code_object_array_t& data) {
code_obj_vec->rlock([](const code_object_array_t& data) {
auto tidx = common::get_tid();
// set the contexts for each code object
for(const auto& ditr : data)
@@ -960,34 +964,43 @@ executable_destroy(hsa_executable_t executable)
auto _unloaded = shutdown(executable);
get_kernel_object_map().wlock([_unloaded](kernel_object_map_t& data) {
for(const auto& uitr : _unloaded)
{
for(const auto& sitr : uitr.symbols)
if(get_kernel_object_map())
{
CHECK_NOTNULL(get_kernel_object_map())->wlock([_unloaded](kernel_object_map_t& data) {
for(const auto& uitr : _unloaded)
{
data.erase(sitr->rocp_data.kernel_id);
for(const auto& sitr : uitr.symbols)
{
data.erase(sitr->rocp_data.kernel_id);
}
}
}
});
});
}
get_code_objects().wlock([executable](code_object_array_t& data) {
for(auto& itr : data)
{
if(itr->hsa_executable.handle == executable.handle) itr.reset();
}
data.erase(
std::remove_if(data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }),
data.end());
});
if(get_code_objects())
{
CHECK_NOTNULL(get_code_objects())->wlock([executable](code_object_array_t& data) {
for(auto& itr : data)
{
if(itr->hsa_executable.handle == executable.handle) itr.reset();
}
data.erase(std::remove_if(
data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }),
data.end());
});
}
get_executables().wlock([executable](executable_array_t& data) {
data.erase(std::remove_if(data.begin(),
data.end(),
[executable](hsa_executable_t itr) {
return (itr.handle == executable.handle);
}),
data.end());
});
if(get_executables())
{
CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) {
data.erase(std::remove_if(data.begin(),
data.end(),
[executable](hsa_executable_t itr) {
return (itr.handle == executable.handle);
}),
data.end());
});
}
return CHECK_NOTNULL(get_destroy_function())(executable);
}
@@ -1100,27 +1113,28 @@ code_object_init(HsaApiTable* table)
uint64_t
get_kernel_id(uint64_t kernel_object)
{
return get_kernel_object_map().rlock(
[](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t {
auto itr = object_map.find(_kern_obj);
return (itr == object_map.end()) ? 0 : itr->second;
},
kernel_object);
return CHECK_NOTNULL(get_kernel_object_map())
->rlock(
[](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t {
auto itr = object_map.find(_kern_obj);
return (itr == object_map.end()) ? 0 : itr->second;
},
kernel_object);
}
void
code_object_shutdown()
{
if(is_shutdown) return;
if(is_shutdown || !get_executables() || !get_code_objects()) return;
get_executables().rlock([](const executable_array_t& edata) {
CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) {
auto tmp = edata;
std::reverse(tmp.begin(), tmp.end());
for(auto itr : tmp)
shutdown(itr);
});
get_code_objects().wlock([](code_object_array_t& data) { data.clear(); });
CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); });
is_shutdown = true;
}
@@ -416,6 +416,7 @@ WriteInterceptor(const void* packets,
.inst_pkt = std::move(inst_pkt),
.interrupt_signal = interrupt_signal,
.tid = thr_id,
.enqueue_ts = common::timestamp_ns(),
.user_data = user_data,
.correlation_id = corr_id,
.kernel_pkt = kernel_pkt,
@@ -56,6 +56,7 @@ struct queue_info_session
inst_pkt_t inst_pkt = {};
hsa_signal_t interrupt_signal = {};
rocprofiler_thread_id_t tid = common::get_tid();
rocprofiler_timestamp_t enqueue_ts = 0;
rocprofiler_user_data_t user_data = {.value = 0};
context::correlation_id* correlation_id = nullptr;
rocprofiler_packet kernel_pkt = {};
@@ -43,6 +43,15 @@
# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL)
#endif
#define ROCP_HSA_TABLE_CALL(SEVERITY, EXPR) \
auto ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) = (EXPR); \
LOG_IF(SEVERITY, ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) != HSA_STATUS_SUCCESS) \
<< #EXPR << " returned non-zero status code " \
<< ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) << " :: " \
<< ::rocprofiler::hsa::get_hsa_status_string( \
ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__)) \
<< " "
namespace rocprofiler
{
namespace kernel_dispatch
@@ -51,11 +60,38 @@ namespace
{
using queue_info_session_t = hsa::queue_info_session;
using kernel_dispatch_record_t = rocprofiler_buffer_tracing_kernel_dispatch_record_t;
hsa_amd_profiling_dispatch_time_t&
operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start += rhs;
lhs.end += rhs;
return lhs;
}
hsa_amd_profiling_dispatch_time_t&
operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs)
{
lhs.start *= rhs;
lhs.end *= rhs;
return lhs;
}
} // namespace
void
dispatch_complete(queue_info_session_t& session)
{
auto ts = common::timestamp_ns();
static auto sysclock_period = []() -> uint64_t {
constexpr auto nanosec = 1000000000UL;
uint64_t sysclock_hz = 0;
ROCP_HSA_TABLE_CALL(ERROR,
hsa::get_core_table()->hsa_system_get_info_fn(
HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz));
return (nanosec / sysclock_hz);
}();
// get the contexts that were active when the signal was created
auto& tracing_data_v = session.tracing_data;
if(tracing_data_v.callback_contexts.empty() && tracing_data_v.buffered_contexts.empty()) return;
@@ -80,6 +116,20 @@ dispatch_complete(queue_info_session_t& session)
if(dispatch_time_status == HSA_STATUS_SUCCESS)
{
// normalize
dispatch_time *= sysclock_period;
// below is a hack for clock skew issues:
// the timestamp of the packet rewriter for the kernel packet will always be before when the
// kernel started
if(dispatch_time.start < session.enqueue_ts)
dispatch_time += (session.enqueue_ts - dispatch_time.start);
// below is a hack for clock skew issues:
// the timestamp of this handler for the kernel dispatch will always be after when the
// kernel completed
if(dispatch_time.end < ts) dispatch_time += (ts - dispatch_time.end);
callback_record.start_timestamp = dispatch_time.start;
callback_record.end_timestamp = dispatch_time.end;
}
@@ -58,7 +58,7 @@ namespace
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
double nruntime = 500.0; // ms
uint32_t nspin = 1000000;
uint32_t nspin = 256 * 10000;
size_t nthreads = 2;
void
@@ -81,17 +81,17 @@ main(int argc, char** argv)
{
fprintf(stderr,
"usage: reproducible-runtime [KERNEL RUNTIME PER THREAD (default: %f msec)] "
"[SPIN CYCLES PER KERNEL LAUNCH (default: %u)] [NUM_THREADS (default: %zu)]\n",
"[NUM_THREADS (default: %zu)] [SPIN CYCLES PER KERNEL LAUNCH (default: %u)]\n",
nruntime,
nspin,
nthreads);
nthreads,
nspin);
exit(EXIT_SUCCESS);
}
}
if(argc > 1) nruntime = std::stod(argv[1]);
if(argc > 2) nspin = std::stoll(argv[2]);
if(argc > 3) nthreads = std::stoll(argv[3]);
if(argc > 2) nthreads = std::stoll(argv[2]);
if(argc > 3) nspin = std::stoll(argv[3]);
printf("[reproducible-runtime] Kernel runtime per thread: %.3f msec\n", nruntime);
printf("[reproducible-runtime] Spin time per kernel: %u cycles\n", nspin);
@@ -115,11 +115,11 @@ main(int argc, char** argv)
__global__ void
reproducible_runtime(uint32_t nspin_v)
{
for(uint32_t i = 0; i < nspin_v / 2048; i++)
asm volatile("s_sleep 32"); // ~2048 cycles -> ~1us
uint32_t remainder = nspin_v % 2048;
for(uint32_t i = 0; i < remainder / 64; i++)
for(uint32_t i = 0; i < nspin_v / 64; i++)
asm volatile("s_sleep 1");
if(nspin_v > 64)
for(uint32_t i = 0; i < nspin_v % 64; i++)
asm volatile("s_sleep 1");
}
void
@@ -130,10 +130,11 @@ run(int tid, int devid)
constexpr int min_avail_simd = 128;
dim3 grid(min_avail_simd);
dim3 block(32);
double time = 0.0;
hipStream_t stream = {};
hipEvent_t start = {};
hipEvent_t stop = {};
double time = 0.0;
hipStream_t stream = {};
hipEvent_t start = {};
hipEvent_t stop = {};
uint64_t nlaunch = 0;
HIP_API_CALL(hipSetDevice(devid));
HIP_API_CALL(hipStreamCreate(&stream));
@@ -152,6 +153,7 @@ run(int tid, int devid)
float elapsed = 0.0f;
HIP_API_CALL(hipEventElapsedTime(&elapsed, start, stop));
time += static_cast<double>(elapsed);
++nlaunch;
} while(time < nruntime);
HIP_API_CALL(hipStreamSynchronize(stream));
@@ -162,7 +164,7 @@ run(int tid, int devid)
auto _msg = std::stringstream{};
_msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-runtime is "
<< std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3)
<< (time / 1000.0f) << " sec)\n";
<< (time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n";
auto_lock_t _lk{print_lock};
std::cout << _msg.str() << std::flush;
}
@@ -171,6 +173,15 @@ run(int tid, int devid)
HIP_API_CALL(hipStreamDestroy(stream));
roctxRangeStop(roctx_range_id);
constexpr auto scale = 1.1;
if(time > scale * nruntime)
{
auto _msg = std::stringstream{};
_msg << "total kernel runtime exceeded (" << scale << " * " << nruntime << " = "
<< (scale * nruntime) << ") :: " << time << " ms";
throw std::runtime_error{_msg.str()};
}
}
namespace