diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index e6463cc78a..72415fa486 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -48,20 +48,27 @@ namespace tool { namespace { -const auto launch_time = new std::time_t{std::time(nullptr)}; -const auto env_regexes = +std::string* +get_local_datetime(const std::string& dt_format); + +const auto* launch_datetime = get_local_datetime(get_env("ROCP_TIME_FORMAT", "%F_%H.%M")); +const auto env_regexes = new std::array{std::regex{"(.*)%(env|ENV)\\{([A-Z0-9_]+)\\}%(.*)"}, std::regex{"(.*)\\$(env|ENV)\\{([A-Z0-9_]+)\\}(.*)"}}; -std::string -get_local_datetime(const char* dt_format, std::time_t* dt_curr) +std::string* +get_local_datetime(const std::string& dt_format) { - char mbstr[512]; - if(!dt_curr) dt_curr = launch_time; + constexpr auto strsize = 512; + auto dt_curr = std::time_t{std::time(nullptr)}; - if(std::strftime(mbstr, sizeof(mbstr), dt_format, std::localtime(dt_curr)) != 0) - return std::string{mbstr}; - return std::string{}; + char mbstr[strsize]; + memset(mbstr, '\0', sizeof(mbstr) * sizeof(char)); + + if(std::strftime(mbstr, sizeof(mbstr) - 1, dt_format.c_str(), std::localtime(&dt_curr)) != 0) + return new std::string{mbstr}; + + return nullptr; } inline bool @@ -270,9 +277,6 @@ output_keys(std::string _tag) } } - auto* _launch_time = launch_time; - auto _time_format = get_env("ROCP_TIME_FORMAT", "%F_%H.%M"); - auto _mpi_size = get_mpi_size(); auto _mpi_rank = get_mpi_rank(); @@ -286,7 +290,6 @@ output_keys(std::string _tag) auto _pwd_string = get_env("PWD", "."); auto _slurm_job_id = get_env("SLURM_JOB_ID", "0"); auto _slurm_proc_id = get_env("SLURM_PROCID", _dmp_rank); - auto _launch_string = get_local_datetime(_time_format.c_str(), _launch_time); auto _uniq_id = _proc_id; if(get_env("SLURM_PROCID", -1) >= 0) @@ -318,6 +321,8 @@ output_keys(std::string _tag) } } + auto _launch_time = (launch_datetime) ? *launch_datetime : std::string{".UNKNOWN_LAUNCH_TIME."}; + for(auto&& itr : std::initializer_list{ {"%pid%", _proc_id, "Process identifier"}, {"%ppid%", _parent_id, "Parent process identifier"}, @@ -328,7 +333,7 @@ output_keys(std::string _tag) {"%rank%", _slurm_proc_id, "MPI/UPC++ rank"}, {"%size%", _dmp_size, "MPI/UPC++ size"}, {"%nid%", _uniq_id, "%rank% if possible, otherwise %pid%"}, - {"%launch_time%", _launch_string, "Data and/or time of run according to time format"}, + {"%launch_time%", _launch_time, "Data and/or time of run according to time format"}, }) { _options.emplace_back(itr); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp index 460765928a..6c876f35da 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/async_copy.cpp @@ -175,6 +175,7 @@ struct async_copy_data rocprofiler_agent_id_t src_agent = null_rocp_agent_id; rocprofiler_memory_copy_operation_t direction = ROCPROFILER_MEMORY_COPY_NONE; uint64_t bytes_copied = 0; + uint64_t start_ts = 0; context::correlation_id* correlation_id = nullptr; tracing::tracing_data tracing_data = {}; @@ -331,6 +332,22 @@ convert_hsa_handle(Up _hsa_object) return reinterpret_cast(_hsa_object.handle); } +hsa_amd_profiling_async_copy_time_t& +operator+=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs) +{ + lhs.start += rhs; + lhs.end += rhs; + return lhs; +} + +hsa_amd_profiling_async_copy_time_t& +operator*=(hsa_amd_profiling_async_copy_time_t& lhs, uint64_t rhs) +{ + lhs.start *= rhs; + lhs.end *= rhs; + return lhs; +} + bool async_copy_handler(hsa_signal_value_t signal_value, void* arg) { @@ -351,14 +368,23 @@ async_copy_handler(hsa_signal_value_t signal_value, void* arg) return (nanosec / sysclock_hz); }(); + auto ts = common::timestamp_ns(); auto* _data = static_cast(arg); auto copy_time = hsa_amd_profiling_async_copy_time_t{}; auto copy_time_status = get_amd_ext_table()->hsa_amd_profiling_get_async_copy_time_fn( _data->rocp_signal, ©_time); // normalize - copy_time.start *= sysclock_period; - copy_time.end *= sysclock_period; + copy_time *= sysclock_period; + + // below is a hack for clock skew issues: + // the timestamp of the function call triggering the copy will always be before when the copy + // started + if(copy_time.start < _data->start_ts) copy_time += (_data->start_ts - copy_time.start); + + // below is a hack for clock skew issues: + // the timestamp of this handler for the copy will always be after when the copy ended + if(copy_time.end < ts) copy_time += (ts - copy_time.end); // if we encounter this in CI, it will cause test to fail ROCP_CI_LOG_IF(ERROR, copy_time_status == HSA_STATUS_SUCCESS && copy_time.end < copy_time.start) @@ -650,12 +676,13 @@ async_copy_impl(Args... args) // if we constructed a correlation id, this decrements the reference count after the underlying // function returns - auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop]() { + auto _corr_id_dtor = common::scope_destructor{[_corr_id_pop, _data]() { if(_corr_id_pop) { context::pop_latest_correlation_id(_corr_id_pop); _corr_id_pop->sub_ref_count(); } + _data->start_ts = common::timestamp_ns(); }}; auto thr_id = _data->correlation_id->thread_idx; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp index 75cac1e2a3..990491aecb 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/code_object.cpp @@ -505,25 +505,27 @@ shutdown(hsa_executable_t executable); bool is_shutdown = false; -auto& +auto* get_executables() { - static auto _v = common::Synchronized{}; + static auto*& _v = common::static_object>::construct(); return _v; } -auto& +auto* get_code_objects() { - static auto _v = common::Synchronized{}; + static auto*& _v = + common::static_object>::construct(); static auto _dtor = common::scope_destructor{[]() { code_object_shutdown(); }}; return _v; } -auto& +auto* get_kernel_object_map() { - static auto _v = common::Synchronized{}; + static auto*& _v = + common::static_object>::construct(); return _v; } @@ -616,12 +618,13 @@ executable_iterate_agent_symbols_load_callback(hsa_executable_t executabl // generate a unique kernel symbol id data.kernel_id = ++get_kernel_symbol_id(); - get_kernel_object_map().wlock( - [](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) { - object_map[_kern_obj] = _kern_id; - }, - data.kernel_object, - data.kernel_id); + CHECK_NOTNULL(get_kernel_object_map()) + ->wlock( + [](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) { + object_map[_kern_obj] = _kern_id; + }, + data.kernel_object, + data.kernel_id); code_obj_v->symbols.emplace_back(std::make_unique(std::move(symbol_v))); @@ -809,11 +812,11 @@ code_object_unload_callback(hsa_executable_t executable, CHECK_NOTNULL(code_obj_arr); - // auto _size = get_code_objects().rlock([](const auto& data) { return data.size(); }); - // ROCP_INFO << "[inp] executable=" << executable.handle + // auto _size = CHECK_NOTNULL(get_code_objects())->rlock([](const auto& data) { return + // data.size(); }); ROCP_INFO << "[inp] executable=" << executable.handle // << ", code_object=" << loaded_code_object.handle << " vs. " << _size; - get_code_objects().rlock([&](const code_object_array_t& arr) { + CHECK_NOTNULL(get_code_objects())->rlock([&](const code_object_array_t& arr) { for(const auto& itr : arr) { // ROCP_INFO << "[cmp] executable=" << itr->hsa_executable.handle @@ -856,11 +859,12 @@ executable_freeze(hsa_executable_t executable, const char* options) ROCP_INFO << "running " << __FUNCTION__ << " (executable=" << executable.handle << ")..."; - get_executables().wlock( - [executable](executable_array_t& data) { data.emplace_back(executable); }); + CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) { + data.emplace_back(executable); + }); - auto& code_obj_vec = get_code_objects(); - code_obj_vec.wlock([executable](code_object_array_t& _vec) { + auto* code_obj_vec = get_code_objects(); + CHECK_NOTNULL(code_obj_vec)->wlock([executable](code_object_array_t& _vec) { hsa::get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects( executable, code_object_load_callback, &_vec); }); @@ -881,7 +885,7 @@ executable_freeze(hsa_executable_t executable, const char* options) if(!ctxs.empty()) { - code_obj_vec.rlock([](const code_object_array_t& data) { + code_obj_vec->rlock([](const code_object_array_t& data) { auto tidx = common::get_tid(); // set the contexts for each code object for(const auto& ditr : data) @@ -960,34 +964,43 @@ executable_destroy(hsa_executable_t executable) auto _unloaded = shutdown(executable); - get_kernel_object_map().wlock([_unloaded](kernel_object_map_t& data) { - for(const auto& uitr : _unloaded) - { - for(const auto& sitr : uitr.symbols) + if(get_kernel_object_map()) + { + CHECK_NOTNULL(get_kernel_object_map())->wlock([_unloaded](kernel_object_map_t& data) { + for(const auto& uitr : _unloaded) { - data.erase(sitr->rocp_data.kernel_id); + for(const auto& sitr : uitr.symbols) + { + data.erase(sitr->rocp_data.kernel_id); + } } - } - }); + }); + } - get_code_objects().wlock([executable](code_object_array_t& data) { - for(auto& itr : data) - { - if(itr->hsa_executable.handle == executable.handle) itr.reset(); - } - data.erase( - std::remove_if(data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }), - data.end()); - }); + if(get_code_objects()) + { + CHECK_NOTNULL(get_code_objects())->wlock([executable](code_object_array_t& data) { + for(auto& itr : data) + { + if(itr->hsa_executable.handle == executable.handle) itr.reset(); + } + data.erase(std::remove_if( + data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }), + data.end()); + }); + } - get_executables().wlock([executable](executable_array_t& data) { - data.erase(std::remove_if(data.begin(), - data.end(), - [executable](hsa_executable_t itr) { - return (itr.handle == executable.handle); - }), - data.end()); - }); + if(get_executables()) + { + CHECK_NOTNULL(get_executables())->wlock([executable](executable_array_t& data) { + data.erase(std::remove_if(data.begin(), + data.end(), + [executable](hsa_executable_t itr) { + return (itr.handle == executable.handle); + }), + data.end()); + }); + } return CHECK_NOTNULL(get_destroy_function())(executable); } @@ -1100,27 +1113,28 @@ code_object_init(HsaApiTable* table) uint64_t get_kernel_id(uint64_t kernel_object) { - return get_kernel_object_map().rlock( - [](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t { - auto itr = object_map.find(_kern_obj); - return (itr == object_map.end()) ? 0 : itr->second; - }, - kernel_object); + return CHECK_NOTNULL(get_kernel_object_map()) + ->rlock( + [](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t { + auto itr = object_map.find(_kern_obj); + return (itr == object_map.end()) ? 0 : itr->second; + }, + kernel_object); } void code_object_shutdown() { - if(is_shutdown) return; + if(is_shutdown || !get_executables() || !get_code_objects()) return; - get_executables().rlock([](const executable_array_t& edata) { + CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) { auto tmp = edata; std::reverse(tmp.begin(), tmp.end()); for(auto itr : tmp) shutdown(itr); }); - get_code_objects().wlock([](code_object_array_t& data) { data.clear(); }); + CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); }); is_shutdown = true; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp index a562e863c3..004abc1f4c 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -416,6 +416,7 @@ WriteInterceptor(const void* packets, .inst_pkt = std::move(inst_pkt), .interrupt_signal = interrupt_signal, .tid = thr_id, + .enqueue_ts = common::timestamp_ns(), .user_data = user_data, .correlation_id = corr_id, .kernel_pkt = kernel_pkt, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_info_session.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_info_session.hpp index cb3ab287d3..09878d3a58 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_info_session.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_info_session.hpp @@ -56,6 +56,7 @@ struct queue_info_session inst_pkt_t inst_pkt = {}; hsa_signal_t interrupt_signal = {}; rocprofiler_thread_id_t tid = common::get_tid(); + rocprofiler_timestamp_t enqueue_ts = 0; rocprofiler_user_data_t user_data = {.value = 0}; context::correlation_id* correlation_id = nullptr; rocprofiler_packet kernel_pkt = {}; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kernel_dispatch/tracing.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kernel_dispatch/tracing.cpp index 8824014e21..07e8db562f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kernel_dispatch/tracing.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/kernel_dispatch/tracing.cpp @@ -43,6 +43,15 @@ # define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL) #endif +#define ROCP_HSA_TABLE_CALL(SEVERITY, EXPR) \ + auto ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) = (EXPR); \ + LOG_IF(SEVERITY, ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) != HSA_STATUS_SUCCESS) \ + << #EXPR << " returned non-zero status code " \ + << ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__) << " :: " \ + << ::rocprofiler::hsa::get_hsa_status_string( \ + ROCPROFILER_VARIABLE(rocp_hsa_table_call_, __LINE__)) \ + << " " + namespace rocprofiler { namespace kernel_dispatch @@ -51,11 +60,38 @@ namespace { using queue_info_session_t = hsa::queue_info_session; using kernel_dispatch_record_t = rocprofiler_buffer_tracing_kernel_dispatch_record_t; + +hsa_amd_profiling_dispatch_time_t& +operator+=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs) +{ + lhs.start += rhs; + lhs.end += rhs; + return lhs; +} + +hsa_amd_profiling_dispatch_time_t& +operator*=(hsa_amd_profiling_dispatch_time_t& lhs, uint64_t rhs) +{ + lhs.start *= rhs; + lhs.end *= rhs; + return lhs; +} } // namespace void dispatch_complete(queue_info_session_t& session) { + auto ts = common::timestamp_ns(); + + static auto sysclock_period = []() -> uint64_t { + constexpr auto nanosec = 1000000000UL; + uint64_t sysclock_hz = 0; + ROCP_HSA_TABLE_CALL(ERROR, + hsa::get_core_table()->hsa_system_get_info_fn( + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz)); + return (nanosec / sysclock_hz); + }(); + // get the contexts that were active when the signal was created auto& tracing_data_v = session.tracing_data; if(tracing_data_v.callback_contexts.empty() && tracing_data_v.buffered_contexts.empty()) return; @@ -80,6 +116,20 @@ dispatch_complete(queue_info_session_t& session) if(dispatch_time_status == HSA_STATUS_SUCCESS) { + // normalize + dispatch_time *= sysclock_period; + + // below is a hack for clock skew issues: + // the timestamp of the packet rewriter for the kernel packet will always be before when the + // kernel started + if(dispatch_time.start < session.enqueue_ts) + dispatch_time += (session.enqueue_ts - dispatch_time.start); + + // below is a hack for clock skew issues: + // the timestamp of this handler for the kernel dispatch will always be after when the + // kernel completed + if(dispatch_time.end < ts) dispatch_time += (ts - dispatch_time.end); + callback_record.start_timestamp = dispatch_time.start; callback_record.end_timestamp = dispatch_time.end; } diff --git a/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp b/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp index 82ef726fd0..ee8641fd86 100644 --- a/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp +++ b/projects/rocprofiler-sdk/tests/bin/reproducible-runtime/reproducible-runtime.cpp @@ -58,7 +58,7 @@ namespace using auto_lock_t = std::unique_lock; auto print_lock = std::mutex{}; double nruntime = 500.0; // ms -uint32_t nspin = 1000000; +uint32_t nspin = 256 * 10000; size_t nthreads = 2; void @@ -81,17 +81,17 @@ main(int argc, char** argv) { fprintf(stderr, "usage: reproducible-runtime [KERNEL RUNTIME PER THREAD (default: %f msec)] " - "[SPIN CYCLES PER KERNEL LAUNCH (default: %u)] [NUM_THREADS (default: %zu)]\n", + "[NUM_THREADS (default: %zu)] [SPIN CYCLES PER KERNEL LAUNCH (default: %u)]\n", nruntime, - nspin, - nthreads); + nthreads, + nspin); exit(EXIT_SUCCESS); } } if(argc > 1) nruntime = std::stod(argv[1]); - if(argc > 2) nspin = std::stoll(argv[2]); - if(argc > 3) nthreads = std::stoll(argv[3]); + if(argc > 2) nthreads = std::stoll(argv[2]); + if(argc > 3) nspin = std::stoll(argv[3]); printf("[reproducible-runtime] Kernel runtime per thread: %.3f msec\n", nruntime); printf("[reproducible-runtime] Spin time per kernel: %u cycles\n", nspin); @@ -115,11 +115,11 @@ main(int argc, char** argv) __global__ void reproducible_runtime(uint32_t nspin_v) { - for(uint32_t i = 0; i < nspin_v / 2048; i++) - asm volatile("s_sleep 32"); // ~2048 cycles -> ~1us - uint32_t remainder = nspin_v % 2048; - for(uint32_t i = 0; i < remainder / 64; i++) + for(uint32_t i = 0; i < nspin_v / 64; i++) asm volatile("s_sleep 1"); + if(nspin_v > 64) + for(uint32_t i = 0; i < nspin_v % 64; i++) + asm volatile("s_sleep 1"); } void @@ -130,10 +130,11 @@ run(int tid, int devid) constexpr int min_avail_simd = 128; dim3 grid(min_avail_simd); dim3 block(32); - double time = 0.0; - hipStream_t stream = {}; - hipEvent_t start = {}; - hipEvent_t stop = {}; + double time = 0.0; + hipStream_t stream = {}; + hipEvent_t start = {}; + hipEvent_t stop = {}; + uint64_t nlaunch = 0; HIP_API_CALL(hipSetDevice(devid)); HIP_API_CALL(hipStreamCreate(&stream)); @@ -152,6 +153,7 @@ run(int tid, int devid) float elapsed = 0.0f; HIP_API_CALL(hipEventElapsedTime(&elapsed, start, stop)); time += static_cast(elapsed); + ++nlaunch; } while(time < nruntime); HIP_API_CALL(hipStreamSynchronize(stream)); @@ -162,7 +164,7 @@ run(int tid, int devid) auto _msg = std::stringstream{}; _msg << '[' << getpid() << "][" << tid << "] Runtime of reproducible-runtime is " << std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3) - << (time / 1000.0f) << " sec)\n"; + << (time / 1000.0f) << " sec). Kernels dispatched: " << nlaunch << "\n"; auto_lock_t _lk{print_lock}; std::cout << _msg.str() << std::flush; } @@ -171,6 +173,15 @@ run(int tid, int devid) HIP_API_CALL(hipStreamDestroy(stream)); roctxRangeStop(roctx_range_id); + + constexpr auto scale = 1.1; + if(time > scale * nruntime) + { + auto _msg = std::stringstream{}; + _msg << "total kernel runtime exceeded (" << scale << " * " << nruntime << " = " + << (scale * nruntime) << ") :: " << time << " ms"; + throw std::runtime_error{_msg.str()}; + } } namespace