diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index fb28364924..999163fc5d 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -48,8 +48,8 @@ SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 2 -#define ROCPROFILER_VERSION_MINOR 1 +#define ROCPROFILER_VERSION_MAJOR 3 +#define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus extern "C" { diff --git a/src/core/context.h b/src/core/context.h index db3b76e39d..687272d3f4 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -29,6 +29,9 @@ SOFTWARE. #include #include +#include // usleep +#include +#include #include #include #include @@ -49,27 +52,7 @@ class Context; inline unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } -#if 0 -// Block descriptor -struct block_des_t { - uint32_t id; - uint32_t index; -}; -// block_des_t less-then functor -struct lt_block_des { - bool operator()(const block_des_t& a1, const block_des_t& a2) const { - return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); - } -}; - -// Block status -struct block_status_t { - uint32_t max_counters; - uint32_t counter_index; - uint32_t group_index; -}; -#endif // Metrics arguments template class MetricArgs : public xml::args_cache_t { public: @@ -97,6 +80,9 @@ template class MetricArgs : public xml::args_cache_t { // Profiling group class Group { public: + typedef uint32_t refs_t; + typedef std::atomic atomic_refs_t; + Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), sqtt_profile_(agent_info), @@ -146,10 +132,10 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } - void ResetRefs() { refs_ = n_profiles_; } - uint32_t DecrRefs() { - return (refs_ > 0) ? --refs_ : 0; - } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } + void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } + void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } + uint32_t FetchDecrRefsCount() { return AtomicRefsCount()->fetch_sub(1, std::memory_order_acq_rel); } private: PmcProfile pmc_profile_; @@ -159,7 +145,7 @@ class Group { pkt_vector_t stop_vector_; pkt_vector_t read_vector_; uint32_t n_profiles_; - uint32_t refs_; + refs_t refs_; Context* const context_; const uint32_t index_; }; @@ -167,7 +153,6 @@ class Group { // Profiling context class Context { public: - typedef std::mutex mutex_t; typedef std::map info_map_t; Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -180,6 +165,8 @@ class Context { handler_(handler), handler_arg_(handler_arg) { + if (info_count == 0) return; + metrics_ = MetricsDict::Create(agent_info); if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); if (Initialize(info, info_count) == false) { @@ -192,7 +179,7 @@ class Context { if (handler != NULL) { for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { - set_[group_index].ResetRefs(); + set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Handler for stop packet completion @@ -307,11 +294,11 @@ class Context { } } - void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); } + void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); } uint32_t GetGroupCount() const { return set_.size(); } - rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupInfo(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -320,8 +307,14 @@ class Context { group.feature_count = info_vector.size(); return group; } - rocprofiler_group_t GetGroupInfo(const uint32_t& index) { - return GetGroupInfo(&set_[index]); + inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + rocprofiler_group_t group = {}; + if (set_.empty()) { + group.context = reinterpret_cast(this); + } else { + group = GetGroupInfo(&set_[index]); + } + return group; } const pkt_vector_t& StartPackets(const uint32_t& group_index) const { @@ -368,14 +361,7 @@ class Context { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete - const uint64_t timeout = timeout_; - bool complete = false; - while (!complete) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, - HSA_WAIT_STATE_BLOCKED); - complete = (signal_value < 1); - if (!complete) WARN_LOGGING("timeout"); - } + hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -411,8 +397,19 @@ class Context { } } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } - static uint64_t GetTimeout() { return timeout_; } + static bool Handler(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + auto r = group->FetchDecrRefsCount(); + if (r == 1) { + const rocprofiler_group_t group_info = context->GetGroupInfo(group); + context->handler_(group_info, context->handler_arg_); + } + return false; + } + + Group* GetGroup(const uint32_t& index) { return &set_[index]; } + rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } private: // Getting profling packets @@ -425,18 +422,6 @@ class Context { return vec; } - static bool Handler(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - context->mutex_.lock(); - uint32_t r = group->DecrRefs(); - context->mutex_.unlock(); - if (r == 0) { - return context->handler_(context->GetGroupInfo(group), context->handler_arg_); - } - return false; - } - static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -526,9 +511,6 @@ class Context { return info; } - // Profiling data waiting timeout - static uint64_t timeout_; - // GPU handel const hsa_agent_t agent_; const util::AgentInfo* agent_info_; @@ -551,7 +533,6 @@ class Context { // Context completion handler rocprofiler_handler_t handler_; void* handler_arg_; - mutex_t mutex_; }; } // namespace rocprofiler diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index e67c34dcb7..b85ec1cce5 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -36,7 +36,6 @@ InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; void* InterceptQueue::callback_data_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; -uint64_t InterceptQueue::timeout_ = UINT64_MAX; Tracker* InterceptQueue::tracker_ = NULL; bool InterceptQueue::tracker_on_ = false; bool InterceptQueue::in_constr_call_ = false; diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 733f0fc33a..90fd0cb576 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -50,6 +50,7 @@ class InterceptQueue { typedef std::recursive_mutex mutex_t; typedef std::map obj_map_t; typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); + typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg); static void HsaIntercept(HsaApiTable* table); @@ -65,12 +66,12 @@ class InterceptQueue { if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); in_constr_call_ = true; - ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size, group_segment_size, queue, &status); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()"); if (tracker_on || tracker_on_) { - if (tracker_ == NULL) tracker_ = new Tracker(timeout_); + if (tracker_ == NULL) tracker_ = new Tracker; status = hsa_amd_profiling_set_profiler_enabled(*queue, true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } @@ -79,6 +80,7 @@ class InterceptQueue { InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); (*obj_map_)[(uint64_t)(*queue)] = obj; status = proxy->SetInterceptCB(OnSubmitCB, obj); + obj->queue_event_callback_ = callback; in_constr_call_ = false; return status; @@ -132,11 +134,10 @@ class InterceptQueue { reinterpret_cast(packet); // Adding kernel timing tracker - const rocprofiler_dispatch_record_t* record = NULL; + Tracker::entry_t* tracker_entry = NULL; if (tracker_ != NULL) { - const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = entry->signal; - record = entry->record; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); + const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } // Prepareing dispatch callback data @@ -147,27 +148,44 @@ class InterceptQueue { user_que_idx, dispatch_packet, kernel_name, - record}; + (tracker_entry) ? tracker_entry->record : NULL}; // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); free(const_cast(kernel_name)); // Injecting profiling start/stop packets - if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { + if (tracker_entry != NULL) tracker_->Delete(tracker_entry); + } else { Context* context = reinterpret_cast(group.context); - const pkt_vector_t& start_vector = context->StartPackets(group.index); - const pkt_vector_t& stop_vector = context->StopPackets(group.index); - pkt_vector_t packets = start_vector; - packets.insert(packets.end(), *packet); - packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); - if (writer != NULL) { - writer(&packets[0], packets.size()); + if (group.feature_count != 0) { + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast(context_group)); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; } else { - proxy->Submit(&packets[0], packets.size()); + if (tracker_entry != NULL) { + void* context_handler_arg = NULL; + rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); + tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); + rocprofiler_close(context); + } } - to_submit = false; } } @@ -189,11 +207,16 @@ class InterceptQueue { destroy_callback_ = destroy_callback; } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } static void TrackerOn(bool on) { tracker_on_ = on; } static bool IsTrackerOn() { return tracker_on_; } private: + static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); + InterceptQueue* obj = GetObj(queue); + if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg); + } + static hsa_packet_type_t GetHeaderType(const packet_t* packet) { const packet_word_t* header = reinterpret_cast(packet); return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); @@ -258,8 +281,12 @@ class InterceptQueue { proxy_(proxy) { agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + queue_event_callback_ = NULL; + } + + ~InterceptQueue() { + ProxyQueue::Destroy(proxy_); } - ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; @@ -268,7 +295,6 @@ class InterceptQueue { static void* callback_data_; static obj_map_t* obj_map_; static const char* kernel_none_; - static uint64_t timeout_; static Tracker* tracker_; static bool tracker_on_; static bool in_constr_call_; @@ -276,6 +302,7 @@ class InterceptQueue { hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; + queue_event_callback_t queue_event_callback_; }; } // namespace rocprofiler diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 183d4f3b2a..ec7bd06868 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -154,7 +154,7 @@ bool LoadTool() { settings.intercept_mode = (intercept_mode) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; - settings.timeout = Context::GetTimeout(); + settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; if (handler) handler(); @@ -163,8 +163,7 @@ bool LoadTool() { intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); SqttProfile::SetLocal(settings.sqtt_local != 0); - Context::SetTimeout(settings.timeout); - InterceptQueue::SetTimeout(settings.timeout); + util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); } @@ -190,8 +189,8 @@ CONSTRUCTOR_API void constructor() { } DESTRUCTOR_API void destructor() { - util::HsaRsrcFactory::Destroy(); rocprofiler::MetricsDict::Destroy(); + util::HsaRsrcFactory::Destroy(); util::Logger::Destroy(); } @@ -213,10 +212,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { } rocprofiler_properties_t rocprofiler_properties; -uint64_t Context::timeout_ = UINT64_MAX; +#if DEFERRED_HANDLERS +Context::deferred_handler_list_t* Context::deferred_handler_list_ = NULL; +Context::recur_mutex_t Context::mutex_static_; +#endif uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; -Tracker::mutex_t Tracker::mutex_; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; } diff --git a/src/core/tracker.h b/src/core/tracker.h index ee22e3d592..5fafa19214 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -30,9 +30,11 @@ SOFTWARE. #include #include +#include #include #include +#include "util/hsa_rsrc_factory.h" #include "inc/rocprofiler.h" #include "util/exception.h" #include "util/logger.h" @@ -41,12 +43,13 @@ namespace rocprofiler { class Tracker { public: - typedef uint64_t timestamp_t; - typedef long double freq_t; typedef std::mutex mutex_t; + typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef rocprofiler_dispatch_record_t record_t; struct entry_t; typedef std::list sig_list_t; + typedef sig_list_t::iterator sig_list_it_t; + struct entry_t { Tracker* tracker; sig_list_t::iterator it; @@ -54,75 +57,59 @@ class Tracker { hsa_signal_t orig; hsa_signal_t signal; record_t* record; + std::atomic handler; + void* arg; + bool context_active; }; - Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) { - timestamp_t timestamp_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz); - if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)"); - timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz; - } + Tracker() : + outstanding_(0), + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + {} + ~Tracker() { - mutex_.lock(); - for (entry_t* entry : sig_list_) { - assert(entry != NULL); - while (1) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire( - entry->signal, - HSA_SIGNAL_CONDITION_LT, - 1, - timeout_, - HSA_WAIT_STATE_BLOCKED); - if (signal_value < 1) break; - else WARN_LOGGING("tracker timeout"); - } - Del(entry); + auto it = sig_list_.begin(); + auto end = sig_list_.end(); + while (it != end) { + auto cur = it++; + hsa_rsrc_->SignalWait((*cur)->signal); + Erase(cur); } - mutex_.unlock(); } // Add tracker entry - entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { hsa_status_t status = HSA_STATUS_ERROR; + + // Creating a new tracker entry entry_t* entry = new entry_t{}; assert(entry); entry->tracker = this; + entry->agent = agent; + entry->orig = orig; + + // Creating a record with the dispatch timestamps + record_t* record = new record_t{}; + assert(record); + record->dispatch = hsa_rsrc_->TimestampNs(); + entry->record = record; + + // Creating a proxy signal + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + + // Adding antry to the list mutex_.lock(); entry->it = sig_list_.insert(sig_list_.begin(), entry); mutex_.unlock(); - entry->agent = agent; - entry->orig = orig; - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - - record_t* record = new record_t{}; - assert(record); - entry->record = record; - - timestamp_t dispatch_timestamp = 0; - status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &dispatch_timestamp); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - - record->dispatch = timestamp2ns(dispatch_timestamp); - - status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); - - if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding++; - fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); - fflush(stdout); - mutex_.unlock(); - } - return entry; } - private: // Delete tracker entry - void Del(entry_t* entry) { + void Delete(entry_t* entry) { hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); @@ -130,31 +117,53 @@ class Tracker { delete entry; } - // Handler for packet completion - static bool Handler(hsa_signal_value_t value, void* arg) { - entry_t* entry = reinterpret_cast(arg); + // Enable tracker entry + void Enable(entry_t* entry, void* handler, void* arg) { + // Set entry handler and release the entry + entry->arg = arg; + entry->handler.store(handler, std::memory_order_release); + + // Debug trace + if (trace_on_) { + auto outstanding = outstanding_.fetch_add(1); + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fflush(stdout); + } + } + + void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->context_active = true; + Enable(entry, reinterpret_cast(handler), arg); + } + void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) { + Enable(entry, reinterpret_cast(handler), arg); + } + + private: + // Delete an entry by iterator + void Erase(const sig_list_it_t& it) { Delete(*it); } + + // Entry completion + void Complete(entry_t* entry) { record_t* record = entry->record; + // Debug trace if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding--; - fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + auto outstanding = outstanding_.fetch_sub(1); + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); - mutex_.unlock(); } - timestamp_t complete_timestamp = 0; + // Query begin/end and complete timestamps hsa_amd_profiling_dispatch_time_t dispatch_time{}; - - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); - record->complete = entry->tracker->timestamp2ns(complete_timestamp); - record->begin = entry->tracker->timestamp2ns(dispatch_time.start); - record->end = entry->tracker->timestamp2ns(dispatch_time.end); + record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); + record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); + record->complete = hsa_rsrc_->TimestampNs(); + // Original intercepted signal completion hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); @@ -165,26 +174,41 @@ class Tracker { const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); hsa_signal_store_screlease(orig, value - 1); } - entry->tracker->Del(entry); + } + + // Handler for packet completion + static bool Handler(hsa_signal_value_t, void* arg) { + // Acquire entry + entry_t* entry = reinterpret_cast(arg); + volatile std::atomic* ptr = &entry->handler; + while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); + + // Complete entry + entry->tracker->Complete(entry); + + // Call entry handler + void* handler = static_cast(entry->handler); + if (entry->context_active) { + reinterpret_cast(handler)(0, entry->arg); + } else { + rocprofiler_group_t group{}; + reinterpret_cast(handler)(group, entry->arg); + } + + // Delete tracker entry + entry->tracker->Delete(entry); return false; } - inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const { - const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_; - return (timestamp_t)timestamp_ns; - } - - // Timestamp frequency factor - freq_t timestamp_factor_; - // Timeout for wait on destruction - timestamp_t timeout_; // Tracked signals list sig_list_t sig_list_; // Inter-thread synchronization - static mutex_t mutex_; + mutex_t mutex_; // Outstanding dispatches - uint64_t outstanding; + std::atomic outstanding_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/core/types.h b/src/core/types.h index bb53f86fa6..17f7fdc2ba 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -34,6 +34,7 @@ typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; typedef hsa_ven_amd_aqlprofile_profile_t profile_t; typedef hsa_ext_amd_aql_pm4_packet_t packet_t; typedef uint32_t packet_word_t; +typedef uint64_t timestamp_t; } // namespace rocprofiler #endif // SRC_CORE_TYPES_H_ diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 5dccc8a031..d773c301c1 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -44,6 +44,9 @@ SOFTWARE. #include #include +#include "util/exception.h" +#include "util/logger.h" + namespace rocprofiler { namespace util { @@ -110,11 +113,13 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; + // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); } + // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); @@ -132,10 +137,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize loader_api_ = {0}; status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer; + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { @@ -403,22 +417,38 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s return ptr; } +// Wait signal +void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { + while (1) { + const hsa_signal_value_t signal_value = + hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + if (signal_value == 0) { + break; + } else { + if (signal_value == 1) WARN_LOGGING("signal waiting..."); + else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")"); + } + } +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal); + hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_signal_create(1, 0, NULL, &s); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); - if (status == HSA_STATUS_SUCCESS) { - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) { - status = HSA_STATUS_ERROR; - } - } - status = hsa_signal_destroy(s); - } + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s); + status = hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } @@ -561,6 +591,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 93005610d4..08aeb7d13c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -118,9 +118,42 @@ struct AgentInfo { uint32_t shader_arrays_per_se; }; +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + HsaTimer() { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methids for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; +}; + class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); @@ -206,6 +239,12 @@ class HsaRsrcFactory { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + // Wait signal + void SignalWait(const hsa_signal_t& signal) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + // Copy data from GPU to host memory bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); @@ -237,6 +276,18 @@ class HsaRsrcFactory { // Return Loader API table const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -282,6 +333,14 @@ class HsaRsrcFactory { // Loader API table hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; }; } // namespace util diff --git a/src/util/logger.h b/src/util/logger.h index 47767d162c..0a5a29533d 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -191,33 +191,33 @@ class Logger { } // namespace rocprofiler #define ERR_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::errm \ << "error: " << rocprofiler::util::Logger::begm \ << stream << rocprofiler::util::Logger::endl; \ - } + } while(0) #define INFO_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #define WARN_LOGGING(stream) \ - { \ - std::cerr << "ROCProfiler: " << stream << std::endl; \ + do { \ + std::cerr << "ROCProfiler: " << stream << std::endl; \ rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #ifdef DEBUG #define DBG_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \ << stream << "\"" < < < < \ " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #endif #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 0c658d5f68..c142526a3c 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -39,11 +39,14 @@ SOFTWARE. #include #include +#include +#include #include #include #include #include #include +#include #include #include "inc/rocprofiler.h" @@ -70,7 +73,8 @@ struct callbacks_data_t { // Context stored entry type struct context_entry_t { - uint32_t valid; + bool valid; + bool active; uint32_t index; hsa_agent_t agent; rocprofiler_group_t group; @@ -95,8 +99,6 @@ callbacks_data_t* callbacks_data = NULL; // Stored contexts array typedef std::map context_array_t; context_array_t* context_array = NULL; -typedef std::list wait_list_t; -wait_list_t* wait_list = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; @@ -171,9 +173,11 @@ std::string filtr_kernel_name(const std::string name) { } ++rit; } - while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; +// while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; + while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break; auto rbeg = rit; - while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; +// while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; + while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); @@ -384,11 +388,12 @@ void output_group(const context_entry_t* entry, const char* label) { } } -// Dump stored context profiling output data -bool dump_context(context_entry_t* entry) { +// Dump stored context entry +bool dump_context_entry(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; - if (entry->valid == 0) return true; + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); const rocprofiler_dispatch_record_t* record = entry->data.record; if (record) { @@ -438,65 +443,48 @@ bool dump_context(context_entry_t* entry) { rocprofiler_close(group.context); } - entry->valid = 0; return true; } -// Dump and clean a given context entry -static inline bool dump_context_entry(context_entry_t* entry) { - const bool ret = dump_context(entry); - if (ret) dealloc_context_entry(entry); - return ret; -} - -// Dump waiting entries -static inline void dump_wait_list() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - auto it = wait_list->begin(); - auto end = wait_list->end(); - while (it != end) { - auto cur = it++; - if (dump_context_entry(*cur)) { - wait_list->erase(cur); +// Wait for and dump all stored contexts for a given queue if not NULL +void dump_context_array(hsa_queue_t* queue) { + bool done = false; + while (done == false) { + done = true; + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); } - } - - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } -} - -// Dump all stored contexts profiling output data -void dump_context_array() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - if (context_array) { - if (!wait_list->empty()) dump_wait_list(); - - auto it = context_array->begin(); - auto end = context_array->end(); - while (it != end) { - auto cur = it++; - dump_context(&(cur->second)); + + if (context_array) { + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + context_entry_t* entry = &(cur->second); + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + if ((queue == NULL) || (entry->data.queue == queue)) { + if (entry->active == true) { + if (dump_context_entry(&(cur->second)) == false) done = false; + else entry->active = false; + } + } + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + if (done == false) sched_yield(); } } - - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } } // Profiling completion handler -bool handler(rocprofiler_group_t group, void* arg) { +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { @@ -504,11 +492,15 @@ bool handler(rocprofiler_group_t group, void* arg) { abort(); } - if (!wait_list->empty()) dump_wait_list(); - - if (!dump_context_entry(entry)) { - wait_list->push_back(entry); + bool ret = true; + if (entry->active == true) { + ret = dump_context_entry(entry); + if (ret == false) { + fprintf(stderr, "tool error: context is not complete\n"); + abort(); + } } + if (ret) dealloc_context_entry(entry); if (trace_on) { fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -579,7 +571,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, context_entry_t* entry = alloc_context_entry(); // context properties rocprofiler_properties_t properties{}; - properties.handler = (result_prefix != NULL) ? handler : NULL; + properties.handler = (result_prefix != NULL) ? context_handler : NULL; properties.handler_arg = (void*)entry; rocprofiler_feature_t* features = tool_data->features; @@ -600,22 +592,20 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, feature_count = next_offset - set_offset; } - if (feature_count > 0) { - // Open profiling context - status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); - check_status(status); + // Open profiling context + status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); - // Check that we have only one profiling group - uint32_t group_count = 0; - status = rocprofiler_group_count(context, &group_count); - check_status(status); - assert(group_count == 1); - // Get group[0] - const uint32_t group_index = 0; - status = rocprofiler_get_group(context, group_index, group); - check_status(status); - } + // Check that we have only one profiling group + uint32_t group_count = 0; + status = rocprofiler_group_count(context, &group_count); + check_status(status); + assert(group_count == 1); + // Get group[0] + const uint32_t group_index = 0; + status = rocprofiler_get_group(context, group_index, group); + check_status(status); // Fill profiling context entry entry->agent = callback_data->agent; @@ -625,7 +615,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; - entry->valid = 1; + entry->active = true; + reinterpret_cast*>(&entry->valid)->store(true); if (trace_on) { fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -637,7 +628,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { if (result_file_opened == false) printf("\nROCProfiler results:\n"); - dump_context_array(); + dump_context_array(queue); return HSA_STATUS_SUCCESS; } @@ -969,7 +960,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - wait_list = new wait_list_t; // Adding dispatch observer rocprofiler_queue_callbacks_t callbacks_ptrs{0}; @@ -1020,21 +1010,13 @@ extern "C" PUBLIC_API void OnUnloadTool() { rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data - printf("\nROCPRofiler: %u contexts collected", context_collected); - if (result_file_opened) printf(", output directory %s", result_prefix); - printf("\n"); fflush(stdout); - dump_context_array(); - if (wait_list) { - if (!wait_list->empty()) { - printf("\nWaiting for pending kernels ..."); fflush(stdout); - while (wait_list->size() != 0) { - usleep(1000); - dump_wait_list(); - } - printf(".done\n"); fflush(stdout); - } + printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + dump_context_array(NULL); + if (result_file_opened) { + fclose(result_file_handle); + printf(", output directory %s", result_prefix); } - if (result_file_opened) fclose(result_file_handle); + printf("\n"); fflush(stdout); // Cleanup if (callbacks_data != NULL) { @@ -1052,8 +1034,6 @@ extern "C" PUBLIC_API void OnUnloadTool() { range_vec = NULL; delete context_array; context_array = NULL; - delete wait_list; - wait_list = NULL; } extern "C" DESTRUCTOR_API void destructor() {