diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 53ec083644..2c8853e571 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -41,9 +41,11 @@ class InterceptQueue { group_segment_size, queue, &status); if (status != HSA_STATUS_SUCCESS) abort(); - if (tracker_on_ && (tracker_ == NULL)) tracker_ = new Tracker(timeout_); - status = hsa_amd_profiling_set_profiler_enabled(*queue, true); - if (status != HSA_STATUS_SUCCESS) abort(); + if (tracker_on_ && (tracker_ == NULL)) { + tracker_ = new Tracker(timeout_); + status = hsa_amd_profiling_set_profiler_enabled(*queue, true); + if (status != HSA_STATUS_SUCCESS) abort(); + } if (!obj_map_) obj_map_ = new obj_map_t; InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); diff --git a/src/core/proxy_queue.cpp b/src/core/proxy_queue.cpp index 54c93801c8..59ab280bd1 100644 --- a/src/core/proxy_queue.cpp +++ b/src/core/proxy_queue.cpp @@ -19,7 +19,7 @@ ProxyQueue* ProxyQueue::Create(hsa_agent_t agent, uint32_t size, hsa_queue_type3 hsa_status_t suc = HSA_STATUS_ERROR; #ifdef ROCP_HSA_PROXY ProxyQueue* instance = - (rocp_type_) ? (ProxyQueue*)new SimpleProxyQueue() : (ProxyQueue*)new HsaProxyQueue(); + (rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue(); #else ProxyQueue* instance = new SimpleProxyQueue(); #endif diff --git a/src/core/simple_proxy_queue.h b/src/core/simple_proxy_queue.h index e556644c10..c1c18862bd 100644 --- a/src/core/simple_proxy_queue.h +++ b/src/core/simple_proxy_queue.h @@ -144,6 +144,7 @@ class SimpleProxyQueue : public ProxyQueue { on_submit_cb_data_(0) { printf("ROCProfiler: SimpleProxyQueue is enabled\n"); + fflush(stdout); } ~SimpleProxyQueue() {} diff --git a/src/core/tracker.h b/src/core/tracker.h index 98ae3fe19e..76b24257bc 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -1,6 +1,7 @@ #ifndef SRC_CORE_TRACKER_H_ #define SRC_CORE_TRACKER_H_ +#include #include #include #include @@ -29,7 +30,7 @@ class Tracker { record_t* record; }; - Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout) {} + Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) {} ~Tracker() { mutex_.lock(); for (entry_t* entry : sig_list_) { @@ -51,6 +52,7 @@ class Tracker { // Add tracker entry entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + hsa_status_t status = HSA_STATUS_ERROR; entry_t* entry = new entry_t{}; assert(entry); entry->tracker = this; @@ -60,7 +62,7 @@ class Tracker { entry->agent = agent; entry->orig = orig; - hsa_status_t status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); record_t* record = new record_t{}; @@ -72,6 +74,14 @@ class Tracker { hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding++; + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + return entry; } @@ -90,6 +100,14 @@ class Tracker { entry_t* entry = reinterpret_cast(arg); record_t* record = entry->record; + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding--; + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &record->complete); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); if (record->complete == 0) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP), time is zero"); @@ -103,8 +121,13 @@ class Tracker { hsa_signal_t orig = entry->orig; if (orig.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); - hsa_signal_store_relaxed(orig, value - 1); + hsa_signal_store_screlease(orig, value - 1); } entry->tracker->Del(entry); @@ -117,6 +140,10 @@ class Tracker { sig_list_t sig_list_; // Inter-thread synchronization static mutex_t mutex_; + // Outstanding dispatches + uint64_t outstanding; + // Enable tracing + static const bool trace_on_ = false; }; } // namespace rocprofiler diff --git a/test/ctrl/tool.cpp b/test/ctrl/tool.cpp index b53324d7c0..6074886da8 100644 --- a/test/ctrl/tool.cpp +++ b/test/ctrl/tool.cpp @@ -8,12 +8,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -40,7 +40,6 @@ struct callbacks_data_t { // Context stored entry type struct context_entry_t { - int valid; uint32_t index; rocprofiler_group_t group; rocprofiler_feature_t* features; @@ -49,8 +48,10 @@ struct context_entry_t { FILE* file_handle; }; +// Enable tracing +static const bool trace_on = false; // Tool is unloaded -bool is_unloaded = false; +bool is_loaded = false; // Dispatch callbacks and context handlers synchronization pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; // Dispatch callback data @@ -58,6 +59,8 @@ callbacks_data_t* callbacks_data = NULL; // Stored contexts array typedef std::map context_array_t; context_array_t* context_array = NULL; +typedef std::list wait_list_t; +wait_list_t* wait_list = NULL; // Contexts collected count uint32_t context_count = 0; uint32_t context_collected = 0; @@ -106,7 +109,6 @@ context_entry_t* alloc_context_entry() { exit(1); } - if (context_array == NULL) context_array = new context_array_t; const uint32_t index = context_count; auto ret = context_array->insert({index, context_entry_t{}}); if (ret.second == false) { @@ -247,58 +249,76 @@ void output_group(FILE* file, const rocprofiler_group_t* group, const char* str) bool dump_context(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; - if (entry->valid) { - const rocprofiler_dispatch_record_t* record = entry->data.record; - if (record) { - if (record->complete == 0) return false; - } - - ++context_collected; - entry->valid = 0; - const uint32_t index = entry->index; - FILE* file_handle = entry->file_handle; - const rocprofiler_feature_t* features = entry->features; - const unsigned feature_count = entry->feature_count; - - fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", - index, - entry->data.queue_index, - entry->data.kernel_name); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", - record->dispatch, - record->begin, - record->end, - record->complete); - fprintf(file_handle, "\n"); - fflush(file_handle); - - if (record) { - delete record; - entry->data.record = NULL; - } - - rocprofiler_group_t& group = entry->group; - if (group.context != NULL) { - status = rocprofiler_group_get_data(&group); - check_status(status); - // output_group(file, group, "Group[0] data"); - - status = rocprofiler_get_metrics(group.context); - check_status(status); - std::ostringstream oss; - oss << index << "__" << entry->data.kernel_name; - output_results(file_handle, features, feature_count, group.context, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); - free(const_cast(entry->data.kernel_name)); - - // Finishing cleanup - // Deleting profiling context will delete all allocated resources - rocprofiler_close(group.context); + const rocprofiler_dispatch_record_t* record = entry->data.record; + if (record) { + if (record->complete == 0) { + return false; } } + ++context_collected; + const uint32_t index = entry->index; + FILE* file_handle = entry->file_handle; + const rocprofiler_feature_t* features = entry->features; + const unsigned feature_count = entry->feature_count; + + fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", + index, + entry->data.queue_index, + entry->data.kernel_name); + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(file_handle, "\n"); + fflush(file_handle); + + if (record) { + delete record; + entry->data.record = NULL; + } + + rocprofiler_group_t& group = entry->group; + if (group.context != NULL) { + status = rocprofiler_group_get_data(&group); + check_status(status); + // output_group(file, group, "Group[0] data"); + + status = rocprofiler_get_metrics(group.context); + check_status(status); + std::ostringstream oss; + oss << index << "__" << entry->data.kernel_name; + output_results(file_handle, features, feature_count, group.context, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); + free(const_cast(entry->data.kernel_name)); + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + rocprofiler_close(group.context); + } + return true; } +// Profiling completion handler +static inline bool dump_context_entry(context_entry_t* entry) { + const bool ret = dump_context(entry); + if (ret) dealloc_context_entry(entry); + return ret; +} + +// Dump waiting entries +static inline void dump_wait_list() { + auto it = wait_list->begin(); + auto end = wait_list->begin(); + while (it != end) { + auto cur = it++; + if (dump_context_entry(*cur)) { + wait_list->erase(cur); + } + } +} + // Dump all stored contexts profiling output data void dump_context_array() { if (pthread_mutex_lock(&mutex) != 0) { @@ -306,7 +326,16 @@ void dump_context_array() { exit(1); } - if (context_array) for (auto& v : *context_array) dump_context(&v.second); + if (!wait_list->empty()) dump_wait_list(); + + if (context_array) { + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + dump_context_entry(&(cur->second)); + } + } if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); @@ -317,16 +346,21 @@ void dump_context_array() { // Profiling completion handler bool handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); - bool ret = false; if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); exit(1); } - if (context_array->find(entry->index) != context_array->end()) { - if (dump_context(entry)) dealloc_context_entry(entry); - else ret = true; + if (!wait_list->empty()) dump_wait_list(); + + if (!dump_context_entry(entry)) { + wait_list->push_back(entry); + } + + if (trace_on) { + fprintf(stdout, "tool::handler: context_array %d\n", (int)(context_array->size())); + fflush(stdout); } if (pthread_mutex_unlock(&mutex) != 0) { @@ -334,7 +368,7 @@ bool handler(rocprofiler_group_t group, void* arg) { exit(1); } - return ret; + return false; } // Kernel disoatch callback @@ -414,7 +448,11 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; entry->index = next_context_count(); - entry->valid = 1; + + if (trace_on) { + fprintf(stdout, "tool::dispatch: context_array %d\n", (int)(context_array->size())); + fflush(stdout); + } return status; } @@ -468,6 +506,17 @@ void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& fie // Tool constructor extern "C" PUBLIC_API void OnLoadTool() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + if (is_loaded) return; + is_loaded = true; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } + std::map parameters_dict; parameters_dict["COMPUTE_UNIT_TARGET"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; @@ -602,6 +651,9 @@ extern "C" PUBLIC_API void OnLoadTool() } fflush(stdout); + // Context array aloocation + context_array = new context_array_t; + wait_list = new wait_list_t; // Adding dispatch observer rocprofiler_queue_callbacks_t callbacks_ptrs{0}; @@ -624,7 +676,16 @@ extern "C" PUBLIC_API void OnLoadTool() // Tool destructor extern "C" PUBLIC_API void OnUnloadTool() { - is_unloaded = true; + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + if (!is_loaded) return; + is_loaded = false; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); @@ -648,8 +709,12 @@ extern "C" PUBLIC_API void OnUnloadTool() { kernel_string_vec = NULL; delete range_vec; range_vec = NULL; + delete context_array; + context_array = NULL; + delete wait_list; + wait_list = NULL; } extern "C" DESTRUCTOR_API void destructor() { - if (is_unloaded == false) OnUnloadTool(); + if (is_loaded == true) OnUnloadTool(); }