From 6eef0003a5d242fb48c0d48b694d3e3704e10cfe Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 21 Feb 2018 10:07:59 -0600 Subject: [PATCH] contrctor/destructor changes, OnLoad/OnUnload functions, removed hsa_init()/hsa_shutdown() [ROCm/rocprofiler commit: a9a5119399f9af60a55c53f580748424c59e3364] --- projects/rocprofiler/src/core/context.h | 10 +++- .../rocprofiler/src/core/intercept_queue.cpp | 3 +- .../rocprofiler/src/core/intercept_queue.h | 30 +++-------- projects/rocprofiler/src/core/rocprofiler.cpp | 53 +++++++++++++++++-- .../rocprofiler/src/core/simple_proxy_queue.h | 5 +- .../rocprofiler/src/util/hsa_rsrc_factory.cpp | 11 ++-- .../rocprofiler/src/util/hsa_rsrc_factory.h | 8 ++- projects/rocprofiler/test/ctrl/test.cpp | 2 +- projects/rocprofiler/test/ctrl/test_hsa.cpp | 5 +- projects/rocprofiler/test/ctrl/tool.cpp | 18 +++---- projects/rocprofiler/test/metrics.xml | 10 ++-- 11 files changed, 97 insertions(+), 58 deletions(-) diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index ac68bf6199..acda717c00 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -329,8 +329,14 @@ class Context { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete - hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1, - HSA_WAIT_STATE_BLOCKED); + const uint64_t timeout = UINT64_MAX; + bool complete = false; + while (!complete) { + const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, + HSA_WAIT_STATE_BLOCKED); + complete = (signal_value == 0); + if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(%lx)\n", (int)signal_value, timeout); + } for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = diff --git a/projects/rocprofiler/src/core/intercept_queue.cpp b/projects/rocprofiler/src/core/intercept_queue.cpp index 06821ff0ee..6712e60dab 100644 --- a/projects/rocprofiler/src/core/intercept_queue.cpp +++ b/projects/rocprofiler/src/core/intercept_queue.cpp @@ -9,7 +9,6 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) { InterceptQueue::mutex_t InterceptQueue::mutex_; rocprofiler_callback_t InterceptQueue::on_dispatch_cb_ = NULL; void* InterceptQueue::on_dispatch_cb_data_ = NULL; -const char* InterceptQueue::tool_lib_ = NULL; -void* InterceptQueue::tool_handle_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; +const char* InterceptQueue::kernel_none_ = ""; } // namespace rocprofiler diff --git a/projects/rocprofiler/src/core/intercept_queue.h b/projects/rocprofiler/src/core/intercept_queue.h index 76dc0fd163..2cc976b55e 100644 --- a/projects/rocprofiler/src/core/intercept_queue.h +++ b/projects/rocprofiler/src/core/intercept_queue.h @@ -26,30 +26,13 @@ class InterceptQueue { static void HsaIntercept(HsaApiTable* table); - static void SetTool(const char* tool) { tool_lib_ = tool; } - - static void UnloadTool() { - if (tool_handle_) dlclose(tool_handle_); - } - static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue) { - std::lock_guard lck(mutex_); - hsa_status_t status = HSA_STATUS_ERROR; - - if (tool_lib_) { - tool_handle_ = dlopen(tool_lib_, RTLD_NOW); - if (tool_handle_ == NULL) { - fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib_); - fprintf(stderr, "%s\n", dlerror()); - exit(1); - } - tool_lib_ = NULL; - } + std::lock_guard lck(mutex_); if (!obj_map_) obj_map_ = new obj_map_t; @@ -152,7 +135,7 @@ class InterceptQueue { return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; } - static char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( @@ -167,13 +150,15 @@ class InterceptQueue { // Kernel name is mangled name // apply __cxa_demangle() to demangle it - char* funcname = NULL; + const char* funcname = NULL; if (kernel_name != NULL) { size_t funcnamesize = 0; int status; - char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); + const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); funcname = (ret != 0) ? ret : strdup(kernel_name); } + if (funcname == NULL) funcname = strdup(kernel_none_); + return funcname; } @@ -181,9 +166,8 @@ class InterceptQueue { static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; static rocprofiler_callback_t on_dispatch_cb_; static void* on_dispatch_cb_data_; - static const char* tool_lib_; - static void* tool_handle_; static obj_map_t* obj_map_; + static const char* kernel_none_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; diff --git a/projects/rocprofiler/src/core/rocprofiler.cpp b/projects/rocprofiler/src/core/rocprofiler.cpp index b0430cf69d..d6fa2e4555 100644 --- a/projects/rocprofiler/src/core/rocprofiler.cpp +++ b/projects/rocprofiler/src/core/rocprofiler.cpp @@ -99,14 +99,53 @@ void RestoreHsaApi() { #endif } +typedef void (*tool_handler_t)(); +void * kTtoolHandle = NULL; + +void LoadTool(const char* tool_lib) { + if (tool_lib) { + kTtoolHandle = dlopen(tool_lib, RTLD_NOW); + if (kTtoolHandle == NULL) { + fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib); + fprintf(stderr, "%s\n", dlerror()); + exit(1); + } + tool_handler_t handler = reinterpret_cast(dlsym(kTtoolHandle, "OnLoadTool")); + if (handler == NULL) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnLoadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + exit(1); + } + tool_handler_t on_unload_handler = reinterpret_cast(dlsym(kTtoolHandle, "OnUnloadTool")); + if (on_unload_handler == NULL) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + exit(1); + } + handler(); + } +} + +void UnloadTool() { + if (kTtoolHandle) { + tool_handler_t handler = reinterpret_cast(dlsym(kTtoolHandle, "OnUnloadTool")); + if (handler == NULL) { + fprintf(stderr, "ROCProfiler error: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + exit(1); + } + handler(); + dlclose(kTtoolHandle); + } +} + CONSTRUCTOR_API void constructor() { util::Logger::Create(); - util::HsaRsrcFactory::Create(); } DESTRUCTOR_API void destructor() { - rocprofiler::MetricsDict::Destroy(); util::HsaRsrcFactory::Destroy(); + rocprofiler::MetricsDict::Destroy(); util::Logger::Destroy(); } @@ -139,19 +178,23 @@ extern "C" { // HSA-runtime tool on-load method PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { + const bool intercept_mode = (getenv("ROCP_HSA_INTERCEPT") != NULL); rocprofiler::SaveHsaApi(table); rocprofiler::ProxyQueue::InitFactory(); - rocprofiler::InterceptQueue::SetTool(getenv("ROCP_TOOL_LIB")); // HSA intercepting - if (getenv("ROCP_HSA_INTERCEPT") != NULL) { + if (intercept_mode) { rocprofiler::InterceptQueue::HsaIntercept(table); rocprofiler::ProxyQueue::HsaIntercept(table); } + rocprofiler::LoadTool(getenv("ROCP_TOOL_LIB")); return true; } // HSA-runtime tool on-unload method -PUBLIC_API void OnUnload() { rocprofiler::RestoreHsaApi(); } +PUBLIC_API void OnUnload() { + rocprofiler::UnloadTool(); + rocprofiler::RestoreHsaApi(); +} // Returns library vesrion PUBLIC_API uint32_t rocprofiler_version_major() { return ROCPROFILER_VERSION_MAJOR; } diff --git a/projects/rocprofiler/src/core/simple_proxy_queue.h b/projects/rocprofiler/src/core/simple_proxy_queue.h index 6b3f73b7d1..e556644c10 100644 --- a/projects/rocprofiler/src/core/simple_proxy_queue.h +++ b/projects/rocprofiler/src/core/simple_proxy_queue.h @@ -141,7 +141,10 @@ class SimpleProxyQueue : public ProxyQueue { queue_mask_(0), submit_index_(0), on_submit_cb_(0), - on_submit_cb_data_(0) {} + on_submit_cb_data_(0) + { + printf("ROCProfiler: SimpleProxyQueue is enabled\n"); + } ~SimpleProxyQueue() {} diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index 0bed0abcf2..23c26265c1 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -81,10 +81,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d // Constructor of the class HsaRsrcFactory::HsaRsrcFactory() { + hsa_status_t status; +#if 0 // Initialize the Hsa Runtime - hsa_status_t status = hsa_init(); + printf("ROCProfiler: HSA init\n"); + status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); - +#endif // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); @@ -108,9 +111,11 @@ HsaRsrcFactory::HsaRsrcFactory() { HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; - +#if 0 + printf("ROCProfiler: HSA shutdown\n"); hsa_status_t status = hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); +#endif } hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.h b/projects/rocprofiler/src/util/hsa_rsrc_factory.h index be6f27766c..db86b617a4 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.h @@ -114,9 +114,7 @@ class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; - static HsaRsrcFactory* Create() { return NULL; } - - static HsaRsrcFactory* CreateInstance() { + static HsaRsrcFactory* Create() { std::lock_guard lck(mutex_); if (instance_ == NULL) { instance_ = new HsaRsrcFactory(); @@ -125,9 +123,9 @@ class HsaRsrcFactory { } static HsaRsrcFactory& Instance() { - CreateInstance(); + if (instance_ == NULL) instance_ = Create(); hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; - CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status); + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); return *instance_; } diff --git a/projects/rocprofiler/test/ctrl/test.cpp b/projects/rocprofiler/test/ctrl/test.cpp index 780e0d010a..ce6af33285 100644 --- a/projects/rocprofiler/test/ctrl/test.cpp +++ b/projects/rocprofiler/test/ctrl/test.cpp @@ -13,6 +13,6 @@ int main(int argc, char** argv) { const int diter = (diter_s != NULL) ? atol(diter_s) : 1; TestHsa::HsaInstantiate(); for (int i = 0; i < kiter; ++i) RunKernel(argc, argv, diter); - //TestHsa::HsaShutdown(); + TestHsa::HsaShutdown(); return 0; } diff --git a/projects/rocprofiler/test/ctrl/test_hsa.cpp b/projects/rocprofiler/test/ctrl/test_hsa.cpp index 7de33c9d92..58dd38efb9 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.cpp +++ b/projects/rocprofiler/test/ctrl/test_hsa.cpp @@ -69,7 +69,10 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { } void TestHsa::HsaShutdown() { - if (hsa_queue_ != NULL) hsa_queue_destroy(hsa_queue_); + if (hsa_queue_ != NULL) { + hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + } if (hsa_rsrc_) hsa_rsrc_->Destroy(); } diff --git a/projects/rocprofiler/test/ctrl/tool.cpp b/projects/rocprofiler/test/ctrl/tool.cpp index 4b245a9522..2ea0f119ff 100644 --- a/projects/rocprofiler/test/ctrl/tool.cpp +++ b/projects/rocprofiler/test/ctrl/tool.cpp @@ -338,7 +338,7 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg } // Tool constructor -CONSTRUCTOR_API void constructor() +extern "C" PUBLIC_API void OnLoadTool() { std::map parameters_dict; parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] = @@ -487,23 +487,21 @@ CONSTRUCTOR_API void constructor() } // Tool destructor -DESTRUCTOR_API void destructor() { - const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); +extern "C" PUBLIC_API void OnUnloadTool() { + // Unregister dispatch callback + rocprofiler_remove_dispatch_callback(); + // Dump stored profiling output data + const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); printf("\nROCPRofiler: %u contexts collected", context_count); if (result_file_opened) printf(", output directory %s", result_prefix); printf("\n"); - - // Dump stored profiling output data dump_context_array(); + if (result_file_opened) fclose(result_file_handle); - // Unregister dispatch callback and free callback data - rocprofiler_remove_dispatch_callback(); + // Cleanup if (dispatch_data != NULL) { delete[] dispatch_data->features; delete dispatch_data; } - - // Close output file - if (result_file_opened) fclose(result_file_handle); } diff --git a/projects/rocprofiler/test/metrics.xml b/projects/rocprofiler/test/metrics.xml index a666384fc4..e84aa28728 100644 --- a/projects/rocprofiler/test/metrics.xml +++ b/projects/rocprofiler/test/metrics.xml @@ -213,14 +213,14 @@ # VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). # SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). @@ -248,14 +248,14 @@ # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). # MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). @@ -276,7 +276,7 @@ # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).