From 5a7dec797e0ec1e41da5d98be81da1b5e51fce2e Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Sat, 27 May 2023 09:34:42 +0000 Subject: [PATCH] SWDEV-403050: Multiple fixes for Memory Leaks in profiler Change-Id: Ib720a81105af13898ff745ce0cbc2a48c1f4a980 [ROCm/rocprofiler commit: 08fc21ac310bebb22638ae5a8a48879330422713] --- projects/rocprofiler/CMakeLists.txt | 3 + projects/rocprofiler/bin/rocprofv2 | 18 - .../include/rocprofiler/v2/rocprofiler.h | 20 +- projects/rocprofiler/plugin/file/file.cpp | 3 +- .../rocprofiler/samples/common/helper.cpp | 3 +- projects/rocprofiler/src/api/CMakeLists.txt | 6 +- projects/rocprofiler/src/core/context.h | 3 +- .../core/counters/metrics/eval_metrics.cpp | 45 +- .../src/core/counters/metrics/metrics.h | 8 +- .../src/core/hardware/hsa_info.cpp | 4 +- .../rocprofiler/src/core/hardware/hsa_info.h | 6 +- .../rocprofiler/src/core/hsa/hsa_common.cpp | 27 +- .../rocprofiler/src/core/hsa/hsa_common.h | 5 +- .../core/hsa/packets/packets_generator.cpp | 543 +++++++++--------- .../src/core/hsa/packets/packets_generator.h | 2 +- .../rocprofiler/src/core/hsa/queues/queue.cpp | 369 ++++++------ .../rocprofiler/src/core/hsa/queues/queue.h | 4 +- .../src/core/memory/generic_buffer.cpp | 5 +- .../src/core/memory/generic_buffer.h | 6 +- projects/rocprofiler/src/core/metrics.h | 8 +- .../src/core/session/profiler/profiler.cpp | 49 +- .../src/core/session/profiler/profiler.h | 7 +- .../rocprofiler/src/core/session/session.cpp | 26 +- .../rocprofiler/src/core/session/session.h | 2 +- projects/rocprofiler/src/tools/tool.cpp | 90 ++- projects/rocprofiler/src/utils/helper.cpp | 3 +- 26 files changed, 673 insertions(+), 592 deletions(-) diff --git a/projects/rocprofiler/CMakeLists.txt b/projects/rocprofiler/CMakeLists.txt index fbeab25369..a5a0a9d05f 100644 --- a/projects/rocprofiler/CMakeLists.txt +++ b/projects/rocprofiler/CMakeLists.txt @@ -116,6 +116,9 @@ find_package( PATHS ${ROCM_PATH}) +find_library(NUMA NAME numa REQUIRED) +link_libraries(${NUMA}) + get_property( HSA_RUNTIME_INCLUDE_DIRECTORIES TARGET hsa-runtime64::hsa-runtime64 diff --git a/projects/rocprofiler/bin/rocprofv2 b/projects/rocprofiler/bin/rocprofv2 index 319e70833f..5d08c11942 100755 --- a/projects/rocprofiler/bin/rocprofv2 +++ b/projects/rocprofiler/bin/rocprofv2 @@ -302,24 +302,6 @@ get_pmc_results_txt_path(){ done } -if [ -n "$COUNTERS_PMC_DIRS" ]; then - COUNTERS_RESULTS_TXT="" - for PMC_DIR in $COUNTERS_PMC_DIRS; do - COUNTERS_RESULTS_TXT="$COUNTERS_RESULTS_TXT $(get_pmc_results_txt_path $PMC_DIR)" - done - CSV_RESULTS=$OUTPUT_PATH_INTERNAL/results.csv - if [ -n "$OUT_FILE_NAME" ]; then - CSV_RESULTS=$OUTPUT_PATH_INTERNAL/$OUT_FILE_NAME - fi - echo "csv results path: $CSV_RESULTS" - export ROCP_MERGE_PIDS=1 #required for tblextr.py to work correctly for counters - if [ $RUN_FROM_BUILD == 1 ]; then - eval "python3 $ROCM_DIR/bin/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT" - else - eval "python3 $ROCPROFV2_DIR/../libexec/rocprofiler/tblextr.py $CSV_RESULTS $COUNTERS_RESULTS_TXT" - fi -fi - if [ -n "$ATT_PATH" ]; then if [ -n "$ATT_ARGV" ]; then eval "python3 $ATT_PATH $ATT_ARGV" diff --git a/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h b/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h index 097233836a..f2f23a80e7 100644 --- a/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h +++ b/projects/rocprofiler/include/rocprofiler/v2/rocprofiler.h @@ -110,7 +110,7 @@ THE SOFTWARE. #if !defined(ROCPROFILER) #if defined(ROCPROFILER_EXPORTS) #define ROCPROFILER_API ROCPROFILER_EXPORT -#else /* !defined (ROCPROFILER_EXPORTS) */ +#else /* !defined (ROCPROFILER_EXPORTS) */ #define ROCPROFILER_API ROCPROFILER_IMPORT #endif /* !defined (ROCPROFILER_EXPORTS) */ #endif /* !defined (ROCPROFILER) */ @@ -949,6 +949,14 @@ typedef struct { uint64_t signal_handle; } rocprofiler_kernel_properties_t; + +/** + * Correlation ID + */ +typedef struct { + uint64_t value; +} rocprofiler_correlation_id_t; + /** * Profiling record, this will represent all the information reported by the * profiler regarding kernel dispatches and their counters that were collected @@ -986,7 +994,11 @@ typedef struct { * Counters, including identifiers to get counter information and Counters * values */ - rocprofiler_record_counter_instance_t* counters; + const rocprofiler_record_counter_instance_t* counters; + /** + * The count of the counters that were collected by the profiler + */ + rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */ /** * kernel properties, including the grid size, work group size, * registers count, wave size and completion signal @@ -1001,9 +1013,9 @@ typedef struct { */ rocprofiler_queue_index_t queue_idx; /** - * The count of the counters that were collected by the profiler + * Correlation id */ - rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */ + rocprofiler_correlation_id_t correlation_id; } rocprofiler_record_profiler_t; typedef struct { diff --git a/projects/rocprofiler/plugin/file/file.cpp b/projects/rocprofiler/plugin/file/file.cpp index 51fd054a6f..1e82441496 100644 --- a/projects/rocprofiler/plugin/file/file.cpp +++ b/projects/rocprofiler/plugin/file/file.cpp @@ -233,7 +233,6 @@ class file_plugin_t { tracer_record.api_data_handle, tracer_record.operation_id, &function_name_c)); } } - //return; output_file_t* output_file = get_output_file(output_type_t::TRACER, tracer_record.domain); *output_file << "Record(" << tracer_record.header.id.handle << "), Domain(" << GetDomainName(tracer_record.domain) << "),"; @@ -336,7 +335,7 @@ class file_plugin_t { session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler, &counter_name_length)); if (counter_name_length > 1) { - const char* name_c = static_cast(malloc(name_length * sizeof(char))); + const char* name_c = nullptr; CHECK_ROCPROFILER(rocprofiler_query_counter_info( session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler, &name_c)); diff --git a/projects/rocprofiler/samples/common/helper.cpp b/projects/rocprofiler/samples/common/helper.cpp index d70677d8a7..16b0380bd3 100644 --- a/projects/rocprofiler/samples/common/helper.cpp +++ b/projects/rocprofiler/samples/common/helper.cpp @@ -87,7 +87,8 @@ std::string string_printf(const char* format, ...) { std::string errmsg("ROCProfiler: fatal error: " + message); fputs(errmsg.c_str(), stderr); - throw(errmsg); + // throw(errmsg); + abort(); } /* The function extracts the kernel name from diff --git a/projects/rocprofiler/src/api/CMakeLists.txt b/projects/rocprofiler/src/api/CMakeLists.txt index c300be9656..6926eaaa97 100644 --- a/projects/rocprofiler/src/api/CMakeLists.txt +++ b/projects/rocprofiler/src/api/CMakeLists.txt @@ -245,7 +245,7 @@ set_target_properties(rocprofiler-v2 PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/v2 VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) - + # Add custom command to copy the v2 library to buil-dir as well add_custom_command(TARGET rocprofiler-v2 POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/v2/librocprofiler64.so.2 ${CMAKE_BINARY_DIR} @@ -271,10 +271,10 @@ target_include_directories(rocprofiler-v2 if(ASAN) target_compile_options(rocprofiler-v2 PRIVATE -fsanitize=address) target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined,-fsanitize=address) - target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES}) + target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa asan dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES}) else() target_link_options(rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined) - target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES}) + target_link_libraries(rocprofiler-v2 PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 Threads::Threads atomic numa dl c stdc++ stdc++fs amd_comgr ${PCIACCESS_LIBRARIES}) endif() ## Install libraries: Non versioned lib file in dev package # install(TARGETS rocprofiler-v2 LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev) diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index 4b999b84de..f366f2ae81 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -207,7 +207,8 @@ class Context { } catch(...) { delete obj; obj = NULL; - throw; + std::cerr << "Error: Context Create failed" << std::endl; + abort(); } return obj; } diff --git a/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp b/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp index 9bc894fe71..4efa7aae91 100644 --- a/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp +++ b/projects/rocprofiler/src/core/counters/metrics/eval_metrics.cpp @@ -46,8 +46,10 @@ hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { if (IsEventMatch(info_data->pmc_data.event, (*data_it)->event)) { uint32_t xcc_index = floor(passed_data->index / passed_data->single_xcc_buff_size); - (*data_it)->xcc_vals[xcc_index] += info_data->pmc_data.result; // stores event result from each xcc separately - (*data_it)->val_double += info_data->pmc_data.result; // stores accumulated event result from all xccs + (*data_it)->xcc_vals[xcc_index] += + info_data->pmc_data.result; // stores event result from each xcc separately + (*data_it)->val_double += + info_data->pmc_data.result; // stores accumulated event result from all xccs } } } @@ -98,19 +100,16 @@ bool metrics::ExtractMetricEvents( // const Metric* metric = metrics_dict->GetMetricByName(metric_names[i]); const Metric* metric = metrics_dict->Get(metric_names[i]); if (metric == nullptr) { - Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); - fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(), - agentInfo.getName().data()); - + Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); + fatal("input metric'%s' not supported on this hardware: %s ", metric_names[i].c_str(), + agentInfo.getName().data()); } // adding result object for derived metric std::lock_guard lock(extract_metric_events_lock); if (results_map.find(metric_names[i]) == results_map.end()) { results_map[metric_names[i]] = new results_t(metric_names[i], {}, xcc_count); - } // else { - // continue; - // } + } counters_vec = metric->GetCounters(); if (counters_vec.empty()) @@ -129,7 +128,8 @@ bool metrics::ExtractMetricEvents( } else { // result object for base metric // std::cout << "Metric : " << metric->GetName() << " : " << counter->name << std::endl; - result = new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value + result = + new results_t(counter->name, {}, xcc_count); // TODO: set correct initial value results_map[counter->name] = result; } } else { @@ -188,7 +188,7 @@ bool metrics::ExtractMetricEvents( bool metrics::GetCounterData(hsa_ven_amd_aqlprofile_profile_t* profile, hsa_agent_t gpu_agent, std::vector& results_list) { uint32_t xcc_count = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle).getXccCount(); - uint32_t single_xcc_buff_size = profile->output_buffer.size /(sizeof(uint64_t) * xcc_count); + uint32_t single_xcc_buff_size = profile->output_buffer.size / (sizeof(uint64_t) * xcc_count); callback_data_t callback_data{&results_list, 0, single_xcc_buff_size}; hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data(profile, pmcCallback, &callback_data); return (status == HSA_STATUS_SUCCESS); @@ -210,16 +210,19 @@ bool metrics::GetMetricsData(std::map& results_map, return true; } -void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index, std::vector& results_list, - std::map& results_map, - std::vector& metrics_list){ - for(auto it = results_list.begin(); it != results_list.end(); it++){ - (*it)->val_double = (*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc - } +void metrics::GetCountersAndMetricResultsByXcc(uint32_t xcc_index, + std::vector& results_list, + std::map& results_map, + std::vector& metrics_list) { + for (auto it = results_list.begin(); it != results_list.end(); it++) { + (*it)->val_double = + (*it)->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc + } - for(auto it = results_map.begin(); it != results_map.end(); it++){ - it->second->val_double = it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc - } + for (auto it = results_map.begin(); it != results_map.end(); it++) { + it->second->val_double = + it->second->xcc_vals[xcc_index]; // set val_double to hold value for specific xcc + } - GetMetricsData(results_map, metrics_list); + GetMetricsData(results_map, metrics_list); } diff --git a/projects/rocprofiler/src/core/counters/metrics/metrics.h b/projects/rocprofiler/src/core/counters/metrics/metrics.h index 062285d344..d21e7d105a 100755 --- a/projects/rocprofiler/src/core/counters/metrics/metrics.h +++ b/projects/rocprofiler/src/core/counters/metrics/metrics.h @@ -282,10 +282,12 @@ class MetricsDict { try { expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); } catch (const xml::exception_t& exc) { - if (do_lookup) + if (do_lookup) { metrics_list.push_back(node); - else - throw(exc); + } else { + std::cerr << "Error: " << exc.what() << std::endl; + abort(); + } } if (expr_obj) { #if 0 diff --git a/projects/rocprofiler/src/core/hardware/hsa_info.cpp b/projects/rocprofiler/src/core/hardware/hsa_info.cpp index e2255f839f..035ba9e601 100644 --- a/projects/rocprofiler/src/core/hardware/hsa_info.cpp +++ b/projects/rocprofiler/src/core/hardware/hsa_info.cpp @@ -89,7 +89,7 @@ AgentInfo::AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table) : handle_(a xcc_num_ = 1; } -int AgentInfo::getIndex() const { return index_; } +uint64_t AgentInfo::getIndex() const { return index_; } hsa_device_type_t AgentInfo::getType() const { return type_; } uint64_t AgentInfo::getHandle() const { return handle_; } const std::string_view AgentInfo::getName() const { return name_; } @@ -107,7 +107,7 @@ uint32_t AgentInfo::getPCIDomain() const { return pci_domain_; } uint32_t AgentInfo::getPCILocationID() const { return pci_location_id_; } uint32_t AgentInfo::getXccCount() const { return xcc_num_; } -void AgentInfo::setIndex(int index) { index_ = index; } +void AgentInfo::setIndex(uint64_t index) { index_ = index; } void AgentInfo::setType(hsa_device_type_t type) { type_ = type; } void AgentInfo::setHandle(uint64_t handle) { handle_ = handle; } void AgentInfo::setName(const std::string& name) { strcpy(name_, name.c_str()); } diff --git a/projects/rocprofiler/src/core/hardware/hsa_info.h b/projects/rocprofiler/src/core/hardware/hsa_info.h index b73c6197a7..b0645f323d 100644 --- a/projects/rocprofiler/src/core/hardware/hsa_info.h +++ b/projects/rocprofiler/src/core/hardware/hsa_info.h @@ -43,7 +43,7 @@ class AgentInfo { AgentInfo(); AgentInfo(const hsa_agent_t agent, ::CoreApiTable* table); - int getIndex() const; + uint64_t getIndex() const; hsa_device_type_t getType() const; uint64_t getHandle() const; const std::string_view getName() const; @@ -62,7 +62,7 @@ class AgentInfo { uint32_t getPCILocationID() const; uint32_t getXccCount() const; - void setIndex(int index); + void setIndex(uint64_t index); void setType(hsa_device_type_t type); void setHandle(uint64_t handle); void setName(const std::string& name); @@ -78,7 +78,7 @@ class AgentInfo { hsa_amd_memory_pool_t gpu_pool; private: - int index_; + uint64_t index_; hsa_device_type_t type_; // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 uint64_t handle_; char name_[64]; diff --git a/projects/rocprofiler/src/core/hsa/hsa_common.cpp b/projects/rocprofiler/src/core/hsa/hsa_common.cpp index 2687f3b651..7b423b46ab 100644 --- a/projects/rocprofiler/src/core/hsa/hsa_common.cpp +++ b/projects/rocprofiler/src/core/hsa/hsa_common.cpp @@ -30,26 +30,39 @@ std::mutex agents_map_lock; std::map agent_info_map; Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle) { std::lock_guard lock(agents_map_lock); - if (agent_info_map.find(handle) != agent_info_map.end()) + if (agent_info_map.find(handle) != agent_info_map.end()) { return agent_info_map.at(handle); - else - throw(std::string("Error: Can't find Agent with handle(") + std::to_string(handle) + - ") in this system"); + } else { + std::cerr << std::string("Error: Can't find Agent with handle(") << std::to_string(handle) << + ") in this system" << std::endl; + abort(); + } } + +std::vector cpu_agents_list; + void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info) { std::lock_guard lock(agents_map_lock); agent_info_map.emplace(handle, agent_info); + if (agent_info.getType() == HSA_DEVICE_TYPE_GPU) { + cpu_agents_list.emplace_back(hsa_agent_t{handle}); + } } -hsa_agent_t GetAgentByIndex(int agent_index) { +std::vector& GetCPUAgentList() { + return cpu_agents_list; +} + +hsa_agent_t GetAgentByIndex(uint64_t agent_index) { std::lock_guard lock(agents_map_lock); for (auto& agent_info : agent_info_map) { if (agent_info.second.getIndex() == agent_index) { return hsa_agent_t{agent_info.second.getHandle()}; } } - throw(std::string("Error: Can't find Agent with Index(") + std::to_string(agent_index) + - ") in this system"); + std::cerr << std::string("Error: Can't find Agent with Index(") << std::to_string(agent_index) << + ") in this system" << std::endl; + abort(); } CoreApiTable saved_core_api{}; diff --git a/projects/rocprofiler/src/core/hsa/hsa_common.h b/projects/rocprofiler/src/core/hsa/hsa_common.h index 3b3dbcf532..1fb20422ef 100644 --- a/projects/rocprofiler/src/core/hsa/hsa_common.h +++ b/projects/rocprofiler/src/core/hsa/hsa_common.h @@ -38,9 +38,12 @@ namespace rocmtools { namespace hsa_support { + +std::vector& GetCPUAgentList(); + Agent::AgentInfo& GetAgentInfo(decltype(hsa_agent_t::handle) handle); void SetAgentInfo(decltype(hsa_agent_t::handle) handle, const Agent::AgentInfo& agent_info); -hsa_agent_t GetAgentByIndex(int agent_index); +hsa_agent_t GetAgentByIndex(uint64_t agent_index); CoreApiTable& GetCoreApiTable(); void SetCoreApiTable(const CoreApiTable& table); diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp index 0cb19013b6..338809d006 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.cpp @@ -25,8 +25,11 @@ #include #include #include +#include #include +#include +#include #include #include #include @@ -47,23 +50,22 @@ #define CHECK_HSA_STATUS(msg, status) \ do { \ - if ((status) != HSA_STATUS_INFO_BREAK) { \ - const char* emsg = 0; \ - hsa_status_string(status, &emsg); \ - printf("%s: %s\n", msg, emsg ? emsg : ""); \ + if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \ + try { \ + const char* emsg = nullptr; \ + hsa_status_string(status, &emsg); \ + if (!emsg) emsg = ""; \ + std::cerr << msg << std::endl; \ + std::cerr << emsg << std::endl; \ + } catch (std::exception & e) { \ + } \ abort(); \ } \ } while (0) namespace Packet { - static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; -// hsa_amd_memory_pool_t command_pool; -// hsa_amd_memory_pool_t output_pool; - -// hsa_amd_memory_pool_t& GetCommandPool() { return command_pool; } -// hsa_amd_memory_pool_t& GetOutputPool() { return output_pool; } // This function checks to see if the provided // pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, @@ -116,11 +118,11 @@ void InitializePools(hsa_agent_t cpu_agent, Agent::AgentInfo* agent_info) { hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( cpu_agent, FindStandardPool, &(agent_info->cpu_pool)); - if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Command Buffer Pool is not initialized\n"); + CHECK_HSA_STATUS("Error: Command Buffer Pool is not initialized", status); status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( cpu_agent, FindKernArgPool, &(agent_info->kernarg_pool)); - if ((status != HSA_STATUS_INFO_BREAK)) printf("Error: Output Buffer Pool is not initialized\n"); + CHECK_HSA_STATUS("Error: Output Buffer Pool is not initialized", status); } void InitializeGPUPool(hsa_agent_t gpu_agent, Agent::AgentInfo* agent_info) { @@ -135,6 +137,7 @@ struct block_des_t { }; std::map metricsDict; +static std::atomic counters_added{false}; void CheckPacketReqiurements(std::vector& gpu_agents) { for (auto& gpu_agent : gpu_agents) { @@ -147,253 +150,240 @@ void CheckPacketReqiurements(std::vector& gpu_agents) { // Initialize the PM4 commands with having the CPU&GPU agents, the counters, // counters count to output three packets which are start, stop and read // packets -std::vector>* +std::vector> InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, std::vector& counter_names, bool is_spm) { hsa_status_t status = HSA_STATUS_SUCCESS; + if (!counters_added.load(std::memory_order_acquire)) { + for (auto& name : counter_names) { + if (rocmtools::GetROCMToolObj()->HasActiveSession()) { + rocmtools::GetROCMToolObj() + ->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId()) + ->GetProfiler() + ->AddCounterName(name); + } + } + counters_added.exchange(true, std::memory_order_release); + } + Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); std::map results_map; std::vector events_list; std::vector results_list; std::map, uint64_t> event_to_max_block_count; std::map> metrics_counters; - uint32_t counters_count = 0; - for (auto& name : counter_names) { - // std::cout << "Counter from Counter Names: " << name << std::endl; - - if (rocmtools::GetROCMToolObj()->HasActiveSession()) { - rocmtools::GetROCMToolObj() - ->GetSession(rocmtools::GetROCMToolObj()->GetCurrentSessionId()) - ->GetProfiler() - ->AddCounterName(name); - } - - counters_count++; + if (!rocmtools::metrics::ExtractMetricEvents( + counter_names, gpu_agent, metricsDict[gpu_agent.handle], results_map, events_list, + results_list, event_to_max_block_count, metrics_counters)) { + std::cerr << "Error: Failed to extract metric events" << std::endl; + abort(); } - rocmtools::metrics::ExtractMetricEvents(counter_names, gpu_agent, metricsDict[gpu_agent.handle], - results_map, events_list, results_list, - event_to_max_block_count, metrics_counters); - // TODO: validate needs to be called on each events_list[i] // Validating the events array for the specified gpu agent - bool result; - hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &result); - if (!result) { - printf("Error: Events are not valid for the current gpu agent\n"); - throw("Error: Events are not valid for the current gpu agent"); + bool validate_event_result; + status = + hsa_ven_amd_aqlprofile_validate_event(gpu_agent, &events_list[0], &validate_event_result); + CHECK_HSA_STATUS("Error: Validating Counters", status); + if (!validate_event_result) { + std::cerr << "Error: Events are not valid for the current gpu agent" << std::endl; + abort(); } - std::vector>* - profiles = new std::vector< + std::vector> + profiles = std::vector< std::pair>(); - do { - rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t(); - context->gpu_agent = gpu_agent; - uint64_t i = 0; - uint32_t counter_val_iteration = 0; - auto result = results_list.begin(); - std::map, uint32_t> block_max_events_count; - std::set block_names_taken; - for (auto event = events_list.begin(); event != events_list.end();) { - if (block_max_events_count[std::make_pair( - static_cast(event->block_name), - static_cast(event->block_index))] < - event_to_max_block_count[std::make_pair( - static_cast(event->block_name), - static_cast(event->block_index))]) { - context->events_list.push_back(*event); - context->results_list.emplace_back(*result); - counter_val_iteration++; - block_max_events_count[std::make_pair( - static_cast(event->block_name), static_cast(event->block_index))]++; - results_list.erase(result); - events_list.erase(event); - } else { - event++; - result++; - } - i++; - } - - std::set counters_taken; - - std::set metrics_counters_taken; - - for (auto result : context->results_list) { - rocmtools::Metric* metric; - if (std::find(counter_names.begin(), counter_names.end(), result->name) != - counter_names.end()) { - // std::cout << "Counter from Result List: " << result->name << std::endl; - counters_taken.insert(result->name); - metric = const_cast(metricsDict[gpu_agent.handle]->Get(result->name)); - if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n"; - context->metrics_list.push_back(metric); - } else { - metrics_counters_taken.insert(result->name); - // std::cout << "Counter Added: " << result->name << std::endl; - } - } - - std::set metrics_taken; - - for (auto result : results_map) { - if (counters_taken.find(result.first) == counters_taken.end() && - std::find(counter_names.begin(), counter_names.end(), result.first) != - counter_names.end()) { - bool flag = true; - for (auto result_basic : results_list) { - if (result_basic->name.compare(result.first)) { - flag = false; - break; - } - } - if (flag) metrics_taken.insert(result.first); - } - } - - for (auto metric_name : metrics_taken) { - bool flag = true; - if (metrics_counters.find(metric_name) == metrics_counters.end()) continue; - for (auto metric_counter_name : metrics_counters.at(metric_name)) { - if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() && - counters_taken.find(metric_counter_name) == counters_taken.end()) { - flag = false; - continue; - } - } - if (flag) { - // std::cout << "Counter from Result Map: " << metric_name << std::endl; - counters_taken.insert(metric_name); - rocmtools::Metric* metric = - const_cast(metricsDict[gpu_agent.handle]->Get(metric_name)); - if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n"; - context->metrics_list.push_back(metric); - } - } - - context->results_map = results_map; - context->metrics_dict = metricsDict[gpu_agent.handle]; - - hsa_ven_amd_aqlprofile_parameter_t* params = {}; - - packet_t* start_packet = new packet_t(); - packet_t* stop_packet = new packet_t(); - packet_t* read_packet = new packet_t(); - - if (context->events_list.size() <= 0) { - continue; - } - -// Preparing the profile structure to get the packets -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion-null" - hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; - if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; - hsa_ven_amd_aqlprofile_profile_t* profile = - new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, - profile_type, - &(context->events_list[0]), - static_cast(context->events_list.size()), - params, - 0, - NULL, - NULL}; -#pragma GCC diagnostic pop - - // Preparing an Getting the size of the command and output buffers - status = hsa_ven_amd_aqlprofile_start(profile, NULL); - if (status != HSA_STATUS_SUCCESS) { - const char* hsa_err_str = nullptr; - if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown"; - printf("Error: %s\n", hsa_err_str); - continue; + // do { + rocmtools::profiling_context_t* context = new rocmtools::profiling_context_t(); + context->gpu_agent = gpu_agent; + auto result = results_list.begin(); + std::map, uint32_t> block_max_events_count; + std::set block_names_taken; + for (auto event = events_list.begin(); event != events_list.end();) { + if (block_max_events_count[std::make_pair( + static_cast(event->block_name), static_cast(event->block_index))] < + event_to_max_block_count[std::make_pair( + static_cast(event->block_name), static_cast(event->block_index))]) { + context->events_list.push_back(*event); + context->results_list.emplace_back(*result); + block_max_events_count[std::make_pair( + static_cast(event->block_name), static_cast(event->block_index))]++; + results_list.erase(result); + events_list.erase(event); } else { - status = HSA_STATUS_ERROR; - size_t size = profile->command_buffer.size; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - agentInfo.cpu_pool, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); + event++; + result++; + } + } - // Both the CPU and GPU can access the memory - if (status == HSA_STATUS_SUCCESS) { - hsa_agent_t ag_list[1] = {gpu_agent}; - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( - 1, ag_list, NULL, profile->command_buffer.ptr); + std::set counters_taken; - if (status != HSA_STATUS_SUCCESS) { - printf("Error: Can't allow access for both agents to Command Buffer\n"); - continue; - } else if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { - printf("Error: Ran out of GPU memory to allocate Command Buffer\n"); - continue; + std::set metrics_counters_taken; + + for (auto result : context->results_list) { + rocmtools::Metric* metric; + if (std::find(counter_names.begin(), counter_names.end(), result->name) != + counter_names.end()) { + // std::cout << "Counter from Result List: " << result->name << std::endl; + counters_taken.insert(result->name); + metric = const_cast(metricsDict[gpu_agent.handle]->Get(result->name)); + if (metric == nullptr) std::cout << result->name << " not found in metricsDict\n"; + context->metrics_list.push_back(metric); + } else { + metrics_counters_taken.insert(result->name); + // std::cout << "Counter Added: " << result->name << std::endl; + } + } + + std::set metrics_taken; + + for (auto result : results_map) { + if (counters_taken.find(result.first) == counters_taken.end() && + std::find(counter_names.begin(), counter_names.end(), result.first) != + counter_names.end()) { + bool flag = true; + for (auto result_basic : results_list) { + if (result_basic->name.compare(result.first)) { + flag = false; + break; } - } else { - const char* hsa_err_str = NULL; - if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) hsa_err_str = "Unknown"; - printf("Error: Allocating command Buffer (Size=%lu) (%s)\n", size, hsa_err_str); } + if (flag) metrics_taken.insert(result.first); + } + } - if (!is_spm) { - status = HSA_STATUS_ERROR; - size = profile->output_buffer.size; - size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - agentInfo.kernarg_pool, size, 0, reinterpret_cast(&profile->output_buffer.ptr)); - - if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { - printf("Error: Ran out of GPU memory to allocate Output Buffer\n"); - continue; - } - - if (status == HSA_STATUS_SUCCESS) { - hsa_agent_t ag_list[1] = {gpu_agent}; - status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( - 1, ag_list, NULL, profile->output_buffer.ptr); - - if (status == HSA_STATUS_SUCCESS) { - memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size); - - // Creating the start/stop/read packets - status = hsa_ven_amd_aqlprofile_start(profile, start_packet); - status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet); - status = hsa_ven_amd_aqlprofile_read(profile, read_packet); - - context->start_packet = start_packet; - context->stop_packet = stop_packet; - context->read_packet = read_packet; - - // add profiles - profiles->emplace_back(std::make_pair(context, profile)); - } else { - printf("Error: Can't allow access for both agents to output Buffer\n"); - continue; - } - } else { - const char* hsa_err_str = NULL; - if (hsa_status_string(status, &hsa_err_str) != HSA_STATUS_SUCCESS) - hsa_err_str = "Unknown"; - printf("Error: Allocating output Buffer (%s)\n", hsa_err_str); - continue; - } - } else { - profile->output_buffer.size = 0; - status = hsa_ven_amd_aqlprofile_start(profile, start_packet); - status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet); - status = hsa_ven_amd_aqlprofile_read(profile, read_packet); - - context->start_packet = start_packet; - context->stop_packet = stop_packet; - context->read_packet = read_packet; - - // add profiles - profiles->emplace_back(std::make_pair(context, profile)); + for (auto metric_name : metrics_taken) { + bool flag = true; + if (metrics_counters.find(metric_name) == metrics_counters.end()) continue; + for (auto metric_counter_name : metrics_counters.at(metric_name)) { + if (metrics_counters_taken.find(metric_counter_name) == metrics_counters_taken.end() && + counters_taken.find(metric_counter_name) == counters_taken.end()) { + flag = false; + continue; } } - } while (events_list.size() > 0); + if (flag) { + // std::cout << "Counter from Result Map: " << metric_name << std::endl; + counters_taken.insert(metric_name); + rocmtools::Metric* metric = + const_cast(metricsDict[gpu_agent.handle]->Get(metric_name)); + if (metric == nullptr) std::cout << metric_name << " not found in metricsDict\n"; + context->metrics_list.push_back(metric); + } + } + + context->results_map = results_map; + context->metrics_dict = metricsDict[gpu_agent.handle]; + + hsa_ven_amd_aqlprofile_parameter_t* params = {}; + + packet_t* start_packet = new packet_t(); + packet_t* stop_packet = new packet_t(); + packet_t* read_packet = new packet_t(); + + if (context->events_list.size() <= 0) { + std::cerr << "Error: No events to profile" << std::endl; + abort(); + } + + // Preparing the profile structure to get the packets + hsa_ven_amd_aqlprofile_event_type_t profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + if (is_spm) profile_type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; + hsa_ven_amd_aqlprofile_profile_t* profile = + new hsa_ven_amd_aqlprofile_profile_t{gpu_agent, + profile_type, + &(context->events_list[0]), + static_cast(context->events_list.size()), + params, + 0, + 0, + 0}; + + size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size(); + hsa_agent_t ag_list[ag_list_count]; + ag_list[0] = gpu_agent; + + // Preparing an Getting the size of the command and output buffers + status = hsa_ven_amd_aqlprofile_start(profile, NULL); + // CHECK_HSA_STATUS("Error: Getting Buffers Size", status); + + if (profile->command_buffer.size > 0 && profile->output_buffer.size > 0) { + status = HSA_STATUS_ERROR; + size_t size = profile->command_buffer.size; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (size <= 0) { + std::cerr << __FILE__ << ":" << __LINE__ << " " + << "Error: Command buffer given size is " << size << std::endl; + abort(); + } + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + agentInfo.cpu_pool, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); + if (status != HSA_STATUS_SUCCESS) { + profile->command_buffer.ptr = malloc(size); + /*numa_alloc_onnode( + size, + rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode());*/ + if (profile->command_buffer.ptr == NULL) { + std::cerr << __FILE__ << ":" << __LINE__ << " " + << "Error: allocating memory for command buffer using NUMA" << std::endl; + abort(); + } + } else { + // Both the CPU and GPU can access the memory + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( + ag_list_count, ag_list, NULL, profile->command_buffer.ptr); + CHECK_HSA_STATUS("Error: Allowing access to Command Buffer", status); + } + + if (!is_spm) { + status = HSA_STATUS_ERROR; + size_t size = profile->output_buffer.size; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (size <= 0) { + std::cerr << __FILE__ << ":" << __LINE__ << " " + << "Error: Output buffer given size is " << size << std::endl; + abort(); + } + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + agentInfo.kernarg_pool, size, 0, reinterpret_cast(&profile->output_buffer.ptr)); + if (status != HSA_STATUS_SUCCESS) { + profile->output_buffer.ptr = malloc(size); + /*numa_alloc_onnode( + size, + rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle) + .getNumaNode());*/ + if (profile->output_buffer.ptr == NULL) { + std::cerr << __FILE__ << ":" << __LINE__ << " " + << "Error: allocating memory for output buffer using NUMA" << std::endl; + abort(); + } + } else { + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( + ag_list_count, ag_list, NULL, profile->output_buffer.ptr); + CHECK_HSA_STATUS("Error: GPU Agent can't have output buffer access", status); + memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size); + } + } else { + profile->output_buffer.size = 0; + } + status = hsa_ven_amd_aqlprofile_start(profile, start_packet); + // CHECK_HSA_STATUS("Error: Creating Start Packet\n", status); + status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet); + // CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status); + status = hsa_ven_amd_aqlprofile_read(profile, read_packet); + // CHECK_HSA_STATUS("Error: Creating Read Packet\n", status); + + context->start_packet = start_packet; + context->stop_packet = stop_packet; + context->read_packet = read_packet; + + // add profiles + profiles.emplace_back(std::make_pair(context, profile)); + } + // } while (events_list.size() > 0); return profiles; } @@ -407,67 +397,71 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( // Validating the events array for the specified gpu agent bool result; - hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result); - if (!result) { - printf("Error: Events are not valid for the current gpu agent\n"); - throw("Error: Events are not valid for the current gpu agent"); - } + status = hsa_ven_amd_aqlprofile_validate_event(gpu_agent, events, &result); + CHECK_HSA_STATUS("Error: Events are not valid for the current gpu agent\n", status); hsa_ven_amd_aqlprofile_parameter_t* params = {}; - uint8_t* command_buffer = nullptr; - uint8_t* output_buffer = nullptr; - -// Preparing the profile structure to get the packets -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion-null" + // Preparing the profile structure to get the packets hsa_ven_amd_aqlprofile_profile_t* profile = new hsa_ven_amd_aqlprofile_profile_t{ - gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, NULL, NULL}; -#pragma GCC diagnostic pop + gpu_agent, HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, events, event_count, params, 0, 0, 0}; // Preparing an Getting the size of the command and output buffers status = hsa_ven_amd_aqlprofile_start(profile, NULL); Agent::AgentInfo& agentInfo = rocmtools::hsa_support::GetAgentInfo(gpu_agent.handle); + size_t ag_list_count = 1; + hsa_agent_t ag_list[ag_list_count]; + ag_list[0] = gpu_agent; // Allocating Command Buffer status = HSA_STATUS_ERROR; size_t size = profile->command_buffer.size; + profile->command_buffer.ptr = nullptr; + if (size <= 0) return nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - agentInfo.cpu_pool, size, 0, reinterpret_cast(&command_buffer)); + agentInfo.cpu_pool, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { - hsa_agent_t ag_list[1] = {gpu_agent}; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( - 1, ag_list, NULL, command_buffer); + ag_list_count, ag_list, NULL, profile->command_buffer.ptr); + CHECK_HSA_STATUS("Error: GPU Agent can't have command buffer access", status); + } else { + profile->command_buffer.ptr = numa_alloc_onnode( + profile->command_buffer.size, + rocmtools::hsa_support::GetAgentInfo(agentInfo.getNearCpuAgent().handle).getNumaNode()); + if (profile->command_buffer.ptr != nullptr) { + status = HSA_STATUS_SUCCESS; + } else { + CHECK_HSA_STATUS("Error: Allocating Command Buffer", status); + } } - profile->command_buffer.ptr = (status == HSA_STATUS_SUCCESS) ? command_buffer : nullptr; - if (status != HSA_STATUS_SUCCESS) printf("Error: Allocating Command Buffer\n"); // Allocating Output Buffer status = HSA_STATUS_ERROR; size = profile->output_buffer.size; + profile->output_buffer.ptr = nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - agentInfo.kernarg_pool, size, 0, reinterpret_cast(&output_buffer)); + agentInfo.gpu_pool, size, 0, reinterpret_cast(&(profile->output_buffer.ptr))); + CHECK_HSA_STATUS("Error: Can't Allocate Output Buffer", status); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { - hsa_agent_t ag_list[1] = {gpu_agent}; status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( - 1, ag_list, NULL, output_buffer); - } - if (status == HSA_STATUS_SUCCESS) { - profile->output_buffer.ptr = output_buffer; + ag_list_count, ag_list, NULL, profile->output_buffer.ptr); + CHECK_HSA_STATUS("Error: Can't allow access on the Output Buffer for the GPU", status); memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size); - } else { - profile->output_buffer.ptr = nullptr; } + // Creating the start/stop/read packets status = hsa_ven_amd_aqlprofile_start(profile, start_packet); + CHECK_HSA_STATUS("Error: Creating Start Packet\n", status); status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet); + CHECK_HSA_STATUS("Error: Creating Stop Packet\n", status); status = hsa_ven_amd_aqlprofile_read(profile, read_packet); + CHECK_HSA_STATUS("Error: Creating Read Packet\n", status); if (status == HSA_STATUS_ERROR) return nullptr; return profile; @@ -479,17 +473,19 @@ bool g_output_buffer_local = true; // Allocate system memory accessible by both CPU and GPU uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool) { + size_t ag_list_count = 1; // rocmtools::hsa_support::GetCPUAgentList().size(); + hsa_agent_t ag_list[ag_list_count]; + ag_list[0] = gpu_agent; hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - // if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(*cpu_pool, size, 0, reinterpret_cast(&buffer)); + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + *cpu_pool, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { - hsa_agent_t ag_list[1] = {gpu_agent}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_agents_allow_access_fn( + ag_list_count, ag_list, NULL, buffer); } - // } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } @@ -542,18 +538,6 @@ att_mem_pools_map_t* GetAttMemPoolsMap() { return agent_att_mem_pools_map; } - -att_memory_pools_t* GetAttMemPools(hsa_agent_t gpu_agent) { - auto it = GetAttMemPoolsMap()->find(gpu_agent.handle); - if (it != GetAttMemPoolsMap()->end()) { - return it->second; - } - printf("Error: att_memory_pools_t instance not found for given gpu agent handle: %lu\n", - gpu_agent.handle); - - return nullptr; -} - // Generate start and stop packets for collecting ATT traces // Also generate and return the profile object which has the PM4 // command buffer and the output buffer for retrieving the traces @@ -573,7 +557,7 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( // Check the profile buffer sizes hsa_status_t status = hsa_ven_amd_aqlprofile_start(profile, NULL); - if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start(NULL)"); + CHECK_HSA_STATUS("Error: Getting PM4 Start Packet", status); // TODO: create a separate class for memory allocations // Maintain pools per device // handle allocation and resource cleanup @@ -582,14 +566,13 @@ hsa_ven_amd_aqlprofile_profile_t* GenerateATTPackets( // command buffer -> from CPU memory pool // output buffer -> from GPU memory pool status = Allocate(gpu_agent, profile); - if (status != HSA_STATUS_SUCCESS) printf("Error: Allocate()"); + CHECK_HSA_STATUS("Error: Att Buffers Allocation", status); // Generate start/stop/read profiling packets status = hsa_ven_amd_aqlprofile_start(profile, start_packet); - if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_start"); + CHECK_HSA_STATUS("Error: Creating Start PM4 Packet", status); status = hsa_ven_amd_aqlprofile_stop(profile, stop_packet); - if (status != HSA_STATUS_SUCCESS) printf("Error: aqlprofile_stop"); - if (status == HSA_STATUS_ERROR) return nullptr; + CHECK_HSA_STATUS("Error: Creating Stop PM4 Packet", status); return profile; } diff --git a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h index df3b2ef0a5..54797a63f2 100644 --- a/projects/rocprofiler/src/core/hsa/packets/packets_generator.h +++ b/projects/rocprofiler/src/core/hsa/packets/packets_generator.h @@ -39,7 +39,7 @@ namespace Packet { typedef hsa_ext_amd_aql_pm4_packet_t packet_t; -std::vector>* +std::vector> InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, std::vector& counter_names, bool is_spm = false); uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_pool_t* cpu_pool); diff --git a/projects/rocprofiler/src/core/hsa/queues/queue.cpp b/projects/rocprofiler/src/core/hsa/queues/queue.cpp index 647599aea9..e4bff8d4a7 100644 --- a/projects/rocprofiler/src/core/hsa/queues/queue.cpp +++ b/projects/rocprofiler/src/core/hsa/queues/queue.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include "rocprofiler.h" #include "src/api/rocmtool.h" @@ -33,6 +34,21 @@ #include "src/core/hsa/hsa_support.h" #include "src/utils/helper.h" +#define CHECK_HSA_STATUS(msg, status) \ + do { \ + if ((status) != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { \ + try { \ + const char* emsg = nullptr; \ + hsa_status_string(status, &emsg); \ + if (!emsg) emsg = ""; \ + std::cerr << msg << std::endl; \ + std::cerr << emsg << std::endl; \ + } catch (std::exception & e) { \ + } \ + abort(); \ + } \ + } while (0) + #define __NR_gettid 186 #define MAX_ATT_PROFILES 16 @@ -290,27 +306,41 @@ hsa_status_t attTraceDataCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, return status; } -void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending) { - std::vector counters_vec; - for (size_t i = 0; i < pending.context->metrics_list.size(); i++) { - const rocmtools::Metric* metric = pending.context->metrics_list[i]; +void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t* pending) { + record->counters_count = + rocprofiler_record_counters_instances_count_t{pending->context->metrics_list.size()}; + size_t counters_list_size = + record->counters_count.value * sizeof(rocprofiler_record_counter_instance_t); + rocprofiler_record_counter_instance_t* counters = + static_cast(malloc(counters_list_size)); + for (size_t i = 0; i < pending->context->metrics_list.size(); i++) { + const rocmtools::Metric* metric = pending->context->metrics_list[i]; double value = 0; std::string metric_name = metric->GetName(); - auto it = pending.context->results_map.find(metric_name); - if (it != pending.context->results_map.end()) { + auto it = pending->context->results_map.find(metric_name); + if (it != pending->context->results_map.end()) { value = it->second->val_double; } - counters_vec.emplace_back(rocprofiler_record_counter_instance_t{ + counters[i] = (rocprofiler_record_counter_instance_t{ // TODO(aelwazir): Moving to span once C++20 is adopted, strdup can be // removed after that rocprofiler_counter_id_t{rocmtools::profiler::GetCounterID(metric_name)}, rocprofiler_record_counter_value_t{value}}); } - record->counters = static_cast( - malloc(counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t))); - ::memcpy(record->counters, &(counters_vec)[0], - counters_vec.size() * sizeof(rocprofiler_record_counter_instance_t)); - record->counters_count = rocprofiler_record_counters_instances_count_t{counters_vec.size()}; + record->counters = counters; + rocmtools::Session* session = GetROCMToolObj()->GetSession(pending->session_id); + void* initial_handle = const_cast(record->counters); + if (session->FindBuffer(pending->buffer_id)) { + Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id); + buffer->AddRecord(*record, record->counters, counters_list_size, + [initial_handle](auto& record, const void* data) { + if (record.counters == initial_handle && data != initial_handle) { + free(initial_handle); + } + record.counters = + static_cast(data); + }); + } } void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent, @@ -330,7 +360,6 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent for (trace_data_it = data.begin(); trace_data_it != data.end(); trace_data_it++) { const void* data_ptr = trace_data_it->trace_data.ptr; const uint32_t data_size = trace_data_it->trace_data.size; - // fprintf(arg->file, " SE(%u) size(%u)\n", data.sample_id, data_size); void* buffer = NULL; if (data_size != 0) { @@ -359,23 +388,22 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { !GetROCMToolObj()->GetSession(queue_info_session->session_id)->GetProfiler()) return true; rocmtools::Session* session = GetROCMToolObj()->GetSession(queue_info_session->session_id); + std::lock_guard lock(session->GetSessionLock()); rocmtools::profiler::Profiler* profiler = session->GetProfiler(); - std::vector& pending_signals = const_cast&>( + std::vector pending_signals = const_cast&>( profiler->GetPendingSignals(queue_info_session->writer_id)); if (!pending_signals.empty()) { for (auto it = pending_signals.begin(); it != pending_signals.end(); it = pending_signals.erase(it)) { auto& pending = *it; - std::lock_guard lock(session->GetSessionLock()); - if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true; + if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->signal)) return true; hsa_amd_profiling_dispatch_time_t time; hsa_support::GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn( - queue_info_session->agent, pending.signal, &time); + queue_info_session->agent, pending->signal, &time); uint32_t record_count = 1; bool is_individual_xcc_mode = false; - uint32_t xcc_count = - hsa_support::GetAgentInfo(queue_info_session->agent.handle).getXccCount(); + uint32_t xcc_count = queue_info_session->xcc_count; if (xcc_count > 1) { // for MI300 const char* str = getenv("ROCPROFILER_INDIVIDUAL_XCC_MODE"); if (str != NULL) is_individual_xcc_mode = (atol(str) > 0); @@ -387,70 +415,62 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { rocprofiler_record_profiler_t record{}; // TODO: (sauverma) gpu-id will need to support xcc like so- 1.1, 1.2, 1.3 ... 1.5 for // different xcc - record.gpu_id = rocprofiler_agent_id_t{ - (uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()}; - record.kernel_properties = pending.kernel_properties; - record.thread_id = rocprofiler_thread_id_t{pending.thread_id}; - record.queue_idx = rocprofiler_queue_index_t{pending.queue_index}; + record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index}; + record.kernel_properties = pending->kernel_properties; + record.thread_id = rocprofiler_thread_id_t{pending->thread_id}; + record.queue_idx = rocprofiler_queue_index_t{pending->queue_index}; record.timestamps = rocprofiler_record_header_timestamp_t{time.start, time.end}; record.queue_id = rocprofiler_queue_id_t{queue_info_session->queue_id}; - if (pending.counters_count > 0 && pending.context->metrics_list.size() > 0 && - pending.profile) { - if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch - rocmtools::metrics::GetCounterData(pending.profile, queue_info_session->agent, - pending.context->results_list); - if (is_individual_xcc_mode) - rocmtools::metrics::GetCountersAndMetricResultsByXcc( - xcc_id, pending.context->results_list, pending.context->results_map, - pending.context->metrics_list); - else - rocmtools::metrics::GetMetricsData(pending.context->results_map, - pending.context->metrics_list); - AddRecordCounters(&record, pending); - } // Kernel Descriptor is the right record id generated in the WriteInterceptor function and // will be used to handle the kernel name of that dispatch - record.header = {ROCPROFILER_PROFILER_RECORD, - rocprofiler_record_id_t{pending.kernel_descriptor}}; - record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor}; + record.header = rocprofiler_record_header_t{ + ROCPROFILER_PROFILER_RECORD, rocprofiler_record_id_t{pending->kernel_descriptor}}; + record.kernel_id = rocprofiler_kernel_id_t{pending->kernel_descriptor}; + record.correlation_id = rocprofiler_correlation_id_t{pending->correlation_id}; - if (pending.session_id.handle == 0) { - pending.session_id = GetROCMToolObj()->GetCurrentSessionId(); + if (pending->session_id.handle == 0) { + pending->session_id = GetROCMToolObj()->GetCurrentSessionId(); } - if (session->FindBuffer(pending.buffer_id)) { - Memory::GenericBuffer* buffer = session->GetBuffer(pending.buffer_id); - if (pending.profile && pending.counters_count > 0) { - rocprofiler_record_counter_instance_t* record_counters = record.counters; - buffer->AddRecord( - record, record.counters, - (record.counters_count.value * (sizeof(rocprofiler_record_counter_instance_t) + 1)), - [](auto& record, const void* data) { - record.counters = const_cast( - static_cast(data)); - }); - free(record_counters); - } else { + if (pending->counters_count > 0 && pending->context->metrics_list.size() > 0 && + pending->profile) { + if (xcc_id == 0) // call to GetCounterData() is required only once for a dispatch + rocmtools::metrics::GetCounterData(pending->profile, queue_info_session->agent, + pending->context->results_list); + if (is_individual_xcc_mode) + rocmtools::metrics::GetCountersAndMetricResultsByXcc( + xcc_id, pending->context->results_list, pending->context->results_map, + pending->context->metrics_list); + else + rocmtools::metrics::GetMetricsData(pending->context->results_map, + pending->context->metrics_list); + AddRecordCounters(&record, pending); + } else { + if (session->FindBuffer(pending->buffer_id)) { + Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id); buffer->AddRecord(record); } } } - if (pending.counters_count > 0 && pending.profile && pending.profile->events) { + if (pending->counters_count > 0 && pending->profile && pending->profile->events) { // TODO(aelwazir): we need a better way of distributing events and free them - // free(const_cast(pending.profile->events)); + // if (pending->profile->output_buffer.ptr) + // numa_free(pending->profile->output_buffer.ptr, pending->profile->output_buffer.size); hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn( - (pending.profile->output_buffer.ptr)); - if (status != HSA_STATUS_SUCCESS) { - printf("Error: Couldn't free output buffer memory\n"); - } + (pending->profile->output_buffer.ptr)); + CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status); + // if (pending->profile->command_buffer.ptr) + // numa_free(pending->profile->command_buffer.ptr, pending->profile->command_buffer.size); status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn( - (pending.profile->command_buffer.ptr)); - if (status != HSA_STATUS_SUCCESS) { - printf("Error: Couldn't free command buffer memory\n"); + (pending->profile->command_buffer.ptr)); + CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status); + delete pending->profile; + for (auto& it : pending->context->results_map) { + delete it.second; } - delete pending.profile; + delete pending->context; } - if (pending.signal.handle) - hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending.signal); + if (pending->signal.handle) + hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(pending->signal); if (queue_info_session->interrupt_signal.handle) hsa_support::GetCoreApiTable().hsa_signal_destroy_fn(queue_info_session->interrupt_signal); } @@ -483,8 +503,7 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) { if (hsa_support::GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.signal)) return true; rocprofiler_record_att_tracer_t record{}; record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor}; - record.gpu_id = rocprofiler_agent_id_t{ - (uint64_t)hsa_support::GetAgentInfo(queue_info_session->agent.handle).getIndex()}; + record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index}; record.kernel_properties = pending.kernel_properties; record.thread_id = rocprofiler_thread_id_t{pending.thread_id}; record.queue_idx = rocprofiler_queue_index_t{pending.queue_index}; @@ -506,14 +525,10 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) { } hsa_status_t status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn( (pending.profile->output_buffer.ptr)); - if (status != HSA_STATUS_SUCCESS) { - printf("Error: Couldn't free output buffer memory\n"); - } + CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status); status = rocmtools::hsa_support::GetAmdExtTable().hsa_amd_memory_pool_free_fn( (pending.profile->command_buffer.ptr)); - if (status != HSA_STATUS_SUCCESS) { - printf("Error: Couldn't free command buffer memory\n"); - } + CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status); delete pending.profile; } } @@ -540,19 +555,19 @@ void AddVendorSpecificPacket(const Packet::packet_t* packet, void SignalAsyncHandler(const hsa_signal_t& signal, void* data) { hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn( signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data); - if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed"); + CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler failed", status); } void signalAsyncHandlerATT(const hsa_signal_t& signal, void* data) { hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_async_handler_fn( signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data); - if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed"); + CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler for ATT failed", status); } void CreateSignal(uint32_t attribute, hsa_signal_t* signal) { hsa_status_t status = hsa_support::GetAmdExtTable().hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal); - if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_create failed"); + CHECK_HSA_STATUS("Error: hsa_amd_signal_create failed", status); } template constexpr Integral bit_mask(int first, int last) { @@ -659,13 +674,13 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt is_pc_sampling_collection_mode) && session) { // Getting Queue Data and Information - auto& queue_info = *static_cast(data); + Queue& queue_info = *reinterpret_cast(data); std::lock_guard lk(queue_info.qw_mutex); // hsa_ven_amd_aqlprofile_profile_t* profile; - std::vector>* - profiles = nullptr; + std::vector> + profiles; // Searching accross all the packets given during this write @@ -686,106 +701,109 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt // Get the PM4 Packets using packets_generator profiles = Packet::InitializeAqlPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(), session_data); - replay_mode_count = profiles->size(); + replay_mode_count = profiles.size(); } uint32_t profile_id = 0; - hsa_signal_t interrupt_signal; - do { - std::pair profile; - if (profiles && replay_mode_count > 0) profile = profiles->at(profile_id); + // do { + std::pair profile; + if (profiles.size() > 0 && replay_mode_count > 0) profile = profiles.at(profile_id); - uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release); + uint32_t writer_id = WRITER_ID.fetch_add(1, std::memory_order_release); - if (session_data_count > 0 && is_counter_collection_mode && profiles && - replay_mode_count > 0) { - // Adding start packet and its barrier with a dummy signal - hsa_signal_t dummy_signal{}; - dummy_signal.handle = 0; - profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC - << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal); + if (session_data_count > 0 && is_counter_collection_mode && profiles.size() > 0 && + replay_mode_count > 0) { + // Adding start packet and its barrier with a dummy signal + hsa_signal_t dummy_signal{}; + dummy_signal.handle = 0; + profile.first->start_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + AddVendorSpecificPacket(profile.first->start_packet, &transformed_packets, dummy_signal); - CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets); - } + CreateBarrierPacket(profile.first->start_packet->completion_signal, &transformed_packets); + } - auto& packet = transformed_packets.emplace_back(packets_arr[i]); - auto& dispatch_packet = reinterpret_cast(packet); + auto& packet = transformed_packets.emplace_back(packets_arr[i]); + auto& dispatch_packet = reinterpret_cast(packet); - /* - * Only PC sampling relies on this right now, so it would be better to - * only generate an ID if PC sampling is active to conserve IDs, but it's - * unlikely 64 bits' worth of identifiers will be exhausted during the - * lifetime of the ROCMToolObj. - */ - dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId(); + /* + * Only PC sampling relies on this right now, so it would be better to + * only generate an ID if PC sampling is active to conserve IDs, but it's + * unlikely 64 bits' worth of identifiers will be exhausted during the + * lifetime of the ROCMToolObj. + */ + uint64_t correlation_id = dispatch_packet.reserved2; + // dispatch_packet.reserved2 = GetROCMToolObj()->GetUniqueKernelDispatchId(); - CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal); - // Adding the dispatch packet newly created signal to the pending signals - // list to be processed by the signal interrupt - rocprofiler_kernel_properties_t kernel_properties = - set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent()); - if (session) { - uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId(); - AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object), - record_id); - if (profiles && replay_mode_count > 0) { - session->GetProfiler()->AddPendingSignals( - writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id, - profile.first, profile.first->metrics_list.size(), profile.second, - kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index); - } else { - session->GetProfiler()->AddPendingSignals( - writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id, - nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid), - user_pkt_index); - } - } - - // Make a copy of the original packet, adding its signal to a barrier - // packet and create a new signal for it to get timestamps - if (original_packet.completion_signal.handle) { - hsa_barrier_and_packet_t barrier{0}; - barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - Packet::packet_t* __attribute__((__may_alias__)) pkt = - (reinterpret_cast(&barrier)); - transformed_packets.emplace_back(*pkt).completion_signal = - original_packet.completion_signal; - } - - // Adding a barrier packet with the original packet's completion signal. - CreateSignal(0, &interrupt_signal); - - // Adding Stop and Read PM4 Packets - if (session_data_count > 0 && is_counter_collection_mode && profiles && - replay_mode_count > 0) { - hsa_signal_t dummy_signal{}; - profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC - << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal); - profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC - << HSA_PACKET_HEADER_TYPE; - AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets, - interrupt_signal); - - // Added Interrupt Signal with barrier and provided handler for it - CreateBarrierPacket(interrupt_signal, &transformed_packets); + CreateSignal(HSA_AMD_SIGNAL_AMD_GPU_ONLY, &packet.completion_signal); + // Adding the dispatch packet newly created signal to the pending signals + // list to be processed by the signal interrupt + rocprofiler_kernel_properties_t kernel_properties = + set_kernel_properties(dispatch_packet, queue_info.GetGPUAgent()); + if (session) { + uint64_t record_id = GetROCMToolObj()->GetUniqueRecordId(); + AddKernelNameWithDispatchID(GetKernelNameFromKsymbols(dispatch_packet.kernel_object), + record_id); + if (profiles.size() > 0 && replay_mode_count > 0) { + session->GetProfiler()->AddPendingSignals( + writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id, + profile.first, profile.first->metrics_list.size(), profile.second, kernel_properties, + (uint32_t)syscall(__NR_gettid), user_pkt_index, correlation_id); } else { - hsa_barrier_and_packet_t barrier{0}; - barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - barrier.completion_signal = interrupt_signal; - Packet::packet_t* __attribute__((__may_alias__)) pkt = - (reinterpret_cast(&barrier)); - transformed_packets.emplace_back(*pkt); + session->GetProfiler()->AddPendingSignals( + writer_id, record_id, dispatch_packet.completion_signal, session_id, buffer_id, + nullptr, 0, nullptr, kernel_properties, (uint32_t)syscall(__NR_gettid), + user_pkt_index, correlation_id); } - // Creating Async Handler to be called every time the interrupt signal is - // marked complete - SignalAsyncHandler(interrupt_signal, - new queue_info_session_t{queue_info.GetGPUAgent(), session_id, - queue_info.GetQueueID(), writer_id}); - ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed); - profile_id++; - } while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end + } + + // Make a copy of the original packet, adding its signal to a barrier + // packet and create a new signal for it to get timestamps + if (original_packet.completion_signal.handle) { + hsa_barrier_and_packet_t barrier{0}; + barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + Packet::packet_t* __attribute__((__may_alias__)) pkt = + (reinterpret_cast(&barrier)); + transformed_packets.emplace_back(*pkt).completion_signal = + original_packet.completion_signal; + } + + hsa_signal_t interrupt_signal{}; + // Adding a barrier packet with the original packet's completion signal. + CreateSignal(0, &interrupt_signal); + + // Adding Stop and Read PM4 Packets + if (session_data_count > 0 && is_counter_collection_mode) { + hsa_signal_t dummy_signal{}; + profile.first->stop_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + AddVendorSpecificPacket(profile.first->stop_packet, &transformed_packets, dummy_signal); + profile.first->read_packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC + << HSA_PACKET_HEADER_TYPE; + AddVendorSpecificPacket(profile.first->read_packet, &transformed_packets, interrupt_signal); + + // Added Interrupt Signal with barrier and provided handler for it + CreateBarrierPacket(interrupt_signal, &transformed_packets); + } else { + hsa_barrier_and_packet_t barrier{0}; + barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + barrier.completion_signal = interrupt_signal; + Packet::packet_t* __attribute__((__may_alias__)) pkt = + (reinterpret_cast(&barrier)); + transformed_packets.emplace_back(*pkt); + } + Agent::AgentInfo& agentInfo = + rocmtools::hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle); + // Creating Async Handler to be called every time the interrupt signal is + // marked complete + SignalAsyncHandler( + interrupt_signal, + new queue_info_session_t{queue_info.GetGPUAgent(), session_id, queue_info.GetQueueID(), + writer_id, interrupt_signal, agentInfo.getIndex(), + agentInfo.getXccCount()}); + ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed); + // profile_id++; + // } while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end } /* Write the transformed packets to the hardware queue. */ writer(&transformed_packets[0], transformed_packets.size()); @@ -795,7 +813,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt // Getting Queue Data and Information auto& queue_info = *static_cast(data); std::lock_guard lk(queue_info.qw_mutex); - Agent::AgentInfo* agentInfo = &(hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle)); + Agent::AgentInfo agentInfo = hsa_support::GetAgentInfo(queue_info.GetGPUAgent().handle); bool can_profile_anypacket = false; std::vector can_profile_packet; @@ -858,7 +876,7 @@ void WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt } if (att_counters_names.size() > 0) { - MetricsDict* metrics_dict_ = MetricsDict::Create(agentInfo); + MetricsDict* metrics_dict_ = MetricsDict::Create(&agentInfo); for (const std::string& counter_name : att_counters_names) { const Metric* metric = metrics_dict_->Get(counter_name); @@ -1007,6 +1025,11 @@ Queue::Queue(const hsa_agent_t& cpu_agent, const hsa_agent_t& gpu_agent, uint32_ *queue = intercept_queue_; } +Queue::~Queue() { + while (ACTIVE_INTERRUPT_SIGNAL_COUNT.load(std::memory_order_acquire) > 0) { + } +} + hsa_queue_t* Queue::GetCurrentInterceptQueue() { return intercept_queue_; } hsa_agent_t Queue::GetGPUAgent() { return gpu_agent_; } diff --git a/projects/rocprofiler/src/core/hsa/queues/queue.h b/projects/rocprofiler/src/core/hsa/queues/queue.h index 39aacd78f5..606c4d5d94 100644 --- a/projects/rocprofiler/src/core/hsa/queues/queue.h +++ b/projects/rocprofiler/src/core/hsa/queues/queue.h @@ -56,7 +56,7 @@ class Queue { hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); - ~Queue() {} + ~Queue(); hsa_queue_t* GetCurrentInterceptQueue(); hsa_agent_t GetGPUAgent(); @@ -82,6 +82,8 @@ struct queue_info_session_t { uint64_t queue_id; uint32_t writer_id; hsa_signal_t interrupt_signal; + uint64_t gpu_index; + uint32_t xcc_count; }; void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_signal_t& pending); diff --git a/projects/rocprofiler/src/core/memory/generic_buffer.cpp b/projects/rocprofiler/src/core/memory/generic_buffer.cpp index 3fb71797bb..284f982fc8 100644 --- a/projects/rocprofiler/src/core/memory/generic_buffer.cpp +++ b/projects/rocprofiler/src/core/memory/generic_buffer.cpp @@ -23,6 +23,7 @@ #include #include +#include "rocprofiler.h" #include "src/api/rocmtool.h" namespace Memory { @@ -68,8 +69,8 @@ GenericBuffer::GenericBuffer(rocprofiler_session_id_t session_id, rocprofiler_bu GenericBuffer::~GenericBuffer() { if (is_valid_.load(std::memory_order_release)) { std::lock_guard lock(buffer_lock_); - if (rocmtools::GetROCMToolObj()->GetSession(session_id_)) - rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_); + // if (rocmtools::GetROCMToolObj()->GetSession(session_id_)) + // rocmtools::GetROCMToolObj()->GetSession(session_id_)->DisableTools(id_); Flush(); diff --git a/projects/rocprofiler/src/core/memory/generic_buffer.h b/projects/rocprofiler/src/core/memory/generic_buffer.h index c8dedb0e1a..c44e6d3d8d 100644 --- a/projects/rocprofiler/src/core/memory/generic_buffer.h +++ b/projects/rocprofiler/src/core/memory/generic_buffer.h @@ -75,8 +75,7 @@ class GenericBuffer { } // Store data in the record. Copy the data first if it fits in the buffer - // (reserve_data_size != 0). - if (reserve_data_size) { + if (reserve_data_size != 0) { data_ptr_ -= data_size; ::memcpy(data_ptr_, data, data_size); store_data(record, data_ptr_); @@ -160,7 +159,8 @@ class GenericBuffer { std::mutex buffer_lock_; }; -bool GetNextRecord(const rocprofiler_record_header_t* record, const rocprofiler_record_header_t** next); +bool GetNextRecord(const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t** next); } // namespace Memory #endif // SRC_CORE_MEMORY_GENERIC_BUFFER_H_ diff --git a/projects/rocprofiler/src/core/metrics.h b/projects/rocprofiler/src/core/metrics.h index 02c2c36714..4756e11605 100755 --- a/projects/rocprofiler/src/core/metrics.h +++ b/projects/rocprofiler/src/core/metrics.h @@ -281,10 +281,12 @@ class MetricsDict { try { expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); } catch (const xml::exception_t& exc) { - if (do_lookup) + if (do_lookup) { metrics_list.push_back(node); - else - throw(exc); + } else { + std::cerr << "Error: " << exc.what() << std::endl; + abort(); + } } if (expr_obj) { #if 0 diff --git a/projects/rocprofiler/src/core/session/profiler/profiler.cpp b/projects/rocprofiler/src/core/session/profiler/profiler.cpp index baeb62ad2a..aa22071894 100644 --- a/projects/rocprofiler/src/core/session/profiler/profiler.cpp +++ b/projects/rocprofiler/src/core/session/profiler/profiler.cpp @@ -50,8 +50,17 @@ uint64_t GetCounterID(std::string& counter_name) { Profiler::Profiler(rocprofiler_buffer_id_t buffer_id, rocprofiler_filter_id_t filter_id, rocprofiler_session_id_t session_id) - : buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) {} -Profiler::~Profiler() {} + : buffer_id_(buffer_id), filter_id_(filter_id), session_id_(session_id) { + sessions_pending_signals_ = new std::map>(); +} +Profiler::~Profiler() { + for (auto& [thread_id, pending_signals] : *sessions_pending_signals_) { + for (auto& pending_signal : pending_signals) { + delete pending_signal; + } + } + delete sessions_pending_signals_; +} void Profiler::AddCounterName(rocprofiler_counter_id_t counter_id, std::string counter_name) { std::lock_guard lock(counter_names_lock_); @@ -106,39 +115,39 @@ const char* Profiler::GetCounterInfo(rocprofiler_counter_info_kind_t kind, return nullptr; } -void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) { warning("Not yet supported!"); } +void Profiler::StartReplayPass(rocprofiler_session_id_t session_id) { + warning("Not yet supported!"); +} void Profiler::EndReplayPass() { warning("Not yet supported!"); } bool Profiler::HasActivePass() { warning("Not yet supported!"); return true; } -void Profiler::AddPendingSignals(uint32_t writer_id, uint64_t kernel_object, - const hsa_signal_t& completion_signal, - rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id, - rocmtools::profiling_context_t* context, - uint64_t session_data_count, - hsa_ven_amd_aqlprofile_profile_t* profile, - rocprofiler_kernel_properties_t kernel_properties, - uint32_t thread_id, uint64_t queue_index) { +void Profiler::AddPendingSignals( + uint32_t writer_id, uint64_t kernel_object, const hsa_signal_t& completion_signal, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id, + rocmtools::profiling_context_t* context, uint64_t session_data_count, + hsa_ven_amd_aqlprofile_profile_t* profile, rocprofiler_kernel_properties_t kernel_properties, + uint32_t thread_id, uint64_t queue_index, uint64_t correlation_id) { std::lock_guard lock(sessions_pending_signals_lock_); - if (sessions_pending_signals_.find(writer_id) == sessions_pending_signals_.end()) - sessions_pending_signals_.emplace(writer_id, std::vector()); - sessions_pending_signals_.at(writer_id).emplace_back( - pending_signal_t{kernel_object, completion_signal, session_id_, buffer_id, context, - session_data_count, profile, kernel_properties, thread_id, queue_index}); + if (sessions_pending_signals_->find(writer_id) == sessions_pending_signals_->end()) + sessions_pending_signals_->emplace(writer_id, std::vector()); + sessions_pending_signals_->at(writer_id).emplace_back(new pending_signal_t{ + kernel_object, completion_signal, session_id_, buffer_id, context, session_data_count, + profile, kernel_properties, thread_id, queue_index, correlation_id}); } -const std::vector& Profiler::GetPendingSignals(uint32_t writer_id) { +const std::vector& Profiler::GetPendingSignals(uint32_t writer_id) { std::lock_guard lock(sessions_pending_signals_lock_); - assert(sessions_pending_signals_.find(writer_id) != sessions_pending_signals_.end() && + assert(sessions_pending_signals_->find(writer_id) != sessions_pending_signals_->end() && "writer_id is not found in the pending_signals"); - return sessions_pending_signals_.at(writer_id); + return sessions_pending_signals_->at(writer_id); } bool Profiler::CheckPendingSignalsIsEmpty() { std::lock_guard lock(sessions_pending_signals_lock_); - return sessions_pending_signals_.empty(); + return sessions_pending_signals_->empty(); } } // namespace profiler diff --git a/projects/rocprofiler/src/core/session/profiler/profiler.h b/projects/rocprofiler/src/core/session/profiler/profiler.h index 1fc1ee69af..911e5450be 100644 --- a/projects/rocprofiler/src/core/session/profiler/profiler.h +++ b/projects/rocprofiler/src/core/session/profiler/profiler.h @@ -53,6 +53,7 @@ typedef struct { rocprofiler_kernel_properties_t kernel_properties; uint32_t thread_id; uint64_t queue_index; + uint64_t correlation_id; } pending_signal_t; namespace profiler { @@ -71,9 +72,9 @@ class Profiler { rocmtools::profiling_context_t* context, uint64_t session_data_count, hsa_ven_amd_aqlprofile_profile_t* profile, rocprofiler_kernel_properties_t kernel_properties, uint32_t thread_id, - uint64_t queue_index); + uint64_t queue_index, uint64_t correlation_id); - const std::vector& GetPendingSignals(uint32_t writer_id); + const std::vector& GetPendingSignals(uint32_t writer_id); bool CheckPendingSignalsIsEmpty(); void AddCounterName(rocprofiler_counter_id_t handler, std::string counter_name); @@ -97,7 +98,7 @@ class Profiler { rocprofiler_session_id_t session_id_; std::mutex sessions_pending_signals_lock_; - std::map> sessions_pending_signals_; + std::map>* sessions_pending_signals_; }; } // namespace profiler diff --git a/projects/rocprofiler/src/core/session/session.cpp b/projects/rocprofiler/src/core/session/session.cpp index bcf30b5f77..d9bfeed9e2 100644 --- a/projects/rocprofiler/src/core/session/session.cpp +++ b/projects/rocprofiler/src/core/session/session.cpp @@ -41,7 +41,9 @@ namespace rocmtools { Session::Session(rocprofiler_replay_mode_t replay_mode, rocprofiler_session_id_t session_id) - : session_id_(session_id), is_active_(false), replay_mode_(replay_mode) {} + : session_id_(session_id), is_active_(false), replay_mode_(replay_mode) { + buffers_ = new std::map(); +} Session::~Session() { while (GetCurrentActiveInterruptSignalsCount() > 0) { @@ -63,6 +65,7 @@ Session::~Session() { // std::lock_guard lock(filters_lock_); // buffers_.clear(); // } + delete buffers_; } void Session::DisableTools(rocprofiler_buffer_id_t buffer_id) { @@ -171,6 +174,8 @@ void Session::Start() { void Session::Terminate() { if (is_active_) { + while (GetCurrentActiveInterruptSignalsCount() > 0) { + } rocmtools::queue::ResetSessionID(); std::lock_guard lock(session_lock_); if (FindFilterWithKind(ROCPROFILER_SPM_COLLECTION)) { @@ -204,6 +209,11 @@ void Session::Terminate() { } } + for (auto& buffer : *buffers_) { + buffer.second->Flush(); + delete buffer.second; + } + is_active_ = false; } } @@ -289,7 +299,7 @@ rocprofiler_filter_id_t Session::GetFilterIdWithKind(rocprofiler_filter_kind_t k return rocprofiler_filter_id_t{0}; } -bool Session::HasBuffer() { return buffers_.size() > 0; } +bool Session::HasBuffer() { return buffers_->size() > 0; } rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buffer_callback, size_t buffer_size) { @@ -297,8 +307,8 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff rocprofiler_buffer_id_t{buffers_counter_.fetch_add(1, std::memory_order_release)}; { std::lock_guard lock(buffers_lock_); - buffers_.emplace(id.value, - new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback)); + buffers_->emplace(id.value, + new Memory::GenericBuffer(session_id_, id, buffer_size, buffer_callback)); } return id; } @@ -306,7 +316,7 @@ rocprofiler_buffer_id_t Session::CreateBuffer(rocprofiler_buffer_callback_t buff bool Session::FindBuffer(rocprofiler_buffer_id_t buffer_id) { { std::lock_guard lock(buffers_lock_); - return buffers_.find(buffer_id.value) != buffers_.end(); + return buffers_->find(buffer_id.value) != buffers_->end(); } } @@ -316,8 +326,8 @@ void Session::DestroyTracer() { /* tracer_.reset(); */ void Session::DestroyBuffer(rocprofiler_buffer_id_t buffer_id) { { std::lock_guard lock(filters_lock_); - delete buffers_.at(buffer_id.value); - buffers_.erase(buffer_id.value); + delete buffers_->at(buffer_id.value); + buffers_->erase(buffer_id.value); // if (buffers_.find(buffer_id.value) != buffers_.end() && // buffers_.at(buffer_id.value)->IsValid()) // buffers_.at(buffer_id.value).reset(); @@ -347,7 +357,7 @@ rocprofiler_status_t Session::stopSpm() { Memory::GenericBuffer* Session::GetBuffer(rocprofiler_buffer_id_t buffer_id) { { std::lock_guard lock(buffers_lock_); - return buffers_.at(buffer_id.value); + return buffers_->at(buffer_id.value); } } diff --git a/projects/rocprofiler/src/core/session/session.h b/projects/rocprofiler/src/core/session/session.h index a0da53ce17..1a832b4167 100644 --- a/projects/rocprofiler/src/core/session/session.h +++ b/projects/rocprofiler/src/core/session/session.h @@ -124,7 +124,7 @@ class Session { std::atomic buffers_counter_{1}; std::mutex buffers_lock_; - std::map buffers_; + std::map* buffers_; std::atomic records_counter_{1}; diff --git a/projects/rocprofiler/src/tools/tool.cpp b/projects/rocprofiler/src/tools/tool.cpp index d778bf9132..6c58c45b46 100644 --- a/projects/rocprofiler/src/tools/tool.cpp +++ b/projects/rocprofiler/src/tools/tool.cpp @@ -300,13 +300,13 @@ att_parsed_input_t GetATTParams() { } else if (param_name == "PERFCOUNTER") { counters_names.push_back(line.substr(pos + 1)); continue; - } else { // param_value is a number + } else { // param_value is a number try { - auto hexa_pos = line.find("0x", pos); // Is it hex? + auto hexa_pos = line.find("0x", pos); // Is it hex? if (hexa_pos != std::string::npos) param_value = stoi(line.substr(hexa_pos + 2), 0, 16); // hexadecimal else - param_value = stoi(line.substr(pos + 1), 0, 10); // decimal + param_value = stoi(line.substr(pos + 1), 0, 10); // decimal } catch (...) { printf("Error: Invalid parameter value %s - (%s)\n", line.substr(pos + 1, line.size()).c_str(), line.c_str()); @@ -364,6 +364,9 @@ att_parsed_input_t GetATTParams() { } void finish() { + for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) { + CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id)); + } if (amd_sys_handler.load(std::memory_order_release)) { amd_sys_handler.exchange(false, std::memory_order_release); wait_for_start_shm.join(); @@ -371,11 +374,8 @@ void finish() { } if (session_created.load(std::memory_order_relaxed)) { session_created.exchange(false, std::memory_order_release); - CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id)); rocprofiler::TraceBufferBase::FlushAll(); - for ([[maybe_unused]] rocprofiler_buffer_id_t buffer_id : buffer_ids) { - CHECK_ROCPROFILER(rocprofiler_flush_data(session_id, buffer_id)); - } + CHECK_ROCPROFILER(rocprofiler_terminate_session(session_id)); } } @@ -646,49 +646,59 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f if (apis_requested.size() > 0) filters_requested.emplace_back(ROCPROFILER_API_TRACE); if (parameters.size() > 0) filters_requested.emplace_back(ROCPROFILER_ATT_TRACE_COLLECTION); - rocprofiler_buffer_id_t buffer_id; - CHECK_ROCPROFILER(rocprofiler_create_buffer( - session_id, - [](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record, - rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { - if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); - }, - 1 << 20, &buffer_id)); - buffer_ids.emplace_back(buffer_id); - - rocprofiler_buffer_id_t buffer_id_1; - CHECK_ROCPROFILER(rocprofiler_create_buffer( - session_id, - [](const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record, - rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id_1) { - if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id_1); - }, - 1 << 20, &buffer_id_1)); - buffer_ids.emplace_back(buffer_id_1); - for (rocprofiler_filter_kind_t filter_kind : filters_requested) { switch (filter_kind) { case ROCPROFILER_COUNTERS_COLLECTION: { + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); + }, + 1 << 20, &buffer_id)); + buffer_ids.emplace_back(buffer_id); printf("Enabling Counter Collection\n"); rocprofiler_filter_id_t filter_id; [[maybe_unused]] rocprofiler_filter_property_t property = {}; CHECK_ROCPROFILER(rocprofiler_create_filter( session_id, filter_kind, rocprofiler_filter_data_t{.counters_names = &counters_[0]}, counters_.size(), &filter_id, property)); - CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1)); + CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id)); filter_ids.emplace_back(filter_id); break; } case ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION: { + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); + }, + 1 << 20, &buffer_id)); + buffer_ids.emplace_back(buffer_id); rocprofiler_filter_id_t filter_id; [[maybe_unused]] rocprofiler_filter_property_t property = {}; CHECK_ROCPROFILER(rocprofiler_create_filter( session_id, filter_kind, rocprofiler_filter_data_t{}, 0, &filter_id, property)); - CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1)); + CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id)); filter_ids.emplace_back(filter_id); break; } case ROCPROFILER_API_TRACE: { + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); + }, + 1 << 20, &buffer_id)); + buffer_ids.emplace_back(buffer_id); printf("Enabling API Tracing\n"); rocprofiler_filter_id_t filter_id; [[maybe_unused]] rocprofiler_filter_property_t property = {}; @@ -702,6 +712,16 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f break; } case ROCPROFILER_ATT_TRACE_COLLECTION: { + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); + }, + 1 << 20, &buffer_id)); + buffer_ids.emplace_back(buffer_id); printf("Enabling ATT Tracing\n"); rocprofiler_filter_id_t filter_id; @@ -717,11 +737,21 @@ ROCPROFILER_EXPORT bool OnLoad(void* table, uint64_t runtime_version, uint64_t f rocprofiler_create_filter(session_id, ROCPROFILER_ATT_TRACE_COLLECTION, rocprofiler_filter_data_t{.att_parameters = ¶meters[0]}, parameters.size(), &filter_id, property)); - CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id_1)); + CHECK_ROCPROFILER(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id)); filter_ids.emplace_back(filter_id); break; } case ROCPROFILER_PC_SAMPLING_COLLECTION: { + rocprofiler_buffer_id_t buffer_id; + CHECK_ROCPROFILER(rocprofiler_create_buffer( + session_id, + [](const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (plugin) plugin->write_buffer_records(record, end_record, session_id, buffer_id); + }, + 1 << 20, &buffer_id)); + buffer_ids.emplace_back(buffer_id); puts("Enabling PC sampling"); rocprofiler_filter_id_t filter_id; [[maybe_unused]] rocprofiler_filter_property_t property = {}; diff --git a/projects/rocprofiler/src/utils/helper.cpp b/projects/rocprofiler/src/utils/helper.cpp index cf8e933824..1ac92b009f 100644 --- a/projects/rocprofiler/src/utils/helper.cpp +++ b/projects/rocprofiler/src/utils/helper.cpp @@ -145,7 +145,8 @@ std::string string_printf(const char* format, ...) { std::string errmsg("ROCMTools: fatal error: " + message); fputs(errmsg.c_str(), stderr); - throw(errmsg); + std::cerr << errmsg << std::endl; + abort(); } /* The function extracts the kernel name from