diff --git a/projects/rocprofiler-sdk/samples/CMakeLists.txt b/projects/rocprofiler-sdk/samples/CMakeLists.txt index 4884d85270..ac7619b7b6 100644 --- a/projects/rocprofiler-sdk/samples/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/CMakeLists.txt @@ -20,3 +20,4 @@ add_subdirectory(api_callback_tracing) add_subdirectory(api_buffered_tracing) add_subdirectory(counter_collection) add_subdirectory(intercept_table) +add_subdirectory(code_object_tracing) diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp index 64f8fc1007..20ed09ee9f 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.cpp @@ -19,7 +19,7 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. - +// // undefine NDEBUG so asserts are implemented #ifdef NDEBUG # undef NDEBUG @@ -34,11 +34,14 @@ #include "client.hpp" #include +#include +#include #include #include #include #include +#include #include #include #include @@ -61,8 +64,14 @@ rocprofiler_status_t CHECKSTATUS = result; \ if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ { \ - std::cerr << #result << " failed with error code " << CHECKSTATUS << std::endl; \ - throw std::runtime_error(#result " failure"); \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ } \ } @@ -82,6 +91,8 @@ using call_stack_t = std::vector; using buffer_kind_names_t = std::map; using buffer_kind_operation_names_t = std::map>; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = std::unordered_map; struct buffer_name_info { @@ -93,6 +104,8 @@ rocprofiler_client_id_t* client_id = nullptr; rocprofiler_client_finalize_t client_fini_func = nullptr; rocprofiler_context_id_t client_ctx = {}; rocprofiler_buffer_id_t client_buffer = {}; +buffer_name_info client_name_info = {}; +kernel_symbol_map_t client_kernels = {}; void print_call_stack(const call_stack_t& _call_stack) @@ -127,9 +140,9 @@ print_call_stack(const call_stack_t& _call_stack) size_t n = 0; for(const auto& itr : _call_stack) { - *ofs << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() << " "; - *ofs << "[" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) - << std::left << itr.function; + *ofs << std::left << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() + << " [" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) + << itr.function; if(!itr.context.empty()) *ofs << " :: " << itr.context; *ofs << "\n"; } @@ -188,6 +201,42 @@ get_buffer_tracing_names() return cb_name_info; } +void +tool_code_object_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) + { + if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + // flush the buffer to ensure that any lookups for the client kernel names for the code + // object are completed + auto flush_status = rocprofiler_flush_buffer(client_buffer); + if(flush_status != ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) + ROCPROFILER_CALL(flush_status, "buffer flush"); + } + } + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + auto* data = static_cast(record.payload); + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + client_kernels.emplace(data->kernel_id, *data); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + client_kernels.erase(data->kernel_id); + } + } + + (void) user_data; + (void) callback_data; +} + void tool_tracing_callback(rocprofiler_context_id_t context, rocprofiler_buffer_id_t buffer_id, @@ -197,6 +246,7 @@ tool_tracing_callback(rocprofiler_context_id_t context, uint64_t drop_count) { assert(user_data != nullptr); + assert(drop_count == 0 && "drop count should be zero for lossless policy"); if(num_headers == 0) throw std::runtime_error{ @@ -227,12 +277,49 @@ tool_tracing_callback(rocprofiler_context_id_t context, auto info = std::stringstream{}; info << "tid=" << record->thread_id << ", context=" << context.handle << ", buffer_id=" << buffer_id.handle - << ", cid=" << record->correlation_id.internal << ", kind=" << record->kind - << ", operation=" << record->operation << ", drop_count=" << drop_count - << ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp; + << ", cid=" << record->correlation_id.internal + << ", extern_cid=" << record->correlation_id.external.value + << ", kind=" << record->kind << ", operation=" << record->operation + << ", start=" << record->start_timestamp << ", stop=" << record->end_timestamp + << ", name=" << client_name_info.operation_names[record->kind][record->operation]; if(record->start_timestamp > record->end_timestamp) - throw std::runtime_error("start > end"); + { + auto msg = std::stringstream{}; + msg << "hsa api: start > end (" << record->start_timestamp << " > " + << record->end_timestamp + << "). diff = " << (record->start_timestamp - record->end_timestamp); + std::cerr << "threw an exception " << msg.str() << "\n" << std::flush; + // throw std::runtime_error{msg.str()}; + } + + static_cast(user_data)->emplace_back( + source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) + { + auto* record = + static_cast(header->payload); + + auto info = std::stringstream{}; + + info << "agent_id=" << record->agent_id.handle + << ", queue_id=" << record->queue_id.handle << ", kernel_id=" << record->kernel_id + << ", kernel=" << client_kernels.at(record->kernel_id).kernel_name + << ", context=" << context.handle << ", buffer_id=" << buffer_id.handle + << ", cid=" << record->correlation_id.internal + << ", extern_cid=" << record->correlation_id.external.value + << ", kind=" << record->kind << ", start=" << record->start_timestamp + << ", stop=" << record->end_timestamp + << ", private_segment_size=" << record->private_segment_size + << ", group_segment_size=" << record->group_segment_size << ", workgroup_size=(" + << record->workgroup_size.x << "," << record->workgroup_size.y << "," + << record->workgroup_size.z << "), grid_size=(" << record->grid_size.x << "," + << record->grid_size.y << "," << record->grid_size.z << ")"; + + if(record->start_timestamp > record->end_timestamp) + throw std::runtime_error("kernel dispatch: start > end"); static_cast(user_data)->emplace_back( source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); @@ -275,9 +362,9 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); - buffer_name_info name_info = get_buffer_tracing_names(); + client_name_info = get_buffer_tracing_names(); - for(const auto& itr : name_info.operation_names) + for(const auto& itr : client_name_info.operation_names) { auto name_idx = std::stringstream{}; name_idx << " [" << std::setw(3) << static_cast(itr.first) << "]"; @@ -285,7 +372,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) source_location{"rocprofiler_buffer_tracing_kind_names " + name_idx.str(), __FILE__, __LINE__, - name_info.kind_names.at(itr.first)}); + client_name_info.kind_names.at(itr.first)}); for(const auto& ditr : itr.second) { @@ -301,7 +388,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) client_fini_func = fini_func; - ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation failed"); + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_code_object_callback, + nullptr), + "code object tracing service configure"); ROCPROFILER_CALL(rocprofiler_create_buffer(client_ctx, 4096, @@ -310,22 +406,32 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) tool_tracing_callback, tool_data, &client_buffer), - "buffer creation failed"); + "buffer creation"); ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( client_ctx, ROCPROFILER_BUFFER_TRACING_HSA_API, nullptr, 0, client_buffer), - "buffer tracing service failed to configure"); + "buffer tracing service configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service( + client_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, client_buffer), + "buffer tracing service for kernel dispatch configure"); + + ROCPROFILER_CALL( + rocprofiler_configure_buffer_tracing_service( + client_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, client_buffer), + "buffer tracing service for memory copy configure"); auto client_thread = rocprofiler_callback_thread_t{}; ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread), - "failure creating callback thread"); + "creating callback thread"); ROCPROFILER_CALL(rocprofiler_assign_callback_thread(client_buffer, client_thread), - "failed to assign thread for buffer"); + "assignment of thread for buffer"); int valid_ctx = 0; ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), - "failure checking context validity"); + "context validity check"); if(valid_ctx == 0) { // notify rocprofiler that initialization failed @@ -334,7 +440,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) return -1; } - ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "rocprofiler context start failed"); + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "rocprofiler context start"); // no errors return 0; @@ -357,8 +463,7 @@ tool_fini(void* tool_data) void setup() { - ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), - "failed to force configuration"); + ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), "force configuration"); } void @@ -366,20 +471,7 @@ shutdown() { if(client_id) { - auto status = ROCPROFILER_STATUS_SUCCESS; - while((status = rocprofiler_flush_buffer(client_buffer)) == - ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) - { - std::this_thread::yield(); - std::this_thread::sleep_for(std::chrono::milliseconds{10}); - } - ROCPROFILER_CALL(status, "rocprofiler_flush_buffer failed"); - while((status = rocprofiler_flush_buffer(client_buffer)) == - ROCPROFILER_STATUS_ERROR_BUFFER_BUSY) - { - std::this_thread::yield(); - std::this_thread::sleep_for(std::chrono::milliseconds{10}); - } + ROCPROFILER_CALL(rocprofiler_flush_buffer(client_buffer), "buffer flush"); client_fini_func(*client_id); } } @@ -387,13 +479,22 @@ shutdown() void start() { - ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "rocprofiler context start failed"); + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); +} + +void +identify(uint64_t val) +{ + auto _tid = rocprofiler_thread_id_t{}; + rocprofiler_get_thread_id(&_tid); + rocprofiler_push_external_correlation_id( + client_ctx, _tid, rocprofiler_user_data_t{.value = val}); } void stop() { - ROCPROFILER_CALL(rocprofiler_stop_context(client_ctx), "rocprofiler context stop failed"); + ROCPROFILER_CALL(rocprofiler_stop_context(client_ctx), "context stop"); } } // namespace client @@ -435,7 +536,7 @@ rocprofiler_configure(uint32_t version, ROCPROFILER_LIBRARY | ROCPROFILER_HSA_LIBRARY | ROCPROFILER_HIP_LIBRARY | ROCPROFILER_MARKER_LIBRARY, static_cast(client_tool_data)), - "failed to register for thread creation notifications"); + "registration for thread creation notifications"); // create configure data static auto cfg = diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.hpp b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.hpp index c58ea04b07..d6bcb52c36 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.hpp +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/client.hpp @@ -28,6 +28,8 @@ # define CLIENT_API #endif +#include + namespace client { void @@ -41,4 +43,7 @@ start() CLIENT_API; void stop() CLIENT_API; + +void +identify(uint64_t corr_id) CLIENT_API; } // namespace client diff --git a/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp b/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp index 21d556dc55..6a721350ec 100644 --- a/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp +++ b/projects/rocprofiler-sdk/samples/api_buffered_tracing/main.cpp @@ -64,7 +64,7 @@ verify(int* in, int* out, int M, int N); } // namespace __global__ void -transpose_a(int* in, int* out, int M, int N); +transpose_a(const int* in, int* out, int M, int N); void run(int rank, int tid, hipStream_t stream, int argc, char** argv); @@ -74,6 +74,7 @@ main(int argc, char** argv) { client::setup(); // forces rocprofiler to configure/initialize client::start(); // starts context before any API tables are available + client::identify(1); int rank = 0; for(int i = 1; i < argc; ++i) @@ -133,7 +134,7 @@ main(int argc, char** argv) } __global__ void -transpose_a(int* in, int* out, int M, int N) +transpose_a(const int* in, int* out, int M, int N) { __shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim]; @@ -147,6 +148,7 @@ transpose_a(int* in, int* out, int M, int N) void run(int rank, int tid, hipStream_t stream, int argc, char** argv) { + client::identify(tid + 1); unsigned int M = 4960 * 2; unsigned int N = 4960 * 2; if(argc > 2) nitr = atoll(argv[2]); @@ -180,6 +182,18 @@ run(int rank, int tid, hipStream_t stream, int argc, char** argv) dim3 grid(M / 32, N / 32, 1); dim3 block(32, 32, 1); // transpose_a + print_lock.lock(); + printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", + rank, + tid, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z); + print_lock.unlock(); + auto t1 = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < nitr; ++i) { diff --git a/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp b/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp index 29cf00b43c..cfc0299654 100644 --- a/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp +++ b/projects/rocprofiler-sdk/samples/api_callback_tracing/client.cpp @@ -58,8 +58,14 @@ rocprofiler_status_t CHECKSTATUS = result; \ if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ { \ - std::cerr << #result << " failed with error code " << CHECKSTATUS << std::endl; \ - throw std::runtime_error(#result " failure"); \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ } \ } @@ -124,8 +130,8 @@ print_call_stack(const call_stack_t& _call_stack) *ofs << std::left; for(const auto& itr : _call_stack) { - *ofs << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() << " "; - *ofs << "[" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) + *ofs << std::left << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() + << " [" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) << itr.function; if(!itr.context.empty()) *ofs << " :: " << itr.context; *ofs << "\n"; diff --git a/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt b/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt new file mode 100644 index 0000000000..a1d1df215c --- /dev/null +++ b/projects/rocprofiler-sdk/samples/code_object_tracing/CMakeLists.txt @@ -0,0 +1,60 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-samples-code-object-tracing LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +if(NOT TARGET rocprofiler::rocprofiler) + find_package(rocprofiler REQUIRED) +endif() + +add_library(code-object-tracing-client SHARED) +target_sources(code-object-tracing-client PRIVATE client.cpp) +target_link_libraries( + code-object-tracing-client + PRIVATE rocprofiler::rocprofiler + $) + +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) +find_package(Threads REQUIRED) + +add_executable(code-object-tracing) +target_sources(code-object-tracing PRIVATE main.cpp) +target_link_libraries( + code-object-tracing PRIVATE code-object-tracing-client Threads::Threads + $) + +add_test(NAME code-object-tracing COMMAND $) + +set_tests_properties( + code-object-tracing + PROPERTIES + TIMEOUT + 45 + LABELS + "samples" + ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" + FAIL_REGULAR_EXPRESSION + "threw an exception") diff --git a/projects/rocprofiler-sdk/samples/code_object_tracing/client.cpp b/projects/rocprofiler-sdk/samples/code_object_tracing/client.cpp new file mode 100644 index 0000000000..3f435659dc --- /dev/null +++ b/projects/rocprofiler-sdk/samples/code_object_tracing/client.cpp @@ -0,0 +1,405 @@ +// MIT License +// +// Copyright (c) 2023 ROCm Developer Tools +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#include +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/code_object_tracing/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ + } \ + } + +namespace client +{ +namespace +{ +struct source_location +{ + std::string function = {}; + std::string file = {}; + uint32_t line = 0; + std::string context = {}; +}; + +using call_stack_t = std::vector; +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = std::unordered_map; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_client_finalize_t client_fini_func = nullptr; +rocprofiler_context_id_t client_ctx = {}; +kernel_symbol_map_t client_kernels = {}; + +std::string +cxa_demangle(std::string_view _mangled_name, int* _status) +{ + constexpr size_t buffer_len = 4096; + // return the mangled since there is no buffer + if(_mangled_name.empty()) + { + *_status = -2; + return std::string{}; + } + + auto _demangled_name = std::string{_mangled_name}; + + // PARAMETERS to __cxa_demangle + // mangled_name: + // A NULL-terminated character string containing the name to be demangled. + // buffer: + // A region of memory, allocated with malloc, of *length bytes, into which the + // demangled name is stored. If output_buffer is not long enough, it is expanded + // using realloc. output_buffer may instead be NULL; in that case, the demangled + // name is placed in a region of memory allocated with malloc. + // _buflen: + // If length is non-NULL, the length of the buffer containing the demangled name + // is placed in *length. + // status: + // *status is set to one of the following values + size_t _demang_len = 0; + char* _demang = abi::__cxa_demangle(_demangled_name.c_str(), nullptr, &_demang_len, _status); + switch(*_status) + { + // 0 : The demangling operation succeeded. + // -1 : A memory allocation failure occurred. + // -2 : mangled_name is not a valid name under the C++ ABI mangling rules. + // -3 : One of the arguments is invalid. + case 0: + { + if(_demang) _demangled_name = std::string{_demang}; + break; + } + case -1: + { + char _msg[buffer_len]; + ::memset(_msg, '\0', buffer_len * sizeof(char)); + ::snprintf(_msg, + buffer_len, + "memory allocation failure occurred demangling %s", + _demangled_name.c_str()); + ::perror(_msg); + break; + } + case -2: break; + case -3: + { + char _msg[buffer_len]; + ::memset(_msg, '\0', buffer_len * sizeof(char)); + ::snprintf(_msg, + buffer_len, + "Invalid argument in: (\"%s\", nullptr, nullptr, %p)", + _demangled_name.c_str(), + (void*) _status); + ::perror(_msg); + break; + } + default: break; + }; + + // if it "demangled" but the length is zero, set the status to -2 + if(_demang_len == 0 && *_status == 0) *_status = -2; + + // free allocated buffer + ::free(_demang); + return _demangled_name; +} + +void +print_call_stack(const call_stack_t& _call_stack) +{ + namespace fs = ::std::filesystem; + + auto ofname = std::string{"code_object_trace.log"}; + if(auto* eofname = getenv("ROCPROFILER_SAMPLE_OUTPUT_FILE")) ofname = eofname; + + std::ostream* ofs = nullptr; + auto cleanup = std::function{}; + + if(ofname == "stdout") + ofs = &std::cout; + else if(ofname == "stderr") + ofs = &std::cerr; + else + { + ofs = new std::ofstream{ofname}; + if(ofs && *ofs) + cleanup = [](std::ostream*& _os) { delete _os; }; + else + { + std::cerr << "Error outputting to " << ofname << ". Redirecting to stderr...\n"; + ofname = "stderr"; + ofs = &std::cerr; + } + } + + std::cout << "Outputting collected data to " << ofname << "...\n" << std::flush; + + size_t n = 0; + for(const auto& itr : _call_stack) + { + *ofs << std::left << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() + << " [" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) + << itr.function; + if(!itr.context.empty()) *ofs << " :: " << itr.context; + *ofs << "\n"; + } + + *ofs << std::flush; + + if(cleanup) cleanup(ofs); +} + +template +std::string +as_hex(Tp _v, size_t _width = 16) +{ + auto _ss = std::stringstream{}; + _ss.fill('0'); + _ss << "0x" << std::hex << std::setw(_width) << _v; + return _ss.str(); +} + +void +tool_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) + { + auto* data = static_cast(record.payload); + auto* call_stack_v = static_cast(callback_data); + auto info = std::stringstream{}; + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + info << "code object load :: "; + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + info << "code object unload :: "; + } + + info << "code_object_id=" << data->code_object_id + << ", rocp_agent=" << data->rocp_agent.handle << ", uri=" << data->uri + << ", load_base=" << as_hex(data->load_base) << ", load_size=" << data->load_size + << ", load_delta=" << as_hex(data->load_delta); + if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) + info << ", storage_file_descr=" << data->storage_file; + else if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY) + info << ", storage_memory_base=" << as_hex(data->memory_base) + << ", storage_memory_size=" << data->memory_size; + + call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + auto* data = static_cast(record.payload); + auto* call_stack_v = static_cast(callback_data); + auto info = std::stringstream{}; + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + info << "kernel symbol load :: "; + client_kernels.emplace(data->kernel_id, *data); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + info << "kernel symbol unload :: "; + client_kernels.erase(data->kernel_id); + } + + auto kernel_name = std::regex_replace(data->kernel_name, std::regex{"(\\.kd)$"}, ""); + int demangle_status = 0; + kernel_name = cxa_demangle(kernel_name, &demangle_status); + + info << "code_object_id=" << data->code_object_id << ", kernel_id=" << data->kernel_id + << ", kernel_object=" << as_hex(data->kernel_object) + << ", kernarg_segment_size=" << data->kernarg_segment_size + << ", kernarg_segment_alignment=" << data->kernarg_segment_alignment + << ", group_segment_size=" << data->group_segment_size + << ", private_segment_size=" << data->private_segment_size + << ", kernel_name=" << kernel_name; + + call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + } + + (void) user_data; +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + assert(tool_data != nullptr); + + auto* call_stack_v = static_cast(tool_data); + + call_stack_v->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); + + client_fini_func = fini_func; + + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_tracing_callback, + tool_data), + "code object tracing service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data != nullptr); + + auto* _call_stack = static_cast(tool_data); + _call_stack->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); + + print_call_stack(*_call_stack); + + delete _call_stack; +} + +void +setup() +{ + if(int status = 0; + rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) + { + ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), + "force configuration"); + } +} +} // namespace + +// force configuration when library is loaded +bool cfg_on_load = (client::setup(), true); +} // namespace client + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "ExampleTool"; + + // store client info + client::client_id = id; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " is using rocprofiler v" << major << "." << minor << "." << patch << " (" + << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + auto* client_tool_data = new std::vector{}; + + client_tool_data->emplace_back( + client::source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &client::tool_init, + &client::tool_fini, + static_cast(client_tool_data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp b/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp new file mode 100644 index 0000000000..89db164b81 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/code_object_tracing/main.cpp @@ -0,0 +1,247 @@ +// MIT License +// +// Copyright (c) 2023 ROCm Developer Tools +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "hip/hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + auto _hip_api_print_lk = auto_lock_t{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + throw std::runtime_error("hip_api_call"); \ + } \ + } + +namespace +{ +using auto_lock_t = std::unique_lock; +auto print_lock = std::mutex{}; +size_t nthreads = 2; +size_t nitr = 500; +size_t nsync = 10; +constexpr unsigned shared_mem_tile_dim = 32; + +void +check_hip_error(void); + +void +verify(int* in, int* out, int M, int N); +} // namespace + +__global__ void +transpose_a(const int* in, int* out, int M, int N); + +void +run(int rank, int tid, hipStream_t stream, int argc, char** argv); + +int +main(int argc, char** argv) +{ + int rank = 0; + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + fprintf(stderr, + "usage: transpose [NUM_THREADS (%zu)] [NUM_ITERATION (%zu)] " + "[SYNC_EVERY_N_ITERATIONS (%zu)]\n", + nthreads, + nitr, + nsync); + exit(EXIT_SUCCESS); + } + } + if(argc > 1) nthreads = atoll(argv[1]); + if(argc > 2) nitr = atoll(argv[2]); + if(argc > 3) nsync = atoll(argv[3]); + + printf("[transpose] Number of threads: %zu\n", nthreads); + printf("[transpose] Number of iterations: %zu\n", nitr); + printf("[transpose] Syncing every %zu iterations\n", nsync); + + // this is a temporary workaround in omnitrace when HIP + MPI is enabled + int ndevice = 0; + int devid = rank; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + printf("[transpose] Number of devices found: %i\n", ndevice); + if(ndevice > 0) + { + devid = rank % ndevice; + HIP_API_CALL(hipSetDevice(devid)); + printf("[transpose] Rank %i assigned to device %i\n", rank, devid); + } + if(rank == devid && rank < ndevice) + { + std::vector _threads{}; + std::vector _streams(nthreads); + for(size_t i = 0; i < nthreads; ++i) + HIP_API_CALL(hipStreamCreate(&_streams.at(i))); + for(size_t i = 1; i < nthreads; ++i) + _threads.emplace_back(run, rank, i, _streams.at(i), argc, argv); + run(rank, 0, _streams.at(0), argc, argv); + for(auto& itr : _threads) + itr.join(); + for(size_t i = 0; i < nthreads; ++i) + HIP_API_CALL(hipStreamDestroy(_streams.at(i))); + } + HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipDeviceReset()); + + return 0; +} + +__global__ void +transpose_a(const int* in, int* out, int M, int N) +{ + __shared__ int tile[shared_mem_tile_dim][shared_mem_tile_dim]; + + int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + threadIdx.x; + tile[threadIdx.y][threadIdx.x] = in[idx]; + __syncthreads(); + idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + threadIdx.x; + out[idx] = tile[threadIdx.x][threadIdx.y]; +} + +void +run(int rank, int tid, hipStream_t stream, int argc, char** argv) +{ + unsigned int M = 4960 * 2; + unsigned int N = 4960 * 2; + if(argc > 2) nitr = atoll(argv[2]); + if(argc > 3) nsync = atoll(argv[3]); + + auto_lock_t _lk{print_lock}; + std::cout << "[" << rank << "][" << tid << "] M: " << M << " N: " << N << std::endl; + _lk.unlock(); + + std::default_random_engine _engine{std::random_device{}() * (rank + 1) * (tid + 1)}; + std::uniform_int_distribution _dist{0, 1000}; + + size_t size = sizeof(int) * M * N; + int* inp_matrix = new int[size]; + int* out_matrix = new int[size]; + for(size_t i = 0; i < M * N; i++) + { + inp_matrix[i] = _dist(_engine); + out_matrix[i] = 0; + } + int* in = nullptr; + int* out = nullptr; + + HIP_API_CALL(hipMalloc(&in, size)); + HIP_API_CALL(hipMalloc(&out, size)); + HIP_API_CALL(hipMemsetAsync(in, 0, size, stream)); + HIP_API_CALL(hipMemsetAsync(out, 0, size, stream)); + HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream)); + HIP_API_CALL(hipStreamSynchronize(stream)); + + dim3 grid(M / 32, N / 32, 1); + dim3 block(32, 32, 1); // transpose_a + + print_lock.lock(); + printf("[%i][%i] grid=(%i,%i,%i), block=(%i,%i,%i)\n", + rank, + tid, + grid.x, + grid.y, + grid.z, + block.x, + block.y, + block.z); + print_lock.unlock(); + + auto t1 = std::chrono::high_resolution_clock::now(); + for(size_t i = 0; i < nitr; ++i) + { + transpose_a<<>>(in, out, M, N); + check_hip_error(); + if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream)); + } + auto t2 = std::chrono::high_resolution_clock::now(); + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipMemcpyAsync(out_matrix, out, size, hipMemcpyDeviceToHost, stream)); + double time = std::chrono::duration_cast>(t2 - t1).count(); + float GB = (float) size * nitr * 2 / (1 << 30); + + print_lock.lock(); + std::cout << "[" << rank << "][" << tid << "] Runtime of transpose is " << time << " sec\n" + << "The average performance of transpose is " << GB / time << " GBytes/sec" + << std::endl; + print_lock.unlock(); + + HIP_API_CALL(hipStreamSynchronize(stream)); + + // cpu_transpose(matrix, out_matrix, M, N); + verify(inp_matrix, out_matrix, M, N); + + HIP_API_CALL(hipFree(in)); + HIP_API_CALL(hipFree(out)); + + delete[] inp_matrix; + delete[] out_matrix; +} + +namespace +{ +void +check_hip_error(void) +{ + hipError_t err = hipGetLastError(); + if(err != hipSuccess) + { + auto_lock_t _lk{print_lock}; + std::cerr << "Error: " << hipGetErrorString(err) << std::endl; + throw std::runtime_error("hip_api_call"); + } +} + +void +verify(int* in, int* out, int M, int N) +{ + for(int i = 0; i < 10; i++) + { + int row = rand() % M; + int col = rand() % N; + if(in[row * N + col] != out[col * M + row]) + { + auto_lock_t _lk{print_lock}; + std::cout << "mismatch: " << row << ", " << col << " : " << in[row * N + col] << " | " + << out[col * M + row] << "\n"; + } + } +} +} // namespace diff --git a/projects/rocprofiler-sdk/samples/counter_collection/client.cpp b/projects/rocprofiler-sdk/samples/counter_collection/client.cpp index a9a261635d..a0a486fd3d 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/client.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/client.cpp @@ -12,8 +12,14 @@ rocprofiler_status_t CHECKSTATUS = result; \ if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ { \ - std::cerr << #result << " failed with error code " << CHECKSTATUS << std::endl; \ - throw std::runtime_error(#result " failure"); \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ } \ } diff --git a/projects/rocprofiler-sdk/samples/intercept_table/client.cpp b/projects/rocprofiler-sdk/samples/intercept_table/client.cpp index 96f317b843..9adf7dae9d 100644 --- a/projects/rocprofiler-sdk/samples/intercept_table/client.cpp +++ b/projects/rocprofiler-sdk/samples/intercept_table/client.cpp @@ -58,8 +58,14 @@ rocprofiler_status_t CHECKSTATUS = result; \ if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ { \ - std::cerr << #result << " failed with error code " << CHECKSTATUS << std::endl; \ - throw std::runtime_error(#result " failure"); \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ } \ } @@ -118,8 +124,8 @@ print_call_stack(const call_stack_t& _call_stack) *ofs << std::left; for(const auto& itr : _call_stack) { - *ofs << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() << " "; - *ofs << "[" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) + *ofs << std::left << std::setw(2) << ++n << "/" << std::setw(2) << _call_stack.size() + << " [" << fs::path{itr.file}.filename() << ":" << itr.line << "] " << std::setw(20) << itr.function; if(!itr.context.empty()) *ofs << " :: " << itr.context; *ofs << "\n"; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler/buffer_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler/buffer_tracing.h index f898e1d4a7..664717ddfc 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler/buffer_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler/buffer_tracing.h @@ -86,14 +86,11 @@ typedef struct uint64_t size; rocprofiler_service_buffer_tracing_kind_t kind; rocprofiler_correlation_id_t correlation_id; - /** - * Memory copy operation that can be derived from - * ::rocprofiler_tracing_operation_t - */ - uint32_t operation; - rocprofiler_timestamp_t start_timestamp; - rocprofiler_timestamp_t end_timestamp; - rocprofiler_queue_id_t queue_id; + rocprofiler_timestamp_t start_timestamp; + rocprofiler_timestamp_t end_timestamp; + rocprofiler_agent_id_t agent_id; + rocprofiler_queue_id_t queue_id; + rocprofiler_kernel_id_t kernel_id; } rocprofiler_buffer_tracing_memory_copy_record_t; /** diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler/callback_tracing.h b/projects/rocprofiler-sdk/source/include/rocprofiler/callback_tracing.h index 62a5036949..845cde17ec 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler/callback_tracing.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler/callback_tracing.h @@ -97,7 +97,7 @@ typedef struct ///< the code object that is loaded. Note that any non-loaded segments ///< before the first loaded segment are ignored. uint64_t load_size; ///< The byte size of the loaded code objects contiguous memory allocation. - uint64_t load_delta; ///< The signed byte address difference of the memory address at which the + int64_t load_delta; ///< The signed byte address difference of the memory address at which the ///< code object is loaded minus the virtual address specified in the code ///< object that is loaded. rocprofiler_code_object_storage_type_t @@ -131,12 +131,11 @@ typedef struct */ typedef struct { - uint64_t size; ///< size of this struct - uint64_t kernel_id; ///< unique symbol identifier value - uint64_t code_object_id; ///< parent unique code object identifier - rocprofiler_agent_id_t rocp_agent; ///< Agent associated with this symbol - const char* kernel_name; ///< name of the kernel - uint64_t kernel_object; ///< kernel object handle, used in the kernel dispatch packet + uint64_t size; ///< size of this struct + uint64_t kernel_id; ///< unique symbol identifier value + uint64_t code_object_id; ///< parent unique code object identifier + const char* kernel_name; ///< name of the kernel + uint64_t kernel_object; ///< kernel object handle, used in the kernel dispatch packet uint32_t kernarg_segment_size; ///< size of memory (in bytes) allocated for kernel arguments. ///< Will be multiple of 16 uint32_t kernarg_segment_alignment; ///< Alignment (in bytes) of the buffer used to pass diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler/external_correlation.h b/projects/rocprofiler-sdk/source/include/rocprofiler/external_correlation.h index 94419bd603..8ed07ac6ae 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler/external_correlation.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler/external_correlation.h @@ -37,6 +37,13 @@ ROCPROFILER_EXTERN_C_INIT /** * @brief Push default value for `external` field in @ref rocprofiler_correlation_id_t onto stack. * + * External correlation ids are thread-local values. However, if rocprofiler internally requests an + * external correlation id on a non-main thread and an external correlation id has not been pushed + * for this thread, the external correlation ID will default to the latest external correlation id + * on the main thread -- this allows tools to push an external correlation id once on the main + * thread for, say, the MPI rank or process-wide UUID and this value will be used by all subsequent + * child threads. + * * @param [in] context Associated context * @param [in] tid thread identifier. @see rocprofiler_get_thread_id * @param [in] external_correlation_id User data to place in external field in @ref diff --git a/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt index 8cb86e34d6..89963ab8f1 100644 --- a/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/common/CMakeLists.txt @@ -4,13 +4,22 @@ rocprofiler_activate_clang_tidy() set(common_sources config.cpp environment.cpp demangle.cpp utility.cpp xml.cpp) -set(common_headers config.hpp defines.hpp environment.hpp demangle.hpp mpl.hpp - synchronized.hpp utility.hpp xml.hpp) +set(common_headers + config.hpp + defines.hpp + environment.hpp + demangle.hpp + mpl.hpp + scope_destructor.hpp + synchronized.hpp + utility.hpp + xml.hpp) add_library(rocprofiler-common-library STATIC) add_library(rocprofiler::rocprofiler-common-library ALIAS rocprofiler-common-library) add_subdirectory(container) +add_subdirectory(memory) target_sources(rocprofiler-common-library PRIVATE ${common_sources} ${common_headers}) target_include_directories(rocprofiler-common-library diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/common/memory/CMakeLists.txt new file mode 100644 index 0000000000..2aa6c04a8f --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/memory/CMakeLists.txt @@ -0,0 +1,7 @@ +# +# add container sources and headers to common library target +# +set(memory_headers deleter.hpp pool.hpp pool_allocator.hpp stateless_allocator.hpp) +set(memory_sources) + +target_sources(rocprofiler-common-library PRIVATE ${memory_sources} ${memory_headers}) diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/deleter.hpp b/projects/rocprofiler-sdk/source/lib/common/memory/deleter.hpp new file mode 100644 index 0000000000..01ca522000 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/memory/deleter.hpp @@ -0,0 +1,46 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" + +namespace rocprofiler +{ +namespace common +{ +namespace memory +{ +// this type is a template parameter for allocators to execute a function when the allocator +// destroys an object. In the rocprofiler library, this is used to ensure +// rocprofiler::registration::finalize is called on the first instance of a data structure being +// destroyed after main exits +template +struct deleter; + +// default deleter type +template <> +struct deleter +{ + constexpr void operator()() const {} +}; +} // namespace memory +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp b/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp new file mode 100644 index 0000000000..5d3854a18f --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/memory/pool.hpp @@ -0,0 +1,101 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" + +#include +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace common +{ +namespace memory +{ +template +class pool +{ +public: + explicit pool(size_t size) + : m_size(size) + { + for(size_t i = 0; i < ReservedBlocks; i++) + { + append(); + } + } + + void* allocate() + { + if(m_addrs.empty()) + { + append(); + } + + auto* ptr = m_addrs.top(); + m_addrs.pop(); + return ptr; + } + + void deallocate(void* ptr) { m_addrs.push(ptr); } + + void rebind(size_t size) + { + if(!(m_addrs.empty() && m_blocks.empty())) + { + throw std::runtime_error{"cannot call pool::rebind() after alloc"}; + ::abort(); + } + + m_size = size; + } + +private: + // Refill the address stack by allocating another block of memory + void append() + { + auto block = std::make_unique(BlockSize); + auto total_size = BlockSize % m_size == 0 ? BlockSize : BlockSize - m_size; + + // Divide the block into chunks of m_size bytes, and add their addrs + for(size_t i = 0; i < total_size; i += m_size) + { + m_addrs.push(&block.get()[i]); + } + + // Keep the memory of the block alive by adding it to our stack + m_blocks.push(std::move(block)); + } + +private: + size_t m_size = {}; + std::stack m_addrs = {}; + std::stack> m_blocks = {}; +}; + +} // namespace memory +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/pool_allocator.hpp b/projects/rocprofiler-sdk/source/lib/common/memory/pool_allocator.hpp new file mode 100644 index 0000000000..1eb1d45f1c --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/memory/pool_allocator.hpp @@ -0,0 +1,174 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" +#include "lib/common/memory/deleter.hpp" +#include "lib/common/memory/pool.hpp" + +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace common +{ +namespace memory +{ +// template pool_allocator:: +template > +class pool_allocator +{ +public: + using value_type = Tp; + using pointer = Tp*; + using const_pointer = const Tp*; + using reference = Tp&; + using const_reference = const Tp&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using propagate_on_container_move_assignment = std::false_type; + using is_always_equal = value_type; + + pool_allocator() = default; + + // Rebind copy constructor + template + pool_allocator(const pool_allocator& rhs); + + pool_allocator(const pool_allocator& rhs) = default; + pool_allocator(pool_allocator&& rhs) noexcept = default; + pool_allocator& operator=(const pool_allocator& rhs) = default; + pool_allocator& operator=(pool_allocator&& rhs) noexcept = default; + + value_type* allocate(size_t n); + void deallocate(value_type* ptr, size_t n); + void construct(value_type* const _p, const value_type& _v) const; + void construct(value_type* const _p, value_type&& _v) const; + void construct_at(value_type* const _p, const value_type& _v) const; + void construct_at(value_type* const _p, value_type&& _v) const; + void destroy(value_type* const _p) const; + void destroy_at(value_type* const _p) const; + + template + struct rebind + { + using other = pool_allocator; + }; + +private: + using pool_type = pool; + + std::shared_ptr m_pool = std::make_shared(sizeof(value_type)); +}; + +template +template +pool_allocator::pool_allocator( + const pool_allocator& rhs) +: m_pool{rhs.m_pool} +{ + m_pool->rebind(sizeof(value_type)); +} + +template +typename pool_allocator::value_type* +pool_allocator::allocate(size_t n) +{ + if(n > 1) + { + return static_cast(::aligned_alloc(AlignV, sizeof(value_type) * n)); + } + + return static_cast(m_pool->allocate()); +} + +template +void +pool_allocator::deallocate(value_type* ptr, size_t n) +{ + DeleterT{}(); + if(n > 1) + { + ::free(ptr); + return; + } + + m_pool->deallocate(ptr); +} + +template +void +pool_allocator::construct(value_type* const _p, + const value_type& _v) const +{ + ::new((void*) _p) value_type{_v}; +} + +template +void +pool_allocator::construct(value_type* const _p, + value_type&& _v) const +{ + ::new((void*) _p) value_type{std::move(_v)}; +} + +template +void +pool_allocator::construct_at(value_type* const _p, + const value_type& _v) const +{ + ::new((void*) _p) value_type{_v}; +} + +template +void +pool_allocator::construct_at(value_type* const _p, + value_type&& _v) const +{ + ::new((void*) _p) value_type{std::move(_v)}; +} + +template +void +pool_allocator::destroy(value_type* const _p) const +{ + DeleterT{}(); + _p->~value_type(); +} + +template +void +pool_allocator::destroy_at(value_type* const _p) const +{ + DeleterT{}(); + _p->~value_type(); +} +} // namespace memory +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/memory/stateless_allocator.hpp b/projects/rocprofiler-sdk/source/lib/common/memory/stateless_allocator.hpp new file mode 100644 index 0000000000..429ffda5e4 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/memory/stateless_allocator.hpp @@ -0,0 +1,181 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" +#include "lib/common/memory/deleter.hpp" + +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace common +{ +namespace memory +{ +template > +class stateless_allocator +{ +public: + using value_type = Tp; + using pointer = Tp*; + using const_pointer = const Tp*; + using reference = Tp&; + using const_reference = const Tp&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using propagate_on_container_move_assignment = std::true_type; + + template + struct rebind + { + using other = stateless_allocator; + }; + + stateless_allocator() = default; + stateless_allocator(const stateless_allocator& rhs) = default; + stateless_allocator(stateless_allocator&& rhs) noexcept = default; + stateless_allocator& operator=(const stateless_allocator& rhs) = default; + stateless_allocator& operator=(stateless_allocator&& rhs) noexcept = default; + + template + stateless_allocator(const stateless_allocator& rhs); + + static Tp* allocate(size_t n); + static void deallocate(Tp* ptr, size_t n); + static void construct(value_type* const _p, const value_type& _v); + static void construct(value_type* const _p, value_type&& _v); + static void construct_at(value_type* const _p, const value_type& _v); + static void construct_at(value_type* const _p, value_type&& _v); + static void destroy(value_type* const _p); + static void destroy_at(value_type* const _p); +}; + +template +template +stateless_allocator::stateless_allocator( + const stateless_allocator& rhs) +{ + (void) rhs; +} + +template +Tp* +stateless_allocator::allocate(size_t n) +{ + constexpr auto alignment_v = Alignment / sizeof(void*); + Tp* ptr = nullptr; + + if constexpr(sizeof(Tp) >= alignment_v && sizeof(Tp) % alignment_v == 0) + ptr = static_cast(::aligned_alloc(Alignment / sizeof(void*), sizeof(Tp) * n)); + else + ptr = static_cast(::malloc(sizeof(Tp) * n)); + + if(ptr) return ptr; + + throw std::bad_alloc{}; +} + +template +void +stateless_allocator::deallocate(Tp* ptr, size_t n) +{ + (void) n; + ::free(ptr); +} + +template +void +stateless_allocator::construct(value_type* const _p, const value_type& _v) +{ + ::new((void*) _p) value_type{_v}; +} + +template +void +stateless_allocator::construct(value_type* const _p, value_type&& _v) +{ + ::new((void*) _p) value_type{std::move(_v)}; +} + +template +void +stateless_allocator::construct_at(value_type* const _p, + const value_type& _v) +{ + ::new((void*) _p) value_type{_v}; +} + +template +void +stateless_allocator::construct_at(value_type* const _p, value_type&& _v) +{ + ::new((void*) _p) value_type{std::move(_v)}; +} + +template +void +stateless_allocator::destroy(value_type* const _p) +{ + DeleterT{}(); + _p->~value_type(); +} + +template +void +stateless_allocator::destroy_at(value_type* const _p) +{ + DeleterT{}(); + _p->~value_type(); +} + +template +constexpr bool +operator==(const stateless_allocator&, + const stateless_allocator&) +{ + return true; +} + +template +constexpr bool +operator!=(const stateless_allocator&, + const stateless_allocator&) +{ + return false; +} +} // namespace memory +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/scope_destructor.hpp b/projects/rocprofiler-sdk/source/lib/common/scope_destructor.hpp new file mode 100644 index 0000000000..76312dfa77 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/common/scope_destructor.hpp @@ -0,0 +1,86 @@ +// MIT License +// +// Copyright (c) 2023 ROCm Developer Tools +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "lib/common/defines.hpp" + +#include +#include + +namespace rocprofiler +{ +namespace common +{ +struct scope_destructor +{ + /// \fn scope_destructor(FuncT&& _fini, InitT&& _init) + /// \tparam FuncT "std::function or void (*)()" + /// \tparam InitT "std::function or void (*)()" + /// \param _fini Function to execute when object is destroyed + /// \param _init Function to execute when object is created (optional) + /// + /// \brief Provides a utility to perform an operation when exiting a scope. + template + scope_destructor( + FuncT&& _fini, + InitT&& _init = []() {}); + + ~scope_destructor() { m_functor(); } + + // delete copy operations + scope_destructor(const scope_destructor&) = delete; + scope_destructor& operator=(const scope_destructor&) = delete; + + // allow move operations + scope_destructor(scope_destructor&& rhs) noexcept; + scope_destructor& operator=(scope_destructor&& rhs) noexcept; + +private: + std::function m_functor = []() {}; +}; + +template +scope_destructor::scope_destructor(FuncT&& _fini, InitT&& _init) +: m_functor{std::forward(_fini)} +{ + _init(); +} + +inline scope_destructor::scope_destructor(scope_destructor&& rhs) noexcept +: m_functor{std::move(rhs.m_functor)} +{ + rhs.m_functor = []() {}; +} + +inline scope_destructor& +scope_destructor::operator=(scope_destructor&& rhs) noexcept +{ + if(this != &rhs) + { + m_functor = std::move(rhs.m_functor); + rhs.m_functor = []() {}; + } + return *this; +} +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt index 96e7dd5d5a..c75c89aa73 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/CMakeLists.txt @@ -3,10 +3,11 @@ # rocprofiler_activate_clang_tidy() -set(ROCPROFILER_LIB_HEADERS agent.hpp buffer.hpp external_correlation.hpp +set(ROCPROFILER_LIB_HEADERS agent.hpp allocator.hpp buffer.hpp external_correlation.hpp intercept_table.hpp internal_threading.hpp registration.hpp) set(ROCPROFILER_LIB_SOURCES agent.cpp + allocator.cpp buffer.cpp buffer_tracing.cpp callback_tracing.cpp diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.cpp new file mode 100644 index 0000000000..6434a4b8f8 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.cpp @@ -0,0 +1,49 @@ +// MIT License +// +// Copyright (c) 2023 ROCm Developer Tools +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler/allocator.hpp" +#include "lib/rocprofiler/registration.hpp" + +#include + +#include + +namespace rocprofiler +{ +namespace common +{ +namespace memory +{ +void +deleter::operator()() const +{ + // if fully initialized and not yet finalized + if(registration::get_init_status() > 0 && registration ::get_fini_status() == 0) + { + static auto _once = std::atomic_flag{}; + if(!_once.test_and_set()) registration::finalize(); + // above returns false for only first invocation + } +} +} // namespace memory +} // namespace common +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.hpp new file mode 100644 index 0000000000..217bd2cd37 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/allocator.hpp @@ -0,0 +1,79 @@ +// MIT License +// +// Copyright (c) 2023 ROCm Developer Tools +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include "lib/common/defines.hpp" +#include "lib/common/memory/deleter.hpp" +#include "lib/common/memory/stateless_allocator.hpp" + +namespace rocprofiler +{ +namespace allocator +{ +// declare this trivial type for common::memory::deleter specialization +struct static_data; +} // namespace allocator + +namespace common +{ +namespace memory +{ +template <> +struct deleter +{ + // specialize the deleter call operator to invoke registration::finalize + void operator()() const; +}; +} // namespace memory +} // namespace common + +namespace allocator +{ +// use this allocator for static data which only gets deleted at the end of the application +template +using static_data_allocator = + common::memory::stateless_allocator>; + +// use this for unique_ptr +template +struct static_data_deleter +{ + void operator()(Tp* ptr) const + { + common::memory::deleter{}(); + delete ptr; + } +}; + +template +using unique_static_ptr_t = std::unique_ptr>; + +template +decltype(auto) +make_unique_static(Args&&... args) +{ + return unique_static_ptr_t{new Tp{std::forward(args)...}}; +} +} // namespace allocator +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.cpp index df1a8d7601..b4c778c145 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.cpp @@ -88,7 +88,7 @@ allocate_buffer() // create an entry in the registered auto& _cfg_v = get_buffers().back(); - _cfg_v = std::make_unique(); + _cfg_v = allocator::make_unique_static(); auto* _cfg = _cfg_v.get(); if(!_cfg) return std::nullopt; @@ -99,6 +99,8 @@ allocate_buffer() rocprofiler_status_t flush(rocprofiler_buffer_id_t buffer_id, bool wait) { + if(registration::get_fini_status() > 0) return ROCPROFILER_STATUS_SUCCESS; + if(buffer_id.handle >= get_buffers().size()) return ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND; auto& buff = get_buffers().at(buffer_id.handle); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.hpp index ef6ea1115c..fdd784589e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/buffer.hpp @@ -28,6 +28,7 @@ #include "lib/common/container/record_header_buffer.hpp" #include "lib/common/container/stable_vector.hpp" #include "lib/common/demangle.hpp" +#include "lib/rocprofiler/allocator.hpp" #include #include @@ -61,7 +62,8 @@ struct instance buffer_t& get_internal_buffer(size_t); }; -using unique_buffer_vec_t = common::container::stable_vector, 4>; +using unique_buffer_vec_t = + common::container::stable_vector, 4>; std::optional allocate_buffer(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/context.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/context.cpp index 20bd685d0a..81776c8c53 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/context.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/context.cpp @@ -83,7 +83,7 @@ rocprofiler_context_is_active(rocprofiler_context_id_t context_id, int* status) if(context_id.handle == rocprofiler_context_none.handle) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_FOUND; - auto ctxs = std::vector{}; + auto ctxs = rocprofiler::context::context_array_t{}; for(const auto* itr : rocprofiler::context::get_active_contexts(ctxs)) { if(itr && itr->context_idx == context_id.handle) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.cpp index 13bba780bd..80a36f3de2 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.cpp @@ -140,8 +140,8 @@ get_registered_contexts() return _v; } -std::vector& -get_active_contexts(std::vector& data, context_filter_t filter) +context_array_t& +get_active_contexts(context_array_t& data, context_filter_t filter) { data.clear(); auto num_ctx = get_num_active_contexts().load(std::memory_order_acquire); @@ -169,10 +169,10 @@ get_active_contexts(std::vector& data, context_filter_t filter) return data; } -std::vector +context_array_t get_active_contexts(context_filter_t filter) { - auto data = std::vector{}; + auto data = context_array_t{}; get_active_contexts(data, filter); return data; } @@ -213,7 +213,7 @@ allocate_context() // create an entry in the registered auto& _cfg_v = get_registered_contexts().back(); - _cfg_v = std::make_unique(); + _cfg_v = allocator::make_unique_static(); auto* _cfg = _cfg_v.get(); // ... @@ -260,7 +260,7 @@ start_context(rocprofiler_context_id_t context_id) return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID; } - auto current_contexts = std::vector{}; + auto current_contexts = context_array_t{}; for(const auto* itr : get_active_contexts(current_contexts)) { if(cfg->context_idx == itr->context_idx) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.hpp index ecac17504d..de32bd536e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/context/context.hpp @@ -26,8 +26,10 @@ #include #include +#include "lib/common/container/small_vector.hpp" #include "lib/common/container/stable_vector.hpp" #include "lib/common/synchronized.hpp" +#include "lib/rocprofiler/allocator.hpp" #include "lib/rocprofiler/context/domain.hpp" #include "lib/rocprofiler/counters/core.hpp" #include "lib/rocprofiler/external_correlation.hpp" @@ -174,8 +176,10 @@ start_context(rocprofiler_context_id_t id); /// \brief disable the contexturation. rocprofiler_status_t stop_context(rocprofiler_context_id_t); -using unique_context_vec_t = common::container::stable_vector, 8>; +using unique_context_vec_t = + common::container::stable_vector, 8>; using active_context_vec_t = common::container::stable_vector, 8>; +using context_array_t = common::container::small_vector; unique_context_vec_t& get_registered_contexts(); @@ -188,11 +192,10 @@ default_context_filter(const context* val) return (val != nullptr); } -std::vector& -get_active_contexts(std::vector& data, - context_filter_t filter = default_context_filter); +context_array_t& +get_active_contexts(context_array_t& data, context_filter_t filter = default_context_filter); -std::vector +context_array_t get_active_contexts(context_filter_t filter = default_context_filter); void deactivate_client_contexts(rocprofiler_client_id_t); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/external_correlation.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/external_correlation.cpp index 41c7089fd0..2b97c53c53 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/external_correlation.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/external_correlation.cpp @@ -24,6 +24,7 @@ #include #include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" #include "lib/rocprofiler/context/context.hpp" #include "lib/rocprofiler/external_correlation.hpp" @@ -33,17 +34,43 @@ namespace rocprofiler { namespace external_correlation { +namespace +{ +auto +get_default_tid() +{ + static auto _v = common::get_tid(); + return _v; +} + +constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0}; + +auto& +get_default_data_impl() +{ + static auto _v = std::atomic{0}; + return _v; +} + +auto +get_default_data() +{ + return rocprofiler_user_data_t{.value = + get_default_data_impl().load(std::memory_order_relaxed)}; +} + +auto f_default_tid = get_default_tid(); // make sure it is initialized +} // namespace + rocprofiler_user_data_t external_correlation::get(rocprofiler_thread_id_t tid) const { - static constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0}; - return data.rlock( [](const external_correlation_map_t& _data, rocprofiler_thread_id_t tid_v) { - if(_data.count(tid_v) == 0) return empty_user_data; + if(_data.count(tid_v) == 0) return get_default_data(); const auto& itr = _data.at(tid_v); return itr.rlock([](const external_correlation_stack_t& data_stack) { - if(data_stack.empty()) return empty_user_data; + if(data_stack.empty()) return get_default_data(); return data_stack.back(); }); }, @@ -53,6 +80,8 @@ external_correlation::get(rocprofiler_thread_id_t tid) const void external_correlation::push(rocprofiler_thread_id_t tid, rocprofiler_user_data_t user_data) { + static auto default_tid = get_default_tid(); + // ensure that data contains key for provided thread id while(!data.ulock( [](const external_correlation_map_t& _data, rocprofiler_thread_id_t tid_v) { @@ -78,6 +107,9 @@ external_correlation::push(rocprofiler_thread_id_t tid, rocprofiler_user_data_t itr.wlock([](external_correlation_stack_t& data_stack, rocprofiler_user_data_t value) { data_stack.emplace_back(value); }, user_data_v); + // child threads inherit the current value on default thread + if(tid_v == default_tid) + get_default_data_impl().store(user_data_v.value, std::memory_order_relaxed); }, tid, user_data); @@ -86,16 +118,22 @@ external_correlation::push(rocprofiler_thread_id_t tid, rocprofiler_user_data_t rocprofiler_user_data_t external_correlation::pop(rocprofiler_thread_id_t tid) { - static constexpr auto empty_user_data = rocprofiler_user_data_t{.value = 0}; + static auto default_tid = get_default_tid(); return data.wlock( [](external_correlation_map_t& _data, rocprofiler_thread_id_t tid_v) { if(_data.count(tid_v) == 0) return empty_user_data; auto& itr = _data.at(tid_v); - return itr.wlock([](external_correlation_stack_t& data_stack) { + return itr.wlock([tid_v](external_correlation_stack_t& data_stack) { if(data_stack.empty()) return empty_user_data; auto ret = data_stack.back(); data_stack.pop_back(); + // child threads inherit the current value on default thread + if(tid_v == default_tid) + { + uint64_t value = (!data_stack.empty()) ? data_stack.back().value : 0; + get_default_data_impl().store(value, std::memory_order_relaxed); + } return ret; }); }, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/CMakeLists.txt index 7a1b724f29..8e3368efa0 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/CMakeLists.txt @@ -1,7 +1,15 @@ -set(ROCPROFILER_LIB_HSA_SOURCES hsa.cpp queue.cpp queue_controller.cpp agent_cache.cpp - aql_packet.cpp) -set(ROCPROFILER_LIB_HSA_HEADERS hsa.hpp defines.hpp types.hpp utils.hpp queue.hpp - queue_controller.hpp agent_cache.hpp aql_packet.hpp) +set(ROCPROFILER_LIB_HSA_SOURCES agent_cache.cpp aql_packet.cpp code_object.cpp hsa.cpp + queue_controller.cpp queue.cpp) +set(ROCPROFILER_LIB_HSA_HEADERS + agent_cache.hpp + aql_packet.hpp + code_object.hpp + defines.hpp + hsa.hpp + queue_controller.hpp + queue.hpp + types.hpp + utils.hpp) target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_HSA_SOURCES} ${ROCPROFILER_LIB_HSA_HEADERS}) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.cpp new file mode 100644 index 0000000000..3a3252881a --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.cpp @@ -0,0 +1,834 @@ +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "lib/rocprofiler/hsa/code_object.hpp" +#include "lib/common/scope_destructor.hpp" +#include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler/agent.hpp" +#include "lib/rocprofiler/context/context.hpp" +#include "lib/rocprofiler/hsa/hsa.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#if defined(ROCPROFILER_CI) +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(FATAL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(FATAL) +#else +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(NON_CI_LEVEL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL) +#endif + +namespace rocprofiler +{ +namespace hsa +{ +namespace +{ +using hsa_loader_table_t = hsa_ven_amd_loader_1_01_pfn_t; +using context_t = context::context; +using user_data_t = rocprofiler_user_data_t; +using context_array_t = context::context_array_t; +using context_user_data_map_t = std::unordered_map; + +template +auto +consume_args(Tp&&...) +{} + +hsa_loader_table_t& +get_loader_table() +{ + static auto _v = []() { + auto _val = hsa_loader_table_t{}; + memset(&_val, 0, sizeof(hsa_loader_table_t)); + return _val; + }(); + return _v; +} + +struct kernel_symbol +{ + using kernel_symbol_data_t = + rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + + kernel_symbol() = default; + ~kernel_symbol() = default; + + kernel_symbol(const kernel_symbol&) = delete; + kernel_symbol(kernel_symbol&&) noexcept; + + kernel_symbol& operator=(const kernel_symbol&) = delete; + kernel_symbol& operator =(kernel_symbol&&) noexcept; + + bool beg_notified = false; + bool end_notified = false; + std::string name = {}; + hsa_executable_t hsa_executable = {}; + hsa_agent_t hsa_agent = {}; + hsa_executable_symbol_t hsa_symbol = {}; + kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{}); + context_user_data_map_t user_data = {}; +}; + +kernel_symbol::kernel_symbol(kernel_symbol&& rhs) noexcept { operator=(std::move(rhs)); } + +kernel_symbol& +kernel_symbol::operator=(kernel_symbol&& rhs) noexcept +{ + if(this != &rhs) + { + beg_notified = rhs.beg_notified; + end_notified = rhs.end_notified; + name = std::move(rhs.name); + hsa_executable = rhs.hsa_executable; + hsa_agent = rhs.hsa_agent; + hsa_symbol = rhs.hsa_symbol; + rocp_data = rhs.rocp_data; + user_data = std::move(rhs.user_data); + rocp_data.kernel_name = name.c_str(); + } + + return *this; +} + +bool +operator==(const kernel_symbol& lhs, const kernel_symbol& rhs) +{ + return std::tie(lhs.hsa_executable.handle, lhs.hsa_agent.handle, lhs.hsa_symbol.handle) == + std::tie(rhs.hsa_executable.handle, rhs.hsa_agent.handle, rhs.hsa_symbol.handle); +} + +struct code_object +{ + using code_object_data_t = rocprofiler_callback_tracing_code_object_load_data_t; + using symbol_array_t = std::vector>; + + code_object() = default; + ~code_object() = default; + + code_object(const code_object&) = delete; + code_object(code_object&&) noexcept; + + code_object& operator=(const code_object&) = delete; + code_object& operator =(code_object&&) noexcept; + + bool beg_notified = false; + bool end_notified = false; + std::string uri = {}; + hsa_executable_t hsa_executable = {}; + hsa_loaded_code_object_t hsa_code_object = {}; + code_object_data_t rocp_data = common::init_public_api_struct(code_object_data_t{}); + symbol_array_t symbols = {}; + context_array_t contexts = {}; + context_user_data_map_t user_data = {}; +}; + +code_object::code_object(code_object&& rhs) noexcept { operator=(std::move(rhs)); } + +code_object& +code_object::operator=(code_object&& rhs) noexcept +{ + if(this != &rhs) + { + beg_notified = rhs.beg_notified; + end_notified = rhs.end_notified; + uri = std::move(rhs.uri); + hsa_executable = rhs.hsa_executable; + hsa_code_object = rhs.hsa_code_object; + rocp_data = rhs.rocp_data; + user_data = std::move(rhs.user_data); + rocp_data.uri = uri.c_str(); + symbols = std::move(rhs.symbols); + } + + return *this; +} + +bool +operator==(const code_object& lhs, const code_object& rhs) +{ + return std::tie(lhs.hsa_executable.handle, lhs.hsa_code_object.handle) == + std::tie(rhs.hsa_executable.handle, rhs.hsa_code_object.handle); +} + +struct code_object_unload +{ + code_object* object = nullptr; + std::vector symbols = {}; +}; + +auto& +get_code_object_id() +{ + static auto _v = std::atomic{}; + return _v; +} + +auto& +get_kernel_symbol_id() +{ + static auto _v = std::atomic{}; + return _v; +} + +using code_object_array_t = std::vector>; +using kernel_object_map_t = std::unordered_map; +using executable_array_t = std::vector; +using code_object_unload_array_t = std::vector; + +std::vector +shutdown(hsa_executable_t executable); + +bool is_shutdown = false; + +auto& +get_executables() +{ + static auto _v = common::Synchronized{}; + return _v; +} + +auto& +get_code_objects() +{ + static auto _v = common::Synchronized{}; + static auto _dtor = common::scope_destructor{[]() { code_object_shutdown(); }}; + return _v; +} + +auto& +get_kernel_object_map() +{ + static auto _v = common::Synchronized{}; + return _v; +} + +hsa_status_t +executable_iterate_agent_symbols_load_callback(hsa_executable_t executable, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void* args) +{ +#define ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(...) \ + { \ + auto _status = core_table.hsa_executable_symbol_get_info_fn(symbol, __VA_ARGS__); \ + LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) \ + << "core_table.hsa_executable_symbol_get_info_fn(hsa_executable_symbol_t{.handle=" \ + << symbol.handle << "}, " << #__VA_ARGS__ << " failed"; \ + if(_status != HSA_STATUS_SUCCESS) return _status; \ + } + + auto& core_table = *get_table().core_; + auto* code_obj_v = static_cast(args); + auto symbol_v = kernel_symbol{}; + auto& data = symbol_v.rocp_data; + + symbol_v.hsa_executable = executable; + symbol_v.hsa_agent = agent; + symbol_v.hsa_symbol = symbol; + + auto exists = std::any_of(code_obj_v->symbols.begin(), + code_obj_v->symbols.end(), + [&symbol_v](auto& itr) { return (itr && symbol_v == *itr); }); + + // if there is an existing matching kernel symbol, return success and move onto next symbol + if(exists) return HSA_STATUS_SUCCESS; + + LOG_IF(FATAL, data.size == 0) << "kernel symbol did not properly initialized the size field " + "upon construction (this is likely a compiler bug)"; + + auto type = hsa_symbol_kind_t{}; + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &type); + + if(type != HSA_SYMBOL_KIND_KERNEL) return HSA_STATUS_SUCCESS; + + // set the code object id + data.code_object_id = code_obj_v->rocp_data.code_object_id; + + // compute the kernel name length + constexpr auto name_length_max = std::numeric_limits::max(); + uint32_t _name_length = 0; + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &_name_length); + + ROCP_CI_LOG_IF(WARNING, _name_length > name_length_max / 2) + << "kernel symbol name length is extremely large: " << _name_length; + + // set the kernel name + if(_name_length > 0 && _name_length < name_length_max) + { + auto _name = std::string(_name_length + 1, '\0'); + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_NAME, _name.data()); + + symbol_v.name = _name.substr(0, _name.find_first_of('\0')); + } + data.kernel_name = symbol_v.name.c_str(); + + // these should all be self-explanatory + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &data.kernel_object); + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, + &data.kernarg_segment_size); + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, + &data.kernarg_segment_alignment); + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, + &data.group_segment_size); + ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &data.private_segment_size); + + // if we have reached this point (i.e. there were no HSA errors returned within macro) then we + // generate a unique kernel symbol id + data.kernel_id = ++get_kernel_symbol_id(); + + get_kernel_object_map().wlock( + [](kernel_object_map_t& object_map, uint64_t _kern_obj, uint64_t _kern_id) { + object_map[_kern_obj] = _kern_id; + }, + data.kernel_object, + data.kernel_id); + + code_obj_v->symbols.emplace_back(std::make_unique(std::move(symbol_v))); + + return HSA_STATUS_SUCCESS; + +#undef ROCP_HSA_CORE_GET_EXE_SYMBOL_INFO +} + +hsa_status_t +executable_iterate_agent_symbols_unload_callback(hsa_executable_t executable, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void* args) +{ + auto symbol_v = kernel_symbol{}; + symbol_v.hsa_executable = executable; + symbol_v.hsa_agent = agent; + symbol_v.hsa_symbol = symbol; + + auto* code_obj_v = static_cast(args); + CHECK_NOTNULL(code_obj_v); + CHECK_NOTNULL(code_obj_v->object); + + for(const auto& itr : code_obj_v->object->symbols) + { + if(itr && *itr == symbol_v) code_obj_v->symbols.emplace_back(itr.get()); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +code_object_load_callback(hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* cb_data) +{ +#define ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(...) \ + { \ + auto _status = loader_table.hsa_ven_amd_loader_loaded_code_object_get_info( \ + loaded_code_object, __VA_ARGS__); \ + LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) \ + << "loader_table.hsa_ven_amd_loader_loaded_code_object_get_info(loaded_code_object, " \ + << #__VA_ARGS__ << " failed"; \ + if(_status != HSA_STATUS_SUCCESS) return _status; \ + } + + auto& loader_table = get_loader_table(); + auto code_obj_v = code_object{}; + auto& data = code_obj_v.rocp_data; + int _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE; + + LOG_IF(FATAL, data.size == 0) << "code object did not properly initialized the size field upon " + "construction (this is likely a compiler bug)"; + + code_obj_v.hsa_executable = executable; + code_obj_v.hsa_code_object = loaded_code_object; + + auto* code_obj_vec = static_cast(cb_data); + auto exists = std::any_of(code_obj_vec->begin(), code_obj_vec->end(), [&code_obj_v](auto& itr) { + return (itr && code_obj_v == *itr); + }); + + // if there is an existing matching code object, check for any new symbols and then return + // success and move onto next code object + if(exists) + { + for(auto& itr : *code_obj_vec) + { + if(itr && *itr == code_obj_v) + { + get_table().core_->hsa_executable_iterate_agent_symbols_fn( + executable, + data.hsa_agent, + executable_iterate_agent_symbols_load_callback, + itr.get()); + } + } + + return HSA_STATUS_SUCCESS; + } + + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO( + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE, &_storage_type); + + LOG_IF(FATAL, _storage_type >= ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_LAST) + << "HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE returned an " + "unsupported code object storage type. Expected 0=none, 1=file, or 2=memory but " + "received a value of " + << _storage_type; + + data.storage_type = static_cast(_storage_type); + + if(_storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE) + { + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO( + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE, + &data.storage_file); + } + else if(_storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY) + { + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO( + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE, + &data.memory_base); + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO( + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE, + &data.memory_size); + } + else if(_storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE) + { + LOG(WARNING) << "Code object storage type of none was ignored"; + return HSA_STATUS_SUCCESS; + } + + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &data.load_base); + + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &data.load_size); + + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &data.load_delta); + + constexpr auto uri_length_max = std::numeric_limits::max(); + auto _uri_length = uint32_t{0}; + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &_uri_length); + + ROCP_CI_LOG_IF(WARNING, _uri_length > uri_length_max / 2) + << "code object uri length is extremely large: " << _uri_length; + + if(_uri_length > 0 && _uri_length < uri_length_max) + { + auto _uri = std::string(_uri_length + 1, '\0'); + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + _uri.data()); + + code_obj_v.uri = _uri; + } + data.uri = code_obj_v.uri.data(); + + auto _hsa_agent = hsa_agent_t{}; + ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO(HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT, + &data.hsa_agent); + + const auto* _rocp_agent = agent::get_rocprofiler_agent(data.hsa_agent); + if(!_rocp_agent) + { + ROCP_CI_LOG(ERROR) << "hsa agent (handle=" << _hsa_agent.handle + << ") did not map to a rocprofiler agent"; + return HSA_STATUS_ERROR_INVALID_AGENT; + } + data.rocp_agent = _rocp_agent->id; + + // if we have reached this point (i.e. there were no HSA errors returned within macro) then we + // generate a unique code object id + data.code_object_id = ++get_code_object_id(); + + auto _status = get_table().core_->hsa_executable_iterate_agent_symbols_fn( + executable, data.hsa_agent, executable_iterate_agent_symbols_load_callback, &code_obj_v); + + if(_status == HSA_STATUS_SUCCESS) + { + code_obj_vec->emplace_back(std::make_unique(std::move(code_obj_v))); + } + else + { + LOG(ERROR) << "hsa_executable_iterate_agent_symbols failed for " << data.uri; + } + + return _status; + +#undef ROCP_HSA_VEN_LOADER_GET_CODE_OBJECT_INFO +} + +hsa_status_t +code_object_unload_callback(hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* args) +{ + auto code_obj_v = code_object{}; + code_obj_v.hsa_executable = executable; + code_obj_v.hsa_code_object = loaded_code_object; + + auto* code_obj_arr = static_cast(args); + + CHECK_NOTNULL(code_obj_arr); + + // auto _size = get_code_objects().rlock([](const auto& data) { return data.size(); }); + // LOG(INFO) << "[inp] executable=" << executable.handle + // << ", code_object=" << loaded_code_object.handle << " vs. " << _size; + + get_code_objects().rlock([&](const code_object_array_t& arr) { + for(const auto& itr : arr) + { + // LOG(INFO) << "[cmp] executable=" << itr->hsa_executable.handle + // << ", code_object=" << itr->hsa_code_object.handle; + if(itr->hsa_executable.handle == executable.handle && + itr->hsa_code_object.handle == loaded_code_object.handle) + // if(itr && *itr == code_obj_v) + { + auto& _last = code_obj_arr->emplace_back(code_object_unload{.object = itr.get()}); + + auto agent = itr->rocp_data.hsa_agent; + get_table().core_->hsa_executable_iterate_agent_symbols_fn( + executable, agent, executable_iterate_agent_symbols_unload_callback, &_last); + } + } + }); + + return HSA_STATUS_SUCCESS; +} + +auto& +get_freeze_function() +{ + static decltype(::hsa_executable_freeze)* _v = nullptr; + return _v; +} + +auto& +get_destroy_function() +{ + static decltype(::hsa_executable_destroy)* _v = nullptr; + return _v; +} + +hsa_status_t +executable_freeze(hsa_executable_t executable, const char* options) +{ + hsa_status_t status = CHECK_NOTNULL(get_freeze_function())(executable, options); + if(status != HSA_STATUS_SUCCESS) return status; + + LOG(INFO) << "running " << __FUNCTION__ << " (executable=" << executable.handle << ")..."; + + get_executables().wlock( + [executable](executable_array_t& data) { data.emplace_back(executable); }); + + auto& code_obj_vec = get_code_objects(); + code_obj_vec.wlock([executable](code_object_array_t& _vec) { + hsa::get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, code_object_load_callback, &_vec); + }); + + constexpr auto CODE_OBJECT_KIND = ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT; + constexpr auto CODE_OBJECT_LOAD = ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD; + constexpr auto CODE_OBJECT_KERNEL_SYMBOL = + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER; + + auto&& context_filter = [](const context_t* ctx) { + return (ctx->callback_tracer && ctx->callback_tracer->domains(CODE_OBJECT_KIND) && + (ctx->callback_tracer->domains(CODE_OBJECT_KIND, CODE_OBJECT_LOAD) || + ctx->callback_tracer->domains(CODE_OBJECT_KIND, CODE_OBJECT_KERNEL_SYMBOL))); + }; + + static thread_local auto ctxs = context_array_t{}; + context::get_active_contexts(ctxs, std::move(context_filter)); + + if(!ctxs.empty()) + { + code_obj_vec.rlock([](const code_object_array_t& data) { + auto tidx = common::get_tid(); + // set the contexts for each code object + for(const auto& ditr : data) + ditr->contexts = ctxs; + + for(const auto& ditr : data) + { + for(const auto* citr : ditr->contexts) + { + if(citr->callback_tracer->domains(CODE_OBJECT_KIND, CODE_OBJECT_LOAD)) + { + if(!ditr->beg_notified) + { + auto co_data = ditr->rocp_data; + auto record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{citr->context_idx}, + .thread_id = tidx, + .correlation_id = rocprofiler_correlation_id_t{}, + .kind = CODE_OBJECT_KIND, + .operation = CODE_OBJECT_LOAD, + .phase = ROCPROFILER_CALLBACK_PHASE_LOAD, + .payload = static_cast(&co_data)}; + + // invoke callback + auto& cb_data = + citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + auto& user_data = ditr->user_data[citr]; + cb_data.callback(record, &user_data, cb_data.data); + } + } + + for(const auto& sitr : ditr->symbols) + { + if(sitr && citr->callback_tracer->domains(CODE_OBJECT_KIND, + CODE_OBJECT_KERNEL_SYMBOL)) + { + if(!sitr->beg_notified) + { + auto sym_data = sitr->rocp_data; + auto record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{citr->context_idx}, + .thread_id = tidx, + .correlation_id = rocprofiler_correlation_id_t{}, + .kind = CODE_OBJECT_KIND, + .operation = CODE_OBJECT_KERNEL_SYMBOL, + .phase = ROCPROFILER_CALLBACK_PHASE_LOAD, + .payload = static_cast(&sym_data)}; + + // invoke callback + auto& cb_data = + citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + auto& user_data = sitr->user_data[citr]; + cb_data.callback(record, &user_data, cb_data.data); + } + } + } + } + } + + for(const auto& ditr : data) + { + ditr->beg_notified = true; + for(auto& sitr : ditr->symbols) + sitr->beg_notified = true; + } + }); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t +executable_destroy(hsa_executable_t executable) +{ + if(is_shutdown) return HSA_STATUS_SUCCESS; + + auto _unloaded = shutdown(executable); + + get_kernel_object_map().wlock([_unloaded](kernel_object_map_t& data) { + for(const auto& uitr : _unloaded) + { + for(const auto& sitr : uitr.symbols) + { + data.erase(sitr->rocp_data.kernel_id); + } + } + }); + + get_code_objects().wlock([executable](code_object_array_t& data) { + for(auto& itr : data) + { + if(itr->hsa_executable.handle == executable.handle) itr.reset(); + } + data.erase( + std::remove_if(data.begin(), data.end(), [](auto& itr) { return (itr == nullptr); }), + data.end()); + }); + + get_executables().wlock([executable](executable_array_t& data) { + data.erase(std::remove_if(data.begin(), + data.end(), + [executable](hsa_executable_t itr) { + return (itr.handle == executable.handle); + }), + data.end()); + }); + + return CHECK_NOTNULL(get_destroy_function())(executable); +} + +std::vector +shutdown(hsa_executable_t executable) +{ + LOG(INFO) << "running " << __FUNCTION__ << " (executable=" << executable.handle << ")..."; + + auto _unloaded = std::vector{}; + hsa::get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, code_object_unload_callback, &_unloaded); + + constexpr auto CODE_OBJECT_KIND = ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT; + constexpr auto CODE_OBJECT_LOAD = ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD; + constexpr auto CODE_OBJECT_KERNEL_SYMBOL = + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER; + + auto tidx = common::get_tid(); + for(auto& itr : _unloaded) + { + LOG_IF(FATAL, itr.object == nullptr); + for(const auto* citr : itr.object->contexts) + { + if(citr->callback_tracer->domains(CODE_OBJECT_KIND, CODE_OBJECT_LOAD)) + { + if(!itr.object->end_notified) + { + auto record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{citr->context_idx}, + .thread_id = tidx, + .correlation_id = rocprofiler_correlation_id_t{}, + .kind = CODE_OBJECT_KIND, + .operation = CODE_OBJECT_LOAD, + .phase = ROCPROFILER_CALLBACK_PHASE_UNLOAD, + .payload = static_cast(&itr.object->rocp_data)}; + + // invoke callback + auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + auto& user_data = itr.object->user_data.at(citr); + cb_data.callback(record, &user_data, cb_data.data); + } + } + + // generate callbacks for kernel symbols after the callback for code object + // unloading so the code object unload can be used to flush the buffer before the + // symbol information is removed + if(citr->callback_tracer->domains(CODE_OBJECT_KIND, CODE_OBJECT_KERNEL_SYMBOL)) + { + for(auto& sitr : itr.symbols) + { + if(!sitr->end_notified) + { + auto record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{citr->context_idx}, + .thread_id = tidx, + .correlation_id = rocprofiler_correlation_id_t{}, + .kind = CODE_OBJECT_KIND, + .operation = CODE_OBJECT_KERNEL_SYMBOL, + .phase = ROCPROFILER_CALLBACK_PHASE_UNLOAD, + .payload = static_cast(&sitr->rocp_data)}; + + // invoke callback + auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + auto& user_data = sitr->user_data.at(citr); + cb_data.callback(record, &user_data, cb_data.data); + } + } + } + } + } + + for(auto& itr : _unloaded) + { + itr.object->end_notified = true; + for(auto& sitr : itr.symbols) + sitr->end_notified = true; + } + + return _unloaded; +} +} // namespace + +void +code_object_init(HsaApiTable* table) +{ + auto& core_table = *table->core_; + + auto _status = core_table.hsa_system_get_major_extension_table_fn( + HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_loader_table_t), &get_loader_table()); + + LOG_IF(ERROR, _status != HSA_STATUS_SUCCESS) << "hsa_system_get_major_extension_table failed"; + + if(_status == HSA_STATUS_SUCCESS) + { + get_freeze_function() = CHECK_NOTNULL(core_table.hsa_executable_freeze_fn); + get_destroy_function() = CHECK_NOTNULL(core_table.hsa_executable_destroy_fn); + core_table.hsa_executable_freeze_fn = executable_freeze; + core_table.hsa_executable_destroy_fn = executable_destroy; + LOG_IF(FATAL, get_freeze_function() == core_table.hsa_executable_freeze_fn) + << "infinite recursion"; + LOG_IF(FATAL, get_destroy_function() == core_table.hsa_executable_destroy_fn) + << "infinite recursion"; + } +} + +uint64_t +get_kernel_id(uint64_t kernel_object) +{ + // return get_code_objects().rlock([kernel_object](const code_object_array_t& _data) -> uint64_t + // { + // for(const auto& itr : _data) + // { + // for(const auto& ditr : itr->symbols) + // { + // if(kernel_object == ditr->rocp_data.kernel_object) return + // ditr->rocp_data.kernel_id; + // } + // } + // return 0; + // }); + + return get_kernel_object_map().rlock( + [](const kernel_object_map_t& object_map, uint64_t _kern_obj) -> uint64_t { + auto itr = object_map.find(_kern_obj); + return (itr == object_map.end()) ? 0 : itr->second; + // return object_map.at(_kern_obj); + }, + kernel_object); +} + +void +code_object_shutdown() +{ + if(is_shutdown) return; + + get_executables().rlock([](const executable_array_t& edata) { + auto tmp = edata; + std::reverse(tmp.begin(), tmp.end()); + for(auto itr : tmp) + shutdown(itr); + }); + + get_code_objects().wlock([](code_object_array_t& data) { data.clear(); }); + + is_shutdown = true; +} +} // namespace hsa +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.hpp new file mode 100644 index 0000000000..8980a5d236 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/code_object.hpp @@ -0,0 +1,38 @@ +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +namespace rocprofiler +{ +namespace hsa +{ +void +code_object_init(HsaApiTable* table); + +uint64_t +get_kernel_id(uint64_t kernel_object); + +void +code_object_shutdown(); +} // namespace hsa +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/hsa.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/hsa.cpp index 182477f2a6..725ed52f87 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/hsa.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/hsa.cpp @@ -188,7 +188,7 @@ hsa_api_impl::functor(Args&&... args) rocprofiler_user_data_t external_correlation = {}; }; - static thread_local auto active_contexts = std::vector{}; + static thread_local auto active_contexts = context::context_array_t{}; auto thr_id = common::get_tid(); auto callback_contexts = std::vector{}; auto buffered_contexts = std::vector{}; @@ -250,6 +250,9 @@ hsa_api_impl::functor(Args&&... args) buffer_record.thread_id = thr_id; } + tracer_data.size = sizeof(rocprofiler_callback_tracing_hsa_api_data_t); + set_data_args(info_type::get_api_data_args(tracer_data.args), std::forward(args)...); + // invoke the callbacks if(!callback_contexts.empty()) { @@ -448,41 +451,37 @@ get_names(std::vector& _name_list, std::index_sequence) (_emplace(_name_list, hsa_api_info::name), ...); } +bool +should_wrap_functor(rocprofiler_service_callback_tracing_kind_t _callback_domain, + rocprofiler_service_buffer_tracing_kind_t _buffered_domain, + int _operation) +{ + // we loop over all the *registered* contexts and see if any of them, at any point in time, + // might require callback or buffered API tracing + for(const auto& itr : context::get_registered_contexts()) + { + if(!itr) continue; + + // if there is a callback tracer enabled for the given domain and op, we need to wrap + if(itr->callback_tracer && itr->callback_tracer->domains(_callback_domain) && + itr->callback_tracer->domains(_callback_domain, _operation)) + return true; + + // if there is a buffered tracer enabled for the given domain and op, we need to wrap + if(itr->buffered_tracer && itr->buffered_tracer->domains(_buffered_domain) && + itr->buffered_tracer->domains(_buffered_domain, _operation)) + return true; + } + return false; +} + template void update_table(hsa_api_table_t* _orig, std::index_sequence) { - static auto _should_wrap_functor = - [](auto _callback_domain, auto _buffered_domain, auto _operation) { - for(const auto& itr : context::get_registered_contexts()) - { - if(!itr) continue; - - if(itr->callback_tracer) - { - // domain not enabled so skip to next callback_tracer - if(!itr->callback_tracer->domains(_callback_domain)) continue; - - // if the given domain + op is enabled, we need to wrap - if(itr->callback_tracer->domains(_callback_domain, _operation)) return true; - } - - if(itr->buffered_tracer) - { - // domain not enabled so skip to next callback_tracer - if(!itr->buffered_tracer->domains(_buffered_domain)) continue; - - // if the given domain + op is enabled, we need to wrap - if(itr->buffered_tracer->domains(_buffered_domain, _operation)) return true; - } - } - return false; - }; - (void) _should_wrap_functor; - auto _update = [](hsa_api_table_t* _orig_v, auto _info) { // check to see if there are any contexts which enable this operation in the HSA API domain - if(!_should_wrap_functor( + if(!should_wrap_functor( _info.callback_domain_idx, _info.buffered_domain_idx, _info.operation_idx)) return; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp index 950c1a4dae..45fe4f4631 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.cpp @@ -22,6 +22,7 @@ #include "lib/common/utility.hpp" #include "lib/rocprofiler/buffer.hpp" #include "lib/rocprofiler/context/context.hpp" +#include "lib/rocprofiler/hsa/code_object.hpp" #include #include @@ -32,6 +33,31 @@ #include #include +// static assert for rocprofiler_packet ABI compatibility +static_assert(sizeof(hsa_ext_amd_aql_pm4_packet_t) == sizeof(hsa_kernel_dispatch_packet_t), + "unexpected ABI incompatibility"); +static_assert(sizeof(hsa_ext_amd_aql_pm4_packet_t) == sizeof(hsa_barrier_and_packet_t), + "unexpected ABI incompatibility"); +static_assert(sizeof(hsa_ext_amd_aql_pm4_packet_t) == sizeof(hsa_barrier_or_packet_t), + "unexpected ABI incompatibility"); +static_assert(offsetof(hsa_ext_amd_aql_pm4_packet_t, completion_signal) == + offsetof(hsa_kernel_dispatch_packet_t, completion_signal), + "unexpected ABI incompatibility"); +static_assert(offsetof(hsa_ext_amd_aql_pm4_packet_t, completion_signal) == + offsetof(hsa_barrier_and_packet_t, completion_signal), + "unexpected ABI incompatibility"); +static_assert(offsetof(hsa_ext_amd_aql_pm4_packet_t, completion_signal) == + offsetof(hsa_barrier_or_packet_t, completion_signal), + "unexpected ABI incompatibility"); + +#if defined(ROCPROFILER_CI) +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(FATAL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(FATAL) +#else +# define ROCP_CI_LOG_IF(NON_CI_LEVEL, ...) LOG_IF(NON_CI_LEVEL, __VA_ARGS__) +# define ROCP_CI_LOG(NON_CI_LEVEL, ...) LOG(NON_CI_LEVEL) +#endif + namespace rocprofiler { namespace hsa @@ -43,16 +69,138 @@ signal_limiter() { // Limit the maximun number of HSA signals created. // There is a hard limit to the maximum that can exist. - static common::active_capacity_gate _gate(1024); + static common::active_capacity_gate _gate{96}; return _gate; } bool -AsyncSignalHandler(hsa_signal_value_t, void* data) +context_filter(const context::context* ctx) { + return (ctx->buffered_tracer && + (ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) || + ctx->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY))); +} + +bool +AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data) +{ + // LOG(ERROR) << "signal value is " << signal_v; + if(!data) return true; auto& queue_info_session = *static_cast(data); + // we need to decrement this reference count at the end of the functions + auto* _corr_id = queue_info_session.correlation_id; + // get the contexts that were active when the signal was created + const auto& ctxs = queue_info_session.contexts; + if(!ctxs.empty()) + { + // only do the following work if there are contexts that require this info + const auto* _rocp_agent = queue_info_session.queue.get_agent().get_rocp_agent(); + auto _hsa_agent = queue_info_session.queue.get_agent().get_hsa_agent(); + auto _queue_id = queue_info_session.queue.get_id(); + auto _signal = queue_info_session.interrupt_signal; + auto _kern_id = queue_info_session.kernel_id; + const auto& _extern_corr_ids = queue_info_session.extern_corr_ids; + + auto dispatch_time = hsa_amd_profiling_dispatch_time_t{}; + auto dispatch_time_status = + queue_info_session.queue.ext_api().hsa_amd_profiling_get_dispatch_time_fn( + _hsa_agent, _signal, &dispatch_time); + + // if we encounter this in CI, it will cause test to fail + ROCP_CI_LOG_IF( + ERROR, + dispatch_time_status == HSA_STATUS_SUCCESS && dispatch_time.end < dispatch_time.start) + << "hsa_amd_profiling_get_dispatch_time for kernel_id=" << _kern_id + << " on rocprofiler_agent=" << _rocp_agent->id.handle + << " returned dispatch times where the end time (" << dispatch_time.end + << ") was less than the start time (" << dispatch_time.start << ")"; + + // try to extract the async copy time. this will return HSA_STATUS_ERROR if there + // is not an async copy agent associated with the signal so we just predicate + // putting something into the buffer based on whether or not + // hsa_amd_profiling_get_async_copy_time returns HSA_STATUS_SUCCESS. + auto copy_time = hsa_amd_profiling_async_copy_time_t{}; + auto copy_time_status = + queue_info_session.queue.ext_api().hsa_amd_profiling_get_async_copy_time_fn(_signal, + ©_time); + + // if we encounter this in CI, it will cause test to fail + ROCP_CI_LOG_IF(ERROR, + copy_time_status == HSA_STATUS_SUCCESS && copy_time.end < copy_time.start) + << "hsa_amd_profiling_get_async_copy_time for kernel_id=" << _kern_id + << " on rocprofiler_agent=" << _rocp_agent->id.handle + << " returned async times where the end time (" << copy_time.end + << ") was less than the start time (" << copy_time.start << ")"; + + for(const auto* itr : ctxs) + { + auto* _buffer = buffer::get_buffer( + itr->buffered_tracer->buffer_data.at(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH)); + + // go ahead and create the correlation id value since we expect at least one of these + // domains will require it + auto _corr_id_v = + rocprofiler_correlation_id_t{.internal = 0, .external = context::null_user_data}; + if(_corr_id) + { + _corr_id_v.internal = _corr_id->internal; + _corr_id_v.external = _extern_corr_ids.at(itr); + } + + if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH)) + { + if(dispatch_time_status == HSA_STATUS_SUCCESS) + { + const auto& dispatch_packet = queue_info_session.kernel_pkt.kernel_dispatch; + + auto record = rocprofiler_buffer_tracing_kernel_dispatch_record_t{ + sizeof(rocprofiler_buffer_tracing_kernel_dispatch_record_t), + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + _corr_id_v, + dispatch_time.start, + dispatch_time.end, + _rocp_agent->id, + _queue_id, + _kern_id, + dispatch_packet.private_segment_size, + dispatch_packet.group_segment_size, + rocprofiler_dim3_t{dispatch_packet.workgroup_size_x, + dispatch_packet.workgroup_size_y, + dispatch_packet.workgroup_size_z}, + rocprofiler_dim3_t{dispatch_packet.grid_size_x, + dispatch_packet.grid_size_y, + dispatch_packet.grid_size_z}}; + + _buffer->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + record); + } + } + + if(itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)) + { + if(copy_time_status == HSA_STATUS_SUCCESS) + { + auto record = rocprofiler_buffer_tracing_memory_copy_record_t{ + sizeof(rocprofiler_buffer_tracing_memory_copy_record_t), + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + _corr_id_v, + copy_time.start, + copy_time.end, + _rocp_agent->id, + _queue_id, + _kern_id}; + + _buffer->emplace(ROCPROFILER_BUFFER_CATEGORY_TRACING, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + record); + } + } + } + } + // Calls our internal callbacks to callers who need to be notified post // kernel execution. queue_info_session.queue.signal_callback([&](const auto& map) { @@ -95,6 +243,14 @@ AsyncSignalHandler(hsa_signal_value_t, void* data) } queue_info_session.queue.async_complete(); + if(_corr_id) + { + LOG_IF(FATAL, _corr_id->ref_count.load() == 0) + << "reference counter for correlation id " << _corr_id->internal << " from thread " + << _corr_id->thread_idx << " has no reference count"; + _corr_id->ref_count.fetch_sub(1); + } + delete static_cast(data); return false; } @@ -133,6 +289,8 @@ WriteInterceptor(const void* packets, void* data, hsa_amd_queue_intercept_packet_writer writer) { + using context_array_t = Queue::context_array_t; + auto&& AddVendorSpecificPacket = [](hsa_ext_amd_aql_pm4_packet_t _packet, hsa_signal_t _signal, std::vector& _packets) { @@ -149,19 +307,39 @@ WriteInterceptor(const void* packets, LOG_IF(FATAL, data == nullptr) << "WriteInterceptor was not passed a pointer to the queue"; - auto& queue = *static_cast(data); - auto thr_id = common::get_tid(); - auto* corr_id = context::get_latest_correlation_id(); - // increase the reference count to denote that this correlation id is being used in a kernel - if(corr_id) corr_id->ref_count.fetch_add(1); + static thread_local auto ctxs = context_array_t{}; + context::get_active_contexts(ctxs, context_filter); + + auto& queue = *static_cast(data); // We have no packets or no one who needs to be notified, do nothing. - if(pkt_count == 0 || queue.get_notifiers() == 0) + if(pkt_count == 0 || (queue.get_notifiers() == 0 && ctxs.empty())) { writer(packets, pkt_count); return; } + auto thr_id = common::get_tid(); + auto* corr_id = context::get_latest_correlation_id(); + + // use thread-local value to reuse allocation + static thread_local auto extern_corr_ids_tl = + Queue::queue_info_session_t::external_corr_id_map_t{}; + + // increase the reference count to denote that this correlation id is being used in a kernel + if(corr_id) + { + extern_corr_ids_tl.clear(); // clear it so that it only contains the current contexts + extern_corr_ids_tl.reserve(ctxs.size()); // reserve for performance + for(const auto* ctx : ctxs) + extern_corr_ids_tl.emplace(ctx, + ctx->correlation_tracer.external_correlator.get(thr_id)); + corr_id->ref_count.fetch_add(1); + } + + // move to local variable + auto extern_corr_ids = std::move(extern_corr_ids_tl); + // hsa_ext_amd_aql_pm4_packet_t const auto* packets_arr = static_cast(packets); auto transformed_packets = std::vector{}; @@ -248,8 +426,9 @@ WriteInterceptor(const void* packets, transformed_packets.emplace_back(barrier); } - // TODO(jrmadsen): fetch kernel identifier from code object loading - uint64_t kernel_id = 0; + LOG_IF(FATAL, packet_type != HSA_PACKET_TYPE_KERNEL_DISPATCH) + << "get_kernel_id below might need to be updated"; + uint64_t kernel_id = get_kernel_id(kernel_pkt.kernel_dispatch.kernel_object); // Enqueue the signal into the handler. Will call completed_cb when // signal completes. @@ -263,7 +442,9 @@ WriteInterceptor(const void* packets, .tid = thr_id, .kernel_id = kernel_id, .correlation_id = corr_id, - .kernel_pkt = kernel_pkt}); + .kernel_pkt = kernel_pkt, + .contexts = ctxs, + .extern_corr_ids = extern_corr_ids}); } writer(transformed_packets.data(), transformed_packets.size()); @@ -332,6 +513,21 @@ Queue::Queue(const AgentCache& agent, LOG_IF(FATAL, _ext_api.hsa_amd_queue_intercept_register_fn(_intercept_queue, WriteInterceptor, this)) << "Could not register interceptor"; + + bool enable_async_copy = false; + for(const auto& itr : context::get_registered_contexts()) + { + if(itr->buffered_tracer && + itr->buffered_tracer->domains(ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)) + enable_async_copy = true; + } + + if(enable_async_copy) + { + LOG_IF(FATAL, _ext_api.hsa_amd_profiling_async_copy_enable_fn(true) != HSA_STATUS_SUCCESS) + << "Could not enable async copy timing"; + } + *queue = _intercept_queue; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.hpp index 6c97ed3c4e..bc7c1d118c 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/hsa/queue.hpp @@ -23,6 +23,7 @@ #include #include +#include "lib/common/container/small_vector.hpp" #include "lib/common/synchronized.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler/hsa/agent_cache.hpp" @@ -47,8 +48,9 @@ namespace rocprofiler { namespace context { +struct context; struct correlation_id; -} +} // namespace context namespace hsa { using ClientID = int64_t; @@ -92,7 +94,9 @@ union rocprofiler_packet class Queue { public: - using callback_t = void (*)(hsa_status_t status, hsa_queue_t* source, void* data); + using context_t = context::context; + using context_array_t = common::container::small_vector; + using callback_t = void (*)(hsa_status_t status, hsa_queue_t* source, void* data); // Function prototype used to notify consumers that a kernel has been // enqueued. An AQL packet can be returned that will be injected into // the queue. @@ -107,6 +111,9 @@ public: // to track state of the intercepted kernel. struct queue_info_session_t { + using external_corr_id_map_t = + std::unordered_map; + Queue& queue; std::unique_ptr inst_pkt = {}; ClientID inst_pkt_id = 0; @@ -115,6 +122,8 @@ public: rocprofiler_kernel_id_t kernel_id = 0; context::correlation_id* correlation_id = nullptr; rocprofiler_packet kernel_pkt = {}; + context_array_t contexts = {}; + external_corr_id_map_t extern_corr_ids = {}; }; Queue(const AgentCache& agent, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.cpp index 16148bf304..c3b71d1d6d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.cpp @@ -26,6 +26,7 @@ #include "lib/common/container/stable_vector.hpp" #include "lib/common/utility.hpp" +#include "lib/rocprofiler/allocator.hpp" #include "lib/rocprofiler/buffer.hpp" #include "lib/rocprofiler/context/context.hpp" #include "lib/rocprofiler/internal_threading.hpp" @@ -41,6 +42,22 @@ namespace rocprofiler { namespace internal_threading { +using thread_pool_vec_t = std::vector>; +// Note: task_group maintains a shared_ptr copy to thread_pool to ensure it is not destroyed +// before the task can be sync'd. +using task_group_vec_t = std::vector>; + +TaskGroup::TaskGroup(std::shared_ptr pool) +: parent_type{static_cast(pool.get())} +, m_pool{std::move(pool)} +{} + +ThreadPool::ThreadPool(const parent_type::Config& cfg) +: parent_type{cfg} +{} + +ThreadPool::~ThreadPool() { parent_type::destroy_threadpool(); } + namespace { template @@ -152,15 +169,10 @@ get_thread_pools() return _v; } -auto& +auto*& get_task_groups() { - static auto _v = task_group_vec_t([](auto& data) { - for(auto& itr : data) - itr.first->join(); - data.clear(); - }); - + static auto* _v = new task_group_vec_t{}; return _v; } } // namespace @@ -184,8 +196,14 @@ finalize() { // PLT::ThreadPool::f_thread_ids() is not destruction order safe // if it does become safe, these two calls could be removed. - get_task_groups().destroy(); - get_thread_pools().clear(); + if(get_task_groups()) + { + for(auto& itr : *get_task_groups()) + itr->join(); + get_task_groups()->clear(); + delete get_task_groups(); + get_task_groups() = nullptr; + } } void @@ -209,13 +227,13 @@ create_callback_thread() // this will be index after emplace_back auto idx = get_thread_pools().size(); - auto& thr_pool = get_thread_pools().emplace_back(std::make_shared( - std::make_unique(thread_pool_config_t{.pool_size = 1}), - [](auto& tp) { tp->destroy_threadpool(); })); + auto& thr_pool = get_thread_pools().emplace_back( + std::make_shared(thread_pool_config_t{.pool_size = 1})); + + if(!get_task_groups()) get_task_groups() = new task_group_vec_t{}; // construct the task group to use the newly created thread pool - get_task_groups().get().emplace_back(std::make_unique(thr_pool->get().get()), - thr_pool); + get_task_groups()->emplace_back(allocator::make_unique_static(thr_pool)); // notify that rocprofiler library finished creating an internal thread notify_post_internal_thread_create(ROCPROFILER_LIBRARY); @@ -227,9 +245,8 @@ create_callback_thread() task_group_t* get_task_group(rocprofiler_callback_thread_t cb_tid) { - return (!get_task_groups().get().empty()) - ? get_task_groups().get().at(cb_tid.handle).first.get() - : nullptr; + if(!get_task_groups() || get_task_groups()->empty()) return nullptr; + return get_task_groups()->at(cb_tid.handle).get(); } } // namespace internal_threading } // namespace rocprofiler @@ -269,7 +286,10 @@ rocprofiler_status_t ROCPROFILER_API rocprofiler_assign_callback_thread(rocprofiler_buffer_id_t buffer_id, rocprofiler_callback_thread_t cb_thread_id) { - if(cb_thread_id.handle >= rocprofiler::internal_threading::get_task_groups().get().size()) + if(!rocprofiler::internal_threading::get_task_groups()) + return ROCPROFILER_STATUS_ERROR_THREAD_NOT_FOUND; + + if(cb_thread_id.handle >= rocprofiler::internal_threading::get_task_groups()->size()) return ROCPROFILER_STATUS_ERROR_THREAD_NOT_FOUND; for(auto& bitr : rocprofiler::buffer::get_buffers()) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.hpp index 3411731d55..22b8568efb 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/internal_threading.hpp @@ -27,6 +27,7 @@ #include "lib/common/container/stable_vector.hpp" #include "lib/common/defines.hpp" #include "lib/common/utility.hpp" +#include "lib/rocprofiler/allocator.hpp" #include #include @@ -39,16 +40,28 @@ namespace rocprofiler { namespace internal_threading { -using thread_pool_t = PTL::ThreadPool; -using task_group_t = PTL::TaskGroup; -using thread_pool_cleanup_t = common::static_cleanup_wrapper>; -using task_group_cleanup_t = - std::pair, std::shared_ptr>; -using thread_pool_vec_t = std::vector>; +class ThreadPool : public PTL::ThreadPool +{ +public: + using parent_type = PTL::ThreadPool; -// Note: task_group maintains a shared_ptr copy to thread_pool to ensure it is not destroyed -// before the task can be sync'd. -using task_group_vec_t = common::static_cleanup_wrapper>; + ThreadPool(const parent_type::Config&); + ~ThreadPool(); +}; + +class TaskGroup : public PTL::TaskGroup +{ +public: + using parent_type = PTL::TaskGroup; + + TaskGroup(std::shared_ptr); + +private: + std::shared_ptr m_pool = {}; +}; + +using thread_pool_t = ThreadPool; +using task_group_t = TaskGroup; void notify_pre_internal_thread_create(rocprofiler_runtime_library_t); void notify_post_internal_thread_create(rocprofiler_runtime_library_t); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler/registration.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler/registration.cpp index 94db1e9493..593ab86614 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler/registration.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler/registration.cpp @@ -22,7 +22,9 @@ #include "lib/rocprofiler/registration.hpp" #include "lib/rocprofiler/agent.hpp" +#include "lib/rocprofiler/allocator.hpp" #include "lib/rocprofiler/context/context.hpp" +#include "lib/rocprofiler/hsa/code_object.hpp" #include "lib/rocprofiler/hsa/hsa.hpp" #include "lib/rocprofiler/hsa/queue.hpp" #include "lib/rocprofiler/hsa/queue_controller.hpp" @@ -131,18 +133,30 @@ get_link_map() struct client_library { - std::string name = {}; - void* dlhandle = nullptr; - decltype(::rocprofiler_configure)* configure_func = nullptr; - std::unique_ptr configure_result = {}; - rocprofiler_client_id_t internal_client_id = {}; - rocprofiler_client_id_t mutable_client_id = {}; + client_library() = default; + ~client_library() { delete configure_result; } + + client_library(const client_library&) = delete; + client_library(client_library&&) noexcept = default; + + client_library& operator=(const client_library&) = delete; + client_library& operator=(client_library&&) noexcept = delete; + + std::string name = {}; + void* dlhandle = nullptr; + decltype(::rocprofiler_configure)* configure_func = nullptr; + rocprofiler_tool_configure_result_t* configure_result = nullptr; + rocprofiler_client_id_t internal_client_id = {}; + rocprofiler_client_id_t mutable_client_id = {}; }; -std::vector +using client_library_vec_t = + std::vector>; + +client_library_vec_t find_clients() { - auto data = std::vector{}; + auto data = client_library_vec_t{}; auto priority_offset = get_client_offset(); if(get_forced_configure()) @@ -227,7 +241,7 @@ find_clients() return data; } -std::vector& +client_library_vec_t& get_clients() { static auto _v = find_clients(); @@ -290,7 +304,7 @@ invoke_client_configures() if(_result) { - itr.configure_result = std::make_unique(*_result); + itr.configure_result = new rocprofiler_tool_configure_result_t{*_result}; } else { @@ -467,10 +481,15 @@ finalize() { if(get_fini_status() != 0) return; + static auto _sync = std::atomic_flag{}; + if(_sync.test_and_set()) return; + // above returns true for all invocations after the first one + static auto _once = std::once_flag{}; std::call_once(_once, []() { set_fini_status(-1); - hsa_shut_down(); + ::hsa_shut_down(); + hsa::code_object_shutdown(); if(get_init_status() > 0) { invoke_client_finalizers(); @@ -550,6 +569,7 @@ rocprofiler_set_api_table(const char* name, // need to construct agent mappings before initializing the queue controller rocprofiler::agent::construct_agent_cache(hsa_api_table); rocprofiler::hsa::queue_controller_init(hsa_api_table); + rocprofiler::hsa::code_object_init(hsa_api_table); // any internal modifications to the HsaApiTable need to be done before we make the // copy or else those modifications will be lost when HSA API tracing is enabled diff --git a/projects/rocprofiler-sdk/source/scripts/leak-sanitizer-suppr.txt b/projects/rocprofiler-sdk/source/scripts/leak-sanitizer-suppr.txt index 693edcc463..771bf14a08 100644 --- a/projects/rocprofiler-sdk/source/scripts/leak-sanitizer-suppr.txt +++ b/projects/rocprofiler-sdk/source/scripts/leak-sanitizer-suppr.txt @@ -7,3 +7,4 @@ leak:hsa-runtime leak:amdhip leak:python leak:hsa-amd-aqlprofile +leak:__new_exitfn diff --git a/projects/rocprofiler-sdk/samples/apps/transpose/CMakeLists.txt b/projects/rocprofiler-sdk/tests/apps/transpose/CMakeLists.txt similarity index 100% rename from projects/rocprofiler-sdk/samples/apps/transpose/CMakeLists.txt rename to projects/rocprofiler-sdk/tests/apps/transpose/CMakeLists.txt diff --git a/projects/rocprofiler-sdk/samples/apps/transpose/transpose.cpp b/projects/rocprofiler-sdk/tests/apps/transpose/transpose.cpp similarity index 100% rename from projects/rocprofiler-sdk/samples/apps/transpose/transpose.cpp rename to projects/rocprofiler-sdk/tests/apps/transpose/transpose.cpp