From 69b8a43dc6cb5929ce43ee2fcac4dff07fa6e7b2 Mon Sep 17 00:00:00 2001 From: Giovanni Lenzi Baraldi Date: Mon, 8 Apr 2024 16:43:02 -0300 Subject: [PATCH] Gbaraldi/threadtrace2 (#724) * Added first ATT API * Finalizing thread trace API * Fixing more rebase conflicts * Added codeobj disassembly sample * Fixing merge issues with rebase [2] * Adding ATT packets * Implemented thread trace intercept * Moved codeobj parser to same repo as rocprofiler * Moved thread trace to new API * Fixing merge conflicts * Fixing more merge conflicts * Adding thread trace packet reuse * Merged aql_profile_v2 headers * Linked ATT sample to aqlprofile * Updated decoder to include non-loaded codeobjs * Implemented ISA decoder into ATT sample * Added marker_id to vaddr * Updating aql_profile_v2 API to memcpy * Updating thread trace API to include 64bit markers. Using the result of ISA matching. * Added instruction type and cycles summary * Updated sample with selection of kernel by kernel_object * Added option to copy from memory kernels * Moved tool_data in thread_trace to dynamic alloc * Restoring hsa.cpp * Fixed ATT sample crash. General improvements. * Moved codeobj library to outside src/ * Updated license header * Moved codeobj_capture to camelcase * Solving some more merge conflicts * Update samples/advanced_thread_trace/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update samples/advanced_thread_trace/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update samples/code_object_isa_decode/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt * Removing unused parameter check * Adding const to isEmpty * Removing unused warning * Adding libdw-dev to requirements * Running clang-format * Commenting out new aql calls * Clang format * Unused variable fix * Adding codeobj-decoder coverage * Commenting out threadtrace * Update samples/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * P * WOverloaded * Addressing clang-tidy * Virtual destructor on ttracer class * Corr id * Fixing code source format * Update CMakeLists.txt * Build fixes * Update source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Fix shadowing * Update CMakeLists.txt * Update samples/CMakeLists.txt Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Ammar ELWazir Co-authored-by: Ammar ELWazir Co-authored-by: Benjamin Welton --- .github/workflows/continuous_integration.yml | 4 +- samples/CMakeLists.txt | 1 + samples/advanced_thread_trace/CMakeLists.txt | 59 ++ samples/advanced_thread_trace/client.cpp | 578 ++++++++++++++++ samples/advanced_thread_trace/main.cpp | 245 +++++++ .../transpose_kernels.hpp | 84 +++ samples/code_object_isa_decode/CMakeLists.txt | 52 ++ samples/code_object_isa_decode/client.cpp | 251 +++++++ samples/code_object_isa_decode/main.cpp | 246 +++++++ .../transpose_kernels.hpp | 84 +++ source/include/rocprofiler-sdk/CMakeLists.txt | 1 + source/include/rocprofiler-sdk/rocprofiler.h | 1 + source/include/rocprofiler-sdk/thread_trace.h | 108 +++ source/lib/CMakeLists.txt | 2 + .../rocprofiler-sdk-codeobj/CMakeLists.txt | 58 ++ source/lib/rocprofiler-sdk-codeobj/LICENSE | 21 + source/lib/rocprofiler-sdk-codeobj/README.md | 1 + .../code_object_track.cpp | 196 ++++++ .../code_object_track.hpp | 158 +++++ .../rocprofiler-sdk-codeobj/code_printing.cpp | 278 ++++++++ .../rocprofiler-sdk-codeobj/code_printing.hpp | 257 +++++++ .../rocprofiler-sdk-codeobj/disassembly.cpp | 372 ++++++++++ .../rocprofiler-sdk-codeobj/disassembly.hpp | 68 ++ .../lib/rocprofiler-sdk-codeobj/segment.hpp | 162 +++++ source/lib/rocprofiler-sdk/CMakeLists.txt | 1 + .../lib/rocprofiler-sdk/aql/aql_profile_v2.h | 651 +++++++++++++++++- .../rocprofiler-sdk/aql/packet_construct.cpp | 111 ++- .../rocprofiler-sdk/aql/packet_construct.hpp | 34 +- .../rocprofiler-sdk/aql/tests/aql_test.cpp | 49 +- .../lib/rocprofiler-sdk/context/context.cpp | 4 + .../lib/rocprofiler-sdk/context/context.hpp | 2 + source/lib/rocprofiler-sdk/counters/core.cpp | 4 +- source/lib/rocprofiler-sdk/counters/core.hpp | 2 +- .../rocprofiler-sdk/counters/dimensions.cpp | 4 +- .../rocprofiler-sdk/counters/evaluate_ast.cpp | 6 +- .../rocprofiler-sdk/counters/evaluate_ast.hpp | 4 +- .../counters/tests/dimension.cpp | 4 +- source/lib/rocprofiler-sdk/hsa/aql_packet.cpp | 65 +- source/lib/rocprofiler-sdk/hsa/aql_packet.hpp | 86 ++- .../lib/rocprofiler-sdk/hsa/code_object.cpp | 8 +- source/lib/rocprofiler-sdk/hsa/queue.cpp | 2 - .../rocprofiler-sdk/hsa/queue_controller.cpp | 25 +- .../rocprofiler-sdk/hsa/queue_controller.hpp | 4 + .../thread_trace/CMakeLists.txt | 6 + .../rocprofiler-sdk/thread_trace/att_core.cpp | 238 +++++++ .../rocprofiler-sdk/thread_trace/att_core.hpp | 75 ++ .../thread_trace/att_service.cpp | 81 +++ .../reproducible-runtime.cpp | 5 +- 48 files changed, 4649 insertions(+), 109 deletions(-) create mode 100644 samples/advanced_thread_trace/CMakeLists.txt create mode 100644 samples/advanced_thread_trace/client.cpp create mode 100644 samples/advanced_thread_trace/main.cpp create mode 100644 samples/advanced_thread_trace/transpose_kernels.hpp create mode 100644 samples/code_object_isa_decode/CMakeLists.txt create mode 100644 samples/code_object_isa_decode/client.cpp create mode 100644 samples/code_object_isa_decode/main.cpp create mode 100644 samples/code_object_isa_decode/transpose_kernels.hpp create mode 100644 source/include/rocprofiler-sdk/thread_trace.h create mode 100644 source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt create mode 100644 source/lib/rocprofiler-sdk-codeobj/LICENSE create mode 100644 source/lib/rocprofiler-sdk-codeobj/README.md create mode 100644 source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/code_printing.cpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/code_printing.hpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/disassembly.cpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/disassembly.hpp create mode 100644 source/lib/rocprofiler-sdk-codeobj/segment.hpp create mode 100644 source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt create mode 100644 source/lib/rocprofiler-sdk/thread_trace/att_core.cpp create mode 100644 source/lib/rocprofiler-sdk/thread_trace/att_core.hpp create mode 100644 source/lib/rocprofiler-sdk/thread_trace/att_service.cpp diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 95e8735fde..b1408c21bd 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -200,7 +200,7 @@ jobs: run: | git config --global --add safe.directory '*' apt-get update - apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig + apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig libdw-dev python3 -m pip install -r requirements.txt python3 -m pip install pytest pycobertura @@ -388,7 +388,7 @@ jobs: python3 -m pip install pytest add-apt-repository ppa:ubuntu-toolchain-r/test apt-get update - apt-get install -y g++-13 + apt-get install -y g++-13 libdw-dev update-alternatives --install $(which gcc) gcc $(which gcc-13) 100 --slave $(which g++) g++ $(which g++-13) realpath $(which gcc) realpath $(which g++) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 4787374dac..fbd6032b66 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -29,3 +29,4 @@ add_subdirectory(api_buffered_tracing) add_subdirectory(code_object_tracing) add_subdirectory(counter_collection) add_subdirectory(intercept_table) +# add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace) diff --git a/samples/advanced_thread_trace/CMakeLists.txt b/samples/advanced_thread_trace/CMakeLists.txt new file mode 100644 index 0000000000..3bfd9a8d91 --- /dev/null +++ b/samples/advanced_thread_trace/CMakeLists.txt @@ -0,0 +1,59 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-samples-advanced-thread-trace LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +find_package(rocprofiler-sdk REQUIRED) + +add_library(advanced-thread-trace-client SHARED) +target_sources(advanced-thread-trace-client PRIVATE client.cpp) +target_link_libraries( + advanced-thread-trace-client + PRIVATE rocprofiler::rocprofiler rocprofiler::samples-build-flags + rocprofiler-sdk-codeobj rocprofiler::samples-common-library) + +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g") +find_package(Threads REQUIRED) + +add_executable(advanced-thread-trace) +target_sources(advanced-thread-trace PRIVATE main.cpp) +target_link_libraries( + advanced-thread-trace PRIVATE advanced-thread-trace-client Threads::Threads + rocprofiler::samples-build-flags) + +add_test(NAME advanced-thread-trace COMMAND $) + +set_tests_properties( + advanced-thread-trace + PROPERTIES + TIMEOUT + 45 + LABELS + "samples" + ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" + FAIL_REGULAR_EXPRESSION + "threw an exception") diff --git a/samples/advanced_thread_trace/client.cpp b/samples/advanced_thread_trace/client.cpp new file mode 100644 index 0000000000..6befe9ac96 --- /dev/null +++ b/samples/advanced_thread_trace/client.cpp @@ -0,0 +1,578 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/code_object_isa_decode/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include +#include +#include +#include +#include +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" + +#include "code_object_track.hpp" +#include "common/defines.hpp" +#include "common/filesystem.hpp" +#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TARGET_CU 1 +#define SIMD_SELECT 0x3 +#define BUFFER_SIZE 0x6000000 +#define NUM_SE 2 +constexpr bool COPY_MEMORY_CODEOBJ = false; + +template <> +struct std::hash +{ + uint64_t operator()(const pcinfo_t& info) const + { + return info.addr ^ (info.marker_id << 32ul) ^ (info.marker_id >> 32ul); + } +}; + +bool +operator==(const pcinfo_t& a, const pcinfo_t& b) +{ + return a.addr == b.addr && a.marker_id == b.marker_id; +}; + +bool +operator<(const pcinfo_t& a, const pcinfo_t& b) +{ + if(a.marker_id == b.marker_id) return a.addr < b.addr; + return a.marker_id < b.marker_id; +}; + +namespace client +{ +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_context_id_t client_ctx = {}; + +struct isa_map_elem_t +{ + std::atomic hitcount{0}; + std::atomic latency{0}; + std::shared_ptr code_line{nullptr}; +}; + +struct ToolData +{ + std::shared_mutex isa_map_mut; + std::mutex output_mut; + CodeobjAddressTranslate codeobjTranslate; + std::map> isa_map; + std::unordered_map kernels_in_codeobj = {}; + std::unordered_map kernel_object_to_kernel_name = {}; + std::stringstream output; + + std::stringstream printKernel(uint64_t vaddr) + { + std::stringstream ss; + try + { + ss << '\n' << std::hex; + SymbolInfo& info = kernels_in_codeobj.at(vaddr); + + ss << std::hex << "Found: " << info.name << " at addr: 0x" << vaddr << " with offset 0x" + << info.faddr << " vaddr 0x" << info.vaddr << std::dec << '\n'; + } catch(std::exception& e) + { + ss << e.what() << '\n'; + } + return ss; + } +}; + +struct source_location +{ + std::string function = {}; + std::string file = {}; + uint32_t line = 0; + std::string context = {}; +}; + +struct trace_data_t +{ + int64_t id; + uint8_t* data; + uint64_t size; + ToolData* tool; +}; + +std::atomic TRACE_DATA_ID{-1}; +std::atomic KERNEL_ADDR_ID{-1}; +std::atomic OCCUPANCY_ID{-1}; + +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; + + assert(callback_data && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(callback_data); + + if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + std::unique_lock lg(tool.isa_map_mut); + auto* data = static_cast(record.payload); + tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name); + } + + if(record.operation != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) return; + + std::unique_lock lg(tool.isa_map_mut); + auto* data = static_cast(record.payload); + + if(std::string_view(data->uri).find("file:///") == 0) + { + tool.codeobjTranslate.addDecoder( + data->uri, data->code_object_id, data->load_delta, data->load_size); + auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id); + for(auto& [vaddr, symbol] : symbolmap) + tool.kernels_in_codeobj[vaddr] = symbol; + } + else if(COPY_MEMORY_CODEOBJ) + { + tool.codeobjTranslate.addDecoder(reinterpret_cast(data->memory_base), + data->memory_size, + data->code_object_id, + data->load_delta, + data->load_size); + auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id); + for(auto& [vaddr, symbol] : symbolmap) + tool.kernels_in_codeobj[vaddr] = symbol; + } + + (void) user_data; + (void) callback_data; +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" + +rocprofiler_att_control_flags_t +dispatch_callback(rocprofiler_queue_id_t queue_id, + const rocprofiler_agent_t* agent, + rocprofiler_correlation_id_t correlation_id, + const hsa_kernel_dispatch_packet_t* dispatch_packet, + uint64_t kernel_id, + void* userdata) +{ + assert(userdata && "Dispatch callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + std::shared_lock lg(tool.isa_map_mut); + + constexpr int desired_call_iteration = 1; + static std::atomic call_id{0}; + static std::string_view desired_func_name = "transposeLdsSwapInplace"; + + try + { + auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object); + if(kernel_name.find(desired_func_name) == std::string::npos) + return ROCPROFILER_ATT_CONTROL_NONE; + + if(call_id.fetch_add(1) == desired_call_iteration) + return ROCPROFILER_ATT_CONTROL_START_AND_STOP; + } catch(...) + { + std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object + << std::endl; + } + + return ROCPROFILER_ATT_CONTROL_NONE; +} + +void +iterate_trace_types(int id, const char* metadata, void*) +{ + if(std::string_view(metadata).find("occupancy") == 0) + OCCUPANCY_ID.store(id); + else if(std::string_view(metadata).find("kernel_ids_addr") == 0) + KERNEL_ADDR_ID.store(id); + else if(std::string_view(metadata).find("tracedata") == 0) + TRACE_DATA_ID.store(id); +} + +hsa_status_t +get_trace_data(int trace_type_id, + int correlation_id, + void* trace_events, + uint64_t trace_size, + void* userdata) +{ + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + std::stringstream ss; + std::shared_lock shared_lock(tool.isa_map_mut); + + if(trace_type_id == OCCUPANCY_ID) + { + ss << "Num waves: " << trace_size / 2 << '\n'; + // auto* occ = reinterpret_cast(trace_events); + } + else if(trace_type_id == KERNEL_ADDR_ID) + { + ss << "Num KRN events: " << trace_size << std::hex << '\n'; + auto* kaddr = reinterpret_cast(trace_events); + for(size_t i = 0; i < trace_size; i++) + if(kaddr[i].addr != 0) + { + ss << " - ADDR: " << kaddr[i].addr << ' ' << tool.printKernel(kaddr[i].addr).str() + << '\n'; + } + ss << std::dec; + } + else if(trace_type_id == TRACE_DATA_ID) + { + ss << "Trace Length: " << trace_size << '\n'; + auto* tracedata = reinterpret_cast(trace_events); + + for(size_t i = 0; i < trace_size; i++) + { + pcinfo_t pc = tracedata[i].pc; + auto it = tool.isa_map.find(pc); + if(it == tool.isa_map.end()) + { + shared_lock.unlock(); + { + std::unique_lock unique_lock(tool.isa_map_mut); + auto ptr = std::make_unique(); + ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr); + it = tool.isa_map.emplace(pc, std::move(ptr)).first; + } + shared_lock.lock(); + } + + it->second->hitcount.fetch_add(tracedata[i].hitcount, std::memory_order_relaxed); + it->second->latency.fetch_add(tracedata[i].latency, std::memory_order_relaxed); + } + } + + std::unique_lock lk(tool.output_mut); + tool.output << ss.str(); + + return HSA_STATUS_SUCCESS; +} + +uint64_t +copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata) +{ + trace_data_t& data = *reinterpret_cast(userdata); + *seid = data.id; + *buffer_size = data.size; + *buffer = data.data; + data.size = 0; + return *buffer_size; +} + +hsa_status_t +isa_callback(char* isa_instruction, + char* source_reference, + uint64_t* isa_memory_size, + uint64_t* isa_size, + uint64_t* source_size, + uint64_t marker_id, + uint64_t offset, + void* userdata) +{ + assert(userdata && "ISA callback passed null!"); + trace_data_t& trace_data = *reinterpret_cast(userdata); + assert(trace_data.tool && "ISA callback passed null!"); + ToolData& tool = *reinterpret_cast(trace_data.tool); + + std::shared_ptr instruction; + + { + std::unique_lock unique_lock(tool.isa_map_mut); + instruction = tool.codeobjTranslate.get(marker_id, offset); + } + + if(!instruction.get()) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + { + size_t tmp_isa_size = *isa_size; + size_t tmp_source_size = *source_size; + *isa_size = instruction->inst.size(); + *source_size = instruction->comment.size(); + + if(*isa_size > tmp_isa_size || *source_size > tmp_source_size) + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + memcpy(isa_instruction, instruction->inst.data(), *isa_size); + memcpy(source_reference, instruction->comment.data(), *source_size); + *isa_memory_size = instruction->size; + + auto ptr = std::make_unique(); + ptr->code_line = std::move(instruction); + tool.isa_map.emplace(pcinfo_t{offset, marker_id}, std::move(ptr)); + + return HSA_STATUS_SUCCESS; +} + +void +shader_data_callback(int64_t se_id, + int64_t data_type_id, + const char* data_type_name, + void* se_data, + size_t data_size, + void* userdata) +{ + assert(userdata && "Shader callback passed null!"); + ToolData& tool = *reinterpret_cast(userdata); + + { + std::unique_lock lk(tool.output_mut); + tool.output << "SE ID: " << se_id << " with size " << data_size << std::hex << '\n'; + } + trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool}; + auto status = aqlprofile_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data); + (void) status; +} + +#pragma GCC diagnostic pop + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + (void) fini_func; + aqlprofile_att_parser_iterate_event_list(iterate_trace_types, nullptr); + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_codeobj_tracing_callback, + tool_data), + "code object tracing service configure"); + + rocprofiler_att_parameters_t parameters{}; + parameters.target_cu = TARGET_CU; + parameters.simd_select = SIMD_SELECT; + parameters.buffer_size = BUFFER_SIZE; + + std::vector shaders; + for(size_t i = 0; i < NUM_SE; i++) + shaders.push_back(2 * i); // use shader engines 0, 2 + + parameters.shader_ids = shaders.data(); + parameters.shader_num = shaders.size(); + + ROCPROFILER_CALL( + rocprofiler_configure_thread_trace_service( + client_ctx, parameters, dispatch_callback, shader_data_callback, tool_data), + "thread trace service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + assert(tool_data && "tool_fini callback passed null!"); + ToolData& tool = *reinterpret_cast(tool_data); + + std::unique_lock isa_lk(tool.isa_map_mut); + std::unique_lock out_lk(tool.output_mut); + + // Find largest instruction + size_t max_inst_size = 0; + for(auto& [addr, lines] : tool.isa_map) + if(lines.get()) max_inst_size = std::max(max_inst_size, lines->code_line->inst.size()); + + std::string empty_space; + empty_space.resize(max_inst_size, ' '); + + size_t vmc_latency = 0; + size_t lgk_latency = 0; + size_t scalar_latency = 0; + size_t vector_latency = 0; + size_t other_latency = 0; + + size_t scalar_exec = 0; + size_t vector_exec = 0; + size_t other_exec = 0; + + for(auto& [addr, line] : tool.isa_map) + if(line.get()) + { + size_t hitcount = line->hitcount.load(std::memory_order_relaxed); + size_t latency = line->latency.load(std::memory_order_relaxed); + auto& code_line = line->code_line->inst; + + tool.output << std::hex << "0x" << addr.addr << std::dec << ' ' << code_line + << empty_space.substr(0, max_inst_size - code_line.size()) + << " Hit: " << hitcount << " - Latency: " << latency << '\n'; + + if(code_line.find("s_waitcnt") == 0) + { + other_exec += hitcount; + if(code_line.find("lgkmcnt") != std::string::npos) + lgk_latency += latency; + else + vmc_latency += latency; + } + else if(code_line.find("v_") == 0) + { + vector_exec += hitcount; + vector_latency += latency; + } + else if(code_line.find("s_") == 0) + { + scalar_exec += hitcount; + scalar_latency += latency; + } + else + { + other_exec += hitcount; + other_latency += latency; + } + } + + size_t total_exec = vector_exec + scalar_exec + other_exec; + size_t memory_latency = vmc_latency + lgk_latency; + size_t total_latency = memory_latency + vector_latency + scalar_latency + other_latency; + float vmc_fraction = 100 * vmc_latency / float(total_latency); + float lgk_fraction = 100 * lgk_latency / float(total_latency); + + tool.output << "Total executed instructions: " << total_exec << '\n' + << "Total executed vector instructions: " << vector_exec << " with average " + << vector_latency / float(vector_exec) << " cycles.\n" + << "Total executed scalar instructions: " << scalar_exec << " with average " + << scalar_latency / float(scalar_exec) << " cycles.\n" + << "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n" + << "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n"; + + std::cout << tool.output.str(); +} + +void +setup() +{ + if(int status = 0; + rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) + { + ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), + "force configuration"); + } +} + +// force configuration when library is loaded +bool cfg_on_load = (client::setup(), true); +} // namespace client + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "Adv_Thread_Trace_Sample"; + + // store client info + client::client_id = id; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch + << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + auto* data = new client::ToolData{}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &client::tool_init, + &client::tool_fini, + reinterpret_cast(data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/samples/advanced_thread_trace/main.cpp b/samples/advanced_thread_trace/main.cpp new file mode 100644 index 0000000000..b9c60b3314 --- /dev/null +++ b/samples/advanced_thread_trace/main.cpp @@ -0,0 +1,245 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "hip/hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include "transpose_kernels.hpp" + +#define PRINT_ALIGN 36 + +namespace +{ +using lock_guard_t = std::lock_guard; +auto print_lock = std::mutex{}; +} // namespace + +enum TransposeType +{ + TRANSPOSE_NAIVE, + TRANSPOSE_INPLACE_LDS, + TRANSPOSE_NO_BANK_CONFLICTS +}; + +class ITranspose +{ +public: + virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0; + virtual ~ITranspose(){}; +}; + +template +class Transpose : public ITranspose +{ +public: + Transpose(int dev, size_t _M) + : devID(dev) + , M(_M) + , databytes(_M * _M * sizeof(T)) + { + HIP_API_CALL(hipSetDevice(devID)); + HIP_API_CALL(hipStreamCreate(&stream)); + + std::default_random_engine _engine{std::random_device{}() * rand()}; + std::uniform_int_distribution _dist{0, 1000}; + + inp_matrix = new T[M * M]; + out_matrix = new T[M * M]; + + for(size_t i = 0; i < M * M; i++) + inp_matrix[i] = static_cast(_dist(_engine)); + memset(out_matrix, 0, databytes); + + HIP_API_CALL(hipMalloc(&in, databytes)); + HIP_API_CALL(hipMalloc(&out, databytes)); + HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream)); + HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream)); + HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream)); + + HIP_API_CALL(hipEventCreate(&start)); + HIP_API_CALL(hipEventCreate(&stop)); + } + + void run(TransposeType ttype, int numThreadsY, int num_iter) override + { + HIP_API_CALL(hipSetDevice(devID)); + dim3 grid(M / TILE_DIM, M / TILE_DIM, 1); + dim3 block(TILE_DIM, numThreadsY, 1); + + auto Kernel = transposeNaive; + std::string KernelName = "transposeNaive"; + if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS) + { + Kernel = transposeLdsNoBankConflicts; + KernelName = "transposeLdsNoBankConflicts"; + } + else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS) + { + Kernel = transposeLdsSwapInplace; + KernelName = "transposeLdsSwapInplace"; + } + + { + std::string functypeid = __PRETTY_FUNCTION__; + auto it_beg = functypeid.rfind("[T = "); + auto it_end = functypeid.rfind(']'); + + if(it_beg != std::string::npos) it_beg += std::string("[T = ").size(); + + if(it_beg < it_end && it_end != std::string::npos) + KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>'; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipEventRecord(start, stream)); + + for(int i = 0; i < num_iter; i++) + { + Kernel<<>>(out, in, M); + HIP_API_CALL(hipGetLastError()); + } + + HIP_API_CALL(hipEventRecord(stop, stream)); + HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream)); + HIP_API_CALL(hipEventSynchronize(stop)); + + float time; + HIP_API_CALL(hipEventElapsedTime(&time, start, stop)); + float GB = databytes * num_iter * 2 / float(1 << 30); + + { + lock_guard_t _lk{print_lock}; + std::cout << "The average performance of " << std::setw(38) << KernelName << " : " + << (1000 * GB / time) << " GB/s" << std::endl; + } + + verify(); + } + + void verify() const + { + HIP_API_CALL(hipStreamSynchronize(stream)); + for(int i = 0; i < 10; i++) + { + int row = rand() % M; + int col = rand() % M; + if(inp_matrix[row * M + col] != out_matrix[col * M + row]) + { + lock_guard_t _lk{print_lock}; + std::cout << "mismatch: " << row << ", " << col << " : " + << inp_matrix[row * M + col] << " | " << out_matrix[col * M + row] + << std::endl; + } + } + } + + virtual ~Transpose() + { + HIP_API_CALL(hipSetDevice(devID)); + HIP_API_CALL(hipEventDestroy(start)); + HIP_API_CALL(hipEventDestroy(stop)); + + HIP_API_CALL(hipFree(in)); + HIP_API_CALL(hipFree(out)); + HIP_API_CALL(hipStreamDestroy(stream)); + + delete[] inp_matrix; + delete[] out_matrix; + } + + const int devID; + const size_t M; + const size_t databytes; + + hipStream_t stream; + hipEvent_t start, stop; + + T* inp_matrix = nullptr; + T* out_matrix = nullptr; + + T* in = nullptr; + T* out = nullptr; +}; + +int +main(int argc, char** argv) +{ + int deviceId = 0; + int blockDimY = 8; + int num_iter = 1; + int mat_size = 8192; + + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + std::cout << "usage: transpose " + << "[MatrixSize (" << mat_size << ")] " + << "[numIter (" << num_iter << ")] " + << "[blockDimY (" << blockDimY << ")] " + << "[DEVICE_ID (" << deviceId << ")] " << std::endl; + exit(EXIT_SUCCESS); + } + } + if(argc > 1) mat_size = atoll(argv[1]); + if(argc > 2) num_iter = atoll(argv[2]); + if(argc > 3) blockDimY = atoll(argv[3]); + if(argc > 4) deviceId = atoll(argv[4]); + + printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n", + mat_size, + deviceId, + num_iter, + blockDimY); + + int ndevice = 0; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + printf("[transpose] Number of devices found: %i\n", ndevice); + assert(ndevice > 0); + + if(deviceId >= ndevice) exit(EXIT_FAILURE); + + { + std::vector> kernels; + kernels.push_back(std::make_unique>(deviceId, mat_size)); + kernels.push_back(std::make_unique>(deviceId, mat_size)); + kernels.push_back(std::make_unique>(deviceId, mat_size)); + + for(auto& kernel : kernels) + { + kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter); + kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter); + kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter); + } + } + + HIP_API_CALL(hipDeviceSynchronize()); + + return 0; +} diff --git a/samples/advanced_thread_trace/transpose_kernels.hpp b/samples/advanced_thread_trace/transpose_kernels.hpp new file mode 100644 index 0000000000..bc813c56ae --- /dev/null +++ b/samples/advanced_thread_trace/transpose_kernels.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include "hip/hip_runtime.h" + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + lock_guard_t _hip_api_print_lk{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define TILE_DIM 64 + +template +__global__ void +transposeNaive(T* odata, const T* idata, size_t size) +{ + size_t idx = blockIdx.x * TILE_DIM + threadIdx.x; + size_t block_posy = blockIdx.y * TILE_DIM; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size]; +} + +template +__global__ void +transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size) +{ + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; + + size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x; + size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y; + size_t index_in = idx_in + idy_in * size; + + size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x; + size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y; + size_t index_out = idx_out + idy_out * size; + + for(size_t y = 0; y < TILE_DIM; y += blockDim.y) + tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size]; + + __syncthreads(); + + for(size_t y = 0; y < TILE_DIM; y += blockDim.y) + odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y]; +} + +// Generates more interesting ISA +template +__global__ void +transposeLdsSwapInplace(T* odata, const T* idata, size_t size) +{ + __shared__ T tile[TILE_DIM][TILE_DIM]; + + const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size]; + + __syncthreads(); + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + if(idy < threadIdx.x) + { + T temp = tile[idy][threadIdx.x]; + tile[idy][threadIdx.x] = tile[threadIdx.x][idy]; + tile[threadIdx.x][idy] = temp; + } + + __syncthreads(); + + const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x]; +} diff --git a/samples/code_object_isa_decode/CMakeLists.txt b/samples/code_object_isa_decode/CMakeLists.txt new file mode 100644 index 0000000000..ffc4bae201 --- /dev/null +++ b/samples/code_object_isa_decode/CMakeLists.txt @@ -0,0 +1,52 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-samples-code-object-isa-decode LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +find_package(rocprofiler-sdk REQUIRED) +find_package(Threads REQUIRED) + +add_executable(code-object-isa-decode) +target_sources(code-object-isa-decode PRIVATE main.cpp client.cpp) +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) +set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g") + +target_link_libraries( + code-object-isa-decode PRIVATE Threads::Threads rocprofiler::samples-common-library + rocprofiler-sdk-codeobj rocprofiler::rocprofiler) + +add_test(NAME code-object-isa-decode COMMAND $) + +set_tests_properties( + code-object-isa-decode + PROPERTIES + TIMEOUT + 45 + LABELS + "samples" + ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$" + FAIL_REGULAR_EXPRESSION + "threw an exception") diff --git a/samples/code_object_isa_decode/client.cpp b/samples/code_object_isa_decode/client.cpp new file mode 100644 index 0000000000..53134bee0d --- /dev/null +++ b/samples/code_object_isa_decode/client.cpp @@ -0,0 +1,251 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +/** + * @file samples/code_object_isa_decode/client.cpp + * + * @brief Example rocprofiler client (tool) + */ + +#include +#include +#include +#include +#include +#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" + +#include "common/defines.hpp" +#include "common/filesystem.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "code_object_track.hpp" + +namespace client +{ +namespace +{ +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = std::unordered_map>; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_client_finalize_t client_fini_func = nullptr; +rocprofiler_context_id_t client_ctx = {}; +kernel_symbol_map_t registered_kernels = {}; + +CodeobjAddressTranslate codeobjTranslate; + +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; + + if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) + { + auto* data = static_cast(record.payload); + + if(std::string_view(data->uri).find("file:///") == 0) + { + codeobjTranslate.addDecoder( + data->uri, data->code_object_id, data->load_base, data->load_size); + auto symbolmap = codeobjTranslate.getSymbolMap(); + for(auto& [vaddr, symbol] : symbolmap) + registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}}); + } + } + else if(record.operation == + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + std::cout << std::hex; + auto* data = static_cast(record.payload); + auto kernel_name = std::regex_replace(data->kernel_name, std::regex{"(\\.kd)$"}, ""); + + if(registered_kernels.find(kernel_name) == registered_kernels.end()) + { + std::cout << "Not Found: " << kernel_name << " in codeobj." << std::endl; + return; + } + + auto& begin_end = registered_kernels.at(kernel_name); + + std::cout << std::hex << "Found: " << kernel_name << " at addr: 0x" << begin_end.first + << std::dec << ". Printing first 64 bytes:" << std::endl; + + std::unordered_set references{}; + int num_waitcnts = 0; + int num_scalar = 0; + int num_vector = 0; + int num_other = 0; + + size_t vaddr = begin_end.first; + while(vaddr < begin_end.second) + { + auto inst = codeobjTranslate.get(vaddr); + std::string_view source = inst->comment; + if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/')); + if(vaddr < begin_end.first + 64) std::cout << '\t' << inst->inst << '\n'; + + if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':')); + + references.insert(std::string(source)); + if(inst->inst.find("v_") == 0) + num_vector++; + else if(inst->inst.find("s_waitcnt") == 0) + num_waitcnts++; + else if(inst->inst.find("s_") == 0) + num_scalar++; + else + num_other++; + + vaddr += inst->size; + } + + std::cout << " --- Num Scalar: " << num_scalar << "\n --- Num Vector: " << num_vector + << "\n --- Num Waitcnts: " << num_waitcnts + << "\n --- Other instructions: " << num_other + << "\nKernel has source references to: " << std::endl; + for(auto& ref : references) + std::cout << '\t' << ref << std::endl; + } + + (void) user_data; + (void) callback_data; +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + client_fini_func = fini_func; + + ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + tool_codeobj_tracing_callback, + tool_data), + "code object tracing service configure"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx), + "context validity check"); + if(valid_ctx == 0) + { + // notify rocprofiler that initialization failed + // and all the contexts, buffers, etc. created + // should be ignored + return -1; + } + + ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* tool_data) +{ + (void) tool_data; +} + +void +setup() +{ + if(int status = 0; + rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) + { + ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure), + "force configuration"); + } +} +} // namespace + +// force configuration when library is loaded +bool cfg_on_load = (client::setup(), true); +} // namespace client + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "ExampleTool"; + + // store client info + client::client_id = id; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch + << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &client::tool_init, + &client::tool_fini, + nullptr}; + + // return pointer to configure data + return &cfg; +} diff --git a/samples/code_object_isa_decode/main.cpp b/samples/code_object_isa_decode/main.cpp new file mode 100644 index 0000000000..f72cf7a14c --- /dev/null +++ b/samples/code_object_isa_decode/main.cpp @@ -0,0 +1,246 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "hip/hip_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include "transpose_kernels.hpp" + +#define PRINT_ALIGN 36 + +namespace +{ +using lock_guard_t = std::lock_guard; +auto print_lock = std::mutex{}; +} // namespace + +enum TransposeType +{ + TRANSPOSE_NAIVE, + TRANSPOSE_INPLACE_LDS, + TRANSPOSE_NO_BANK_CONFLICTS +}; + +class ITranspose +{ +public: + virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0; + virtual ~ITranspose(){}; +}; + +template +class Transpose : public ITranspose +{ +public: + Transpose(int dev, size_t _M) + : devID(dev) + , M(_M) + , databytes(_M * _M * sizeof(T)) + { + HIP_API_CALL(hipSetDevice(devID)); + HIP_API_CALL(hipStreamCreate(&stream)); + + std::default_random_engine _engine{std::random_device{}() * rand()}; + std::uniform_int_distribution _dist{0, 1000}; + + inp_matrix = new T[M * M]; + out_matrix = new T[M * M]; + + for(size_t i = 0; i < M * M; i++) + inp_matrix[i] = static_cast(_dist(_engine)); + memset(out_matrix, 0, databytes); + + HIP_API_CALL(hipMalloc(&in, databytes)); + HIP_API_CALL(hipMalloc(&out, databytes)); + HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream)); + HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream)); + HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream)); + + HIP_API_CALL(hipEventCreate(&start)); + HIP_API_CALL(hipEventCreate(&stop)); + } + + void run(TransposeType ttype, int numThreadsY, int num_iter) override + { + HIP_API_CALL(hipSetDevice(devID)); + dim3 grid(M / TILE_DIM, M / TILE_DIM, 1); + dim3 block(TILE_DIM, numThreadsY, 1); + + auto Kernel = transposeNaive; + std::string KernelName = "transposeNaive"; + if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS) + { + Kernel = transposeLdsNoBankConflicts; + KernelName = "transposeLdsNoBankConflicts"; + } + else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS) + { + Kernel = transposeLdsSwapInplace; + KernelName = "transposeLdsSwapInplace"; + } + + { + std::string functypeid = __PRETTY_FUNCTION__; + auto it_beg = functypeid.rfind("[T = "); + auto it_end = functypeid.rfind(']'); + + if(it_beg != std::string::npos) it_beg += std::string("[T = ").size(); + + if(it_beg < it_end && it_end != std::string::npos) + KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>'; + } + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipEventRecord(start, stream)); + + for(int i = 0; i < num_iter; i++) + { + Kernel<<>>(out, in, M); + HIP_API_CALL(hipGetLastError()); + } + + HIP_API_CALL(hipEventRecord(stop, stream)); + HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream)); + HIP_API_CALL(hipEventSynchronize(stop)); + + float time; + HIP_API_CALL(hipEventElapsedTime(&time, start, stop)); + float GB = databytes * num_iter * 2 / float(1 << 30); + + { + lock_guard_t _lk{print_lock}; + std::cout << "The average performance of " << std::setw(38) << KernelName << " : " + << (1000 * GB / time) << " GB/s" << std::endl; + } + + verify(); + } + + void verify() const + { + HIP_API_CALL(hipStreamSynchronize(stream)); + for(int i = 0; i < 10; i++) + { + int row = rand() % M; + int col = rand() % M; + if(inp_matrix[row * M + col] != out_matrix[col * M + row]) + { + lock_guard_t _lk{print_lock}; + std::cout << "mismatch: " << row << ", " << col << " : " + << inp_matrix[row * M + col] << " | " << out_matrix[col * M + row] + << std::endl; + } + } + } + + virtual ~Transpose() + { + HIP_API_CALL(hipSetDevice(devID)); + HIP_API_CALL(hipEventDestroy(start)); + HIP_API_CALL(hipEventDestroy(stop)); + + HIP_API_CALL(hipFree(in)); + HIP_API_CALL(hipFree(out)); + HIP_API_CALL(hipStreamDestroy(stream)); + + delete[] inp_matrix; + delete[] out_matrix; + } + + const int devID; + const size_t M; + const size_t databytes; + + hipStream_t stream; + hipEvent_t start, stop; + + T* inp_matrix = nullptr; + T* out_matrix = nullptr; + + T* in = nullptr; + T* out = nullptr; +}; + +int +main(int argc, char** argv) +{ + int deviceId = 0; + int blockDimY = 8; + int num_iter = 3; + int mat_size = 8192; + + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + std::cout << "usage: transpose " + << "[MatrixSize (" << mat_size << ")] " + << "[numIter (" << num_iter << ")] " + << "[blockDimY (" << blockDimY << ")] " + << "[DEVICE_ID (" << deviceId << ")] " << std::endl; + exit(EXIT_SUCCESS); + } + } + if(argc > 1) mat_size = atoll(argv[1]); + if(argc > 2) num_iter = atoll(argv[2]); + if(argc > 3) blockDimY = atoll(argv[3]); + if(argc > 4) deviceId = atoll(argv[4]); + + printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n", + mat_size, + deviceId, + num_iter, + blockDimY); + + int ndevice = 0; + HIP_API_CALL(hipGetDeviceCount(&ndevice)); + printf("[transpose] Number of devices found: %i\n", ndevice); + assert(ndevice > 0); + + if(deviceId >= ndevice) exit(EXIT_FAILURE); + + { + std::vector> kernels; + kernels.push_back(std::make_unique>(deviceId, mat_size)); + kernels.push_back(std::make_unique>(deviceId, mat_size)); + kernels.push_back(std::make_unique>(deviceId, mat_size)); + + for(auto& kernel : kernels) + { + kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter); + kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter); + kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter); + } + } + + HIP_API_CALL(hipDeviceSynchronize()); + HIP_API_CALL(hipDeviceReset()); + + return 0; +} diff --git a/samples/code_object_isa_decode/transpose_kernels.hpp b/samples/code_object_isa_decode/transpose_kernels.hpp new file mode 100644 index 0000000000..bc813c56ae --- /dev/null +++ b/samples/code_object_isa_decode/transpose_kernels.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include "hip/hip_runtime.h" + +#define HIP_API_CALL(CALL) \ + { \ + hipError_t error_ = (CALL); \ + if(error_ != hipSuccess) \ + { \ + lock_guard_t _hip_api_print_lk{print_lock}; \ + fprintf(stderr, \ + "%s:%d :: HIP error : %s\n", \ + __FILE__, \ + __LINE__, \ + hipGetErrorString(error_)); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define TILE_DIM 64 + +template +__global__ void +transposeNaive(T* odata, const T* idata, size_t size) +{ + size_t idx = blockIdx.x * TILE_DIM + threadIdx.x; + size_t block_posy = blockIdx.y * TILE_DIM; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size]; +} + +template +__global__ void +transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size) +{ + __shared__ T tile[TILE_DIM][TILE_DIM + 1]; + + size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x; + size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y; + size_t index_in = idx_in + idy_in * size; + + size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x; + size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y; + size_t index_out = idx_out + idy_out * size; + + for(size_t y = 0; y < TILE_DIM; y += blockDim.y) + tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size]; + + __syncthreads(); + + for(size_t y = 0; y < TILE_DIM; y += blockDim.y) + odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y]; +} + +// Generates more interesting ISA +template +__global__ void +transposeLdsSwapInplace(T* odata, const T* idata, size_t size) +{ + __shared__ T tile[TILE_DIM][TILE_DIM]; + + const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size]; + + __syncthreads(); + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + if(idy < threadIdx.x) + { + T temp = tile[idy][threadIdx.x]; + tile[idy][threadIdx.x] = tile[threadIdx.x][idy]; + tile[threadIdx.x][idy] = temp; + } + + __syncthreads(); + + const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x; + + for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y) + odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x]; +} diff --git a/source/include/rocprofiler-sdk/CMakeLists.txt b/source/include/rocprofiler-sdk/CMakeLists.txt index 7a65469918..fcf3c53bc4 100644 --- a/source/include/rocprofiler-sdk/CMakeLists.txt +++ b/source/include/rocprofiler-sdk/CMakeLists.txt @@ -30,6 +30,7 @@ set(ROCPROFILER_HEADER_FILES profile_config.h registration.h spm.h + thread_trace.h ${CMAKE_CURRENT_BINARY_DIR}/version.h) install( diff --git a/source/include/rocprofiler-sdk/rocprofiler.h b/source/include/rocprofiler-sdk/rocprofiler.h index 096d488b64..dd27304c9f 100644 --- a/source/include/rocprofiler-sdk/rocprofiler.h +++ b/source/include/rocprofiler-sdk/rocprofiler.h @@ -80,6 +80,7 @@ ROCPROFILER_EXTERN_C_FINI // #include "rocprofiler-sdk/marker.h" #include "rocprofiler-sdk/pc_sampling.h" #include "rocprofiler-sdk/profile_config.h" +#include "rocprofiler-sdk/thread_trace.h" // #include "rocprofiler-sdk/spm.h" ROCPROFILER_EXTERN_C_INIT diff --git a/source/include/rocprofiler-sdk/thread_trace.h b/source/include/rocprofiler-sdk/thread_trace.h new file mode 100644 index 0000000000..1cf9dd66ce --- /dev/null +++ b/source/include/rocprofiler-sdk/thread_trace.h @@ -0,0 +1,108 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include + +ROCPROFILER_EXTERN_C_INIT + +/** + * @defgroup THREAD_TRACE Thread Trace Service + * @brief Provides API calls to enable and handle thread trace data + * + * @{ + */ + +typedef void (*rocprofiler_att_data_types_callback_t)(int64_t data_type_id, + const char* data_type_name, + void* userdata); + +rocprofiler_status_t +rocprofiler_att_iterate_data_types(rocprofiler_att_data_types_callback_t callback, + void* userdata) ROCPROFILER_API; + +typedef union +{ + struct + { + uint32_t enable_async_queue : 1; + uint32_t enable_occupancy_mode : 1; + uint32_t enable_double_buffering : 1; + uint32_t disable_att_markers : 1; + uint32_t disable_software_header : 1; + }; + uint32_t raw; +} rocprofiler_att_parameter_flag_t; + +typedef struct +{ + rocprofiler_att_parameter_flag_t flags; + int shader_num; + int* shader_ids; + uint64_t buffer_size; + uint8_t target_cu; + uint8_t simd_select; + uint8_t reserved; + uint8_t vmid_mask; + uint16_t perfcounter_mask; + uint8_t perfcounter_ctrl; + uint8_t perfcounter_num; + const char** perfcounter; +} rocprofiler_att_parameters_t; + +typedef enum +{ + ROCPROFILER_ATT_CONTROL_NONE = 0, + ROCPROFILER_ATT_CONTROL_START = 1, + ROCPROFILER_ATT_CONTROL_STOP = 2, + ROCPROFILER_ATT_CONTROL_START_AND_STOP = 3 +} rocprofiler_att_control_flags_t; + +typedef rocprofiler_att_control_flags_t (*rocprofiler_att_dispatch_callback_t)( + rocprofiler_queue_id_t queue_id, + const rocprofiler_agent_t* agent, + rocprofiler_correlation_id_t correlation_id, + const hsa_kernel_dispatch_packet_t* dispatch_packet, + uint64_t kernel_id, + void* userdata); + +typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id, + int64_t data_type_id, + const char* data_type_name, + void* data, + size_t data_size, + void* userdata); + +rocprofiler_status_t +rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id, + rocprofiler_att_parameters_t parameters, + rocprofiler_att_dispatch_callback_t dispatch_callback, + rocprofiler_att_shader_data_callback_t shader_callback, + void* callback_userdata) ROCPROFILER_API; + +/** @} */ + +ROCPROFILER_EXTERN_C_FINI diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index 59dbcf99a3..d69bab7869 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -9,6 +9,8 @@ set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "tools") add_subdirectory(rocprofiler-sdk-tool) add_subdirectory(rocprofiler-sdk-roctx) +add_subdirectory(rocprofiler-sdk-codeobj) + if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) endif() diff --git a/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt b/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt new file mode 100644 index 0000000000..3ccfd8cbc9 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/CMakeLists.txt @@ -0,0 +1,58 @@ +# ############################################################################## +# # Copyright (c) 2024 Advanced Micro Devices, Inc. # # Permission is hereby +# granted, free of charge, to any person obtaining a copy # of this software and +# associated documentation files (the "Software"), to # deal in the Software +# without restriction, including without limitation the # rights to use, copy, +# modify, merge, publish, distribute, sublicense, and/or # sell copies of the +# Software, and to permit persons to whom the Software is # furnished to do so, +# subject to the following conditions: # # The above copyright notice and this +# permission notice shall be included in # all copies or substantial portions of +# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS # IN THE SOFTWARE. +# ############################################################################## + +set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "codeobj") +find_package(amd_comgr REQUIRED) + +set(LIB_CODEOBJ_PARSER_SOURCES code_object_track.cpp code_printing.cpp disassembly.cpp) +set(LIB_CODEOBJ_PARSER_HEADERS code_object_track.hpp code_printing.hpp disassembly.hpp + segment.hpp) + +add_library(rocprofiler-sdk-codeobj STATIC) +target_sources(rocprofiler-sdk-codeobj PRIVATE ${LIB_CODEOBJ_PARSER_SOURCES}) +target_link_libraries( + rocprofiler-sdk-codeobj + PRIVATE amd_comgr dw elf rocprofiler::rocprofiler-build-flags + rocprofiler::rocprofiler-memcheck rocprofiler::rocprofiler-common-library) + +target_include_directories(rocprofiler-sdk-codeobj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +if(ROCPROFILER_BUILD_CODECOV) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -coverage") + target_link_libraries(rocprofiler-sdk-codeobj PUBLIC gcov) +endif() + +set_target_properties( + rocprofiler-sdk-codeobj + PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj + SOVERSION ${PROJECT_VERSION_MAJOR} + VERSION ${PROJECT_VERSION} + BUILD_RPATH "\$ORIGIN:\$ORIGIN/.." + INSTALL_RPATH "\$ORIGIN:\$ORIGIN/..") + +install( + TARGETS rocprofiler-sdk-codeobj + DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj + COMPONENT tools + EXPORT rocprofiler-sdk-codeobj-targets) + +install( + FILES ${LIB_CODEOBJ_PARSER_HEADERS} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk-codeobj + COMPONENT tools) diff --git a/source/lib/rocprofiler-sdk-codeobj/LICENSE b/source/lib/rocprofiler-sdk-codeobj/LICENSE new file mode 100644 index 0000000000..b69c259b1a --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 AMD ROCmâ„¢ Software + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/source/lib/rocprofiler-sdk-codeobj/README.md b/source/lib/rocprofiler-sdk-codeobj/README.md new file mode 100644 index 0000000000..bda6c9da14 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/README.md @@ -0,0 +1 @@ +# rocprofiler-codeobj-parser \ No newline at end of file diff --git a/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp b/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp new file mode 100644 index 0000000000..5ce74d62f8 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp @@ -0,0 +1,196 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "code_object_track.hpp" + +void +CodeobjRecorder::Load(uint64_t addr, + uint64_t load_size, + const std::string& URI, + uint64_t mem_addr, + uint64_t mem_size, + uint64_t id) +{ + Load(std::make_shared( + addr, load_size, URI, mem_addr, mem_size, id, capture_mode)); +} + +void +CodeobjCaptureInstance::copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path) +{ + std::ifstream file(path, std::ios::in | std::ios::binary); + if(!file) + { + printf("could not open `%s'\n", path.c_str()); + return; + } + + if(!size) + { + file.ignore(std::numeric_limits::max()); + size_t bytes = file.gcount(); + file.clear(); + + if(bytes < offset) + { + printf("invalid uri `%s' (file size < offset)\n", path.c_str()); + return; + } + size = bytes - offset; + } + + file.seekg(offset, std::ios_base::beg); + buffer.resize(size); + file.read(&buffer[0], size); +} + +void CodeobjCaptureInstance::copyCodeobjFromMemory(uint64_t, uint64_t) +{ + // buffer.resize(mem_size); + // std::memcpy(buffer.data(), (uint64_t*)mem_addr, mem_size); +} + +std::pair +CodeobjCaptureInstance::parse_uri() +{ + const std::string protocol_delim{"://"}; + + size_t protocol_end = URI.find(protocol_delim); + protocol = URI.substr(0, protocol_end); + protocol_end += protocol_delim.length(); + + std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + std::string path; + size_t path_end = URI.find_first_of("#?", protocol_end); + if(path_end != std::string::npos) + { + path = URI.substr(protocol_end, path_end++ - protocol_end); + } + else + { + path = URI.substr(protocol_end); + } + + /* %-decode the string. */ + decoded_path = std::string{}; + decoded_path.reserve(path.length()); + for(size_t i = 0; i < path.length(); ++i) + { + if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2])) + { + decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16); + i += 2; + } + else + { + decoded_path += path[i]; + } + } + + /* Tokenize the query/fragment. */ + std::vector tokens; + size_t pos, last = path_end; + while((pos = URI.find('&', last)) != std::string::npos) + { + tokens.emplace_back(URI.substr(last, pos - last)); + last = pos + 1; + } + if(last != std::string::npos) tokens.emplace_back(URI.substr(last)); + + /* Create a tag-value map from the tokenized query/fragment. */ + std::unordered_map params; + std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) { + size_t delim = token.find('='); + if(delim != std::string::npos) + { + params.emplace(token.substr(0, delim), token.substr(delim + 1)); + } + }); + + size_t offset = 0; + size_t size = 0; + + if(auto offset_it = params.find("offset"); offset_it != params.end()) + offset = std::stoul(offset_it->second, nullptr, 0); + + if(auto size_it = params.find("size"); size_it != params.end()) + { + if(!(size = std::stoul(size_it->second, nullptr, 0))) throw std::exception(); + } + + return {offset, size}; +} + +void +CodeobjCaptureInstance::reset(codeobj_capture_mode_t mode) +{ + if(static_cast(mode) <= static_cast(capture_mode)) return; + + capture_mode = mode; + if(!buffer.empty()) return; + + size_t offset, size; + try + { + std::tie(offset, size) = parse_uri(); + } catch(...) + { + std::cerr << "Error parsing URI " << URI << std::endl; + return; + } + + if(protocol == "file") + { + if(mode == ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY) + copyCodeobjFromFile(offset, size, decoded_path); + } + else if(protocol == "memory") + { + if(mode != ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY) + copyCodeobjFromMemory(mem_addr, mem_size); + } + else + { + printf("\"%s\" protocol not supported\n", protocol.c_str()); + } +} diff --git a/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp b/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp new file mode 100644 index 0000000000..f279b004b4 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/code_object_track.hpp @@ -0,0 +1,158 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include + +/** + * Enum defines how code object is captured for ATT and PC Sampling + */ +enum codeobj_capture_mode_t +{ + /** + * Capture file and memory paths for the loaded code object + */ + ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY = 0, + /** + * Capture symbols for file:// and memory:// type objects, + * and generate a copy of all kernel code for objects under memory:// + */ + ROCPROFILER_CODEOBJ_CAPTURE_COPY_MEMORY = 1, + /** + * Capture symbols and all kernel code for file:// and memory:// type objects + */ + ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY = 2, + ROCPROFILER_CODEOBJ_CAPTURE_LAST = 3, +}; + +/** + * A class to keep track of currently loaded code objects. + * Only the public static methods are thread-safe and expected to be used. + */ +class CodeobjCaptureInstance +{ +public: + CodeobjCaptureInstance(uint64_t _addr, + uint64_t _load_size, + const std::string& _uri, + uint64_t _mem_addr, + uint64_t _mem_size, + uint64_t id, + codeobj_capture_mode_t mode) + : addr(_addr) + , load_size(_load_size) + , load_id(id) + , URI(_uri) + , mem_addr(_mem_addr) + , mem_size(_mem_size) + { + reset(mode); + }; + + const uint64_t addr; + const uint64_t load_size; + const uint64_t load_id; + +private: + void reset(codeobj_capture_mode_t mode); + + std::pair parse_uri(); + void DecodePath(); + void copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path); + void copyCodeobjFromMemory(uint64_t, uint64_t); + + std::string URI{}; + std::string decoded_path{}; + std::string protocol{}; + std::vector buffer{}; + + uint64_t mem_addr = 0; + uint64_t mem_size = 0; + codeobj_capture_mode_t capture_mode = ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY; +}; + +typedef std::shared_ptr CodeobjPtr; + +template <> +struct std::hash +{ + uint64_t operator()(const CodeobjPtr& p) const { return p->load_id; } +}; + +template <> +struct std::equal_to +{ + bool operator()(const CodeobjPtr& a, const CodeobjPtr& b) const + { + return (a->addr == b->addr) && (a->load_id == b->load_id); + }; +}; + +/** + * A class to keep track of the history of loaded code objets. + * Only the public static methods are thread-safe and expected to be used. + */ +class CodeobjRecorder +{ +public: + CodeobjRecorder(codeobj_capture_mode_t mode) + : capture_mode(mode){}; + + void Load(uint64_t _addr, + uint64_t _load_size, + const std::string& _uri, + uint64_t mem_addr, + uint64_t mem_size, + uint64_t id); + void Load(CodeobjPtr capture) + { + std::lock_guard lk(mutex); + captures[capture->load_id] = capture; + } + void Unload(uint64_t id) + { + std::lock_guard lk(mutex); + captures.erase(id); + }; + +public: + std::shared_mutex mutex; + + std::vector get() + { + std::vector vec; + std::shared_lock lk(mutex); + for(auto& [k, v] : captures) + vec.push_back(v); + return vec; + }; + +private: + codeobj_capture_mode_t capture_mode; + std::unordered_map captures; +}; diff --git a/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp b/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp new file mode 100644 index 0000000000..cdda075ece --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/code_printing.cpp @@ -0,0 +1,278 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define C_API_BEGIN \ + try \ + { +#define C_API_END(returndata) \ + } \ + catch(std::exception & e) \ + { \ + std::string s = e.what(); \ + if(s.find("memory protocol not supported!") == std::string::npos) \ + std::cerr << "Codeobj API lookup: " << e.what() << std::endl; \ + return returndata; \ + } \ + catch(std::string & s) \ + { \ + if(s.find("memory protocol not supported!") == std::string::npos) \ + std::cerr << "Codeobj API lookup: " << s << std::endl; \ + return returndata; \ + } \ + catch(...) { return returndata; } + +CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size) +{ + m_fd = -1; +#if defined(_GNU_SOURCE) && defined(MFD_ALLOW_SEALING) && defined(MFD_CLOEXEC) + m_fd = ::memfd_create(m_uri.c_str(), MFD_ALLOW_SEALING | MFD_CLOEXEC); +#endif + if(m_fd == -1) // If fail, attempt under /tmp + m_fd = ::open("/tmp", O_TMPFILE | O_RDWR, 0666); + + if(m_fd == -1) + { + printf("could not create a temporary file for code object\n"); + return; + } + + if(size_t size = ::write(m_fd, codeobj_data, codeobj_size); size != codeobj_size) + { + printf("could not write to the temporary file\n"); + return; + } + ::lseek(m_fd, 0, SEEK_SET); + fsync(m_fd); + + m_line_number_map = {}; + + std::unique_ptr dbg(dwarf_begin(m_fd, DWARF_C_READ), + [](Dwarf* _dbg) { dwarf_end(_dbg); }); + + /*if (!dbg) { + rocprofiler::warning("Error opening Dwarf!\n"); + return; + } */ + + if(dbg) + { + Dwarf_Off cu_offset{0}, next_offset; + size_t header_size; + + std::unordered_set used_addrs; + + while(!dwarf_nextcu( + dbg.get(), cu_offset, &next_offset, &header_size, nullptr, nullptr, nullptr)) + { + Dwarf_Die die; + if(!dwarf_offdie(dbg.get(), cu_offset + header_size, &die)) continue; + + Dwarf_Lines* lines; + size_t line_count; + if(dwarf_getsrclines(&die, &lines, &line_count)) continue; + + for(size_t i = 0; i < line_count; ++i) + { + Dwarf_Addr addr; + int line_number; + Dwarf_Line* line = dwarf_onesrcline(lines, i); + + if(line && !dwarf_lineaddr(line, &addr) && !dwarf_lineno(line, &line_number) && + line_number) + { + std::string src = dwarf_linesrc(line, nullptr, nullptr); + auto dwarf_line = src + ':' + std::to_string(line_number); + + if(used_addrs.find(addr) != used_addrs.end()) + { + size_t pos = m_line_number_map.lower_bound(addr); + m_line_number_map.data()[pos].str += ' ' + dwarf_line; + continue; + } + + used_addrs.insert(addr); + m_line_number_map.insert(DSourceLine{addr, 0, std::move(dwarf_line)}); + } + } + cu_offset = next_offset; + } + } + + // Can throw + disassembly = std::make_unique(codeobj_data, codeobj_size); + if(m_line_number_map.size()) + { + size_t total_size = 0; + for(size_t i = 0; i < m_line_number_map.size() - 1; i++) + { + size_t s = m_line_number_map.get(i + 1).vaddr - m_line_number_map.get(i).vaddr; + m_line_number_map.data()[i].size = s; + total_size += s; + } + m_line_number_map.back().size = std::max(total_size, codeobj_size) - total_size; + } + try + { + m_symbol_map = disassembly->GetKernelMap(); // Can throw + } catch(...) + {} + + // disassemble_kernels(); +} + +CodeobjDecoderComponent::~CodeobjDecoderComponent() +{ + if(m_fd) ::close(m_fd); +} + +std::shared_ptr +CodeobjDecoderComponent::disassemble_instruction(uint64_t faddr, uint64_t vaddr) +{ + if(!disassembly) throw std::exception(); + + const char* cpp_line = nullptr; + + try + { + const DSourceLine& it = m_line_number_map.find_obj(vaddr); + cpp_line = it.str.data(); + } catch(...) + {} + + auto pair = disassembly->ReadInstruction(faddr); + auto inst = std::make_shared(std::move(pair.first), pair.second); + inst->faddr = faddr; + inst->vaddr = vaddr; + + if(cpp_line) inst->comment = cpp_line; + return inst; +} + +LoadedCodeobjDecoder::LoadedCodeobjDecoder(const char* filepath, + uint64_t _load_addr, + uint64_t mem_size) +: load_addr(_load_addr) +, load_end(load_addr + mem_size) +{ + if(!filepath) throw "Empty filepath."; + + std::string_view fpath(filepath); + + if(fpath.rfind(".out") + 4 == fpath.size()) + { + std::ifstream file(filepath, std::ios::in | std::ios::binary); + + if(!file.is_open()) throw "Invalid filename " + std::string(filepath); + + std::vector buffer; + file.seekg(0, file.end); + buffer.resize(file.tellg()); + file.seekg(0, file.beg); + file.read(buffer.data(), buffer.size()); + + decoder = std::make_unique(buffer.data(), buffer.size()); + } + else + { + std::unique_ptr binary = std::make_unique(filepath); + auto& buffer = binary->buffer; + decoder = std::make_unique(buffer.data(), buffer.size()); + } + + elf_segments = decoder->disassembly->getSegments(); +} + +LoadedCodeobjDecoder::LoadedCodeobjDecoder(const void* data, + size_t size, + uint64_t _load_addr, + uint64_t mem_size) +: load_addr(_load_addr) +, load_end(load_addr + mem_size) +{ + decoder = std::make_unique(reinterpret_cast(data), size); + elf_segments = decoder->disassembly->getSegments(); +} + +std::shared_ptr +LoadedCodeobjDecoder::add_to_map(uint64_t ld_addr) +{ + if(!decoder || ld_addr < load_addr) throw std::out_of_range("Addr not in decoder"); + + uint64_t voffset = ld_addr - load_addr; + auto faddr = decoder->disassembly->va2fo(voffset); + if(!faddr) throw std::out_of_range("Could not find file offset"); + + auto shared = decoder->disassemble_instruction(*faddr, voffset); + shared->ld_addr = ld_addr; + decoded_map[ld_addr] = shared; + return shared; +} + +std::shared_ptr +LoadedCodeobjDecoder::get(uint64_t addr) +{ + if(decoded_map.find(addr) != decoded_map.end()) return decoded_map[addr]; + + try + { + return add_to_map(addr); + } catch(std::exception& e) + { + std::cerr << e.what() << " at addr " << std::hex << addr << std::dec << std::endl; + } + throw std::out_of_range("Invalid address"); + return nullptr; +} + +#define PUBLIC_API __attribute__((visibility("default"))) diff --git a/source/lib/rocprofiler-sdk-codeobj/code_printing.hpp b/source/lib/rocprofiler-sdk-codeobj/code_printing.hpp new file mode 100644 index 0000000000..8183a182aa --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/code_printing.hpp @@ -0,0 +1,257 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "disassembly.hpp" +#include "segment.hpp" + +struct Instruction +{ + Instruction() = default; + Instruction(std::string&& _inst, size_t _size) + : inst(std::move(_inst)) + , size(_size) + {} + std::string inst; + std::string comment; + uint64_t faddr; + uint64_t vaddr; + uint64_t ld_addr; + size_t size; +}; + +struct DSourceLine +{ + uint64_t vaddr; + uint64_t size; + std::string str; + uint64_t begin() const { return vaddr; } + bool inrange(uint64_t addr) const { return addr >= vaddr && addr < vaddr + size; } +}; + +class CodeobjDecoderComponent +{ +public: + CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size); + ~CodeobjDecoderComponent(); + + std::shared_ptr disassemble_instruction(uint64_t faddr, uint64_t vaddr); + int m_fd; + + cached_ordered_vector m_line_number_map; + std::map m_symbol_map{}; + + std::string m_uri; + std::vector> instructions{}; + std::unique_ptr disassembly{}; +}; + +class LoadedCodeobjDecoder +{ +public: + LoadedCodeobjDecoder(const char* filepath, uint64_t load_addr, uint64_t memsize); + LoadedCodeobjDecoder(const void* data, uint64_t size, uint64_t load_addr, size_t memsize); + std::shared_ptr add_to_map(uint64_t ld_addr); + + std::shared_ptr get(uint64_t addr); + uint64_t begin() const { return load_addr; }; + uint64_t end() const { return load_end; } + uint64_t size() const { return load_end - load_addr; } + bool inrange(uint64_t addr) const { return addr >= begin() && addr < end(); } + + const char* getSymbolName(uint64_t addr) const + { + if(!decoder) return nullptr; + + auto it = decoder->m_symbol_map.find(addr - load_addr); + if(it != decoder->m_symbol_map.end()) return it->second.name.data(); + + return nullptr; + } + + std::map& getSymbolMap() const + { + if(!decoder) throw std::exception(); + return decoder->m_symbol_map; + } + std::vector> elf_segments{}; + const uint64_t load_addr; + +private: + uint64_t load_end = 0; + + std::unordered_map> decoded_map; + std::unique_ptr decoder{nullptr}; +}; + +/** + * @brief Maps ID and offsets into instructions + */ +class CodeobjMap +{ +public: + CodeobjMap() = default; + + virtual void addDecoder(const char* filepath, + codeobj_marker_id_t id, + uint64_t load_addr, + uint64_t memsize) + { + decoders[id] = std::make_shared(filepath, load_addr, memsize); + } + + virtual void addDecoder(const void* data, + size_t memory_size, + codeobj_marker_id_t id, + uint64_t load_addr, + uint64_t memsize) + { + decoders[id] = + std::make_shared(data, memory_size, load_addr, memsize); + } + + virtual bool removeDecoderbyId(codeobj_marker_id_t id) { return decoders.erase(id) != 0; } + + std::shared_ptr get(codeobj_marker_id_t id, uint64_t offset) + { + auto& decoder = decoders.at(id); + return decoder->get(decoder->begin() + offset); + } + + const char* getSymbolName(codeobj_marker_id_t id, uint64_t offset) + { + auto& decoder = decoders.at(id); + uint64_t vaddr = decoder->begin() + offset; + if(decoder->inrange(vaddr)) return decoder->getSymbolName(vaddr); + return nullptr; + } + +protected: + std::unordered_map> decoders{}; +}; + +/** + * @brief Translates virtual addresses to elf file offsets + */ +class CodeobjAddressTranslate : protected CodeobjMap +{ + using Super = CodeobjMap; + +public: + CodeobjAddressTranslate() = default; + + virtual void addDecoder(const char* filepath, + codeobj_marker_id_t id, + uint64_t load_addr, + uint64_t memsize) override + { + this->Super::addDecoder(filepath, id, load_addr, memsize); + auto ptr = decoders.at(id); + table.insert({ptr->begin(), ptr->size(), id, 0}); + } + + virtual void addDecoder(const void* data, + size_t memory_size, + codeobj_marker_id_t id, + uint64_t load_addr, + uint64_t memsize) override + { + this->Super::addDecoder(data, memory_size, id, load_addr, memsize); + auto ptr = decoders.at(id); + table.insert({ptr->begin(), ptr->size(), id, 0}); + } + + virtual bool removeDecoder(codeobj_marker_id_t id, uint64_t load_addr) + { + return table.remove(load_addr) && this->Super::removeDecoderbyId(id); + } + + std::shared_ptr get(uint64_t vaddr) + { + auto& addr_range = table.find_codeobj_in_range(vaddr); + return this->Super::get(addr_range.id, vaddr - addr_range.vbegin); + } + + std::shared_ptr get(codeobj_marker_id_t id, uint64_t offset) + { + if(id == 0) + return get(offset); + else + return this->Super::get(id, offset); + } + + const char* getSymbolName(uint64_t vaddr) + { + for(auto& [_, decoder] : decoders) + { + if(!decoder->inrange(vaddr)) continue; + return decoder->getSymbolName(vaddr); + } + return nullptr; + } + + void getSymbolMap(const std::shared_ptr& dec, + std::unordered_map& symbols) const + { + try + { + auto& smap = dec->getSymbolMap(); + for(auto& [vaddr, sym] : smap) + symbols[vaddr + dec->load_addr] = sym; + } catch(std::exception& e) + { + return; + }; + } + + std::unordered_map getSymbolMap() const + { + std::unordered_map symbols; + + for(auto& [_, dec] : decoders) + this->getSymbolMap(dec, symbols); + + return symbols; + } + + std::unordered_map getSymbolMap(codeobj_marker_id_t id) const + { + std::unordered_map symbols; + + auto it = decoders.find(id); + if(it == decoders.end()) return symbols; + + this->getSymbolMap(it->second, symbols); + return symbols; + } + +private: + CodeobjTableTranslator table; +}; diff --git a/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp b/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp new file mode 100644 index 0000000000..a184c72127 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/disassembly.cpp @@ -0,0 +1,372 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#if !defined(_GNU_SOURCE) || !defined(_XOPEN_SOURCE) +# define _XOPEN_SOURCE 700 +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp" + +#define THROW_COMGR(call) \ + if(amd_comgr_status_s status = call) \ + { \ + const char* reason = ""; \ + amd_comgr_status_string(status, &reason); \ + std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \ + << std::endl; \ + throw std::exception(); \ + } + +#define RETURN_COMGR(call) \ + if(amd_comgr_status_s status = call) \ + { \ + const char* reason = ""; \ + amd_comgr_status_string(status, &reason); \ + std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \ + << std::endl; \ + return AMD_COMGR_STATUS_ERROR; \ + } + +CodeObjectBinary::CodeObjectBinary(const std::string& uri) +: m_uri(uri) +{ + const std::string protocol_delim{"://"}; + + size_t protocol_end = m_uri.find(protocol_delim); + std::string protocol = m_uri.substr(0, protocol_end); + protocol_end += protocol_delim.length(); + + std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + std::string path; + size_t path_end = m_uri.find_first_of("#?", protocol_end); + if(path_end != std::string::npos) + { + path = m_uri.substr(protocol_end, path_end++ - protocol_end); + } + else + { + path = m_uri.substr(protocol_end); + } + + /* %-decode the string. */ + std::string decoded_path; + decoded_path.reserve(path.length()); + for(size_t i = 0; i < path.length(); ++i) + { + if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2])) + { + decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16); + i += 2; + } + else + { + decoded_path += path[i]; + } + } + + /* Tokenize the query/fragment. */ + std::vector tokens; + size_t pos, last = path_end; + while((pos = m_uri.find('&', last)) != std::string::npos) + { + tokens.emplace_back(m_uri.substr(last, pos - last)); + last = pos + 1; + } + if(last != std::string::npos) + { + tokens.emplace_back(m_uri.substr(last)); + } + + /* Create a tag-value map from the tokenized query/fragment. */ + std::unordered_map params; + std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) { + size_t delim = token.find('='); + if(delim != std::string::npos) + { + params.emplace(token.substr(0, delim), token.substr(delim + 1)); + } + }); + + buffer = std::vector{}; + size_t offset = 0; + size_t size = 0; + + if(auto offset_it = params.find("offset"); offset_it != params.end()) + { + offset = std::stoul(offset_it->second, nullptr, 0); + } + + if(auto size_it = params.find("size"); size_it != params.end()) + { + if(!(size = std::stoul(size_it->second, nullptr, 0))) return; + } + + if(protocol != "file") throw protocol + " protocol not supported!"; + + std::ifstream file(decoded_path, std::ios::in | std::ios::binary); + if(!file || !file.is_open()) throw "could not open " + decoded_path; + + if(!size) + { + file.ignore(std::numeric_limits::max()); + size_t bytes = file.gcount(); + file.clear(); + + if(bytes < offset) throw "invalid uri " + decoded_path + " (file size < offset)"; + + size = bytes - offset; + } + + file.seekg(offset, std::ios_base::beg); + buffer.resize(size); + file.read(&buffer[0], size); +} + +DisassemblyInstance::DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size) +{ + buffer = std::vector(codeobj_size, 0); + std::memcpy(buffer.data(), codeobj_data, codeobj_size); + + THROW_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data)); + THROW_COMGR(amd_comgr_set_data(data, buffer.size(), buffer.data())); + + size_t isa_size = 128; + std::string input_isa{}; + input_isa.resize(isa_size); + THROW_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, input_isa.data())); + + THROW_COMGR(amd_comgr_create_disassembly_info( + input_isa.data(), + &DisassemblyInstance::memory_callback, + &DisassemblyInstance::inst_callback, + [](uint64_t, void*) {}, + &info)); +} + +amd_comgr_status_t +DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol, void* user_data) +{ + amd_comgr_symbol_type_t type; + RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type)); + + if(type != AMD_COMGR_SYMBOL_TYPE_FUNC) return AMD_COMGR_STATUS_SUCCESS; + + uint64_t vaddr = 0; + uint64_t mem_size = 0; + uint64_t name_size = 0; + RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &vaddr)); + RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size)); + RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size)); + + std::string name; + name.resize(name_size); + + RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data())); + + DisassemblyInstance& instance = *static_cast(user_data); + std::optional faddr = instance.va2fo(vaddr); + + if(faddr) instance.symbol_map[vaddr] = {name, *faddr, vaddr, mem_size}; + return AMD_COMGR_STATUS_SUCCESS; +} + +std::map& +DisassemblyInstance::GetKernelMap() +{ + symbol_map = {}; + THROW_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this)); + + return symbol_map; +} + +DisassemblyInstance::~DisassemblyInstance() +{ + amd_comgr_release_data(data); + amd_comgr_destroy_disassembly_info(info); +} + +std::pair +DisassemblyInstance::ReadInstruction(uint64_t faddr) +{ + uint64_t size_read; + uint64_t addr_in_buffer = reinterpret_cast(buffer.data()) + faddr; + + THROW_COMGR(amd_comgr_disassemble_instruction(info, addr_in_buffer, (void*) this, &size_read)); + return {std::move(this->last_instruction), size_read}; +} + +uint64_t +DisassemblyInstance::memory_callback(uint64_t from, char* to, uint64_t size, void* user_data) +{ + DisassemblyInstance& instance = *static_cast(user_data); + int64_t copysize = reinterpret_cast(instance.buffer.data()) + instance.buffer.size() - + static_cast(from); + copysize = std::min(size, copysize); + std::memcpy(to, (char*) from, copysize); + return copysize; +} + +void +DisassemblyInstance::inst_callback(const char* instruction, void* user_data) +{ + DisassemblyInstance& instance = *static_cast(user_data); + + if(!instruction) return; + + while(*instruction == '\t' || *instruction == ' ') + instruction++; + instance.last_instruction = instruction; +} + +#define CHECK_VA2FO(x, msg) \ + if(!(x)) \ + { \ + std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \ + return std::nullopt; \ + } + +// mem - input argument, start of the elf +// va - input argument, virtual address +// return file offset, if found +std::optional +DisassemblyInstance::va2fo(uint64_t va) +{ + CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough"); + + uint8_t* e_ident = (uint8_t*) buffer.data(); + CHECK_VA2FO(e_ident, "e_ident is nullptr"); + + CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 || + e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3, + "unexpected ei_mag"); + + CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class"); + CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data"); + CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version"); + CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA + + CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4 + e_ident[EI_ABIVERSION] == 3, + "unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5 + + Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data(); + CHECK_VA2FO(ehdr, "ehdr is nullptr"); + CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type"); + CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine"); + CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff"); + + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough"); + + Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff); + CHECK_VA2FO(phdr, "phdr is nullptr"); + + for(uint16_t i = 0; i < ehdr->e_phnum; ++i) + { + if(phdr[i].p_type != PT_LOAD) continue; + if(va < phdr[i].p_vaddr || va >= (phdr[i].p_vaddr + phdr[i].p_memsz)) continue; + + return va + phdr[i].p_offset - phdr[i].p_vaddr; + } + return std::nullopt; +} + +#undef CHECK_VA2FO +#define CHECK_VA2FO(x, msg) \ + if(!(x)) \ + { \ + std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \ + return {}; \ + } + +std::vector> +DisassemblyInstance::getSegments() +{ + CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough"); + + uint8_t* e_ident = (uint8_t*) buffer.data(); + CHECK_VA2FO(e_ident, "e_ident is nullptr"); + + CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 || + e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3, + "unexpected ei_mag"); + + CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class"); + CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data"); + CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version"); + CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA + + CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4 + e_ident[EI_ABIVERSION] == 3, + "unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5 + + Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data(); + CHECK_VA2FO(ehdr, "ehdr is nullptr"); + CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type"); + CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine"); + CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff"); + + CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough"); + + Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff); + CHECK_VA2FO(phdr, "phdr is nullptr"); + + std::vector> segments; + for(Elf64_Half i = 0; i < ehdr->e_phnum; ++i) + { + if(phdr[i].p_type != PT_LOAD) continue; + + segments.push_back({phdr[i].p_vaddr - phdr[i].p_offset, phdr[i].p_memsz}); + } + + return segments; +} diff --git a/source/lib/rocprofiler-sdk-codeobj/disassembly.hpp b/source/lib/rocprofiler-sdk-codeobj/disassembly.hpp new file mode 100644 index 0000000000..4e06442db6 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/disassembly.hpp @@ -0,0 +1,68 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include +#include + +class CodeObjectBinary +{ +public: + CodeObjectBinary(const std::string& uri); + std::string m_uri; + std::vector buffer; +}; + +struct SymbolInfo +{ + std::string name{}; + uint64_t faddr = 0; + uint64_t vaddr = 0; + uint64_t mem_size = 0; +}; + +class DisassemblyInstance +{ +public: + DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size); + ~DisassemblyInstance(); + + std::pair ReadInstruction(uint64_t faddr); + std::map& GetKernelMap(); + + static uint64_t memory_callback(uint64_t from, char* to, uint64_t size, void* user_data); + static void inst_callback(const char* instruction, void* user_data); + static amd_comgr_status_t symbol_callback(amd_comgr_symbol_t symbol, void* user_data); + + std::optional va2fo(uint64_t va); + std::vector> getSegments(); + + std::vector buffer; + std::string last_instruction; + amd_comgr_disassembly_info_t info; + amd_comgr_data_t data; + std::map symbol_map; +}; diff --git a/source/lib/rocprofiler-sdk-codeobj/segment.hpp b/source/lib/rocprofiler-sdk-codeobj/segment.hpp new file mode 100644 index 0000000000..f80f4ead95 --- /dev/null +++ b/source/lib/rocprofiler-sdk-codeobj/segment.hpp @@ -0,0 +1,162 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once +#include +#include +#include +#include +#include +#include + +using codeobj_marker_id_t = size_t; + +template +class ordered_vector : public std::vector +{ + using Super = std::vector; + +public: + void insert(const Type& elem) + { + size_t loc = lower_bound(elem.begin()); + if(this->size() && get(loc).begin() < elem.begin()) loc++; + this->Super::insert(this->begin() + loc, elem); + } + bool remove(const Type& elem) + { + if(!this->size()) return false; + size_t loc = lower_bound(elem.begin()); + if(get(loc) != elem) return false; + + this->Super::erase(this->begin() + loc); + return true; + } + bool remove(uint64_t elem_begin) + { + if(!this->size()) return false; + size_t loc = lower_bound(elem_begin); + if(get(loc).begin() != elem_begin) return false; + + this->Super::erase(this->begin() + loc); + return true; + } + size_t lower_bound(size_t addr) const + { + if(!this->size()) return 0; + return binary_search(addr, 0, this->size() - 1); + } + + size_t binary_search(size_t addr, size_t s, size_t e) const + { + if(s >= e) + return s; + else if(s + 1 == e) + return (get(e).begin() <= addr) ? e : s; + + size_t mid = (s + e) / 2; + if(get(mid).begin() <= addr) + return binary_search(addr, mid, e); + else + return binary_search(addr, s, mid); + } + const Type& get(size_t i) const { return this->operator[](i); } +}; + +/** + * @brief Finds a candidate codeobj for the given vaddr + */ +template +class cached_ordered_vector : public ordered_vector +{ + using Super = ordered_vector; + +public: + cached_ordered_vector() { reset(); } + + const Type& find_obj(uint64_t addr) + { + if(testCache(addr)) return get(cached_segment); + + size_t lb = this->lower_bound(addr); + if(lb >= this->size() || !get(lb).inrange(addr)) + throw std::string("segment addr out of range"); + + cached_segment = lb; + return get(cached_segment); + } + + uint64_t find_addr(uint64_t addr) { return find_obj(addr).begin(); } + + bool testCache(uint64_t addr) const + { + return this->cached_segment < this->size() && get(cached_segment).inrange(addr); + } + + const Type& get(size_t index) const { return this->data()[index]; } + + void insert(const Type& elem) { this->Super::insert(elem); } + void insert_list(std::vector arange) + { + for(auto& elem : arange) + push_back(elem); + std::sort(this->begin(), this->end(), [](const Type& a, const Type& b) { + return a.begin() < b.begin(); + }); + }; + + void reset() { cached_segment = ~0; } + void clear() + { + reset(); + this->Super::clear(); + } + bool remove(uint64_t addr) + { + reset(); + return this->Super::remove(addr); + } + +private: + size_t cached_segment = ~0; +}; + +struct address_range_t +{ + uint64_t vbegin; + uint64_t size; + codeobj_marker_id_t id; + uint64_t offset; + + bool operator<(const address_range_t& other) const { return vbegin < other.vbegin; } + bool inrange(uint64_t addr) const { return addr >= vbegin && addr < vbegin + size; }; + uint64_t begin() const { return vbegin; } +}; + +/** + * @brief Finds a candidate codeobj for the given vaddr + */ +class CodeobjTableTranslator : public cached_ordered_vector +{ +public: + const address_range_t& find_codeobj_in_range(uint64_t addr) { return this->find_obj(addr); } +}; diff --git a/source/lib/rocprofiler-sdk/CMakeLists.txt b/source/lib/rocprofiler-sdk/CMakeLists.txt index 9ae2c3aba8..59f50b498a 100644 --- a/source/lib/rocprofiler-sdk/CMakeLists.txt +++ b/source/lib/rocprofiler-sdk/CMakeLists.txt @@ -42,6 +42,7 @@ add_subdirectory(counters) add_subdirectory(aql) add_subdirectory(pc_sampling) add_subdirectory(marker) +add_subdirectory(thread_trace) target_link_libraries( rocprofiler-object-library diff --git a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h index 88d4ce925a..2d6a6da1e6 100644 --- a/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h +++ b/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h @@ -1,17 +1,352 @@ -// This file should be removed when it appears in AQL Profile -#pragma ONCE +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once #include +#include -#define PUBLIC_API __attribute__((visibility("default"))) - +#ifdef __cplusplus extern "C" { +#endif + +typedef struct +{ + uint64_t handle; +} aqlprofile_handle_t; + +typedef enum +{ + AQLPROFILE_MEMORY_HINT_NONE = 0, + AQLPROFILE_MEMORY_HINT_HOST = 1, + AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED = 2, + AQLPROFILE_MEMORY_HINT_DEVICE_COHERENT = 3, + AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT = 4, + AQLPROFILE_MEMORY_HINT_LAST +} aqlprofile_memory_hint_t; + +/** + * @brief Flags to describe which agents can access given buffer. + */ +typedef union +{ + uint32_t raw; + struct + { + uint32_t device_access : 1; + uint32_t host_access : 1; + uint32_t memory_hint : 6; // One of aqlprofile_memory_hint_t + uint32_t _reserved : 24; + }; +} aqlprofile_buffer_desc_flags_t; + +/** + * @brief Callback to request a memory buffer, which will be tied to a profile. + * The user is responsible for clearing up memory after the profile is no longer needed. + * @param[out] ptr The pointer containing memory. + * @param[in] size Minimum requested buffer size. + * @param[in] flags Access flags, requesting which agents need to read/write to the buffer. + * @param[in] userdata Data to be passed back to user. + * @retval HSA_STATUS_SUCCESS if successful + * @retval HSA_STATUS_ERROR if memory could not be allocated + */ +typedef hsa_status_t (*aqlprofile_memory_alloc_callback_t)(void** ptr, + uint64_t size, + aqlprofile_buffer_desc_flags_t flags, + void* userdata); + +/** + * @brief Callback to dealloc memory requested via aqlprofile_memory_alloc_callback_t + * @param[in] ptr The pointer containing memory. + * @param[in] userdata Data to be passed back to user. + * @retval HSA_STATUS_SUCCESS if successful + * @retval HSA_STATUS_ERROR if memory could not be allocated + */ +typedef void (*aqlprofile_memory_dealloc_callback_t)(void* ptr, void* userdata); + +typedef enum +{ + AQLPROFILE_ACCUMULATION_NONE = 0, /** Do not accumulate event */ + AQLPROFILE_ACCUMULATION_LO_RES, /**< The event should be integrated over quad-cycles */ + AQLPROFILE_ACCUMULATION_HI_RES, /**< The event should be integrated every cycle */ + AQLPROFILE_ACCUMULATION_LAST, +} aqlprofile_accumulation_type_t; + +/** + * @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics + */ +typedef union +{ + uint32_t raw; + struct + { + uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */ + uint32_t _reserved : 29; + } sq_flags; +} aqlprofile_pmc_event_flags_t; + +/** + * @brief Struct containing all necessary information of an event (counter). + */ +typedef struct +{ + uint32_t block_index; /**< Block channel. */ + uint32_t event_id; /**< Event ID as fined by XML */ + aqlprofile_pmc_event_flags_t flags; /**< Special event flags e.g. accumulation */ + hsa_ven_amd_aqlprofile_block_name_t block_name; /**< Block name as defined by block indexes */ +} aqlprofile_pmc_event_t; + +/** + * @brief Struct containing information about the agent. User code sets these values + * to the describe the agent to profile. Information can be obtained either from HSA + * (if loaded) or the KFD topology. + */ +typedef struct +{ + const char* agent_gfxip; /**< Agent GFXIP (HSA_AGENT_INFO_NAME or KFD.product_name) */ + uint32_t xcc_num; /**< XCC's on the agent (HSA_AMD_AGENT_INFO_NUM_XCC or KFD.num_xcc) */ + uint32_t se_num; /**< SE's on the agent (HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES or + KFD.num_shader_banks) */ + uint32_t + cu_num; /**< CU's on the agent (HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT or KFD.cu_count) */ + uint32_t shader_arrays_per_se; /**< Shader arrays per SE of agent + (HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE or + KFD.simd_arrays_per_engine)*/ +} aqlprofile_agent_info_t; + +/** + * @brief Struct containing a handle to a registered agent + * + */ +typedef struct +{ + uint64_t handle; +} aqlprofile_agent_handle_t; + +/** + * @brief Registers an agent to be used with AQL profile. + * @param[out] agent_id Handle to newly registered agent + * @param[in] agent_info Info to register a new agent with AQL Profiler + * @retval HSA_STATUS_SUCCESS registration ok + * @retval HSA_STATUS_ERROR registration failed + */ +hsa_status_t +aqlprofile_register_agent(aqlprofile_agent_handle_t* agent_id, + const aqlprofile_agent_info_t* agent_info); + +/** + * @brief AQLprofile struct containing information for perfmon events + */ +typedef struct +{ + aqlprofile_agent_handle_t agent; + const aqlprofile_pmc_event_t* events; + uint32_t event_count; +} aqlprofile_pmc_profile_t; + +// Profile attributes +typedef enum +{ + AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value + AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value + AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value + // in info_data object + AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter + AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances + // by name string using _id_query_t + AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for + // counters enable command buffer + AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for + // counters disable command buffer +} aqlprofile_pmc_info_type_t; + +hsa_status_t +aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile, + aqlprofile_pmc_info_type_t attribute, + void* value); + +/** + * @brief AQLprofile struct containing information for Advanced Thread Trace + */ +typedef struct +{ + hsa_agent_t agent; + const hsa_ven_amd_aqlprofile_parameter_t* parameters; + uint32_t parameter_count; +} aqlprofile_att_profile_t; + +/** + * @brief Data callback for perfmon events. Each event will call this once per coordinate + * @param[in] event The event information passed in from aqlprofile_pmc_profile_t + * @param[in] counter_id Internal ID of the counter + * @param[in] counter_value The event value, as incremented from start() to stop() + * @param[in] userdata Data returned to user + * @retval HSA_STATUS_SUCCESS to continue iteration + * @retval HSA_STATUS_ERROR to stop callback iteration + */ +typedef hsa_status_t (*aqlprofile_pmc_data_callback_t)(aqlprofile_pmc_event_t event, + uint64_t counter_id, + uint64_t counter_value, + void* userdata); + +/** + * @brief Data callback for thread trace. This will be called at least once per shader engine + * @param[in] shader Shader Engine ID + * @param[in] buffer Pointer containing the data + * @param[in] size Amount of bytes used by thread trace + * @param[in] callback_data Data returned to user + * @retval HSA_STATUS_SUCCESS to continue iteration + * @retval HSA_STATUS_ERROR to stop callback iteration + */ +typedef hsa_status_t (*aqlprofile_att_data_callback_t)(uint32_t shader, + void* buffer, + uint64_t size, + void* callback_data); + +/** + * @brief Memory copy fn for aqlprofile to copy data. + * @param[in] dst Destination pointer to copy data to. + * @param[in] src Source pointer where data is to be copied from. + * @param[in] size Amount of bytes to be copied. + * @param[in] userdata Data returned to user + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR on failure + */ +typedef hsa_status_t (*aqlprofile_memory_copy_t)(void* dst, + const void* src, + size_t size, + void* userdata); + +/** + * @brief Validates the event for the agent. + * @param[in] agent The agent to validate the event for. + * @param[in] event The event to validate. + * @param[out] result True if the event is valid for the agent, false otherwise. + * @retval HSA_STATUS_SUCCESS if the event was validated. + * @retval HSA_STATUS_ERROR if the event was not validated. + */ +hsa_status_t +aqlprofile_validate_pmc_event(aqlprofile_agent_handle_t agent, + const aqlprofile_pmc_event_t* event, + bool* result); + +/** + * @brief Iterate_data() will parse the event data and call @callback with the resulting event data + * @param[in] handle The handle returned from aqlprofile_pmc_create_packets() + * @param[in] callback CB where the resulting event values are going to be returned + * @param[in] userdata Data sent back to user + * @retval HSA_STATUS_SUCCESS all operations exited succesfully + * @retval HSA_STATUS_ERROR if some callback returns an error + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given + */ +hsa_status_t +aqlprofile_pmc_iterate_data(aqlprofile_handle_t handle, + aqlprofile_pmc_data_callback_t callback, + void* userdata); + +/** + * @brief Struct to be returned by aqlprofile_pmc_create_packets + */ +typedef struct +{ + hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Reset counters and start incrementing */ + hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Pause counters from incrementing */ + hsa_ext_amd_aql_pm4_packet_t read_packet; /**< Retrieve results from device */ +} aqlprofile_pmc_aql_packets_t; + +/** + * @brief Function to create AQL packets to be inserted into the queue. + * @param[out] handle To be passed to iterate_data() + * @param[out] packets Pointer to where the start, stop and read packets will be written to + * @param[in] profile Agent and events information + * @param[in] alloc_cb Memory allocation, which may request cpu or gpu memory for internal use + * @param[in] dealloc_cb Function to free memory allocated by alloc_cb + * @param[in] userdata Data passed back to user via memory alloc callback + */ +hsa_status_t +aqlprofile_pmc_create_packets(aqlprofile_handle_t* handle, + aqlprofile_pmc_aql_packets_t* packets, + aqlprofile_pmc_profile_t profile, + aqlprofile_memory_alloc_callback_t alloc_cb, + aqlprofile_memory_dealloc_callback_t dealloc_cb, + aqlprofile_memory_copy_t memcpy_cb, + void* userdata); + +/** + * @brief Function to delete AQL packets after creation by aqlprofile_pmc_create_packets + * @param[in] handle Returned by aqlprofile_pmc_create_packets() + */ +void +aqlprofile_pmc_delete_packets(aqlprofile_handle_t handle); + +/** + * @brief Iterates over thread trace data and the data to user + * @param[in] handle The handle returned from aqlprofile_att_create_packets() + * @param[in] callback CB where the resulting data is going to be returned + * @param[in] userdata Data sent back to user + * @retval HSA_STATUS_SUCCESS all operations exited succesfully + * @retval HSA_STATUS_ERROR if some callback returns an error + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given + */ +hsa_status_t +aqlprofile_att_iterate_data(aqlprofile_handle_t handle, + aqlprofile_att_data_callback_t callback, + void* userdata); + +/** + * @brief Struct containing AQLpackets to start and stop thread trace + */ +typedef struct +{ + hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Packet to start thread trace */ + hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Packet to stop thread trace and flush data */ +} aqlprofile_att_control_aql_packets_t; + +/** + * @brief Fn to create start and stop thread trace packets + * @param[out] handle To be passed to iterate_data() + * @param[out] packets Packets returned by this function to start and stop thread trace + * @param[in] profile Agent information and extra parameters for thread trace + * @param[in] callback Memory allocation fn which may request cpu or gpu memory + * @retval HSA_STATUS_SUCCESS if all packets created succesfully + * @retval HSA_STATUS_ERROR otherwise + */ +hsa_status_t +aqlprofile_att_create_packets(aqlprofile_handle_t* handle, + aqlprofile_att_control_aql_packets_t* packets, + aqlprofile_att_profile_t profile, + aqlprofile_memory_alloc_callback_t alloc_cb, + aqlprofile_memory_dealloc_callback_t dealloc_cb, + aqlprofile_memory_copy_t memcpy_cb, + void* userdata); + +void +aqlprofile_att_delete_packets(aqlprofile_handle_t handle); + /** * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. * @param [in] id Integer identifying the dimension. * @param [in] name Name of the dimension * @param [in] data User data supplied to @ref aqlprofile_iterate_event_ids - * @return hsa_status_t * @retval HSA_STATUS_SUCCESS Continues iteration * @retval OTHERS Any other HSA return values stops iteration, passing back this value through * @ref aqlprofile_iterate_event_ids @@ -22,12 +357,314 @@ typedef hsa_status_t (*aqlprofile_eventname_callback_t)(int id, const char* name * @brief Iterate over all possible event coordinate IDs and their names. * @param [in] callback Callback to use for iteration of dimensions * @param [in] user_data Data to supply to callback @ref aqlprofile_eventname_callback_t - * @return hsa_status_t * @retval HSA_STATUS_SUCCESS if successful * @retval HSA_STATUS_ERROR if error on interation * @retval OTHERS If @ref aqlprofile_eventname_callback_t returns non-HSA_STATUS_SUCCESS, * that value is returned. */ -PUBLIC_API hsa_status_t +hsa_status_t aqlprofile_iterate_event_ids(aqlprofile_eventname_callback_t callback, void* user_data); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param position A counting sequence indicating callback number. + * @param id Coordinate ID as in _iterate_event_ids. + * @param extent Coordinate extent indicating maximum allowed instances. + * @param coordinate The coordinate, in the range [0,extent-1]. + * @param name Coordinate name as in _iterate_event_ids. + * @param userdata Userdata returned from _iterate_event_coord function. + */ +typedef hsa_status_t (*aqlprofile_coordinate_callback_t)(int position, + int id, + int extent, + int coordinate, + const char* name, + void* userdata); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] agent HSA agent. + * @param[in] event The event ID and block ID to iterate for. + * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. + * @param[in] callback Callback function to return the coordinates. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +hsa_status_t +aqlprofile_iterate_event_coord(aqlprofile_agent_handle_t agent, + aqlprofile_pmc_event_t event, + uint64_t sample_id, + aqlprofile_coordinate_callback_t callback, + void* userdata); + +typedef union +{ + uint64_t raw; + struct + { + uint64_t isValid : 1; + uint64_t isNavi : 1; + uint64_t npiWaveData : 1; + uint64_t version : 13; + }; +} att_output_flags_t; + +typedef struct +{ + int64_t time; + uint16_t events0; + uint16_t events1; + uint16_t events2; + uint16_t events3; + uint8_t CU; + uint8_t bank; +} att_perfevent_t; + +typedef struct +{ + uint64_t kernel_id : 12; + uint64_t simd : 2; + uint64_t slot : 4; + uint64_t enable : 1; + uint64_t cu : 4; + uint64_t time : 41; // Time_value/8 +} att_occupancy_info_t; + +typedef struct +{ + int32_t type; + int32_t duration; +} wave_state_t; + +typedef struct +{ + int64_t time; + int64_t duration; +} wave_instruction_t; + +enum WAVESLOT_STATE +{ + WS_EMPTY = 0, + WS_IDLE = 1, + WS_EXEC = 2, + WS_WAIT = 3, + WS_STALL = 4, + WS_UNKNOWN = 5, +}; + +enum WaveInstCategory +{ + NONE = 0, + SMEM = 1, + SALU = 2, + VMEM = 3, + FLAT = 4, + LDS = 5, + VALU = 6, + JUMP = 7, + NEXT = 8, + IMMED = 9, + TRAP = 10, + PCINFO = 15, + WAVE_NOT_FINISHED, +}; + +enum WaveTrapStatus +{ + TRAP_RESTORED = 0, + TRAP_REQUEST = 1, + TRAP_SAVED = 1, + TRAP_STANDBY = 2 +}; + +typedef struct +{ + size_t addr; + size_t marker_id; +} pcinfo_t; + +typedef struct __attribute__((packed)) +{ + uint64_t category : 8; + uint64_t hitcount : 56; + uint64_t latency; + pcinfo_t pc; +} att_trace_event_t; + +typedef struct +{ + uint8_t simd; + uint8_t wave_id; + uint8_t trap_status; + uint8_t reserved; + + // VMEM Pipeline: instrs and stalls + int num_vmem_instrs; + int num_vmem_stalls; + // FLAT instrs and stalls + int num_flat_instrs; + int num_flat_stalls; + + // LDS instr and stalls + int num_lds_instrs; + int num_lds_stalls; + + // SCA instrs stalls + int num_salu_instrs; + int num_smem_instrs; + int num_salu_stalls; + int num_smem_stalls; + + // Branch + int num_branch_instrs; + int num_branch_taken_instrs; + int num_branch_stalls; + + // total VMEM/FLAT/LDS/SMEM instructions issued + int num_mem_instrs; // total issued memory instructions + int num_valu_stalls; + size_t num_valu_instrs; + size_t num_issued_instrs; // total issued instructions (compute + memory) + + int64_t begin_time; // Begin and end cycle + int64_t end_time; + int64_t traceID; + + size_t timeline_size; + size_t instructions_size; + wave_state_t* timeline_array; + wave_instruction_t* instructions_array; +} wave_data_t; + +/** + * @brief Callback for rocprofiler to return ISA to aqlprofile ATT parser. + * The caller must copy a desired instruction on isa_instruction and source_reference, + * while obeying the max length passed by the caller. + * If the caller's length is insufficient, then this function writes the minimum sizes to isa_size + * and source_size and returns HSA_STATUS_ERROR_OUT_OF_RESOURCES. + * If call returns HSA_STATUS_SUCCESS, isa_size and source_size are written with bytes used. + * @param[out] isa_instruction Where to copy the ISA line to. + * @param[out] source_reference Reference to source line and/or additional comments in the binary. + * @param[out] isa_memory_size (Auto) The number of bytes to next instruction. 0 for custom ISA. + * @param[inout] isa_size Size of returned ISA string. + * @param[inout] source_size Size of returned reference/comment string. + * @param[in] marker_id The generated ATT marker for given codeobject ID. + * @param[in] offset The offset from base vaddr for given codeobj ID. + * If marker_id == 0, this parameter is raw virtual address with no codeobj ID information. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + * @retval HSA_STATUS_SUCCESS on success. + * @retval HSA_STATUS_ERROR on generic error. + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid offset or invalid marker_id. + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES for insufficient isa_size or source_size. + */ +typedef hsa_status_t (*aqlprofile_att_isa_callback_t)(char* isa_instruction, + char* source_reference, + uint64_t* isa_memory_size, + uint64_t* isa_size, + uint64_t* source_size, + uint64_t marker_id, + uint64_t offset, + void* userdata); + +/** + * @brief Callback for rocprofiler to return traces back to rocprofiler. + * @param[in] trace_type_id The type of this trace as in _iterate_event_ids(). + * @param[in] correlation_id The ID of shader engine or trace callback number. + * @param[in] trace_events A pointer to sequence of events, of size trace_size. + * @param[in] trace_size The number of events in the trace. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + */ +typedef hsa_status_t (*aqlprofile_att_trace_callback_t)(int trace_type_id, + int correlation_id, + void* trace_events, + uint64_t trace_size, + void* userdata); + +/** + * @brief Callback for the ATT parser to retrieve Shader Engine data. + * Returns the amount of data filled. If no more data is available, then callback return 0 + * If the space available in the buffer is less than required for parsing the full data, + * the full data is transfered over multiple calls. + * When all data has been transfered from current shader_engine_id, the caller has the option to + * 1) Return -1 on shader_engine ID and parsing terminates + * 2) Move to the next shader engine. + * @param[out] shader_engine_id The ID of given shader engine. + * @param[out] buffer The buffer to fill up with SE data. + * @param[out] buffer_size The space available in the buffer. + * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. + * @returns Number of bytes remaining in shader engine. + * @retval 0 if no more SE data is available. Parsing will stop. + * @retval buffer_size if the buffer does not hold enough data for the current shader engine. + * @retval 0 > ret > buffer_size for partially filled buffer, and caller moves over to next SE. + */ +typedef uint64_t (*aqlprofile_att_se_data_callback_t)(int* shader_engine_id, + uint8_t** buffer, + uint64_t* buffer_size, + void* userdata); + +/** + * @brief Callback returning from aqlprofile_att_parser_iterate_event_list + * @param[in] trace_event_id ID of the event. + * @param[in] trace_event_metadata Null-terminated string, entries separated by ';' + * @param[in] userdata userdata. + */ +typedef void (*aqlprofile_att_parser_iterate_event_cb_t)(int trace_event_id, + const char* trace_event_metadata, + void* userdata); + +/** + * @brief Iterate over all available event types. + * @param[in] callback Callback where events are returned to. + * @param[in] userdata userdata. + */ +void +aqlprofile_att_parser_iterate_event_list(aqlprofile_att_parser_iterate_event_cb_t callback, + void* userdata); + +/** + * @brief Iterate over all event coordinates for a given agent_t and event_t. + * @param[in] se_data_callback Callback to return shader engine data from. + * @param[in] trace_callback Callback where the trace data is returned to. + * Each trace will be marked by the ID returned on aqlprofile_att_parser_iterate_event_list. + * @param[in] isa_callback Callback to return ISA lines. + * @param[in] userdata Userdata passed back to caller via callback. + */ +hsa_status_t +aqlprofile_att_parse_data(aqlprofile_att_se_data_callback_t se_data_callback, + aqlprofile_att_trace_callback_t trace_callback, + aqlprofile_att_isa_callback_t isa_callback, + void* userdata); + +/** + * @brief Contains flags for how code objects are interpreted + */ +typedef union +{ + struct + { + uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload + uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started? + uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits. + }; + uint32_t raw; +} aqlprofile_att_header_marker_t; + +/** + * @brief Creates an AQL packet for marking code objects + * @param[out] packets Returned packet + * @param[in] handle The handle created from aqlprofile_att_create_packets() + * @param[in] header Header containing code object information created from profiler + * @param[in] id To be passed back to isa_string_callback in marker_id + * @param[in] addr Code object loaded address. + * @param[in] size Code object loaded size. + */ +hsa_status_t +aqlprofile_att_codeobj_load_marker(hsa_ext_amd_aql_pm4_packet_t* packets, + aqlprofile_handle_t handle, + aqlprofile_att_header_marker_t header, + uint64_t id, + uint64_t addr, + uint64_t size); + +#ifdef __cplusplus } +#endif diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp index 4c1ac85251..32a380cd4a 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.cpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.cpp @@ -26,12 +26,22 @@ #include #include "glog/logging.h" +#define CHECK_HSA(fn, message) \ + { \ + auto status = (fn); \ + if(status != HSA_STATUS_SUCCESS) \ + { \ + std::cerr << "HSA Err: " << status << "\n"; \ + exit(1); \ + } \ + } + namespace rocprofiler { namespace aql { -AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agent, - const std::vector& metrics) +CounterPacketConstruct::CounterPacketConstruct(const hsa::AgentCache& agent, + const std::vector& metrics) : _agent(agent) { // Validate that the counter exists and construct the block instances @@ -67,12 +77,11 @@ AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agen _events = get_all_events(); } -std::unique_ptr -AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const +std::unique_ptr +CounterPacketConstruct::construct_packet(const AmdExtTable& ext) { - const size_t MEM_PAGE_MASK = 0x1000 - 1; - auto pkt_ptr = std::make_unique(ext.hsa_amd_memory_pool_free_fn); - auto& pkt = *pkt_ptr; + auto pkt_ptr = std::make_unique(ext.hsa_amd_memory_pool_free_fn); + auto& pkt = *pkt_ptr; if(_events.empty()) { return pkt_ptr; @@ -104,15 +113,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const _agent.get_hsa_agent().handle)); } - auto throw_if_failed = [](auto status, auto& message) { - if(status != HSA_STATUS_SUCCESS) - { - throw std::runtime_error(message); - } - }; - - throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, nullptr), - "could not generate packet sizes"); + CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, nullptr), "could not generate packet sizes"); if(profile.command_buffer.size == 0 || profile.output_buffer.size == 0) { @@ -125,7 +126,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const // Allocate buffers and check the results auto alloc_and_check = [&](auto& pool, auto** mem_loc, auto size) -> bool { bool malloced = false; - size_t page_aligned = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + size_t page_aligned = getPageAligned(size); if(ext.hsa_amd_memory_pool_allocate_fn( pool, page_aligned, 0, static_cast(mem_loc)) != HSA_STATUS_SUCCESS) { @@ -153,22 +154,74 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const _agent.kernarg_pool(), &profile.output_buffer.ptr, profile.output_buffer.size); memset(profile.output_buffer.ptr, 0x0, profile.output_buffer.size); - // throw if we do not construct the packets correctly. - throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start), - "could not generate start packet"); - throw_if_failed(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop), - "could not generate stop packet"); - throw_if_failed(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read), - "could not generate read packet"); + CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start), "failed to create start packet"); + CHECK_HSA(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop), "failed to create stop packet"); + CHECK_HSA(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read), "failed to create read packet"); pkt.start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; pkt.stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; pkt.read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; - return pkt_ptr; } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wnarrowing" + +ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory( + const hsa::AgentCache& agent, + std::shared_ptr& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext) +{ + this->tracepool = std::make_shared(); + this->tracepool->allocate_fn = ext.hsa_amd_memory_pool_allocate_fn; + this->tracepool->allow_access_fn = ext.hsa_amd_agents_allow_access_fn; + this->tracepool->free_fn = ext.hsa_amd_memory_pool_free_fn; + this->tracepool->api_copy_fn = coreapi.hsa_memory_copy_fn; + this->tracepool->gpu_agent = agent.get_hsa_agent(); + this->tracepool->cpu_pool_ = agent.cpu_pool(); + this->tracepool->gpu_pool_ = agent.gpu_pool(); + + this->aql_params.clear(); + auto& p = this->aql_params; + p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, params->target_cu}); + p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, params->shader_engine_mask}); + p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, params->simd_select}); + p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, params->buffer_size}); + + this->profile = aqlprofile_att_profile_t{agent.get_hsa_agent(), p.data(), p.size()}; +} + +#pragma GCC diagnostic pop + +std::unique_ptr +ThreadTraceAQLPacketFactory::construct_packet() +{ + auto packet = std::make_unique(this->tracepool); + /*hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle, + &packet->packets, + this->profile, + &hsa::TraceAQLPacket::Alloc, + &hsa::TraceAQLPacket::Free, + &hsa::TraceAQLPacket::Copy, + packet.get()); + CHECK_HSA(_status, "failed to create ATT packet");*/ + + packet->before_krn_pkt.clear(); + packet->after_krn_pkt.clear(); + packet->packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + packet->packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE; + + packet->empty = false; + packet->start = packet->packets.start_packet; + packet->stop = packet->packets.stop_packet; + packet->before_krn_pkt.push_back(packet->start); + packet->after_krn_pkt.push_back(packet->stop); + + return packet; +} + std::vector -AQLPacketConstruct::get_all_events() const +CounterPacketConstruct::get_all_events() const { std::vector ret; for(const auto& metric : _metrics) @@ -179,7 +232,7 @@ AQLPacketConstruct::get_all_events() const } const counters::Metric* -AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const +CounterPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const { if(const auto* ptr = rocprofiler::common::get_val( _event_to_metric, @@ -191,7 +244,7 @@ AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) } const std::vector& -AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const +CounterPacketConstruct::get_counter_events(const counters::Metric& metric) const { for(const auto& prof_metric : _metrics) { @@ -204,7 +257,7 @@ AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const } void -AQLPacketConstruct::can_collect() +CounterPacketConstruct::can_collect() { // Verify that the counters fit within harrdware limits std::map, int64_t> counter_count; diff --git a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp index c38b5e8326..f2d944c819 100644 --- a/source/lib/rocprofiler-sdk/aql/packet_construct.hpp +++ b/source/lib/rocprofiler-sdk/aql/packet_construct.hpp @@ -30,10 +30,12 @@ #include #include +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/counters/metrics.hpp" #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" #include "lib/rocprofiler-sdk/hsa/queue.hpp" +#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp" namespace rocprofiler { @@ -47,21 +49,27 @@ namespace aql * consturcted start/stop/read packets along with allocated buffers needed * to collect the counter data. */ -class AQLPacketConstruct +class CounterPacketConstruct { public: - AQLPacketConstruct(const hsa::AgentCache& agent, const std::vector& metrics); - std::unique_ptr construct_packet(const AmdExtTable&) const; + CounterPacketConstruct(const hsa::AgentCache& agent, + const std::vector& metrics); + std::unique_ptr construct_packet(const AmdExtTable&); const counters::Metric* event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const; std::vector get_all_events() const; + hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); } const std::vector& get_counter_events( const counters::Metric&) const; - hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); } - private: + const hsa::AgentCache& _agent; + static constexpr size_t MEM_PAGE_ALIGN = 0x1000; + static constexpr size_t MEM_PAGE_MASK = MEM_PAGE_ALIGN - 1; + static size_t getPageAligned(size_t p) { return (p + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; } + +protected: struct AQLProfileMetric { counters::Metric metric; @@ -70,12 +78,26 @@ private: void can_collect(); - const hsa::AgentCache& _agent; std::vector _metrics; std::vector _events; std::map, counters::Metric> _event_to_metric; }; +class ThreadTraceAQLPacketFactory +{ +public: + ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent, + std::shared_ptr& params, + const CoreApiTable& coreapi, + const AmdExtTable& ext); + std::unique_ptr construct_packet(); + +private: + std::shared_ptr tracepool; + std::vector aql_params; + aqlprofile_att_profile_t profile; +}; + } // namespace aql } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/aql/tests/aql_test.cpp b/source/lib/rocprofiler-sdk/aql/tests/aql_test.cpp index 0d0f13f00b..5f5179cb40 100644 --- a/source/lib/rocprofiler-sdk/aql/tests/aql_test.cpp +++ b/source/lib/rocprofiler-sdk/aql/tests/aql_test.cpp @@ -122,7 +122,7 @@ TEST(aql_profile, construct_packets) LOG(WARNING) << fmt::format("Found Agent: {}", agent.get_hsa_agent().handle); auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"}); ASSERT_EQ(metrics.size(), 1); - AQLPacketConstruct(agent, metrics); + CounterPacketConstruct(agent, metrics); } hsa_shut_down(); } @@ -142,7 +142,7 @@ TEST(aql_profile, too_many_counters) { try { - AQLPacketConstruct(agent, metrics); + CounterPacketConstruct(agent, metrics); } catch(const std::exception& e) { EXPECT_NE(e.what(), nullptr) << e.what(); @@ -163,9 +163,9 @@ TEST(aql_profile, packet_generation_single) ASSERT_GT(agents.size(), 0); for(const auto& [_, agent] : agents) { - auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"}); - AQLPacketConstruct pkt(agent, metrics); - auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table()); + auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"}); + CounterPacketConstruct pkt(agent, metrics); + auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table()); EXPECT_TRUE(test_pkt); } @@ -183,14 +183,30 @@ TEST(aql_profile, packet_generation_multi) { auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES", "TA_FLAT_READ_WAVEFRONTS"}); - AQLPacketConstruct pkt(agent, metrics); - auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table()); + CounterPacketConstruct pkt(agent, metrics); + auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table()); EXPECT_TRUE(test_pkt); } hsa_shut_down(); } +class TestAqlPacket : public rocprofiler::hsa::CounterAQLPacket +{ +public: + TestAqlPacket(bool mallocd) + : rocprofiler::hsa::CounterAQLPacket([](void* x) -> hsa_status_t { + ::free(x); + return HSA_STATUS_SUCCESS; + }) + { + this->profile.output_buffer.ptr = malloc(sizeof(double)); + this->profile.command_buffer.ptr = malloc(sizeof(double)); + this->command_buf_mallocd = mallocd; + this->output_buffer_malloced = mallocd; + } +}; + TEST(aql_profile, test_aql_packet) { auto check_null = [](auto& val) { @@ -201,25 +217,12 @@ TEST(aql_profile, test_aql_packet) val.completion_signal.handle == null_val.completion_signal.handle; }; - rocprofiler::hsa::AQLPacket test_pkt([](void* x) -> hsa_status_t { - ::free(x); - return HSA_STATUS_SUCCESS; - }); + TestAqlPacket test_pkt(true); EXPECT_TRUE(check_null(test_pkt.start)) << "Start packet not null"; EXPECT_TRUE(check_null(test_pkt.stop)) << "Stop packet not null"; EXPECT_TRUE(check_null(test_pkt.read)) << "Read packet not null"; - // If this leaks, then AQLPacket is not freeing data correctly. - test_pkt.profile.output_buffer.ptr = malloc(sizeof(double)); - test_pkt.profile.command_buffer.ptr = malloc(sizeof(double)); - test_pkt.command_buf_mallocd = true; - test_pkt.output_buffer_malloced = true; - // test custom destructor as well - rocprofiler::hsa::AQLPacket test_pkt2([](void* x) -> hsa_status_t { - ::free(x); - return HSA_STATUS_SUCCESS; - }); - test_pkt2.profile.output_buffer.ptr = malloc(sizeof(double)); - test_pkt2.profile.command_buffer.ptr = malloc(sizeof(double)); + // Why is this valid? + TestAqlPacket test_pkt2(false); } diff --git a/source/lib/rocprofiler-sdk/context/context.cpp b/source/lib/rocprofiler-sdk/context/context.cpp index 8c55cee2c2..97de3c75c2 100644 --- a/source/lib/rocprofiler-sdk/context/context.cpp +++ b/source/lib/rocprofiler-sdk/context/context.cpp @@ -32,6 +32,7 @@ #include "lib/rocprofiler-sdk/buffer.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/counters/core.hpp" +#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp" #include @@ -446,6 +447,7 @@ start_context(rocprofiler_context_id_t context_id) } if(cfg->counter_collection) rocprofiler::counters::start_context(cfg); + if(cfg->thread_trace) cfg->thread_trace->start_context(); return ROCPROFILER_STATUS_SUCCESS; } @@ -472,6 +474,8 @@ stop_context(rocprofiler_context_id_t idx) if(_expected->counter_collection) rocprofiler::counters::stop_context(const_cast(_expected)); + else if(_expected->thread_trace) + _expected->thread_trace->stop_context(); return ROCPROFILER_STATUS_SUCCESS; } } diff --git a/source/lib/rocprofiler-sdk/context/context.hpp b/source/lib/rocprofiler-sdk/context/context.hpp index ead31fb51f..b3cb238e14 100644 --- a/source/lib/rocprofiler-sdk/context/context.hpp +++ b/source/lib/rocprofiler-sdk/context/context.hpp @@ -33,6 +33,7 @@ #include "lib/rocprofiler-sdk/context/domain.hpp" #include "lib/rocprofiler-sdk/counters/core.hpp" #include "lib/rocprofiler-sdk/external_correlation.hpp" +#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp" #include #include @@ -153,6 +154,7 @@ struct context std::unique_ptr callback_tracer = {}; std::unique_ptr buffered_tracer = {}; std::unique_ptr counter_collection = {}; + std::shared_ptr thread_trace = {}; }; // set the client index needs to be called before allocate_context() diff --git a/source/lib/rocprofiler-sdk/counters/core.cpp b/source/lib/rocprofiler-sdk/counters/core.cpp index db04529377..b41ebe807d 100644 --- a/source/lib/rocprofiler-sdk/counters/core.cpp +++ b/source/lib/rocprofiler-sdk/counters/core.cpp @@ -218,7 +218,7 @@ counter_callback_info::setup_profile_config(const hsa::AgentCache& age } } - profile->pkt_generator = std::make_unique( + profile->pkt_generator = std::make_unique( agent, std::vector{profile->reqired_hw_counters.begin(), profile->reqired_hw_counters.end()}); @@ -288,7 +288,7 @@ queue_cb(const context::context* ctx, // Packet generated when no instrumentation is performed. May contain serialization // packets/barrier packets (and can be empty). auto no_instrumentation = [&]() { - auto ret_pkt = std::make_unique(nullptr); + auto ret_pkt = std::make_unique(nullptr); // If we have a counter collection context but it is not enabled, we still might need // to add barrier packets to transition from serialized -> unserialized execution. This // transition is coordinated by the serializer. diff --git a/source/lib/rocprofiler-sdk/counters/core.hpp b/source/lib/rocprofiler-sdk/counters/core.hpp index 4b288f145e..c979aa3434 100644 --- a/source/lib/rocprofiler-sdk/counters/core.hpp +++ b/source/lib/rocprofiler-sdk/counters/core.hpp @@ -59,7 +59,7 @@ struct profile_config std::vector asts{}; rocprofiler_profile_config_id_t id{.handle = 0}; // Packet generator to create AQL packets for insertion - std::unique_ptr pkt_generator{nullptr}; + std::unique_ptr pkt_generator{nullptr}; // A packet cache of AQL packets. This allows reuse of AQL packets (preventing costly // allocation of new packets/destruction). rocprofiler::common::Synchronized>> diff --git a/source/lib/rocprofiler-sdk/counters/dimensions.cpp b/source/lib/rocprofiler-sdk/counters/dimensions.cpp index 7421200222..1a0df8e8c8 100644 --- a/source/lib/rocprofiler-sdk/counters/dimensions.cpp +++ b/source/lib/rocprofiler-sdk/counters/dimensions.cpp @@ -61,8 +61,8 @@ getBlockDimensions(std::string_view agent, const Metric& metric) { if(maybe_agent.name() == agent) { - aql::AQLPacketConstruct pkt_gen(maybe_agent, {metric}); - const auto& events = pkt_gen.get_counter_events(metric); + aql::CounterPacketConstruct pkt_gen(maybe_agent, {metric}); + const auto& events = pkt_gen.get_counter_events(metric); for(const auto& event : events) { diff --git a/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp b/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp index 8178dd5693..f0a84ef4bb 100644 --- a/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp +++ b/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp @@ -450,16 +450,16 @@ EvaluateAST::read_special_counters( } std::unordered_map> -EvaluateAST::read_pkt(const aql::AQLPacketConstruct* pkt_gen, hsa::AQLPacket& pkt) +EvaluateAST::read_pkt(const aql::CounterPacketConstruct* pkt_gen, hsa::AQLPacket& pkt) { struct it_data { std::unordered_map>* data; - const aql::AQLPacketConstruct* pkt_gen; + const aql::CounterPacketConstruct* pkt_gen; }; std::unordered_map> ret; - if(pkt.empty) return ret; + if(pkt.isEmpty()) return ret; it_data aql_data{.data = &ret, .pkt_gen = pkt_gen}; ; hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data( diff --git a/source/lib/rocprofiler-sdk/counters/evaluate_ast.hpp b/source/lib/rocprofiler-sdk/counters/evaluate_ast.hpp index 8bafb1f31b..95ccf1ed0c 100644 --- a/source/lib/rocprofiler-sdk/counters/evaluate_ast.hpp +++ b/source/lib/rocprofiler-sdk/counters/evaluate_ast.hpp @@ -136,8 +136,8 @@ public: * */ static std::unordered_map> read_pkt( - const aql::AQLPacketConstruct* pkt_gen, - hsa::AQLPacket& pkt); + const aql::CounterPacketConstruct* pkt_gen, + hsa::AQLPacket& pkt); /** * @brief Insert special counter values, such as constants of the agent (i.e. max waves) diff --git a/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp b/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp index da57745df2..7d6dc6c89a 100644 --- a/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp +++ b/source/lib/rocprofiler-sdk/counters/tests/dimension.cpp @@ -251,8 +251,8 @@ TEST(dimension, block_dim_test) } else { - aql::AQLPacketConstruct pkt_gen(agent, {metric}); - const auto& events = pkt_gen.get_counter_events(metric); + aql::CounterPacketConstruct pkt_gen(agent, {metric}); + const auto& events = pkt_gen.get_counter_events(metric); for(const auto& event : events) { std::map dims; diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp index 82da333b7f..2ff93eb961 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp @@ -21,14 +21,22 @@ // THE SOFTWARE. #include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" - #include +#include +#include + +#define CHECK_HSA(fn, message) \ + if((fn) != HSA_STATUS_SUCCESS) \ + { \ + std::cerr << __FILE__ << ':' << __LINE__ << ' ' << message; \ + exit(1); \ + } namespace rocprofiler { namespace hsa { -AQLPacket::~AQLPacket() +CounterAQLPacket::~CounterAQLPacket() { if(!profile.command_buffer.ptr) { @@ -36,7 +44,7 @@ AQLPacket::~AQLPacket() } else if(!command_buf_mallocd) { - free_func(profile.command_buffer.ptr); + CHECK_HSA(free_func(profile.command_buffer.ptr), "freeing memory"); } else { @@ -49,12 +57,61 @@ AQLPacket::~AQLPacket() } else if(!output_buffer_malloced) { - free_func(profile.output_buffer.ptr); + CHECK_HSA(free_func(profile.output_buffer.ptr), "freeing memory"); } else { ::free(profile.output_buffer.ptr); } } + +TraceAQLPacket::~TraceAQLPacket() = default; +/* +TraceAQLPacket::~TraceAQLPacket() +{ + aqlprofile_att_delete_packets(this->handle); +} +*/ + +hsa_status_t +TraceAQLPacket::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data) +{ + if(!data) return HSA_STATUS_ERROR; + if(!reinterpret_cast(data)->tracepool) return HSA_STATUS_ERROR; + + auto& pool = *reinterpret_cast(data)->tracepool; + + if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR; + + if(flags.host_access) + { + hsa_status_t status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr); + if(!flags.device_access || status != HSA_STATUS_SUCCESS) return status; + return pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr); + } + return pool.allocate_fn(pool.gpu_pool_, size, 0, ptr); +} + +void +TraceAQLPacket::Free(void* ptr, void* data) +{ + auto* pool = reinterpret_cast(data)->tracepool.get(); + if(!pool || !pool->free_fn) return; + + pool->free_fn(ptr); +} + +hsa_status_t +TraceAQLPacket::Copy(void* dst, const void* src, size_t size, void* data) +{ + auto* pool = reinterpret_cast(data)->tracepool.get(); + if(!pool || !pool->api_copy_fn) return HSA_STATUS_ERROR; + + return pool->api_copy_fn(dst, src, size); +} + +TraceAQLPacket::TraceAQLPacket(std::shared_ptr& _tracepool) +: tracepool(_tracepool){}; + } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp index 93e41d4916..286a275a40 100644 --- a/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp +++ b/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp @@ -23,6 +23,7 @@ #pragma once #include "lib/common/container/small_vector.hpp" +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" #include #include @@ -30,6 +31,12 @@ namespace rocprofiler { +namespace aql +{ +class CounterPacketConstruct; +class ThreadTraceAQLPacketFactory; +} // namespace aql + namespace hsa { constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = { @@ -41,12 +48,11 @@ constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = { * Struct containing AQL packet information. Including start/stop/read * packets along with allocated buffers */ -struct AQLPacket +class AQLPacket { - using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*; - - AQLPacket(memory_pool_free_func_t func); - ~AQLPacket(); +public: + AQLPacket() = default; + virtual ~AQLPacket() = default; // Keep move constuctors (i.e. std::move()) AQLPacket(AQLPacket&& other) = default; @@ -56,20 +62,68 @@ struct AQLPacket AQLPacket(const AQLPacket&) = delete; AQLPacket& operator=(const AQLPacket&) = delete; - bool command_buf_mallocd = false; - bool output_buffer_malloced = false; - bool empty = {true}; - hsa_ven_amd_aqlprofile_profile_t profile = {}; - hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet; - hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet; - hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet; - memory_pool_free_func_t free_func = nullptr; + hsa_ven_amd_aqlprofile_profile_t profile = {}; + hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet; + hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet; + hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet; + common::container::small_vector before_krn_pkt = {}; common::container::small_vector after_krn_pkt = {}; + + bool isEmpty() const { return empty; } + bool empty = true; +}; + +class CounterAQLPacket : public AQLPacket +{ + friend class rocprofiler::aql::CounterPacketConstruct; + using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*; + +public: + CounterAQLPacket(memory_pool_free_func_t func) + : free_func{func} {}; + ~CounterAQLPacket() override; + +protected: + bool command_buf_mallocd = false; + bool output_buffer_malloced = false; + memory_pool_free_func_t free_func = nullptr; +}; + +struct TraceMemoryPool +{ + hsa_agent_t gpu_agent; + hsa_amd_memory_pool_t cpu_pool_; + hsa_amd_memory_pool_t gpu_pool_; + decltype(hsa_amd_memory_pool_allocate)* allocate_fn; + decltype(hsa_amd_agents_allow_access)* allow_access_fn; + decltype(hsa_amd_memory_pool_free)* free_fn; + decltype(hsa_memory_copy)* api_copy_fn; +}; + +class TraceAQLPacket : public AQLPacket +{ + friend class rocprofiler::aql::ThreadTraceAQLPacketFactory; + +public: + TraceAQLPacket(std::shared_ptr& _tracepool); + TraceMemoryPool& GetPool() const { return *tracepool; } + aqlprofile_handle_t GetHandle() const { return handle; } + uint64_t GetAgent() const { return tracepool->gpu_agent.handle; } + ~TraceAQLPacket() override; + +protected: + std::shared_ptr tracepool; + aqlprofile_att_control_aql_packets_t packets; + aqlprofile_handle_t handle; + + static hsa_status_t Alloc(void** ptr, + size_t size, + aqlprofile_buffer_desc_flags_t flags, + void* data); + static void Free(void* ptr, void* data); + static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data); }; -inline AQLPacket::AQLPacket(memory_pool_free_func_t func) -: free_func{func} -{} } // namespace hsa } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hsa/code_object.cpp b/source/lib/rocprofiler-sdk/hsa/code_object.cpp index bc076c8352..28b936084e 100644 --- a/source/lib/rocprofiler-sdk/hsa/code_object.cpp +++ b/source/lib/rocprofiler-sdk/hsa/code_object.cpp @@ -672,10 +672,10 @@ code_object_load_callback(hsa_executable_t executable, if(_status != HSA_STATUS_SUCCESS) return _status; \ } - auto& loader_table = get_loader_table(); - auto code_obj_v = code_object{}; - auto& data = code_obj_v.rocp_data; - int _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE; + auto& loader_table = get_loader_table(); + auto code_obj_v = code_object{}; + auto& data = code_obj_v.rocp_data; + uint32_t _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE; LOG_IF(FATAL, data.size == 0) << "code object did not properly initialized the size field upon " "construction (this is likely a compiler bug)"; diff --git a/source/lib/rocprofiler-sdk/hsa/queue.cpp b/source/lib/rocprofiler-sdk/hsa/queue.cpp index e3883c6f90..5dabf93ca7 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -342,8 +342,6 @@ WriteInterceptor(const void* packets, { hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - // barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; - // barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; barrier.header |= 1 << HSA_PACKET_HEADER_BARRIER; barrier.completion_signal = original_packet.completion_signal; transformed_packets.emplace_back(barrier); diff --git a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index 51fc03c4e2..a8a224a950 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -90,6 +90,9 @@ constexpr rocprofiler_agent_t default_agent = void QueueController::add_queue(hsa_queue_t* id, std::unique_ptr queue) { + for(auto& pre_initialize_fn : pre_initialize) + pre_initialize_fn(queue->get_agent(), get_core_table(), get_ext_table()); + CHECK(queue); _callback_cache.wlock([&](auto& callbacks) { _queues.wlock([&](auto& map) { @@ -111,6 +114,11 @@ void QueueController::destroy_queue(hsa_queue_t* id) { if(!id) return; + _queues.wlock([&](auto& map) { + for(auto& deinitialize_fn : pre_deinitialize) + if(map.find(id) != map.end()) + deinitialize_fn(map.at(id)->get_agent(), get_core_table(), get_ext_table()); + }); const auto* queue = get_queue(*id); @@ -195,7 +203,8 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) { constexpr auto expected_context_size = 160UL; static_assert( - sizeof(context::context) == expected_context_size, + sizeof(context::context) == + expected_context_size + sizeof(std::shared_ptr), "If you added a new field to context struct, make sure there is a check here if it " "requires queue interception. Once you have done so, increment expected_context_size"); @@ -212,6 +221,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table) break; } } + else if(itr->thread_trace) + { + enable_intercepter = true; + std::weak_ptr trace = itr->thread_trace; + pre_initialize.emplace_back( + [trace](const AgentCache& cache, const CoreApiTable& core, const AmdExtTable& ext) { + if(auto locked = trace.lock()) locked->resource_init(cache, core, ext); + }); + pre_deinitialize.emplace_back( + [trace](const AgentCache& cache, const CoreApiTable&, const AmdExtTable&) { + if(auto locked = trace.lock()) locked->resource_deinit(cache); + }); + break; + } } if(enable_intercepter) diff --git a/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp b/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp index 59a14f5c6c..d719390413 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp +++ b/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp @@ -103,6 +103,7 @@ public: private: using client_id_map_t = std::unordered_map; using agent_cache_map_t = std::unordered_map; + using resource_alloc_t = void(const AgentCache&, const CoreApiTable&, const AmdExtTable&); CoreApiTable _core_table = {}; AmdExtTable _ext_table = {}; @@ -110,6 +111,9 @@ private: common::Synchronized _callback_cache = {}; agent_cache_map_t _supported_agents = {}; common::Synchronized _profiler_serializer; + + std::vector> pre_initialize; + std::vector> pre_deinitialize; }; QueueController* diff --git a/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt b/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt new file mode 100644 index 0000000000..8d54dc8e30 --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt @@ -0,0 +1,6 @@ +set(ROCPROFILER_LIB_THREAD_TRACE_SOURCES att_core.cpp att_service.cpp) +set(ROCPROFILER_LIB_THREAD_TRACE_HEADERS att_core.hpp) +target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_THREAD_TRACE_SOURCES} + ${ROCPROFILER_LIB_THREAD_TRACE_HEADERS}) + +# if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) endif() diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp new file mode 100644 index 0000000000..3ba762425a --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.cpp @@ -0,0 +1,238 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include + +#include "lib/common/container/stable_vector.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/internal_threading.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#define CHECK_HSA(fn, message) \ + { \ + auto _status = (fn); \ + if(_status != HSA_STATUS_SUCCESS) \ + { \ + LOG(ERROR) << "HSA Err: " << _status << '\n'; \ + throw std::runtime_error(message); \ + } \ + } + +namespace rocprofiler +{ +using AQLPacketOwner = std::unique_ptr; +using inst_pkt_t = common::container::small_vector, 4>; +using corr_id_map_t = hsa::Queue::queue_info_session_t::external_corr_id_map_t; + +struct cbdata_t +{ + void* tool_userdata; + rocprofiler_att_shader_data_callback_t cb_fn; + std::vector* memory_space; +}; + +/** + * Callback we get from HSA interceptor when a kernel packet is being enqueued. + * We return an AQLPacket containing the start/stop/read packets for injection. + */ +AQLPacketOwner +pre_kernel_call(ThreadTracer& tracer, + const hsa::Queue& queue, + const hsa::rocprofiler_packet& kern_pkt, + uint64_t kernel_id, + const corr_id_map_t& extern_corr_ids, + const context::correlation_id* corr_id) +{ + (void) extern_corr_ids; + (void) corr_id; + + rocprofiler_correlation_id_t temp_corr_id; + temp_corr_id.internal = 0; + temp_corr_id.external.value = 0; + temp_corr_id.external.ptr = nullptr; + + auto control_flags = tracer.params->dispatch_cb_fn(queue.get_id(), + queue.get_agent().get_rocp_agent(), + temp_corr_id, + &kern_pkt.kernel_dispatch, + kernel_id, + tracer.params->callback_userdata); + + if(control_flags == ROCPROFILER_ATT_CONTROL_NONE) return nullptr; + + assert(control_flags == ROCPROFILER_ATT_CONTROL_START_AND_STOP && "Error: Not implemented"); + + uint64_t agent = queue.get_agent().get_hsa_agent().handle; + std::lock_guard lk(tracer.trace_resources_mut); + + try + { + auto moved = std::move(tracer.resources.at(agent)); + tracer.resources.erase(agent); + return moved; + } catch(std::out_of_range& e) + { + LOG(WARNING) << "Attempt to initialize ATT without allocated resources!\n"; + return nullptr; + } +} + +hsa_status_t +thread_trace_callback(uint32_t shader, void* buffer, uint64_t size, void* callback_data) +{ + void* tool_userdata = static_cast(callback_data)->tool_userdata; + auto callback_fn = *static_cast(callback_data)->cb_fn; + std::vector& cpu_data = *static_cast(callback_data)->memory_space; + + // TODO(gbaraldi): Handle parallel callbacks + static std::mutex mut; + std::lock_guard lk(mut); + + if(size > cpu_data.size()) cpu_data.resize(size + cpu_data.size()); + + auto status = hsa::get_queue_controller()->get_core_table().hsa_memory_copy_fn( + cpu_data.data(), buffer, size); + if(status != HSA_STATUS_SUCCESS) + { + LOG(WARNING) << "Failed to copy hsa memory!"; + return HSA_STATUS_SUCCESS; + } + + callback_fn(shader, 0, "", cpu_data.data(), size, tool_userdata); + return HSA_STATUS_SUCCESS; +} + +void +post_kernel_call(ThreadTracer& tracer, inst_pkt_t& aql) +{ + std::vector cpu_data{}; + auto pair = cbdata_t{tracer.params->callback_userdata, tracer.params->shader_cb_fn, &cpu_data}; + (void) pair; + + for(auto& aql_pkt : aql) + { + auto* pkt = dynamic_cast(aql_pkt.first.get()); + if(!pkt) continue; + + // auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback, + // &pair); CHECK_HSA(status, "Failed to iterate ATT data"); + + std::lock_guard lk(tracer.trace_resources_mut); + if(tracer.agent_active_queues.find(pkt->GetAgent()) != tracer.agent_active_queues.end()) + tracer.resources[pkt->GetAgent()] = std::move(aql_pkt.first); + } +} + +common::Synchronized> client; + +void +ThreadTracer::start_context() +{ + // Only one thread should be attempting to enable/disable this context + client.wlock([&](auto& client_id) { + if(client_id) return; + + client_id = hsa::get_queue_controller()->add_callback( + std::nullopt, + [=](const hsa::Queue& q, + const hsa::rocprofiler_packet& kern_pkt, + uint64_t kernel_id, + rocprofiler_user_data_t* user_data, + const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, + const context::correlation_id* corr_id) { + (void) user_data; + return pre_kernel_call(*this, q, kern_pkt, kernel_id, extern_corr_ids, corr_id); + }, + [=](const hsa::Queue& q, + hsa::rocprofiler_packet kern_pkt, + const hsa::Queue::queue_info_session_t& session, + inst_pkt_t& aql) { + post_kernel_call(*this, aql); + (void) session; + (void) kern_pkt; + (void) q; + }); + }); +} + +void +ThreadTracer::stop_context() +{ + client.wlock([&](auto& client_id) { + if(!client_id) return; + + // Remove our callbacks from HSA's queue controller + hsa::get_queue_controller()->remove_callback(*client_id); + client_id = std::nullopt; + }); +} + +void +ThreadTracer::resource_init(const hsa::AgentCache& cache, + const CoreApiTable& coreapi, + const AmdExtTable& ext) +{ + uint64_t agent = cache.get_hsa_agent().handle; + std::lock_guard lk(trace_resources_mut); + + if(agent_active_queues.find(agent) != agent_active_queues.end()) + { + agent_active_queues.at(agent).fetch_add(1); + return; + } + + auto factory = aql::ThreadTraceAQLPacketFactory(cache, this->params, coreapi, ext); + resources[agent] = factory.construct_packet(); + agent_active_queues[agent] = 1; +} + +void +ThreadTracer::resource_deinit(const hsa::AgentCache& cache) +{ + uint64_t agent = cache.get_hsa_agent().handle; + std::lock_guard lk(trace_resources_mut); + + try + { + if(agent_active_queues.at(agent).fetch_add(-1) > 1) return; + } catch(std::out_of_range&) + {} + + agent_active_queues.erase(agent); + resources.erase(agent); +} + +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp new file mode 100644 index 0000000000..0fa37010c3 --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/att_core.hpp @@ -0,0 +1,75 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include "include/rocprofiler-sdk/thread_trace.h" + +namespace rocprofiler +{ +struct thread_trace_parameters +{ + rocprofiler_context_id_t context_id; + rocprofiler_att_dispatch_callback_t dispatch_cb_fn; + rocprofiler_att_shader_data_callback_t shader_cb_fn; + void* callback_userdata; + + // Parameters + rocprofiler_att_parameter_flag_t flags; + uint64_t buffer_size; + uint8_t target_cu; + uint8_t simd_select; + uint8_t reserved; + uint8_t vmid_mask; + uint16_t perfcounter_mask; + uint8_t perfcounter_ctrl; + uint64_t shader_engine_mask; + + // GFX9 Only + std::vector perfcounters; +}; + +namespace hsa +{ +class AQLPacket; +}; + +class ThreadTracer +{ +public: + ThreadTracer(std::shared_ptr& _params) + : params(_params){}; + virtual void start_context(); + virtual void stop_context(); + virtual void resource_init(const hsa::AgentCache&, const CoreApiTable&, const AmdExtTable&); + virtual void resource_deinit(const hsa::AgentCache&); + virtual ~ThreadTracer() = default; + + std::shared_ptr params; + std::mutex trace_resources_mut; + std::unordered_map> resources; + std::unordered_map> agent_active_queues; +}; // namespace thread_trace + +} // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp new file mode 100644 index 0000000000..eb5b209ab2 --- /dev/null +++ b/source/lib/rocprofiler-sdk/thread_trace/att_service.cpp @@ -0,0 +1,81 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include + +#include "lib/rocprofiler-sdk/aql/helpers.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" + +extern "C" { +/** + * @brief Configure buffered dispatch profile Counting Service. + * Collects the counters in dispatch packets and stores them + * in buffer_id. The buffer may contain packets from more than + * one dispatch (denoted by correlation id). Will trigger the + * callback based on the parameters setup in buffer_id_t. + * + * @param [in] context_id context id + * @param [in] buffer_id id of the buffer to use for the counting service + * @param [in] profile profile config to use for dispatch + * @return ::rocprofiler_status_t + */ +rocprofiler_status_t ROCPROFILER_API +rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id, + rocprofiler_att_parameters_t parameters, + rocprofiler_att_dispatch_callback_t dispatch_callback, + rocprofiler_att_shader_data_callback_t shader_callback, + void* callback_userdata) +{ + auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id); + if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED; + if(ctx->thread_trace) return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED; + if(parameters.flags.raw != 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + + auto thread_tracer = std::make_shared(); + + thread_tracer->context_id = context_id; + thread_tracer->dispatch_cb_fn = dispatch_callback; + thread_tracer->shader_cb_fn = shader_callback; + thread_tracer->callback_userdata = callback_userdata; + + thread_tracer->flags = parameters.flags; + thread_tracer->buffer_size = parameters.buffer_size; + thread_tracer->target_cu = parameters.target_cu; + thread_tracer->simd_select = parameters.simd_select; + thread_tracer->vmid_mask = parameters.vmid_mask; + + thread_tracer->perfcounter_mask = parameters.perfcounter_mask; + thread_tracer->perfcounter_ctrl = parameters.perfcounter_ctrl; + + for(int i = 0; i < parameters.perfcounter_num; i++) + thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]); + + thread_tracer->shader_engine_mask = 0; + for(int i = 0; i < parameters.shader_num; i++) + thread_tracer->shader_engine_mask |= 1ul << parameters.shader_ids[i]; + + ctx->thread_trace = std::make_shared(thread_tracer); + + return ROCPROFILER_STATUS_SUCCESS; +} +} diff --git a/tests/bin/reproducible-runtime/reproducible-runtime.cpp b/tests/bin/reproducible-runtime/reproducible-runtime.cpp index 7926c8b312..82ef726fd0 100644 --- a/tests/bin/reproducible-runtime/reproducible-runtime.cpp +++ b/tests/bin/reproducible-runtime/reproducible-runtime.cpp @@ -127,9 +127,8 @@ run(int tid, int devid) { auto roctx_range_id = roctxRangeStart("run"); - constexpr int min_sa = 8; - constexpr int min_avail_simd = 24; - dim3 grid(min_sa * min_avail_simd); + constexpr int min_avail_simd = 128; + dim3 grid(min_avail_simd); dim3 block(32); double time = 0.0; hipStream_t stream = {};