diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index b2f2e9633a..1790a85cde 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -47,3 +47,4 @@ add_subdirectory(code_object_isa_decode) add_subdirectory(external_correlation_id_request) add_subdirectory(pc_sampling) add_subdirectory(openmp_target) +add_subdirectory(thread_trace) diff --git a/samples/thread_trace/CMakeLists.txt b/samples/thread_trace/CMakeLists.txt new file mode 100644 index 0000000000..0c9c805631 --- /dev/null +++ b/samples/thread_trace/CMakeLists.txt @@ -0,0 +1,79 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-samples-thread-trace LANGUAGES CXX HIP) + +find_package(rocprofiler-sdk REQUIRED) +find_package(rocprofiler-sdk-roctx REQUIRED) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set_source_files_properties( + main.cpp PROPERTIES LANGUAGE HIP COMPILE_FLAGS "${CMAKE_HIP_FLAGS_RELWITHDEBINFO}") + +add_executable(thread-trace-sample) +target_sources(thread-trace-sample PRIVATE main.cpp agent.cpp) + +target_link_libraries( + thread-trace-sample + PUBLIC rocprofiler-sdk::samples-build-flags + PRIVATE rocprofiler-sdk::rocprofiler-sdk + rocprofiler-sdk::samples-common-library + rocprofiler-sdk::rocprofiler-sdk-dw + rocprofiler-sdk-roctx::rocprofiler-sdk-roctx + rocprofiler-sdk::rocprofiler-sdk-amd-comgr) + +add_test(NAME thread-trace-sample COMMAND $) + +find_library( + attdecoder_LIBRARY + NAMES rocprof-trace-decoder + HINTS ${ROCM_PATH} + PATHS ${ROCM_PATH} + PATH_SUFFIXES lib) + +if(attdecoder_LIBRARY) + cmake_path(GET attdecoder_LIBRARY PARENT_PATH attdecoder_LIB_DIR) +endif() + +find_package_handle_standard_args(attdecoder REQUIRED_VARS attdecoder_LIB_DIR + attdecoder_LIBRARY) + +set(IS_DISABLED ON) +if(attdecoder_FOUND) + set(IS_DISABLED OFF) +endif() + +rocprofiler_samples_get_preload_env(PRELOAD_ENV) +list(APPEND PRELOAD_ENV "ROCPROFILER_TRACE_DECODER_LIB_PATH=${attdecoder_LIB_DIR}") + +set_tests_properties( + thread-trace-sample PROPERTIES TIMEOUT 60 ENVIRONMENT ${PRELOAD_ENV} LABELS + "samples;thread-trace" DISABLED ${IS_DISABLED}) diff --git a/samples/thread_trace/README.md b/samples/thread_trace/README.md new file mode 100644 index 0000000000..48528a75a0 --- /dev/null +++ b/samples/thread_trace/README.md @@ -0,0 +1,26 @@ +# Thread Trace and ROCprof Trace Decoder + +## Services + +- Thread trace in device profiling mode +- ROCprof Trace Decoder decodes the received thread trace data +- Thread trace start/stop using roctx + +## Properties + +### [agent.cpp](agent.cpp): + +- Configures thread trace in all GPU agents found with `rocprofiler_configure_device_thread_trace_service` +- Waits until `roctxProfilerResume` is called to start thread trace +- Stops tracing at `roctxProfilerPause` +- Receives the trace data in `shader_data_callback` and calls `rocprofiler_trace_decode` to decode the data +- `rocprofiler_trace_decode` calls `parse` (a lambda) +- `parse` receives the dedecoded data and increments hitcount/latencies by pc address +- At application end, `tool_fini` calls `gen_output_stream` to write the top hotspots into `thread_trace.log` + +### [main.cpp](main.cpp): + +- Defines a few different kernels and runs them +- The first loop iteration warms up the kernels +- The second iteration calls `roctxProfilerResume` to start thread trace +- After the loop ends, `roctxProfilerPause` is called to stop tracing diff --git a/samples/thread_trace/agent.cpp b/samples/thread_trace/agent.cpp new file mode 100644 index 0000000000..e8d9240e29 --- /dev/null +++ b/samples/thread_trace/agent.cpp @@ -0,0 +1,391 @@ +// MIT License +// +// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL(result, msg) \ + if(auto ec = (result); ec != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::cerr << "rocprofiler-sdk error at " << __FILE__ << ":" << __LINE__ \ + << " :: " << #result << std::endl; \ + std::cerr << "rocprofiler-sdk error code " << ec << ": " \ + << rocprofiler_get_status_string(ec) << " :: " << msg << std::endl; \ + abort(); \ + } + +#define DECODER_CALL(result) \ + if(auto ec = (result); ec != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::cerr << "Decoder error at " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::cerr << "rocprofiler-sdk error code " << ec << ": " \ + << rocprofiler_get_status_string(ec) << std::endl; \ + } + +#define CHECK_NOTNULL(x) \ + if(!(x)) \ + { \ + abort(); \ + }; + +namespace +{ +constexpr uint64_t TARGET_CU = 1; // CU (gfx9) or WGP (gfx10+) +constexpr uint64_t SHADER_MASK = 0x1; // Only enable SE=0 +constexpr uint64_t BUFFER_SIZE = 0x10000000; // 256MB +}; // namespace + +namespace Results +{ +using pcinfo_t = rocprofiler_thread_trace_decoder_pc_t; + +struct address_sort_t +{ + bool operator()(const pcinfo_t& a, const pcinfo_t& b) const + { + if(a.marker_id == b.marker_id) return a.addr < b.addr; + return a.marker_id < b.marker_id; + } +}; + +struct Latency +{ + uint64_t latency{0}; + uint64_t hitcount{0}; +}; + +// Maps address to latency +using LatencyTable = std::map; +// Used to disassemble instructions at (id, vaddr) pair +using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; + +AddressTable* table{nullptr}; +LatencyTable* latencies{nullptr}; + +// used to calculate mean wave lifetime +int64_t wave_lifetime = 0; +int64_t waves_started = 0; +int64_t waves_ended = 0; + +void +gen_output_stream() +{ + CHECK_NOTNULL(Results::latencies); + CHECK_NOTNULL(Results::table); + + const char* OUTPUT_OFSTREAM = "thread_trace.log"; + std::ofstream file(OUTPUT_OFSTREAM); + + if(!file.is_open()) + std::cout << "Could not open log file: " << OUTPUT_OFSTREAM << ", writing to stdout\n"; + else + std::cout << "Writing log to: " << OUTPUT_OFSTREAM << std::endl; + + std::ostream& output = file.is_open() ? file : std::cout; + + // Sort map by instruction cost + using Element = std::pair; + + std::vector sorted(latencies->begin(), latencies->end()); + std::stable_sort(sorted.begin(), sorted.end(), [](const Element& a, const Element& b) { + return a.second.latency > b.second.latency; + }); + + output << "Top 50 hotspots for trace (cycles):\n"; + for(size_t i = 0; i < sorted.size() && i < 50; i++) + { + auto& addr = sorted.at(i).first; + auto& latency = sorted.at(i).second; + auto inst = table->get(addr.marker_id, addr.addr); + + auto comment = inst->comment; + size_t pos = comment.rfind('/'); + if(pos != std::string::npos && pos + 1 < comment.size()) comment = comment.substr(pos + 1); + + output << "Latency:" << latency.latency << "\tHit:" << latency.hitcount << " \t" + << inst->inst << " [" << comment << "]\n"; + } + + if(waves_started != waves_ended) + std::cerr << "Error: Some waves have not ended!" << std::endl; + else if(waves_started == 0) + std::cerr << "Error: No waves started!" << std::endl; + else + output << "\nMean wave lifetime: " << wave_lifetime / waves_started << " cycles"; + + output << "\nWaves started: " << waves_started << "\nWaves ended: " << waves_ended << "\n"; +}; +} // namespace Results + +namespace Decoder +{ +rocprofiler_thread_trace_decoder_id_t decoder{}; + +void +tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* /* userdata */) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; + + CHECK_NOTNULL(Results::table); + auto* data = static_cast(record.payload); + + if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) + { + Results::table->addDecoder( + data->uri, data->code_object_id, data->load_delta, data->load_size); + return; + } + + auto* memorybase = reinterpret_cast(data->memory_base); + CHECK_NOTNULL(memorybase); + + DECODER_CALL(rocprofiler_thread_trace_decoder_codeobj_load(decoder, + data->code_object_id, + data->load_delta, + data->load_size, + memorybase, + data->memory_size)); + + Results::table->addDecoder( + memorybase, data->memory_size, data->code_object_id, data->load_delta, data->load_size); +} + +void +shader_data_callback(rocprofiler_agent_id_t /* agent */, + int64_t /* se_id */, + void* se_data, + size_t data_size, + rocprofiler_user_data_t /* userdata */) +{ + CHECK_NOTNULL(Results::latencies); + + auto parse = [](rocprofiler_thread_trace_decoder_record_type_t record_type_id, + void* events, + uint64_t num_events, + void* /* userdata */) { + if(record_type_id == ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY) + { + for(size_t i = 0; i < num_events; i++) + { + auto& event = static_cast(events)[i]; + + if(event.start) + { + Results::wave_lifetime -= static_cast(event.time); + Results::waves_started++; + } + else + { + Results::wave_lifetime += static_cast(event.time); + Results::waves_ended++; + } + } + } + + if(record_type_id != ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE) return; + + for(size_t w = 0; w < num_events; w++) + { + auto* wave = static_cast(events); + for(size_t i = 0; i < wave->instructions_size; i++) + { + auto& inst = wave->instructions_array[i]; + auto& latency = (*Results::latencies)[inst.pc]; + latency.latency += inst.duration; + latency.hitcount += 1; + } + } + }; + + DECODER_CALL(rocprofiler_trace_decode(decoder, parse, se_data, data_size, nullptr)); +} + +} // namespace Decoder + +namespace ThreadTracer +{ +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_context_id_t agent_ctx = {}; +rocprofiler_context_id_t tracing_ctx = {}; + +rocprofiler_status_t +query_available_agents(rocprofiler_agent_version_t /* version */, + const void** agents, + size_t num_agents, + void* user_data) +{ + rocprofiler_user_data_t user{}; + user.ptr = user_data; + + for(size_t idx = 0; idx < num_agents; idx++) + { + const auto* agent = static_cast(agents[idx]); + if(agent->type != ROCPROFILER_AGENT_TYPE_GPU) continue; + + auto parameters = std::vector{}; + parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_TARGET_CU, TARGET_CU}); + parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, BUFFER_SIZE}); + parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, SHADER_MASK}); + + ROCPROFILER_CALL( + rocprofiler_configure_device_thread_trace_service(agent_ctx, + agent->id, + parameters.data(), + parameters.size(), + Decoder::shader_data_callback, + user), + "thread trace service configure"); + } + return ROCPROFILER_STATUS_SUCCESS; +} + +void +cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* /* cb_data */) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API) return; + + if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER && + record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerPause) + { + ROCPROFILER_CALL(rocprofiler_stop_context(agent_ctx), "stopping context"); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && + record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerResume) + { + ROCPROFILER_CALL(rocprofiler_start_context(agent_ctx), "starting context"); + } +} + +int +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */) +{ + Results::latencies = new Results::LatencyTable{}; + Results::table = new Results::AddressTable{}; + + // This is set by ctests: TODO: move to client.cpp + // If nullptr, searches rocprofiler-sdk install location + const char* lib_path = std::getenv("ROCPROFILER_TRACE_DECODER_LIB_PATH"); + + DECODER_CALL(rocprofiler_thread_trace_decoder_create(&Decoder::decoder, lib_path)); + + ROCPROFILER_CALL(rocprofiler_create_context(&tracing_ctx), "context creation"); + ROCPROFILER_CALL(rocprofiler_create_context(&agent_ctx), "context creation"); + + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service(tracing_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + Decoder::tool_codeobj_tracing_callback, + nullptr), + "code object tracing service configure"); + + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + tracing_ctx, + ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API, + nullptr, + 0, + cntrl_tracing_callback, + nullptr), + "marker tracing callback service configure"); + + ROCPROFILER_CALL(rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + &query_available_agents, + sizeof(rocprofiler_agent_t), + nullptr), + "Failed to find GPU agents"); + + int valid_ctx = 0; + ROCPROFILER_CALL(rocprofiler_context_is_valid(agent_ctx, &valid_ctx), "validity check"); + assert(valid_ctx != 0); + ROCPROFILER_CALL(rocprofiler_context_is_valid(tracing_ctx, &valid_ctx), "validity check"); + assert(valid_ctx != 0); + + ROCPROFILER_CALL(rocprofiler_start_context(tracing_ctx), "context start"); + + // no errors + return 0; +} + +void +tool_fini(void* /* tool_data */) +{ + rocprofiler_thread_trace_decoder_destroy(Decoder::decoder); + + Results::gen_output_stream(); + + delete Results::latencies; + delete Results::table; +} + +} // namespace ThreadTracer + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t /* version */, + const char* /* runtime_version */, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // only activate if main tool + if(priority > 0) return nullptr; + + // set the client name + id->name = "Thread Trace Sample"; + + // store client info + ThreadTracer::client_id = id; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &ThreadTracer::tool_init, + &ThreadTracer::tool_fini, + nullptr}; + + // return pointer to configure data + return &cfg; +} diff --git a/samples/thread_trace/main.cpp b/samples/thread_trace/main.cpp new file mode 100644 index 0000000000..e189b41472 --- /dev/null +++ b/samples/thread_trace/main.cpp @@ -0,0 +1,178 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// undefine NDEBUG so asserts are implemented +#ifdef NDEBUG +# undef NDEBUG +#endif + +#include +#include +#include +#include + +#include + +#include "hip/hip_runtime.h" + +// Two waves per SIMD on MI300 +#define DATA_SIZE (304 * 64 * 4 * 2) +#define HIP_API_CALL(CALL) \ + if((CALL) != hipSuccess) \ + { \ + abort(); \ + } + +#define LDS_SIZE 1024 + +__global__ void +divide_kernel(float* a, const float* b, const float* c, int /* unused */) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + + if(index >= DATA_SIZE) return; + + a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1; +} + +__global__ void +looping_lds_kernel(float* a, const float* b, const float* c, int loopcount) +{ + __shared__ float interm[LDS_SIZE]; + + size_t index = blockDim.x * blockIdx.x + threadIdx.x; + + for(size_t i = index; i < DATA_SIZE; i += blockDim.x * gridDim.x) + interm[threadIdx.x % LDS_SIZE] = b[index] + threadIdx.x; + + for(int it = 0; it < loopcount; it++) + { + __syncthreads(); + float value = interm[(it + threadIdx.x + LDS_SIZE / 2) % LDS_SIZE]; + __syncthreads(); + interm[threadIdx.x % LDS_SIZE] += value; + } + + a[index] = interm[threadIdx.x % LDS_SIZE] + c[index]; +} + +__global__ void +fifo_kernel(float* /* a */, const float* /* b */, const float* /* c */, int loops) +{ + using _float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float; + + __shared__ _float4 lds[LDS_SIZE]; + lds[threadIdx.x] = _float4{float(threadIdx.x)}; + lds[threadIdx.x + 512] = _float4{float(threadIdx.x)}; + + __syncthreads(); + + _float4 dst[16]; + + float res1 = 0, res2 = 0; + + for(int l = 0; l < loops; l++) + { +#pragma unroll 16 + for(int i = 0; i < 16; i++) + dst[i] = lds[threadIdx.x + i * 8]; + + __syncthreads(); + +#pragma unroll 16 + for(int i = 0; i < 16; i++) + { + res1 += dst[i][0] + dst[i][1]; + res2 += dst[i][2] + dst[i][3]; + } + asm volatile("v_add_f32 %0, %1, %2" : "=v"(res1) : "v"(res1), "v"(res2)); + } +}; + +class hipMemory +{ +public: + hipMemory(size_t size = DATA_SIZE) + { + HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float))); + HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float))); + } + ~hipMemory() + { + if(ptr) HIP_API_CALL(hipFree(ptr)); + } + hipMemory(hipMemory&& other) + { + ptr = other.ptr; + other.ptr = nullptr; + } + float* ptr = nullptr; +}; + +class HipStream +{ +public: + HipStream() { HIP_API_CALL(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); } + ~HipStream() { HIP_API_CALL(hipStreamDestroy(stream)); } + + hipStream_t stream; + + hipMemory src1{}; + hipMemory src2{}; + hipMemory dst{}; +}; + +#define Launch(kernel, stream, arglast) \ + hipLaunchKernelGGL( \ + kernel, DATA_SIZE / 512, 512, 0, 0, stream.dst.ptr, stream.src1.ptr, stream.src2.ptr, 6); + +int +main(int /*argc*/, char** /*argv*/) +{ + std::array streams{}; + std::vector kernels{}; + + kernels.push_back(divide_kernel); + kernels.push_back(looping_lds_kernel); + kernels.push_back(fifo_kernel); + + for(size_t i = 0; i < streams.size() * kernels.size(); i++) + { + // Warmup then start + if(i == streams.size()) + { + HIP_API_CALL(hipDeviceSynchronize()); + roctxProfilerResume(0); + } + + auto& stream = streams.at(i % streams.size()); + auto& kernel = kernels.at(i % kernels.size()); + + Launch(kernel, stream, 3); + HIP_API_CALL(hipGetLastError()); + } + + HIP_API_CALL(hipDeviceSynchronize()); + roctxProfilerPause(0); + + return 0; +} diff --git a/source/docs/api-reference/thread_trace.rst b/source/docs/api-reference/thread_trace.rst index 78e1343076..6531fb9fdd 100644 --- a/source/docs/api-reference/thread_trace.rst +++ b/source/docs/api-reference/thread_trace.rst @@ -188,7 +188,7 @@ To decode the raw thread trace data, create and initialize a Trace Decoder: .. code-block:: cpp - rocprofiler_thread_trace_decoder_handle_t decoder{}; + rocprofiler_thread_trace_decoder_id_t decoder{}; // Create the Trace Decoder with the path to the decoder library ROCPROFILER_CALL( diff --git a/source/include/rocprofiler-sdk/cxx/hash.hpp b/source/include/rocprofiler-sdk/cxx/hash.hpp index 6ba6eade8f..200a48f26f 100644 --- a/source/include/rocprofiler-sdk/cxx/hash.hpp +++ b/source/include/rocprofiler-sdk/cxx/hash.hpp @@ -23,6 +23,7 @@ #pragma once +#include #include #include #include @@ -70,6 +71,7 @@ ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_signal_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_executable_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_region_t) ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_amd_memory_pool_t) +ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(rocprofiler_thread_trace_decoder_id_t) #undef ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER } // namespace std diff --git a/source/include/rocprofiler-sdk/cxx/operators.hpp b/source/include/rocprofiler-sdk/cxx/operators.hpp index 2eaecd4c49..d09956e124 100644 --- a/source/include/rocprofiler-sdk/cxx/operators.hpp +++ b/source/include/rocprofiler-sdk/cxx/operators.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -148,6 +149,7 @@ ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_record_dimension_inf ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_record_dimension_instance_info_t&) ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_dimension_info_t&) ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_version_triplet_t) +ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_thread_trace_decoder_id_t) // definitions of operator== ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_context_id_t) @@ -164,6 +166,7 @@ ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_signal_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_executable_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_region_t) ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_amd_memory_pool_t) +ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_thread_trace_decoder_id_t) inline bool operator==(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs) @@ -241,6 +244,7 @@ ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_dim3_t) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_region_t) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_amd_memory_pool_t) ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_version_triplet_t) +ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_thread_trace_decoder_id_t) // definitions of operator< ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_context_id_t) @@ -257,6 +261,7 @@ ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_signal_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_executable_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_region_t) ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_amd_memory_pool_t) +ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_thread_trace_decoder_id_t) inline bool operator<(const rocprofiler_counter_record_dimension_info_t& lhs, @@ -334,6 +339,7 @@ ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_dim3_t) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_region_t) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_amd_memory_pool_t) ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_version_triplet_t) +ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_thread_trace_decoder_id_t) // cleanup defines #undef ROCPROFILER_CXX_DECLARE_OPERATORS diff --git a/source/include/rocprofiler-sdk/experimental/thread-trace/trace_decoder.h b/source/include/rocprofiler-sdk/experimental/thread-trace/trace_decoder.h index 746e21278b..b2e34f2c1e 100644 --- a/source/include/rocprofiler-sdk/experimental/thread-trace/trace_decoder.h +++ b/source/include/rocprofiler-sdk/experimental/thread-trace/trace_decoder.h @@ -36,10 +36,10 @@ ROCPROFILER_EXTERN_C_INIT /** * @brief Handle containing a loaded rocprof-trace-decoder and a decoder state. */ -typedef struct rocprofiler_thread_trace_decoder_handle_t +typedef struct rocprofiler_thread_trace_decoder_id_t { uint64_t handle; -} rocprofiler_thread_trace_decoder_handle_t; +} rocprofiler_thread_trace_decoder_id_t; /** * @brief Initializes Trace Decoder library with a library search path @@ -51,7 +51,7 @@ typedef struct rocprofiler_thread_trace_decoder_handle_t * @retval ::ROCPROFILER_STATUS_SUCCESS Handle created */ rocprofiler_status_t -rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_t* handle, +rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_id_t* handle, const char* path) ROCPROFILER_API ROCPROFILER_NONNULL(1, 2); /** @@ -59,7 +59,7 @@ rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_ * @param[in] handle Handle to destroy */ void -rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle_t handle) +rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_id_t handle) ROCPROFILER_API; /** @@ -83,11 +83,11 @@ rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle * @retval ::ROCPROFILER_STATUS_SUCCESS Code object loaded */ rocprofiler_status_t -rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_handle_t handle, - uint64_t load_id, - uint64_t load_addr, - uint64_t load_size, - const void* data, +rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_id_t handle, + uint64_t load_id, + uint64_t load_addr, + uint64_t load_size, + const void* data, uint64_t size) ROCPROFILER_API ROCPROFILER_NONNULL(5); /** @@ -100,7 +100,7 @@ rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_h * @retval ::ROCPROFILER_STATUS_SUCCESS Code object unloaded */ rocprofiler_status_t -rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_handle_t handle, +rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_id_t handle, uint64_t load_id) ROCPROFILER_API; /** @@ -135,7 +135,7 @@ typedef void (*rocprofiler_thread_trace_decoder_callback_t)( * @retval ::ROCPROFILER_STATUS_SUCCESS on success */ rocprofiler_status_t -rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle, +rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_id_t handle, rocprofiler_thread_trace_decoder_callback_t callback, void* data, uint64_t size, @@ -148,8 +148,8 @@ rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle, * @retval null terminated string as description of "info". */ const char* -rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_handle_t handle, - rocprofiler_thread_trace_decoder_info_t info) +rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_id_t handle, + rocprofiler_thread_trace_decoder_info_t info) ROCPROFILER_API; /** @} */ diff --git a/source/lib/att-tool/att_lib_wrapper.cpp b/source/lib/att-tool/att_lib_wrapper.cpp index e2bf7842c0..d07800b36f 100644 --- a/source/lib/att-tool/att_lib_wrapper.cpp +++ b/source/lib/att-tool/att_lib_wrapper.cpp @@ -41,9 +41,9 @@ namespace rocprofiler { namespace att_wrapper { -ATTFileMgr::ATTFileMgr(Fspath _dir, - std::vector _counters, - rocprofiler_thread_trace_decoder_handle_t _decoder) +ATTFileMgr::ATTFileMgr(Fspath _dir, + std::vector _counters, + rocprofiler_thread_trace_decoder_id_t _decoder) : dir(std::move(_dir)) , decoder(_decoder) { diff --git a/source/lib/att-tool/att_lib_wrapper.hpp b/source/lib/att-tool/att_lib_wrapper.hpp index 58e3f6deb6..b5a8dcb493 100644 --- a/source/lib/att-tool/att_lib_wrapper.hpp +++ b/source/lib/att-tool/att_lib_wrapper.hpp @@ -71,7 +71,7 @@ public: bool valid() const; protected: - rocprofiler_thread_trace_decoder_handle_t decoder{}; + rocprofiler_thread_trace_decoder_id_t decoder{}; }; class ATTFileMgr @@ -79,9 +79,9 @@ class ATTFileMgr using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; public: - ATTFileMgr(Fspath _dir, - std::vector _counters, - rocprofiler_thread_trace_decoder_handle_t _decoder); + ATTFileMgr(Fspath _dir, + std::vector _counters, + rocprofiler_thread_trace_decoder_id_t _decoder); ~ATTFileMgr(); void addDecoder(const char* filepath, uint64_t id, uint64_t load_addr, uint64_t memsize); @@ -95,7 +95,7 @@ public: std::shared_ptr table{nullptr}; std::map> occupancy{}; std::vector codeobjs_to_delete{}; - rocprofiler_thread_trace_decoder_handle_t decoder{}; + rocprofiler_thread_trace_decoder_id_t decoder{}; std::array, ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST> wstates; diff --git a/source/lib/att-tool/profile_interface.cpp b/source/lib/att-tool/profile_interface.cpp index f20348617d..1d50c422df 100644 --- a/source/lib/att-tool/profile_interface.cpp +++ b/source/lib/att-tool/profile_interface.cpp @@ -103,9 +103,9 @@ get_trace_data(rocprofiler_thread_trace_decoder_record_type_t trace_id, C_API_END } -ToolData::ToolData(std::vector& _data, - WaveConfig& _config, - rocprofiler_thread_trace_decoder_handle_t _decoder) +ToolData::ToolData(std::vector& _data, + WaveConfig& _config, + rocprofiler_thread_trace_decoder_id_t _decoder) : cfile(_config.code) , config(_config) , decoder(_decoder) diff --git a/source/lib/att-tool/profile_interface.hpp b/source/lib/att-tool/profile_interface.hpp index dc08f1c28d..54d38835b1 100644 --- a/source/lib/att-tool/profile_interface.hpp +++ b/source/lib/att-tool/profile_interface.hpp @@ -60,9 +60,9 @@ using SymbolInfo = rocprofiler::sdk::codeobj::disassembly::SymbolInfo; struct ToolData { - ToolData(std::vector& data, - WaveConfig& config, - rocprofiler_thread_trace_decoder_handle_t decoder); + ToolData(std::vector& data, + WaveConfig& config, + rocprofiler_thread_trace_decoder_id_t decoder); ~ToolData(); CodeLine& get(pcinfo_t pc); @@ -73,7 +73,7 @@ struct ToolData std::vector shader_data{}; size_t num_waves = 0; - rocprofiler_thread_trace_decoder_handle_t decoder{}; + rocprofiler_thread_trace_decoder_id_t decoder{}; }; } // namespace att_wrapper diff --git a/source/lib/rocprofiler-sdk/thread_trace/decode.cpp b/source/lib/rocprofiler-sdk/thread_trace/decode.cpp index 16ef7bfd15..f0e0980d7e 100644 --- a/source/lib/rocprofiler-sdk/thread_trace/decode.cpp +++ b/source/lib/rocprofiler-sdk/thread_trace/decode.cpp @@ -21,6 +21,7 @@ // SOFTWARE. #include "lib/common/static_object.hpp" +#include "lib/common/synchronized.hpp" #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" @@ -29,15 +30,19 @@ #include #include +#include +#include #include +#include #include namespace { using DL = rocprofiler::thread_trace::DL; using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate; +using LockedTable = rocprofiler::common::Synchronized; class DecoderInstance { @@ -46,72 +51,72 @@ public: : dl(std::move(_dl)) {} - std::unique_ptr
dl{nullptr}; - AddressTable table{}; + const std::unique_ptr dl{nullptr}; + + LockedTable table{}; }; -std::mutex map_mut; +using DecoderMap = + std::unordered_map>; +using LockedMap = rocprofiler::common::Synchronized; auto& -get_dlopens() +get_dlmap() { - static auto*& _v = rocprofiler::common::static_object< - std::unordered_map>>::construct(); + static auto*& _v = rocprofiler::common::static_object::construct(); return *CHECK_NOTNULL(_v); } std::shared_ptr -get_dl(rocprofiler_thread_trace_decoder_handle_t handle) +get_dl(rocprofiler_thread_trace_decoder_id_t handle) { - auto lk = std::unique_lock{map_mut}; - auto it = get_dlopens().find(handle.handle); - if(it == get_dlopens().end()) return nullptr; - - return it->second; + return get_dlmap().rlock([&](const DecoderMap& map) -> std::shared_ptr { + if(auto it = map.find(handle); it != map.end()) return it->second; + return nullptr; + }); } } // namespace extern "C" { rocprofiler_status_t -rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_t* handle, - const char* path) +rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_id_t* handle, + const char* path) { auto dl = std::make_unique
(path); if(dl->handle == nullptr) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE; if(!dl->valid()) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI; - auto lk = std::unique_lock{map_mut}; - static uint64_t count = 1; + static std::atomic count{1}; + handle->handle = count.fetch_add(1); - auto instance = std::make_shared(std::move(dl)); - - handle->handle = count++; - get_dlopens()[handle->handle] = std::move(instance); + get_dlmap().wlock( + [&](DecoderMap& map) { map[*handle] = std::make_shared(std::move(dl)); }); return ROCPROFILER_STATUS_SUCCESS; } void -rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle_t handle) +rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_id_t handle) { - auto lk = std::unique_lock{map_mut}; - get_dlopens().erase(handle.handle); + get_dlmap().wlock([&](DecoderMap& map) { map.erase(handle); }); } rocprofiler_status_t -rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_handle_t handle, - uint64_t load_id, - uint64_t load_addr, - uint64_t load_size, - const void* data, - uint64_t size) +rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_id_t handle, + uint64_t load_id, + uint64_t load_addr, + uint64_t load_size, + const void* data, + uint64_t size) { auto decoder = get_dl(handle); if(decoder == nullptr) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; try { - decoder->table.addDecoder(data, size, load_id, load_addr, load_size); + decoder->table.wlock([&](AddressTable& table) { + table.addDecoder(data, size, load_id, load_addr, load_size); + }); } catch(...) { return ROCPROFILER_STATUS_ERROR; @@ -120,15 +125,17 @@ rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_h } rocprofiler_status_t -rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_handle_t handle, - uint64_t load_id) +rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_id_t handle, + uint64_t load_id) { auto decoder = get_dl(handle); if(decoder == nullptr) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; try { - if(decoder->table.removeDecoder(load_id)) return ROCPROFILER_STATUS_SUCCESS; + bool result = + decoder->table.wlock([&](AddressTable& table) { return table.removeDecoder(load_id); }); + if(result) return ROCPROFILER_STATUS_SUCCESS; } catch(std::exception&) {} @@ -169,32 +176,32 @@ isa_callback(char* isa_instruction, void* userdata) { ROCP_FATAL_IF(userdata == nullptr) << "Userdata is null!"; - auto& table = static_cast(userdata)->decoder->table; - - std::unique_ptr instruction{nullptr}; + auto decoder = static_cast(userdata)->decoder; + ROCP_FATAL_IF(decoder == nullptr) << "decoder is null"; try { - instruction = table.get(pc.marker_id, pc.addr); + auto instruction = decoder->table.wlock( + [&](AddressTable& table) { return table.get(pc.marker_id, pc.addr); }); + + if(!instruction) return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT; + + { + size_t tmp_isa_size = *isa_size; + *isa_size = instruction->inst.size(); + + if(*isa_size > tmp_isa_size) + return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES; + } + + memcpy(isa_instruction, instruction->inst.data(), *isa_size); + *isa_memory_size = instruction->size; + } catch(std::exception& e) { - ROCP_WARNING << pc.marker_id << ":" << pc.addr << ' ' << e.what(); + ROCP_CI_LOG(INFO) << pc.marker_id << ":" << pc.addr << ' ' << e.what(); return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR; } - - if(!instruction) return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT; - - { - size_t tmp_isa_size = *isa_size; - *isa_size = instruction->inst.size(); - - if(*isa_size > tmp_isa_size) - return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES; - } - - memcpy(isa_instruction, instruction->inst.data(), *isa_size); - *isa_memory_size = instruction->size; - return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS; } @@ -214,7 +221,7 @@ trace_callback(rocprofiler_thread_trace_decoder_record_type_t record_type_id, extern "C" { rocprofiler_status_t -rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle, +rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_id_t handle, rocprofiler_thread_trace_decoder_callback_t user_callback, void* data, uint64_t size, @@ -249,8 +256,8 @@ rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle, } const char* -rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_handle_t handle, - rocprofiler_thread_trace_decoder_info_t info) +rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_id_t handle, + rocprofiler_thread_trace_decoder_info_t info) { auto decoder = get_dl(handle); if(decoder == nullptr) return nullptr; diff --git a/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt index d48dc04315..7a938b5dde 100644 --- a/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt +++ b/tests/rocprofv3/advanced-thread-trace/CMakeLists.txt @@ -42,8 +42,6 @@ rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.p find_package(rocprofiler-sdk REQUIRED) -set(IS_DISABLED ON) - find_library( attdecoder_LIBRARY NAMES rocprof-trace-decoder @@ -58,43 +56,43 @@ endif() find_package_handle_standard_args(attdecoder REQUIRED_VARS attdecoder_LIB_DIR attdecoder_LIBRARY) +set(COMMON_PARAMS_NO_LIB ${PRELOAD_ARGS} --att --output-format json --log-level env -d + ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/) + +set(IS_DISABLED ON) if(attdecoder_FOUND) set(IS_DISABLED OFF) - set(LIB_PATH_ENV "ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}") + set(COMMON_PARAMS --att-library-path ${attdecoder_LIB_DIR} ${COMMON_PARAMS_NO_LIB}) endif() # hsa multiqueue dependency test with lib path add_test( - NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute + NAME rocprofv3-test-att-hsa-multiqueue-cmd-env-att-lib-path-execute COMMAND - $ --log-level env --advanced-thread-trace - 1 --att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD - --att-buffer-size 0x6000000 --att-simd-select 0x3 --att-serialize-all 1 -d - ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o out --output-format json - ${PRELOAD_ARGS} --att-library-path ${attdecoder_LIB_DIR} --att-activity 8 -- + $ ${COMMON_PARAMS}/cmd_input + --att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD + --att-buffer-size 0x6000000 --att-simd-select 0x3 --att-serialize-all 1 -o out -- $) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute + rocprofv3-test-att-hsa-multiqueue-cmd-env-att-lib-path-execute PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED}) # hsa multiqueue dependency test with json input add_test( - NAME rocprofv3-test-hsa-multiqueue-att-json-execute + NAME rocprofv3-test-att-hsa-multiqueue-json-execute COMMAND - $ --log-level env --att-library-path - ${attdecoder_LIB_DIR} -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/json_input -i - ${CMAKE_CURRENT_BINARY_DIR}/att_input.json ${PRELOAD_ARGS} -- + $ ${COMMON_PARAMS}/json_input -i + ${CMAKE_CURRENT_BINARY_DIR}/att_input.json -- $) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-json-execute - PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} ENVIRONMENT - "${LIB_PATH_ENV}") + rocprofv3-test-att-hsa-multiqueue-json-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED}) # validate output add_test( - NAME rocprofv3-test-hsa-multiqueue-att-cmd-validate + NAME rocprofv3-test-att-hsa-multiqueue-cmd-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json @@ -102,7 +100,7 @@ add_test( ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input) add_test( - NAME rocprofv3-test-hsa-multiqueue-att-json-validate + NAME rocprofv3-test-att-hsa-multiqueue-json-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/out_results.json @@ -110,46 +108,46 @@ add_test( ${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-cmd-validate + rocprofv3-test-att-hsa-multiqueue-cmd-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS - "rocprofv3-test-hsa-multiqueue-att-cmd-ld-lib-path-execute" + "rocprofv3-test-att-hsa-multiqueue-cmd-ld-lib-path-execute" FAIL_REGULAR_EXPRESSION "AssertionError" DISABLED ${IS_DISABLED}) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-json-validate + rocprofv3-test-att-hsa-multiqueue-json-validate PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS - "rocprofv3-test-hsa-multiqueue-att-json-execute" + "rocprofv3-test-att-hsa-multiqueue-json-execute" FAIL_REGULAR_EXPRESSION "AssertionError" DISABLED ${IS_DISABLED}) -function(configure_att_input _FILENAME _OUTDIR) - set(LIBRARY_OUTPUT_DIR ${_OUTDIR}) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/att_input.yml.in +function(configure_att_input _FILENAME) + set(LIB_PATH_LOC "${attdecoder_LIB_DIR}") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}//${_FILENAME}.in ${CMAKE_CURRENT_BINARY_DIR}/${_FILENAME} @ONLY) endfunction() -configure_att_input(att_input.yml "${attdecoder_LIB_DIR}") -configure_att_input(att_input_will_fail.yml "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") +configure_att_input(att_input.yml) +configure_att_input(att_input_will_fail.yml) # test yaml input add_test( NAME rocprofv3-test-att-yaml-input COMMAND - $ -i - ${CMAKE_CURRENT_BINARY_DIR}/att_input.yml --log-level env --echo -- + $ ${COMMON_PARAMS_NO_LIB}/yaml -i + ${CMAKE_CURRENT_BINARY_DIR}/att_input.yml --echo -- $) set_tests_properties( @@ -166,7 +164,7 @@ add_test( set_tests_properties( rocprofv3-test-att-yaml-input-will-fail - PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED True) + PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED ${IS_DISABLED}) add_test(NAME rocprofv3-test-att-library-path-cmd-line-will-fail COMMAND $ --att --att-library-path . @@ -174,11 +172,12 @@ add_test(NAME rocprofv3-test-att-library-path-cmd-line-will-fail set_tests_properties( rocprofv3-test-att-library-path-cmd-line-will-fail - PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED True) + PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED ${IS_DISABLED}) -add_test(NAME rocprofv3-test-att-library-path-env-var-will-fail - COMMAND $ --att --log-level env --echo - -- $) +add_test( + NAME rocprofv3-test-att-library-path-env-var-will-fail + COMMAND $ ${COMMON_PARAMS_NO_LIB}/envfail + --att-library-path . --echo -- $) set_tests_properties( rocprofv3-test-att-library-path-env-var-will-fail @@ -187,65 +186,70 @@ set_tests_properties( LABELS "integration-tests" ENVIRONMENT - "ROCPROF_ATT_LIBRARY_PATH=." + "ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}" WILL_FAIL ON DISABLED - True) + ${IS_DISABLED}) + +add_test(NAME rocprofv3-test-att-env-var + COMMAND $ ${COMMON_PARAMS_NO_LIB}/envvar + -- $) + +set_tests_properties( + rocprofv3-test-att-env-var + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT + "ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}" DISABLED ${IS_DISABLED}) # # Uses ATT and Counter Collection at the same time # -add_test( - NAME rocprofv3-test-hsa-multiqueue-att-plus-pmc-execute - COMMAND - $ --log-level env --pmc SQ_WAVES - --advanced-thread-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o - out --output-format json ${PRELOAD_ARGS} -- $) +add_test(NAME rocprofv3-test-att-hsa-multiqueue-plus-pmc-execute + COMMAND $ ${COMMON_PARAMS}/cmd_input + --pmc SQ_WAVES -o out -- $) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-plus-pmc-execute - PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} ENVIRONMENT - "${LIB_PATH_ENV}") + rocprofv3-test-att-hsa-multiqueue-plus-pmc-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED}) # Check for conflict PMC + activity -add_test( - NAME rocprofv3-test-hsa-multiqueue-att-activity-pmc-will-fail - COMMAND - $ --log-level env --pmc SQ_WAVES - --advanced-thread-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o - out --att-activity 8 ${PRELOAD_ARGS} -- $) +add_test(NAME rocprofv3-test-att-hsa-multiqueue-activity-pmc-will-fail + COMMAND $ ${COMMON_PARAMS}/cmd_input + --pmc SQ_WAVES -o out --att-activity 8 -- $) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-activity-pmc-will-fail - PROPERTIES TIMEOUT - 45 - LABELS - "integration-tests" - DISABLED - ${IS_DISABLED} - ENVIRONMENT - "${LIB_PATH_ENV}" - WILL_FAIL - ON) + rocprofv3-test-att-hsa-multiqueue-activity-pmc-will-fail + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} WILL_FAIL ON) # Check for conflict Perfcounters + activity add_test( - NAME rocprofv3-test-hsa-multiqueue-att-activity-perf-will-fail - COMMAND - $ --log-level env --att - --att-perfcounter-ctrl 8 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o - out --att-activity 8 ${PRELOAD_ARGS} -- $) + NAME rocprofv3-test-att-hsa-multiqueue-activity-perf-will-fail + COMMAND $ ${COMMON_PARAMS}/cmd_input + --att-perfcounter-ctrl 8 -o out --att-activity 8 -- $) set_tests_properties( - rocprofv3-test-hsa-multiqueue-att-activity-perf-will-fail - PROPERTIES TIMEOUT - 45 - LABELS - "integration-tests" - DISABLED - ${IS_DISABLED} - ENVIRONMENT - "${LIB_PATH_ENV}" - WILL_FAIL - ON) + rocprofv3-test-att-hsa-multiqueue-activity-perf-will-fail + PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} WILL_FAIL ON) + +# ATT plus PC sampling + +rocprofiler_sdk_pc_sampling_stochastic_disabled(IS_PC_SAMPLING_STOCHASTIC_DISABLED) + +set(ATT_PLUS_PCS_DISABLE ${IS_DISABLED}) +if(${IS_PC_SAMPLING_STOCHASTIC_DISABLED}) + set(ATT_PLUS_PCS_DISABLE ON) +endif() +if(${ROCPROFILER_DISABLE_UNSTABLE_CTESTS}) + set(ATT_PLUS_PCS_DISABLE ON) +endif() +set(PCS_ARGS --pc-sampling-beta-enabled 1 --pc-sampling-unit cycles + --pc-sampling-interval 16384 --pc-sampling-method stochastic) + +add_test(NAME rocprofv3-test-att-plus-pc-sampling + COMMAND $ ${COMMON_PARAMS}/cmd_input + ${PCS_ARGS} -o out -- $) + +set_tests_properties( + rocprofv3-test-att-plus-pc-sampling + PROPERTIES TIMEOUT 90 LABELS "integration-tests;thread-trace;pc-sampling" DISABLED + ${ATT_PLUS_PCS_DISABLE}) diff --git a/tests/rocprofv3/advanced-thread-trace/att_input.yml.in b/tests/rocprofv3/advanced-thread-trace/att_input.yml.in index d80dd3e1f6..e46bfc1c48 100644 --- a/tests/rocprofv3/advanced-thread-trace/att_input.yml.in +++ b/tests/rocprofv3/advanced-thread-trace/att_input.yml.in @@ -23,9 +23,9 @@ jobs: - advanced_thread_trace: True att_library_path: - - @LIBRARY_OUTPUT_DIR@/att - - @LIBRARY_OUTPUT_DIR@ + - @LIB_PATH_LOC@/att + - @LIB_PATH_LOC@ - advanced_thread_trace: True att_library_path: - - @LIBRARY_OUTPUT_DIR@ + - @LIB_PATH_LOC@ diff --git a/tests/rocprofv3/advanced-thread-trace/att_input_will_fail.yml.in b/tests/rocprofv3/advanced-thread-trace/att_input_will_fail.yml.in new file mode 100644 index 0000000000..b3bc337db3 --- /dev/null +++ b/tests/rocprofv3/advanced-thread-trace/att_input_will_fail.yml.in @@ -0,0 +1,26 @@ +# MIT License +# +# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +jobs: + - advanced_thread_trace: True + att_library_path: + - @LIB_PATH_LOC@/att diff --git a/tests/thread-trace/agent.cpp b/tests/thread-trace/agent.cpp index 4d57af5945..708ba44968 100644 --- a/tests/thread-trace/agent.cpp +++ b/tests/thread-trace/agent.cpp @@ -29,17 +29,6 @@ #include -#define C_API_BEGIN \ - try \ - { -#define C_API_END \ - } \ - catch(std::exception & e) \ - { \ - std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ - } \ - catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } - namespace ATTTest { namespace Agent @@ -135,9 +124,10 @@ query_available_agents(rocprofiler_agent_version_t /* version */, } int -tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */) { - (void) fini_func; + Callbacks::init(); + ROCPROFILER_CALL(rocprofiler_create_context(&tracing_ctx), "context creation"); ROCPROFILER_CALL(rocprofiler_create_context(&agent_ctx), "context creation"); @@ -147,7 +137,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr, 0, Callbacks::tool_codeobj_tracing_callback, - tool_data), + nullptr), "code object tracing service configure"); ROCPROFILER_CALL( @@ -156,13 +146,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr, 0, dispatch_tracing_callback, - tool_data), + nullptr), "dispatch tracing service configure"); ROCPROFILER_CALL(rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, &query_available_agents, sizeof(rocprofiler_agent_t), - tool_data), + nullptr), "Failed to find GPU agents"); int valid_ctx = 0; @@ -177,13 +167,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) return 0; } -void -tool_fini(void* tool_data) -{ - Callbacks::finalize_json(tool_data); - delete static_cast(tool_data); -} - } // namespace Agent } // namespace ATTTest @@ -206,8 +189,8 @@ rocprofiler_configure(uint32_t /* version */, static auto cfg = rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), &ATTTest::Agent::tool_init, - &ATTTest::Agent::tool_fini, - new Callbacks::ToolData{"att_agent_test/"}}; + &Callbacks::finalize, + nullptr}; // return pointer to configure data return &cfg; diff --git a/tests/thread-trace/multi_dispatch.cpp b/tests/thread-trace/multi_dispatch.cpp index bf47704b5e..234024e9d7 100644 --- a/tests/thread-trace/multi_dispatch.cpp +++ b/tests/thread-trace/multi_dispatch.cpp @@ -48,15 +48,17 @@ dispatch_callback(rocprofiler_agent_id_t /* agent */, static std::atomic count{0}; if(count.fetch_add(1) > NUM_KERNELS) return ROCPROFILER_THREAD_TRACE_CONTROL_NONE; - assert(userdata && "Dispatch callback passed null!"); + assert(dispatch_userdata && "Dispatch callback passed null!"); dispatch_userdata->ptr = userdata; return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP; } int -tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */) { + Callbacks::init(); + static rocprofiler_context_id_t client_ctx = {0}; ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); @@ -67,7 +69,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) nullptr, 0, Callbacks::tool_codeobj_tracing_callback, - tool_data), + nullptr), "code object tracing service configure"); std::vector params{}; @@ -100,7 +102,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) params.size(), dispatch_callback, Callbacks::shader_data_callback, - tool_data), + nullptr), "thread trace service configure"); } @@ -121,13 +123,6 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) return 0; } -void -tool_fini(void* tool_data) -{ - Callbacks::finalize_json(tool_data); - delete static_cast(tool_data); -} - } // namespace Multi } // namespace ATTTest @@ -147,11 +142,11 @@ rocprofiler_configure(uint32_t /* version */, ATTTest::Multi::client_id = id; // create configure data - static auto cfg = rocprofiler_tool_configure_result_t{ - sizeof(rocprofiler_tool_configure_result_t), - &ATTTest::Multi::tool_init, - &ATTTest::Multi::tool_fini, - reinterpret_cast(new Callbacks::ToolData{"att_multi_test/"})}; + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &ATTTest::Multi::tool_init, + &Callbacks::finalize, + nullptr}; // return pointer to configure data return &cfg; diff --git a/tests/thread-trace/single_dispatch.cpp b/tests/thread-trace/single_dispatch.cpp index 4feb8b44d1..f09e9c500d 100644 --- a/tests/thread-trace/single_dispatch.cpp +++ b/tests/thread-trace/single_dispatch.cpp @@ -41,35 +41,20 @@ dispatch_callback(rocprofiler_agent_id_t /* agent */, rocprofiler_async_correlation_id_t /* correlation_id */, rocprofiler_kernel_id_t kernel_id, rocprofiler_dispatch_id_t /* dispatch_id */, - void* userdata, - rocprofiler_user_data_t* dispatch_userdata) + void* /* userdata */, + rocprofiler_user_data_t* /* dispatch_userdata */) { - C_API_BEGIN - assert(userdata && "Dispatch callback passed null!"); - auto& tool = *reinterpret_cast(userdata); - dispatch_userdata->ptr = userdata; + static rocprofiler_kernel_id_t target_kernel_id = kernel_id; + if(target_kernel_id == kernel_id) return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP; - static std::string_view desired_func_name = "branching_kernel"; - - try - { - auto& kernel_name = tool.kernel_id_to_kernel_name.at(kernel_id); - if(kernel_name.find(desired_func_name) == std::string::npos) - return ROCPROFILER_THREAD_TRACE_CONTROL_NONE; - - return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP; - } catch(...) - { - std::cerr << "Could not find kernel id: " << kernel_id << std::endl; - } - - C_API_END return ROCPROFILER_THREAD_TRACE_CONTROL_NONE; } int -tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) +tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */) { + Callbacks::init(); + static rocprofiler_context_id_t client_ctx = {0}; ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation"); @@ -80,7 +65,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) nullptr, 0, Callbacks::tool_codeobj_tracing_callback, - tool_data), + nullptr), "code object tracing service configure"); std::vector agents{}; @@ -110,7 +95,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) 0, dispatch_callback, Callbacks::shader_data_callback, - tool_data), + nullptr), "thread trace service configure"); } @@ -131,13 +116,6 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data) return 0; } -void -tool_fini(void* tool_data) -{ - Callbacks::finalize_json(tool_data); - delete static_cast(tool_data); -} - } // namespace Single } // namespace ATTTest @@ -157,11 +135,11 @@ rocprofiler_configure(uint32_t /* version */, ATTTest::Single::client_id = id; // create configure data - static auto cfg = rocprofiler_tool_configure_result_t{ - sizeof(rocprofiler_tool_configure_result_t), - &ATTTest::Single::tool_init, - &ATTTest::Single::tool_fini, - reinterpret_cast(new Callbacks::ToolData{"att_single_test/"})}; + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &ATTTest::Single::tool_init, + &Callbacks::finalize, + nullptr}; // return pointer to configure data return &cfg; diff --git a/tests/thread-trace/trace_callbacks.cpp b/tests/thread-trace/trace_callbacks.cpp index 3224c57b4c..d40377badb 100644 --- a/tests/thread-trace/trace_callbacks.cpp +++ b/tests/thread-trace/trace_callbacks.cpp @@ -26,11 +26,6 @@ #endif #include "trace_callbacks.hpp" -#include - -#ifdef ENABLE_ATT_FILES -# include -#endif #include #include @@ -38,135 +33,78 @@ namespace Callbacks { -using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; -using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +rocprofiler_thread_trace_decoder_id_t decoder{}; +std::atomic latency{0}; void tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* /* user_data */, - void* userdata) + void* /* userdata */) { - C_API_BEGIN if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; - if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return; - - assert(userdata && "Dispatch callback passed null!"); - auto& tool = *reinterpret_cast(userdata); - - if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) - { - auto* data = static_cast(record.payload); - tool.kernel_id_to_kernel_name.emplace(data->kernel_id, data->kernel_name); - } - if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return; - auto* data = static_cast(record.payload); + auto* data = static_cast(record.payload); + if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) return; - static std::atomic filecnt{0}; - std::string name = "codeobj_" + std::to_string(filecnt.fetch_add(1)) + ".out"; - -#ifdef ENABLE_ATT_FILES - if(std::string_view(data->uri).find("file:///") == 0) + if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) { - rocprofiler::sdk::codeobj::disassembly::CodeObjectBinary binary(data->uri); - - std::ofstream file(tool.out_dir + name, std::ios::binary); - assert(file.is_open() && "Could not open codeobj file for writing"); - file.write((char*) binary.buffer.data(), binary.buffer.size()); + DECODER_CALL( + rocprofiler_thread_trace_decoder_codeobj_unload(decoder, data->code_object_id)); + return; } - else - { - std::ofstream file(tool.out_dir + name, std::ios::binary); - file.write((char*) data->memory_base, data->memory_size); - } -#endif - auto _lk = std::unique_lock{tool.mut}; - tool.codeobjs.push_back( - {data->load_delta, data->load_size, data->code_object_id, name, data->uri}); + DECODER_CALL(rocprofiler_thread_trace_decoder_codeobj_load( + decoder, + data->code_object_id, + data->load_delta, + data->load_size, + reinterpret_cast(data->memory_base), + data->memory_size)); +} - C_API_END +typedef void (*rocprofiler_thread_trace_decoder_callback_t)( + rocprofiler_thread_trace_decoder_record_type_t record_type_id, + void* trace_events, + uint64_t trace_size, + void* userdata); + +void +shader_data_callback(rocprofiler_agent_id_t /* agent */, + int64_t /* se_id */, + void* se_data, + size_t data_size, + rocprofiler_user_data_t /* userdata */) +{ + auto parse = [](rocprofiler_thread_trace_decoder_record_type_t record_type_id, + void* trace_events, + uint64_t trace_size, + void* /* userdata */) { + if(record_type_id != ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE) return; + + for(size_t w = 0; w < trace_size; w++) + { + auto* wave = static_cast(trace_events); + for(size_t i = 0; i < wave->instructions_size; i++) + latency += wave->instructions_array[i].duration; + } + }; + DECODER_CALL(rocprofiler_trace_decode(decoder, parse, se_data, data_size, nullptr)); } void -shader_data_callback(rocprofiler_agent_id_t agent, - int64_t se_id, - void* se_data, - size_t data_size, - rocprofiler_user_data_t userdata) +init() { - C_API_BEGIN - - assert(userdata.ptr && "Dispatch callback passed null!"); - auto& tool = *reinterpret_cast(userdata.ptr); - - std::string name = "agent_" + std::to_string(agent.handle) + "_shader_engine_" + - std::to_string(se_id) + "_" + std::to_string(agent.handle) + ".att"; - -#ifdef ENABLE_ATT_FILES - { - std::ofstream file(tool.out_dir + name, std::ios::binary); - assert(file.is_open() && "Could not open ATT file for writing"); - file.write((char*) se_data, data_size); - } -#endif - - assert(se_data); - assert(data_size); - - auto _lk = std::unique_lock{tool.mut}; - tool.att_files.push_back(name); - - C_API_END + // const char* decoder_lib = std::getenv("ROCPROF_TRACE_DECODER_PATH"); + DECODER_CALL(rocprofiler_thread_trace_decoder_create(&decoder, "/opt/rocm/lib")); } void -finalize_json(void* userdata) +finalize(void* /* tool_data */) { - assert(userdata && "Dispatch callback passed null!"); + rocprofiler_thread_trace_decoder_destroy(decoder); - auto& tool = *reinterpret_cast(userdata); - auto _lk = std::unique_lock{tool.mut}; - assert(!tool.att_files.empty()); - -#ifdef ENABLE_ATT_FILES - nlohmann::json att_json; - for(auto& file : tool.att_files) - att_json.push_back(file); - - nlohmann::json codeobj_json; - nlohmann::json snapshot_json; - for(auto& file : tool.codeobjs) - { - nlohmann::json codeobj; - codeobj["code_object_id"] = file.id; - codeobj["load_delta"] = file.addr; - codeobj["load_size"] = file.size; - codeobj["uri"] = file.uri; - codeobj["filename"] = file.filename; - codeobj_json.push_back(codeobj); - - nlohmann::json pair_json; - pair_json["key"] = file.id; - pair_json["value"] = file.filename; - snapshot_json.push_back(pair_json); - } - nlohmann::json tool_json; - tool_json["strings"]["att_files"] = att_json; - tool_json["code_objects"] = codeobj_json; - tool_json["strings"]["code_object_snapshot_files"] = snapshot_json; - - nlohmann::json array; - array.push_back(tool_json); - - nlohmann::json sdk_json; - sdk_json["rocprofiler-sdk-tool"] = array; - - std::ofstream json_file(tool.out_dir + (std::to_string(getpid()) + "_results.json")); - assert(json_file.is_open() && "Could not open json file for writing!"); - json_file << sdk_json; -#endif + if(latency.load() == 0) std::cerr << "Error: No latency was assigned to the trace!"; } } // namespace Callbacks diff --git a/tests/thread-trace/trace_callbacks.hpp b/tests/thread-trace/trace_callbacks.hpp index 414d6c984a..9c175782b0 100644 --- a/tests/thread-trace/trace_callbacks.hpp +++ b/tests/thread-trace/trace_callbacks.hpp @@ -40,60 +40,21 @@ #include #include -#define ROCPROFILER_VAR_NAME_COMBINE(X, Y) X##Y -#define ROCPROFILER_VARIABLE(X, Y) ROCPROFILER_VAR_NAME_COMBINE(X, Y) - -#define C_API_BEGIN \ - try \ - { -#define C_API_END \ - } \ - catch(std::exception & e) \ - { \ - std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \ - } \ - catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; } - #define ROCPROFILER_CALL(result, msg) \ + if((result) != ROCPROFILER_STATUS_SUCCESS) \ { \ - rocprofiler_status_t CHECKSTATUS = result; \ - if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ - { \ - std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ - std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ - << " failed with error code " << CHECKSTATUS << ": " << status_msg \ - << std::endl; \ - std::stringstream errmsg{}; \ - errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ - << status_msg << ")"; \ - throw std::runtime_error(errmsg.str()); \ - } \ + std::cerr << "Error: " << msg << std::endl; \ + abort(); \ + } + +#define DECODER_CALL(result) \ + if((result) != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::cerr << "Error: Generic decoder error" << std::endl; \ } namespace Callbacks { -struct CodeobjInfo -{ - int64_t addr = 0; - size_t size = 0; - size_t id = 0; - std::string filename{}; - std::string uri{}; -}; - -struct ToolData -{ - ToolData(const char* out) - : out_dir(out){}; - - std::string out_dir{}; - std::mutex mut{}; - std::vector codeobjs{}; - std::vector att_files{}; - - std::unordered_map kernel_id_to_kernel_name = {}; -}; - void tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t*, @@ -107,6 +68,9 @@ shader_data_callback(rocprofiler_agent_id_t agent, rocprofiler_user_data_t userdata); void -finalize_json(void* userdata); +init(); + +void +finalize(void* /* tool_data */); } // namespace Callbacks