Thread trace and Trace Decoder API tests and samples (#416)

* Adding test and samples to decoder

* Fix sample

* Formatting

* Fix multi test

* Disable sample

* Fix tests

* Format

* Version fix

* Locking the decoder

* Add atomic

* Review comments

* Format

* Adding readme

* merge conflict and adding PCS+ATT test

* Review comments

* Properly disable PCS test

* Update tests/rocprofv3/advanced-thread-trace/CMakeLists.txt

* Adding back env var test

* Name fix

* Preload sample

* Addressing review comments

* Update docs

---------

Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
Этот коммит содержится в:
Baraldi, Giovanni
2025-07-23 03:08:12 +02:00
коммит произвёл GitHub
родитель bf0fad1d54
Коммит e898079a13
22 изменённых файлов: 982 добавлений и 404 удалений
+1
Просмотреть файл
@@ -47,3 +47,4 @@ add_subdirectory(code_object_isa_decode)
add_subdirectory(external_correlation_id_request)
add_subdirectory(pc_sampling)
add_subdirectory(openmp_target)
add_subdirectory(thread_trace)
+79
Просмотреть файл
@@ -0,0 +1,79 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-samples-thread-trace LANGUAGES CXX HIP)
find_package(rocprofiler-sdk REQUIRED)
find_package(rocprofiler-sdk-roctx REQUIRED)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_EXTENSIONS OFF)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
set_source_files_properties(
main.cpp PROPERTIES LANGUAGE HIP COMPILE_FLAGS "${CMAKE_HIP_FLAGS_RELWITHDEBINFO}")
add_executable(thread-trace-sample)
target_sources(thread-trace-sample PRIVATE main.cpp agent.cpp)
target_link_libraries(
thread-trace-sample
PUBLIC rocprofiler-sdk::samples-build-flags
PRIVATE rocprofiler-sdk::rocprofiler-sdk
rocprofiler-sdk::samples-common-library
rocprofiler-sdk::rocprofiler-sdk-dw
rocprofiler-sdk-roctx::rocprofiler-sdk-roctx
rocprofiler-sdk::rocprofiler-sdk-amd-comgr)
add_test(NAME thread-trace-sample COMMAND $<TARGET_FILE:thread-trace-sample>)
find_library(
attdecoder_LIBRARY
NAMES rocprof-trace-decoder
HINTS ${ROCM_PATH}
PATHS ${ROCM_PATH}
PATH_SUFFIXES lib)
if(attdecoder_LIBRARY)
cmake_path(GET attdecoder_LIBRARY PARENT_PATH attdecoder_LIB_DIR)
endif()
find_package_handle_standard_args(attdecoder REQUIRED_VARS attdecoder_LIB_DIR
attdecoder_LIBRARY)
set(IS_DISABLED ON)
if(attdecoder_FOUND)
set(IS_DISABLED OFF)
endif()
rocprofiler_samples_get_preload_env(PRELOAD_ENV)
list(APPEND PRELOAD_ENV "ROCPROFILER_TRACE_DECODER_LIB_PATH=${attdecoder_LIB_DIR}")
set_tests_properties(
thread-trace-sample PROPERTIES TIMEOUT 60 ENVIRONMENT ${PRELOAD_ENV} LABELS
"samples;thread-trace" DISABLED ${IS_DISABLED})
+26
Просмотреть файл
@@ -0,0 +1,26 @@
# Thread Trace and ROCprof Trace Decoder
## Services
- Thread trace in device profiling mode
- ROCprof Trace Decoder decodes the received thread trace data
- Thread trace start/stop using roctx
## Properties
### [agent.cpp](agent.cpp):
- Configures thread trace in all GPU agents found with `rocprofiler_configure_device_thread_trace_service`
- Waits until `roctxProfilerResume` is called to start thread trace
- Stops tracing at `roctxProfilerPause`
- Receives the trace data in `shader_data_callback` and calls `rocprofiler_trace_decode` to decode the data
- `rocprofiler_trace_decode` calls `parse` (a lambda)
- `parse` receives the dedecoded data and increments hitcount/latencies by pc address
- At application end, `tool_fini` calls `gen_output_stream` to write the top hotspots into `thread_trace.log`
### [main.cpp](main.cpp):
- Defines a few different kernels and runs them
- The first loop iteration warms up the kernels
- The second iteration calls `roctxProfilerResume` to start thread trace
- After the loop ends, `roctxProfilerPause` is called to stop tracing
+391
Просмотреть файл
@@ -0,0 +1,391 @@
// MIT License
//
// Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// undefine NDEBUG so asserts are implemented
#ifdef NDEBUG
# undef NDEBUG
#endif
#include <rocprofiler-sdk/cxx/codeobj/code_printing.hpp>
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/experimental/thread_trace.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <algorithm>
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <string>
#include <vector>
#define ROCPROFILER_CALL(result, msg) \
if(auto ec = (result); ec != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::cerr << "rocprofiler-sdk error at " << __FILE__ << ":" << __LINE__ \
<< " :: " << #result << std::endl; \
std::cerr << "rocprofiler-sdk error code " << ec << ": " \
<< rocprofiler_get_status_string(ec) << " :: " << msg << std::endl; \
abort(); \
}
#define DECODER_CALL(result) \
if(auto ec = (result); ec != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::cerr << "Decoder error at " << __FILE__ << ":" << __LINE__ << std::endl; \
std::cerr << "rocprofiler-sdk error code " << ec << ": " \
<< rocprofiler_get_status_string(ec) << std::endl; \
}
#define CHECK_NOTNULL(x) \
if(!(x)) \
{ \
abort(); \
};
namespace
{
constexpr uint64_t TARGET_CU = 1; // CU (gfx9) or WGP (gfx10+)
constexpr uint64_t SHADER_MASK = 0x1; // Only enable SE=0
constexpr uint64_t BUFFER_SIZE = 0x10000000; // 256MB
}; // namespace
namespace Results
{
using pcinfo_t = rocprofiler_thread_trace_decoder_pc_t;
struct address_sort_t
{
bool operator()(const pcinfo_t& a, const pcinfo_t& b) const
{
if(a.marker_id == b.marker_id) return a.addr < b.addr;
return a.marker_id < b.marker_id;
}
};
struct Latency
{
uint64_t latency{0};
uint64_t hitcount{0};
};
// Maps address to latency
using LatencyTable = std::map<rocprofiler_thread_trace_decoder_pc_t, Latency, address_sort_t>;
// Used to disassemble instructions at (id, vaddr) pair
using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
AddressTable* table{nullptr};
LatencyTable* latencies{nullptr};
// used to calculate mean wave lifetime
int64_t wave_lifetime = 0;
int64_t waves_started = 0;
int64_t waves_ended = 0;
void
gen_output_stream()
{
CHECK_NOTNULL(Results::latencies);
CHECK_NOTNULL(Results::table);
const char* OUTPUT_OFSTREAM = "thread_trace.log";
std::ofstream file(OUTPUT_OFSTREAM);
if(!file.is_open())
std::cout << "Could not open log file: " << OUTPUT_OFSTREAM << ", writing to stdout\n";
else
std::cout << "Writing log to: " << OUTPUT_OFSTREAM << std::endl;
std::ostream& output = file.is_open() ? file : std::cout;
// Sort map by instruction cost
using Element = std::pair<pcinfo_t, Latency>;
std::vector<Element> sorted(latencies->begin(), latencies->end());
std::stable_sort(sorted.begin(), sorted.end(), [](const Element& a, const Element& b) {
return a.second.latency > b.second.latency;
});
output << "Top 50 hotspots for trace (cycles):\n";
for(size_t i = 0; i < sorted.size() && i < 50; i++)
{
auto& addr = sorted.at(i).first;
auto& latency = sorted.at(i).second;
auto inst = table->get(addr.marker_id, addr.addr);
auto comment = inst->comment;
size_t pos = comment.rfind('/');
if(pos != std::string::npos && pos + 1 < comment.size()) comment = comment.substr(pos + 1);
output << "Latency:" << latency.latency << "\tHit:" << latency.hitcount << " \t"
<< inst->inst << " [" << comment << "]\n";
}
if(waves_started != waves_ended)
std::cerr << "Error: Some waves have not ended!" << std::endl;
else if(waves_started == 0)
std::cerr << "Error: No waves started!" << std::endl;
else
output << "\nMean wave lifetime: " << wave_lifetime / waves_started << " cycles";
output << "\nWaves started: " << waves_started << "\nWaves ended: " << waves_ended << "\n";
};
} // namespace Results
namespace Decoder
{
rocprofiler_thread_trace_decoder_id_t decoder{};
void
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* /* user_data */,
void* /* userdata */)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return;
CHECK_NOTNULL(Results::table);
auto* data = static_cast<rocprofiler_callback_tracing_code_object_load_data_t*>(record.payload);
if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE)
{
Results::table->addDecoder(
data->uri, data->code_object_id, data->load_delta, data->load_size);
return;
}
auto* memorybase = reinterpret_cast<const void*>(data->memory_base);
CHECK_NOTNULL(memorybase);
DECODER_CALL(rocprofiler_thread_trace_decoder_codeobj_load(decoder,
data->code_object_id,
data->load_delta,
data->load_size,
memorybase,
data->memory_size));
Results::table->addDecoder(
memorybase, data->memory_size, data->code_object_id, data->load_delta, data->load_size);
}
void
shader_data_callback(rocprofiler_agent_id_t /* agent */,
int64_t /* se_id */,
void* se_data,
size_t data_size,
rocprofiler_user_data_t /* userdata */)
{
CHECK_NOTNULL(Results::latencies);
auto parse = [](rocprofiler_thread_trace_decoder_record_type_t record_type_id,
void* events,
uint64_t num_events,
void* /* userdata */) {
if(record_type_id == ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY)
{
for(size_t i = 0; i < num_events; i++)
{
auto& event = static_cast<rocprofiler_thread_trace_decoder_occupancy_t*>(events)[i];
if(event.start)
{
Results::wave_lifetime -= static_cast<int64_t>(event.time);
Results::waves_started++;
}
else
{
Results::wave_lifetime += static_cast<int64_t>(event.time);
Results::waves_ended++;
}
}
}
if(record_type_id != ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE) return;
for(size_t w = 0; w < num_events; w++)
{
auto* wave = static_cast<rocprofiler_thread_trace_decoder_wave_t*>(events);
for(size_t i = 0; i < wave->instructions_size; i++)
{
auto& inst = wave->instructions_array[i];
auto& latency = (*Results::latencies)[inst.pc];
latency.latency += inst.duration;
latency.hitcount += 1;
}
}
};
DECODER_CALL(rocprofiler_trace_decode(decoder, parse, se_data, data_size, nullptr));
}
} // namespace Decoder
namespace ThreadTracer
{
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_context_id_t agent_ctx = {};
rocprofiler_context_id_t tracing_ctx = {};
rocprofiler_status_t
query_available_agents(rocprofiler_agent_version_t /* version */,
const void** agents,
size_t num_agents,
void* user_data)
{
rocprofiler_user_data_t user{};
user.ptr = user_data;
for(size_t idx = 0; idx < num_agents; idx++)
{
const auto* agent = static_cast<const rocprofiler_agent_v0_t*>(agents[idx]);
if(agent->type != ROCPROFILER_AGENT_TYPE_GPU) continue;
auto parameters = std::vector<rocprofiler_thread_trace_parameter_t>{};
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_TARGET_CU, TARGET_CU});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_BUFFER_SIZE, BUFFER_SIZE});
parameters.push_back({ROCPROFILER_THREAD_TRACE_PARAMETER_SHADER_ENGINE_MASK, SHADER_MASK});
ROCPROFILER_CALL(
rocprofiler_configure_device_thread_trace_service(agent_ctx,
agent->id,
parameters.data(),
parameters.size(),
Decoder::shader_data_callback,
user),
"thread trace service configure");
}
return ROCPROFILER_STATUS_SUCCESS;
}
void
cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* /* user_data */,
void* /* cb_data */)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API) return;
if(record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER &&
record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerPause)
{
ROCPROFILER_CALL(rocprofiler_stop_context(agent_ctx), "stopping context");
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT &&
record.operation == ROCPROFILER_MARKER_CONTROL_API_ID_roctxProfilerResume)
{
ROCPROFILER_CALL(rocprofiler_start_context(agent_ctx), "starting context");
}
}
int
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */)
{
Results::latencies = new Results::LatencyTable{};
Results::table = new Results::AddressTable{};
// This is set by ctests: TODO: move to client.cpp
// If nullptr, searches rocprofiler-sdk install location
const char* lib_path = std::getenv("ROCPROFILER_TRACE_DECODER_LIB_PATH");
DECODER_CALL(rocprofiler_thread_trace_decoder_create(&Decoder::decoder, lib_path));
ROCPROFILER_CALL(rocprofiler_create_context(&tracing_ctx), "context creation");
ROCPROFILER_CALL(rocprofiler_create_context(&agent_ctx), "context creation");
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(tracing_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
nullptr,
0,
Decoder::tool_codeobj_tracing_callback,
nullptr),
"code object tracing service configure");
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
tracing_ctx,
ROCPROFILER_CALLBACK_TRACING_MARKER_CONTROL_API,
nullptr,
0,
cntrl_tracing_callback,
nullptr),
"marker tracing callback service configure");
ROCPROFILER_CALL(rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
&query_available_agents,
sizeof(rocprofiler_agent_t),
nullptr),
"Failed to find GPU agents");
int valid_ctx = 0;
ROCPROFILER_CALL(rocprofiler_context_is_valid(agent_ctx, &valid_ctx), "validity check");
assert(valid_ctx != 0);
ROCPROFILER_CALL(rocprofiler_context_is_valid(tracing_ctx, &valid_ctx), "validity check");
assert(valid_ctx != 0);
ROCPROFILER_CALL(rocprofiler_start_context(tracing_ctx), "context start");
// no errors
return 0;
}
void
tool_fini(void* /* tool_data */)
{
rocprofiler_thread_trace_decoder_destroy(Decoder::decoder);
Results::gen_output_stream();
delete Results::latencies;
delete Results::table;
}
} // namespace ThreadTracer
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t /* version */,
const char* /* runtime_version */,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "Thread Trace Sample";
// store client info
ThreadTracer::client_id = id;
// create configure data
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&ThreadTracer::tool_init,
&ThreadTracer::tool_fini,
nullptr};
// return pointer to configure data
return &cfg;
}
+178
Просмотреть файл
@@ -0,0 +1,178 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// undefine NDEBUG so asserts are implemented
#ifdef NDEBUG
# undef NDEBUG
#endif
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <rocprofiler-sdk-roctx/roctx.h>
#include "hip/hip_runtime.h"
// Two waves per SIMD on MI300
#define DATA_SIZE (304 * 64 * 4 * 2)
#define HIP_API_CALL(CALL) \
if((CALL) != hipSuccess) \
{ \
abort(); \
}
#define LDS_SIZE 1024
__global__ void
divide_kernel(float* a, const float* b, const float* c, int /* unused */)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if(index >= DATA_SIZE) return;
a[index] = (b[index] - c[index]) / abs(c[index] + b[index]) + 1;
}
__global__ void
looping_lds_kernel(float* a, const float* b, const float* c, int loopcount)
{
__shared__ float interm[LDS_SIZE];
size_t index = blockDim.x * blockIdx.x + threadIdx.x;
for(size_t i = index; i < DATA_SIZE; i += blockDim.x * gridDim.x)
interm[threadIdx.x % LDS_SIZE] = b[index] + threadIdx.x;
for(int it = 0; it < loopcount; it++)
{
__syncthreads();
float value = interm[(it + threadIdx.x + LDS_SIZE / 2) % LDS_SIZE];
__syncthreads();
interm[threadIdx.x % LDS_SIZE] += value;
}
a[index] = interm[threadIdx.x % LDS_SIZE] + c[index];
}
__global__ void
fifo_kernel(float* /* a */, const float* /* b */, const float* /* c */, int loops)
{
using _float4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
__shared__ _float4 lds[LDS_SIZE];
lds[threadIdx.x] = _float4{float(threadIdx.x)};
lds[threadIdx.x + 512] = _float4{float(threadIdx.x)};
__syncthreads();
_float4 dst[16];
float res1 = 0, res2 = 0;
for(int l = 0; l < loops; l++)
{
#pragma unroll 16
for(int i = 0; i < 16; i++)
dst[i] = lds[threadIdx.x + i * 8];
__syncthreads();
#pragma unroll 16
for(int i = 0; i < 16; i++)
{
res1 += dst[i][0] + dst[i][1];
res2 += dst[i][2] + dst[i][3];
}
asm volatile("v_add_f32 %0, %1, %2" : "=v"(res1) : "v"(res1), "v"(res2));
}
};
class hipMemory
{
public:
hipMemory(size_t size = DATA_SIZE)
{
HIP_API_CALL(hipMalloc(&ptr, size * sizeof(float)));
HIP_API_CALL(hipMemset(ptr, 0, size * sizeof(float)));
}
~hipMemory()
{
if(ptr) HIP_API_CALL(hipFree(ptr));
}
hipMemory(hipMemory&& other)
{
ptr = other.ptr;
other.ptr = nullptr;
}
float* ptr = nullptr;
};
class HipStream
{
public:
HipStream() { HIP_API_CALL(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); }
~HipStream() { HIP_API_CALL(hipStreamDestroy(stream)); }
hipStream_t stream;
hipMemory src1{};
hipMemory src2{};
hipMemory dst{};
};
#define Launch(kernel, stream, arglast) \
hipLaunchKernelGGL( \
kernel, DATA_SIZE / 512, 512, 0, 0, stream.dst.ptr, stream.src1.ptr, stream.src2.ptr, 6);
int
main(int /*argc*/, char** /*argv*/)
{
std::array<HipStream, 3> streams{};
std::vector<decltype(divide_kernel)*> kernels{};
kernels.push_back(divide_kernel);
kernels.push_back(looping_lds_kernel);
kernels.push_back(fifo_kernel);
for(size_t i = 0; i < streams.size() * kernels.size(); i++)
{
// Warmup then start
if(i == streams.size())
{
HIP_API_CALL(hipDeviceSynchronize());
roctxProfilerResume(0);
}
auto& stream = streams.at(i % streams.size());
auto& kernel = kernels.at(i % kernels.size());
Launch(kernel, stream, 3);
HIP_API_CALL(hipGetLastError());
}
HIP_API_CALL(hipDeviceSynchronize());
roctxProfilerPause(0);
return 0;
}
+1 -1
Просмотреть файл
@@ -188,7 +188,7 @@ To decode the raw thread trace data, create and initialize a Trace Decoder:
.. code-block:: cpp
rocprofiler_thread_trace_decoder_handle_t decoder{};
rocprofiler_thread_trace_decoder_id_t decoder{};
// Create the Trace Decoder with the path to the decoder library
ROCPROFILER_CALL(
+2
Просмотреть файл
@@ -23,6 +23,7 @@
#pragma once
#include <rocprofiler-sdk/experimental/thread-trace/trace_decoder.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <rocprofiler-sdk/internal_threading.h>
@@ -70,6 +71,7 @@ ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_signal_t)
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_executable_t)
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_region_t)
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(hsa_amd_memory_pool_t)
ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER(rocprofiler_thread_trace_decoder_id_t)
#undef ROCPROFILER_CXX_SPECIALIZE_HANDLE_HASHER
} // namespace std
+6
Просмотреть файл
@@ -26,6 +26,7 @@
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/counters.h>
#include <rocprofiler-sdk/defines.h>
#include <rocprofiler-sdk/experimental/thread-trace/trace_decoder.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <rocprofiler-sdk/internal_threading.h>
@@ -148,6 +149,7 @@ ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_record_dimension_inf
ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_record_dimension_instance_info_t&)
ROCPROFILER_CXX_DECLARE_OPERATORS(const rocprofiler_counter_dimension_info_t&)
ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_version_triplet_t)
ROCPROFILER_CXX_DECLARE_OPERATORS(rocprofiler_thread_trace_decoder_id_t)
// definitions of operator==
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_context_id_t)
@@ -164,6 +166,7 @@ ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_signal_t)
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_executable_t)
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_region_t)
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(hsa_amd_memory_pool_t)
ROCPROFILER_CXX_DEFINE_EQ_HANDLE_OPERATOR(rocprofiler_thread_trace_decoder_id_t)
inline bool
operator==(const rocprofiler_agent_v0_t& lhs, const rocprofiler_agent_v0_t& rhs)
@@ -241,6 +244,7 @@ ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_dim3_t)
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_region_t)
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(hsa_amd_memory_pool_t)
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_version_triplet_t)
ROCPROFILER_CXX_DEFINE_NE_OPERATOR(rocprofiler_thread_trace_decoder_id_t)
// definitions of operator<
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_context_id_t)
@@ -257,6 +261,7 @@ ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_signal_t)
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_executable_t)
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_region_t)
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(hsa_amd_memory_pool_t)
ROCPROFILER_CXX_DEFINE_LT_HANDLE_OPERATOR(rocprofiler_thread_trace_decoder_id_t)
inline bool
operator<(const rocprofiler_counter_record_dimension_info_t& lhs,
@@ -334,6 +339,7 @@ ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_dim3_t)
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_region_t)
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(hsa_amd_memory_pool_t)
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_version_triplet_t)
ROCPROFILER_CXX_DEFINE_COMPARE_OPERATORS(rocprofiler_thread_trace_decoder_id_t)
// cleanup defines
#undef ROCPROFILER_CXX_DECLARE_OPERATORS
+13 -13
Просмотреть файл
@@ -36,10 +36,10 @@ ROCPROFILER_EXTERN_C_INIT
/**
* @brief Handle containing a loaded rocprof-trace-decoder and a decoder state.
*/
typedef struct rocprofiler_thread_trace_decoder_handle_t
typedef struct rocprofiler_thread_trace_decoder_id_t
{
uint64_t handle;
} rocprofiler_thread_trace_decoder_handle_t;
} rocprofiler_thread_trace_decoder_id_t;
/**
* @brief Initializes Trace Decoder library with a library search path
@@ -51,7 +51,7 @@ typedef struct rocprofiler_thread_trace_decoder_handle_t
* @retval ::ROCPROFILER_STATUS_SUCCESS Handle created
*/
rocprofiler_status_t
rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_t* handle,
rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_id_t* handle,
const char* path) ROCPROFILER_API ROCPROFILER_NONNULL(1, 2);
/**
@@ -59,7 +59,7 @@ rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_
* @param[in] handle Handle to destroy
*/
void
rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle_t handle)
rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_id_t handle)
ROCPROFILER_API;
/**
@@ -83,11 +83,11 @@ rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle
* @retval ::ROCPROFILER_STATUS_SUCCESS Code object loaded
*/
rocprofiler_status_t
rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_handle_t handle,
uint64_t load_id,
uint64_t load_addr,
uint64_t load_size,
const void* data,
rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_id_t handle,
uint64_t load_id,
uint64_t load_addr,
uint64_t load_size,
const void* data,
uint64_t size) ROCPROFILER_API ROCPROFILER_NONNULL(5);
/**
@@ -100,7 +100,7 @@ rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_h
* @retval ::ROCPROFILER_STATUS_SUCCESS Code object unloaded
*/
rocprofiler_status_t
rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_handle_t handle,
rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_id_t handle,
uint64_t load_id) ROCPROFILER_API;
/**
@@ -135,7 +135,7 @@ typedef void (*rocprofiler_thread_trace_decoder_callback_t)(
* @retval ::ROCPROFILER_STATUS_SUCCESS on success
*/
rocprofiler_status_t
rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle,
rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_id_t handle,
rocprofiler_thread_trace_decoder_callback_t callback,
void* data,
uint64_t size,
@@ -148,8 +148,8 @@ rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle,
* @retval null terminated string as description of "info".
*/
const char*
rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_handle_t handle,
rocprofiler_thread_trace_decoder_info_t info)
rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_id_t handle,
rocprofiler_thread_trace_decoder_info_t info)
ROCPROFILER_API;
/** @} */
+3 -3
Просмотреть файл
@@ -41,9 +41,9 @@ namespace rocprofiler
{
namespace att_wrapper
{
ATTFileMgr::ATTFileMgr(Fspath _dir,
std::vector<std::string> _counters,
rocprofiler_thread_trace_decoder_handle_t _decoder)
ATTFileMgr::ATTFileMgr(Fspath _dir,
std::vector<std::string> _counters,
rocprofiler_thread_trace_decoder_id_t _decoder)
: dir(std::move(_dir))
, decoder(_decoder)
{
+5 -5
Просмотреть файл
@@ -71,7 +71,7 @@ public:
bool valid() const;
protected:
rocprofiler_thread_trace_decoder_handle_t decoder{};
rocprofiler_thread_trace_decoder_id_t decoder{};
};
class ATTFileMgr
@@ -79,9 +79,9 @@ class ATTFileMgr
using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
public:
ATTFileMgr(Fspath _dir,
std::vector<std::string> _counters,
rocprofiler_thread_trace_decoder_handle_t _decoder);
ATTFileMgr(Fspath _dir,
std::vector<std::string> _counters,
rocprofiler_thread_trace_decoder_id_t _decoder);
~ATTFileMgr();
void addDecoder(const char* filepath, uint64_t id, uint64_t load_addr, uint64_t memsize);
@@ -95,7 +95,7 @@ public:
std::shared_ptr<AddressTable> table{nullptr};
std::map<size_t, std::vector<occupancy_t>> occupancy{};
std::vector<uint64_t> codeobjs_to_delete{};
rocprofiler_thread_trace_decoder_handle_t decoder{};
rocprofiler_thread_trace_decoder_id_t decoder{};
std::array<std::shared_ptr<class WstatesFile>, ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST>
wstates;
+3 -3
Просмотреть файл
@@ -103,9 +103,9 @@ get_trace_data(rocprofiler_thread_trace_decoder_record_type_t trace_id,
C_API_END
}
ToolData::ToolData(std::vector<char>& _data,
WaveConfig& _config,
rocprofiler_thread_trace_decoder_handle_t _decoder)
ToolData::ToolData(std::vector<char>& _data,
WaveConfig& _config,
rocprofiler_thread_trace_decoder_id_t _decoder)
: cfile(_config.code)
, config(_config)
, decoder(_decoder)
+4 -4
Просмотреть файл
@@ -60,9 +60,9 @@ using SymbolInfo = rocprofiler::sdk::codeobj::disassembly::SymbolInfo;
struct ToolData
{
ToolData(std::vector<char>& data,
WaveConfig& config,
rocprofiler_thread_trace_decoder_handle_t decoder);
ToolData(std::vector<char>& data,
WaveConfig& config,
rocprofiler_thread_trace_decoder_id_t decoder);
~ToolData();
CodeLine& get(pcinfo_t pc);
@@ -73,7 +73,7 @@ struct ToolData
std::vector<char> shader_data{};
size_t num_waves = 0;
rocprofiler_thread_trace_decoder_handle_t decoder{};
rocprofiler_thread_trace_decoder_id_t decoder{};
};
} // namespace att_wrapper
+62 -55
Просмотреть файл
@@ -21,6 +21,7 @@
// SOFTWARE.
#include "lib/common/static_object.hpp"
#include "lib/common/synchronized.hpp"
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
@@ -29,15 +30,19 @@
#include <rocprofiler-sdk/experimental/thread-trace/trace_decoder.h>
#include <rocprofiler-sdk/experimental/thread_trace.h>
#include <rocprofiler-sdk/cxx/hash.hpp>
#include <rocprofiler-sdk/cxx/operators.hpp>
#include <glog/logging.h>
#include <atomic>
#include <cstdint>
namespace
{
using DL = rocprofiler::thread_trace::DL;
using AddressTable = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
using LockedTable = rocprofiler::common::Synchronized<AddressTable>;
class DecoderInstance
{
@@ -46,72 +51,72 @@ public:
: dl(std::move(_dl))
{}
std::unique_ptr<DL> dl{nullptr};
AddressTable table{};
const std::unique_ptr<const DL> dl{nullptr};
LockedTable table{};
};
std::mutex map_mut;
using DecoderMap =
std::unordered_map<rocprofiler_thread_trace_decoder_id_t, std::shared_ptr<DecoderInstance>>;
using LockedMap = rocprofiler::common::Synchronized<DecoderMap>;
auto&
get_dlopens()
get_dlmap()
{
static auto*& _v = rocprofiler::common::static_object<
std::unordered_map<uint64_t, std::shared_ptr<DecoderInstance>>>::construct();
static auto*& _v = rocprofiler::common::static_object<LockedMap>::construct();
return *CHECK_NOTNULL(_v);
}
std::shared_ptr<DecoderInstance>
get_dl(rocprofiler_thread_trace_decoder_handle_t handle)
get_dl(rocprofiler_thread_trace_decoder_id_t handle)
{
auto lk = std::unique_lock{map_mut};
auto it = get_dlopens().find(handle.handle);
if(it == get_dlopens().end()) return nullptr;
return it->second;
return get_dlmap().rlock([&](const DecoderMap& map) -> std::shared_ptr<DecoderInstance> {
if(auto it = map.find(handle); it != map.end()) return it->second;
return nullptr;
});
}
} // namespace
extern "C" {
rocprofiler_status_t
rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_handle_t* handle,
const char* path)
rocprofiler_thread_trace_decoder_create(rocprofiler_thread_trace_decoder_id_t* handle,
const char* path)
{
auto dl = std::make_unique<DL>(path);
if(dl->handle == nullptr) return ROCPROFILER_STATUS_ERROR_NOT_AVAILABLE;
if(!dl->valid()) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
auto lk = std::unique_lock{map_mut};
static uint64_t count = 1;
static std::atomic<uint64_t> count{1};
handle->handle = count.fetch_add(1);
auto instance = std::make_shared<DecoderInstance>(std::move(dl));
handle->handle = count++;
get_dlopens()[handle->handle] = std::move(instance);
get_dlmap().wlock(
[&](DecoderMap& map) { map[*handle] = std::make_shared<DecoderInstance>(std::move(dl)); });
return ROCPROFILER_STATUS_SUCCESS;
}
void
rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_handle_t handle)
rocprofiler_thread_trace_decoder_destroy(rocprofiler_thread_trace_decoder_id_t handle)
{
auto lk = std::unique_lock{map_mut};
get_dlopens().erase(handle.handle);
get_dlmap().wlock([&](DecoderMap& map) { map.erase(handle); });
}
rocprofiler_status_t
rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_handle_t handle,
uint64_t load_id,
uint64_t load_addr,
uint64_t load_size,
const void* data,
uint64_t size)
rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_id_t handle,
uint64_t load_id,
uint64_t load_addr,
uint64_t load_size,
const void* data,
uint64_t size)
{
auto decoder = get_dl(handle);
if(decoder == nullptr) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
try
{
decoder->table.addDecoder(data, size, load_id, load_addr, load_size);
decoder->table.wlock([&](AddressTable& table) {
table.addDecoder(data, size, load_id, load_addr, load_size);
});
} catch(...)
{
return ROCPROFILER_STATUS_ERROR;
@@ -120,15 +125,17 @@ rocprofiler_thread_trace_decoder_codeobj_load(rocprofiler_thread_trace_decoder_h
}
rocprofiler_status_t
rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_handle_t handle,
uint64_t load_id)
rocprofiler_thread_trace_decoder_codeobj_unload(rocprofiler_thread_trace_decoder_id_t handle,
uint64_t load_id)
{
auto decoder = get_dl(handle);
if(decoder == nullptr) return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
try
{
if(decoder->table.removeDecoder(load_id)) return ROCPROFILER_STATUS_SUCCESS;
bool result =
decoder->table.wlock([&](AddressTable& table) { return table.removeDecoder(load_id); });
if(result) return ROCPROFILER_STATUS_SUCCESS;
} catch(std::exception&)
{}
@@ -169,32 +176,32 @@ isa_callback(char* isa_instruction,
void* userdata)
{
ROCP_FATAL_IF(userdata == nullptr) << "Userdata is null!";
auto& table = static_cast<trace_data_t*>(userdata)->decoder->table;
std::unique_ptr<Instruction> instruction{nullptr};
auto decoder = static_cast<trace_data_t*>(userdata)->decoder;
ROCP_FATAL_IF(decoder == nullptr) << "decoder is null";
try
{
instruction = table.get(pc.marker_id, pc.addr);
auto instruction = decoder->table.wlock(
[&](AddressTable& table) { return table.get(pc.marker_id, pc.addr); });
if(!instruction) return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT;
{
size_t tmp_isa_size = *isa_size;
*isa_size = instruction->inst.size();
if(*isa_size > tmp_isa_size)
return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES;
}
memcpy(isa_instruction, instruction->inst.data(), *isa_size);
*isa_memory_size = instruction->size;
} catch(std::exception& e)
{
ROCP_WARNING << pc.marker_id << ":" << pc.addr << ' ' << e.what();
ROCP_CI_LOG(INFO) << pc.marker_id << ":" << pc.addr << ' ' << e.what();
return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR;
}
if(!instruction) return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT;
{
size_t tmp_isa_size = *isa_size;
*isa_size = instruction->inst.size();
if(*isa_size > tmp_isa_size)
return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES;
}
memcpy(isa_instruction, instruction->inst.data(), *isa_size);
*isa_memory_size = instruction->size;
return ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS;
}
@@ -214,7 +221,7 @@ trace_callback(rocprofiler_thread_trace_decoder_record_type_t record_type_id,
extern "C" {
rocprofiler_status_t
rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle,
rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_id_t handle,
rocprofiler_thread_trace_decoder_callback_t user_callback,
void* data,
uint64_t size,
@@ -249,8 +256,8 @@ rocprofiler_trace_decode(rocprofiler_thread_trace_decoder_handle_t handle,
}
const char*
rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_handle_t handle,
rocprofiler_thread_trace_decoder_info_t info)
rocprofiler_thread_trace_decoder_info_string(rocprofiler_thread_trace_decoder_id_t handle,
rocprofiler_thread_trace_decoder_info_t info)
{
auto decoder = get_dl(handle);
if(decoder == nullptr) return nullptr;
+83 -79
Просмотреть файл
@@ -42,8 +42,6 @@ rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.p
find_package(rocprofiler-sdk REQUIRED)
set(IS_DISABLED ON)
find_library(
attdecoder_LIBRARY
NAMES rocprof-trace-decoder
@@ -58,43 +56,43 @@ endif()
find_package_handle_standard_args(attdecoder REQUIRED_VARS attdecoder_LIB_DIR
attdecoder_LIBRARY)
set(COMMON_PARAMS_NO_LIB ${PRELOAD_ARGS} --att --output-format json --log-level env -d
${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/)
set(IS_DISABLED ON)
if(attdecoder_FOUND)
set(IS_DISABLED OFF)
set(LIB_PATH_ENV "ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}")
set(COMMON_PARAMS --att-library-path ${attdecoder_LIB_DIR} ${COMMON_PARAMS_NO_LIB})
endif()
# hsa multiqueue dependency test with lib path
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
NAME rocprofv3-test-att-hsa-multiqueue-cmd-env-att-lib-path-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --log-level env --advanced-thread-trace
1 --att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-serialize-all 1 -d
${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o out --output-format json
${PRELOAD_ARGS} --att-library-path ${attdecoder_LIB_DIR} --att-activity 8 --
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input
--att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-serialize-all 1 -o out --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
rocprofv3-test-att-hsa-multiqueue-cmd-env-att-lib-path-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED})
# hsa multiqueue dependency test with json input
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-json-execute
NAME rocprofv3-test-att-hsa-multiqueue-json-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --log-level env --att-library-path
${attdecoder_LIB_DIR} -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/json_input -i
${CMAKE_CURRENT_BINARY_DIR}/att_input.json ${PRELOAD_ARGS} --
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/json_input -i
${CMAKE_CURRENT_BINARY_DIR}/att_input.json --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-json-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} ENVIRONMENT
"${LIB_PATH_ENV}")
rocprofv3-test-att-hsa-multiqueue-json-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED})
# validate output
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-cmd-validate
NAME rocprofv3-test-att-hsa-multiqueue-cmd-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json
@@ -102,7 +100,7 @@ add_test(
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input)
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-json-validate
NAME rocprofv3-test-att-hsa-multiqueue-json-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/out_results.json
@@ -110,46 +108,46 @@ add_test(
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-cmd-validate
rocprofv3-test-att-hsa-multiqueue-cmd-validate
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DEPENDS
"rocprofv3-test-hsa-multiqueue-att-cmd-ld-lib-path-execute"
"rocprofv3-test-att-hsa-multiqueue-cmd-ld-lib-path-execute"
FAIL_REGULAR_EXPRESSION
"AssertionError"
DISABLED
${IS_DISABLED})
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-json-validate
rocprofv3-test-att-hsa-multiqueue-json-validate
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DEPENDS
"rocprofv3-test-hsa-multiqueue-att-json-execute"
"rocprofv3-test-att-hsa-multiqueue-json-execute"
FAIL_REGULAR_EXPRESSION
"AssertionError"
DISABLED
${IS_DISABLED})
function(configure_att_input _FILENAME _OUTDIR)
set(LIBRARY_OUTPUT_DIR ${_OUTDIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/att_input.yml.in
function(configure_att_input _FILENAME)
set(LIB_PATH_LOC "${attdecoder_LIB_DIR}")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}//${_FILENAME}.in
${CMAKE_CURRENT_BINARY_DIR}/${_FILENAME} @ONLY)
endfunction()
configure_att_input(att_input.yml "${attdecoder_LIB_DIR}")
configure_att_input(att_input_will_fail.yml "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
configure_att_input(att_input.yml)
configure_att_input(att_input_will_fail.yml)
# test yaml input
add_test(
NAME rocprofv3-test-att-yaml-input
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
${CMAKE_CURRENT_BINARY_DIR}/att_input.yml --log-level env --echo --
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS_NO_LIB}/yaml -i
${CMAKE_CURRENT_BINARY_DIR}/att_input.yml --echo --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
@@ -166,7 +164,7 @@ add_test(
set_tests_properties(
rocprofv3-test-att-yaml-input-will-fail
PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED True)
PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED ${IS_DISABLED})
add_test(NAME rocprofv3-test-att-library-path-cmd-line-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> --att --att-library-path .
@@ -174,11 +172,12 @@ add_test(NAME rocprofv3-test-att-library-path-cmd-line-will-fail
set_tests_properties(
rocprofv3-test-att-library-path-cmd-line-will-fail
PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED True)
PROPERTIES TIMEOUT 45 LABELS "integration-tests" WILL_FAIL ON DISABLED ${IS_DISABLED})
add_test(NAME rocprofv3-test-att-library-path-env-var-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> --att --log-level env --echo
-- $<TARGET_FILE:hsa_code_object_testapp>)
add_test(
NAME rocprofv3-test-att-library-path-env-var-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS_NO_LIB}/envfail
--att-library-path . --echo -- $<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-att-library-path-env-var-will-fail
@@ -187,65 +186,70 @@ set_tests_properties(
LABELS
"integration-tests"
ENVIRONMENT
"ROCPROF_ATT_LIBRARY_PATH=."
"ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}"
WILL_FAIL
ON
DISABLED
True)
${IS_DISABLED})
add_test(NAME rocprofv3-test-att-env-var
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS_NO_LIB}/envvar
-- $<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-att-env-var
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
"ROCPROF_ATT_LIBRARY_PATH=${attdecoder_LIB_DIR}" DISABLED ${IS_DISABLED})
#
# Uses ATT and Counter Collection at the same time
#
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-plus-pmc-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --log-level env --pmc SQ_WAVES
--advanced-thread-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
out --output-format json ${PRELOAD_ARGS} -- $<TARGET_FILE:vector-ops>)
add_test(NAME rocprofv3-test-att-hsa-multiqueue-plus-pmc-execute
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input
--pmc SQ_WAVES -o out -- $<TARGET_FILE:vector-ops>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-plus-pmc-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} ENVIRONMENT
"${LIB_PATH_ENV}")
rocprofv3-test-att-hsa-multiqueue-plus-pmc-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED})
# Check for conflict PMC + activity
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-activity-pmc-will-fail
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --log-level env --pmc SQ_WAVES
--advanced-thread-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
out --att-activity 8 ${PRELOAD_ARGS} -- $<TARGET_FILE:vector-ops>)
add_test(NAME rocprofv3-test-att-hsa-multiqueue-activity-pmc-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input
--pmc SQ_WAVES -o out --att-activity 8 -- $<TARGET_FILE:vector-ops>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-activity-pmc-will-fail
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DISABLED
${IS_DISABLED}
ENVIRONMENT
"${LIB_PATH_ENV}"
WILL_FAIL
ON)
rocprofv3-test-att-hsa-multiqueue-activity-pmc-will-fail
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} WILL_FAIL ON)
# Check for conflict Perfcounters + activity
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-activity-perf-will-fail
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --log-level env --att
--att-perfcounter-ctrl 8 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
out --att-activity 8 ${PRELOAD_ARGS} -- $<TARGET_FILE:vector-ops>)
NAME rocprofv3-test-att-hsa-multiqueue-activity-perf-will-fail
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input
--att-perfcounter-ctrl 8 -o out --att-activity 8 -- $<TARGET_FILE:vector-ops>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-activity-perf-will-fail
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DISABLED
${IS_DISABLED}
ENVIRONMENT
"${LIB_PATH_ENV}"
WILL_FAIL
ON)
rocprofv3-test-att-hsa-multiqueue-activity-perf-will-fail
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DISABLED ${IS_DISABLED} WILL_FAIL ON)
# ATT plus PC sampling
rocprofiler_sdk_pc_sampling_stochastic_disabled(IS_PC_SAMPLING_STOCHASTIC_DISABLED)
set(ATT_PLUS_PCS_DISABLE ${IS_DISABLED})
if(${IS_PC_SAMPLING_STOCHASTIC_DISABLED})
set(ATT_PLUS_PCS_DISABLE ON)
endif()
if(${ROCPROFILER_DISABLE_UNSTABLE_CTESTS})
set(ATT_PLUS_PCS_DISABLE ON)
endif()
set(PCS_ARGS --pc-sampling-beta-enabled 1 --pc-sampling-unit cycles
--pc-sampling-interval 16384 --pc-sampling-method stochastic)
add_test(NAME rocprofv3-test-att-plus-pc-sampling
COMMAND $<TARGET_FILE:rocprofiler-sdk::rocprofv3> ${COMMON_PARAMS}/cmd_input
${PCS_ARGS} -o out -- $<TARGET_FILE:vector-ops>)
set_tests_properties(
rocprofv3-test-att-plus-pc-sampling
PROPERTIES TIMEOUT 90 LABELS "integration-tests;thread-trace;pc-sampling" DISABLED
${ATT_PLUS_PCS_DISABLE})
+3 -3
Просмотреть файл
@@ -23,9 +23,9 @@
jobs:
- advanced_thread_trace: True
att_library_path:
- @LIBRARY_OUTPUT_DIR@/att
- @LIBRARY_OUTPUT_DIR@
- @LIB_PATH_LOC@/att
- @LIB_PATH_LOC@
- advanced_thread_trace: True
att_library_path:
- @LIBRARY_OUTPUT_DIR@
- @LIB_PATH_LOC@
+26
Просмотреть файл
@@ -0,0 +1,26 @@
# MIT License
#
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
jobs:
- advanced_thread_trace: True
att_library_path:
- @LIB_PATH_LOC@/att
+8 -25
Просмотреть файл
@@ -29,17 +29,6 @@
#include <set>
#define C_API_BEGIN \
try \
{
#define C_API_END \
} \
catch(std::exception & e) \
{ \
std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \
} \
catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; }
namespace ATTTest
{
namespace Agent
@@ -135,9 +124,10 @@ query_available_agents(rocprofiler_agent_version_t /* version */,
}
int
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */)
{
(void) fini_func;
Callbacks::init();
ROCPROFILER_CALL(rocprofiler_create_context(&tracing_ctx), "context creation");
ROCPROFILER_CALL(rocprofiler_create_context(&agent_ctx), "context creation");
@@ -147,7 +137,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
nullptr,
0,
Callbacks::tool_codeobj_tracing_callback,
tool_data),
nullptr),
"code object tracing service configure");
ROCPROFILER_CALL(
@@ -156,13 +146,13 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
nullptr,
0,
dispatch_tracing_callback,
tool_data),
nullptr),
"dispatch tracing service configure");
ROCPROFILER_CALL(rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0,
&query_available_agents,
sizeof(rocprofiler_agent_t),
tool_data),
nullptr),
"Failed to find GPU agents");
int valid_ctx = 0;
@@ -177,13 +167,6 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
return 0;
}
void
tool_fini(void* tool_data)
{
Callbacks::finalize_json(tool_data);
delete static_cast<Callbacks::ToolData*>(tool_data);
}
} // namespace Agent
} // namespace ATTTest
@@ -206,8 +189,8 @@ rocprofiler_configure(uint32_t /* version */,
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&ATTTest::Agent::tool_init,
&ATTTest::Agent::tool_fini,
new Callbacks::ToolData{"att_agent_test/"}};
&Callbacks::finalize,
nullptr};
// return pointer to configure data
return &cfg;
+11 -16
Просмотреть файл
@@ -48,15 +48,17 @@ dispatch_callback(rocprofiler_agent_id_t /* agent */,
static std::atomic<size_t> count{0};
if(count.fetch_add(1) > NUM_KERNELS) return ROCPROFILER_THREAD_TRACE_CONTROL_NONE;
assert(userdata && "Dispatch callback passed null!");
assert(dispatch_userdata && "Dispatch callback passed null!");
dispatch_userdata->ptr = userdata;
return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP;
}
int
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */)
{
Callbacks::init();
static rocprofiler_context_id_t client_ctx = {0};
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
@@ -67,7 +69,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
nullptr,
0,
Callbacks::tool_codeobj_tracing_callback,
tool_data),
nullptr),
"code object tracing service configure");
std::vector<rocprofiler_thread_trace_parameter_t> params{};
@@ -100,7 +102,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
params.size(),
dispatch_callback,
Callbacks::shader_data_callback,
tool_data),
nullptr),
"thread trace service configure");
}
@@ -121,13 +123,6 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
return 0;
}
void
tool_fini(void* tool_data)
{
Callbacks::finalize_json(tool_data);
delete static_cast<Callbacks::ToolData*>(tool_data);
}
} // namespace Multi
} // namespace ATTTest
@@ -147,11 +142,11 @@ rocprofiler_configure(uint32_t /* version */,
ATTTest::Multi::client_id = id;
// create configure data
static auto cfg = rocprofiler_tool_configure_result_t{
sizeof(rocprofiler_tool_configure_result_t),
&ATTTest::Multi::tool_init,
&ATTTest::Multi::tool_fini,
reinterpret_cast<void*>(new Callbacks::ToolData{"att_multi_test/"})};
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&ATTTest::Multi::tool_init,
&Callbacks::finalize,
nullptr};
// return pointer to configure data
return &cfg;
+14 -36
Просмотреть файл
@@ -41,35 +41,20 @@ dispatch_callback(rocprofiler_agent_id_t /* agent */,
rocprofiler_async_correlation_id_t /* correlation_id */,
rocprofiler_kernel_id_t kernel_id,
rocprofiler_dispatch_id_t /* dispatch_id */,
void* userdata,
rocprofiler_user_data_t* dispatch_userdata)
void* /* userdata */,
rocprofiler_user_data_t* /* dispatch_userdata */)
{
C_API_BEGIN
assert(userdata && "Dispatch callback passed null!");
auto& tool = *reinterpret_cast<Callbacks::ToolData*>(userdata);
dispatch_userdata->ptr = userdata;
static rocprofiler_kernel_id_t target_kernel_id = kernel_id;
if(target_kernel_id == kernel_id) return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP;
static std::string_view desired_func_name = "branching_kernel";
try
{
auto& kernel_name = tool.kernel_id_to_kernel_name.at(kernel_id);
if(kernel_name.find(desired_func_name) == std::string::npos)
return ROCPROFILER_THREAD_TRACE_CONTROL_NONE;
return ROCPROFILER_THREAD_TRACE_CONTROL_START_AND_STOP;
} catch(...)
{
std::cerr << "Could not find kernel id: " << kernel_id << std::endl;
}
C_API_END
return ROCPROFILER_THREAD_TRACE_CONTROL_NONE;
}
int
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
tool_init(rocprofiler_client_finalize_t /* fini_func */, void* /* tool_data */)
{
Callbacks::init();
static rocprofiler_context_id_t client_ctx = {0};
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
@@ -80,7 +65,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
nullptr,
0,
Callbacks::tool_codeobj_tracing_callback,
tool_data),
nullptr),
"code object tracing service configure");
std::vector<rocprofiler_agent_id_t> agents{};
@@ -110,7 +95,7 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
0,
dispatch_callback,
Callbacks::shader_data_callback,
tool_data),
nullptr),
"thread trace service configure");
}
@@ -131,13 +116,6 @@ tool_init(rocprofiler_client_finalize_t /* fini_func */, void* tool_data)
return 0;
}
void
tool_fini(void* tool_data)
{
Callbacks::finalize_json(tool_data);
delete static_cast<Callbacks::ToolData*>(tool_data);
}
} // namespace Single
} // namespace ATTTest
@@ -157,11 +135,11 @@ rocprofiler_configure(uint32_t /* version */,
ATTTest::Single::client_id = id;
// create configure data
static auto cfg = rocprofiler_tool_configure_result_t{
sizeof(rocprofiler_tool_configure_result_t),
&ATTTest::Single::tool_init,
&ATTTest::Single::tool_fini,
reinterpret_cast<void*>(new Callbacks::ToolData{"att_single_test/"})};
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&ATTTest::Single::tool_init,
&Callbacks::finalize,
nullptr};
// return pointer to configure data
return &cfg;
+50 -112
Просмотреть файл
@@ -26,11 +26,6 @@
#endif
#include "trace_callbacks.hpp"
#include <rocprofiler-sdk/cxx/codeobj/code_printing.hpp>
#ifdef ENABLE_ATT_FILES
# include <nlohmann/json.hpp>
#endif
#include <unistd.h>
#include <cassert>
@@ -38,135 +33,78 @@
namespace Callbacks
{
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
rocprofiler_thread_trace_decoder_id_t decoder{};
std::atomic<size_t> latency{0};
void
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* /* user_data */,
void* userdata)
void* /* userdata */)
{
C_API_BEGIN
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return;
assert(userdata && "Dispatch callback passed null!");
auto& tool = *reinterpret_cast<Callbacks::ToolData*>(userdata);
if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
{
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
tool.kernel_id_to_kernel_name.emplace(data->kernel_id, data->kernel_name);
}
if(record.operation != ROCPROFILER_CODE_OBJECT_LOAD) return;
auto* data = static_cast<code_obj_load_data_t*>(record.payload);
auto* data = static_cast<rocprofiler_callback_tracing_code_object_load_data_t*>(record.payload);
if(data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE) return;
static std::atomic<int> filecnt{0};
std::string name = "codeobj_" + std::to_string(filecnt.fetch_add(1)) + ".out";
#ifdef ENABLE_ATT_FILES
if(std::string_view(data->uri).find("file:///") == 0)
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD)
{
rocprofiler::sdk::codeobj::disassembly::CodeObjectBinary binary(data->uri);
std::ofstream file(tool.out_dir + name, std::ios::binary);
assert(file.is_open() && "Could not open codeobj file for writing");
file.write((char*) binary.buffer.data(), binary.buffer.size());
DECODER_CALL(
rocprofiler_thread_trace_decoder_codeobj_unload(decoder, data->code_object_id));
return;
}
else
{
std::ofstream file(tool.out_dir + name, std::ios::binary);
file.write((char*) data->memory_base, data->memory_size);
}
#endif
auto _lk = std::unique_lock{tool.mut};
tool.codeobjs.push_back(
{data->load_delta, data->load_size, data->code_object_id, name, data->uri});
DECODER_CALL(rocprofiler_thread_trace_decoder_codeobj_load(
decoder,
data->code_object_id,
data->load_delta,
data->load_size,
reinterpret_cast<const void*>(data->memory_base),
data->memory_size));
}
C_API_END
typedef void (*rocprofiler_thread_trace_decoder_callback_t)(
rocprofiler_thread_trace_decoder_record_type_t record_type_id,
void* trace_events,
uint64_t trace_size,
void* userdata);
void
shader_data_callback(rocprofiler_agent_id_t /* agent */,
int64_t /* se_id */,
void* se_data,
size_t data_size,
rocprofiler_user_data_t /* userdata */)
{
auto parse = [](rocprofiler_thread_trace_decoder_record_type_t record_type_id,
void* trace_events,
uint64_t trace_size,
void* /* userdata */) {
if(record_type_id != ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE) return;
for(size_t w = 0; w < trace_size; w++)
{
auto* wave = static_cast<rocprofiler_thread_trace_decoder_wave_t*>(trace_events);
for(size_t i = 0; i < wave->instructions_size; i++)
latency += wave->instructions_array[i].duration;
}
};
DECODER_CALL(rocprofiler_trace_decode(decoder, parse, se_data, data_size, nullptr));
}
void
shader_data_callback(rocprofiler_agent_id_t agent,
int64_t se_id,
void* se_data,
size_t data_size,
rocprofiler_user_data_t userdata)
init()
{
C_API_BEGIN
assert(userdata.ptr && "Dispatch callback passed null!");
auto& tool = *reinterpret_cast<Callbacks::ToolData*>(userdata.ptr);
std::string name = "agent_" + std::to_string(agent.handle) + "_shader_engine_" +
std::to_string(se_id) + "_" + std::to_string(agent.handle) + ".att";
#ifdef ENABLE_ATT_FILES
{
std::ofstream file(tool.out_dir + name, std::ios::binary);
assert(file.is_open() && "Could not open ATT file for writing");
file.write((char*) se_data, data_size);
}
#endif
assert(se_data);
assert(data_size);
auto _lk = std::unique_lock{tool.mut};
tool.att_files.push_back(name);
C_API_END
// const char* decoder_lib = std::getenv("ROCPROF_TRACE_DECODER_PATH");
DECODER_CALL(rocprofiler_thread_trace_decoder_create(&decoder, "/opt/rocm/lib"));
}
void
finalize_json(void* userdata)
finalize(void* /* tool_data */)
{
assert(userdata && "Dispatch callback passed null!");
rocprofiler_thread_trace_decoder_destroy(decoder);
auto& tool = *reinterpret_cast<Callbacks::ToolData*>(userdata);
auto _lk = std::unique_lock{tool.mut};
assert(!tool.att_files.empty());
#ifdef ENABLE_ATT_FILES
nlohmann::json att_json;
for(auto& file : tool.att_files)
att_json.push_back(file);
nlohmann::json codeobj_json;
nlohmann::json snapshot_json;
for(auto& file : tool.codeobjs)
{
nlohmann::json codeobj;
codeobj["code_object_id"] = file.id;
codeobj["load_delta"] = file.addr;
codeobj["load_size"] = file.size;
codeobj["uri"] = file.uri;
codeobj["filename"] = file.filename;
codeobj_json.push_back(codeobj);
nlohmann::json pair_json;
pair_json["key"] = file.id;
pair_json["value"] = file.filename;
snapshot_json.push_back(pair_json);
}
nlohmann::json tool_json;
tool_json["strings"]["att_files"] = att_json;
tool_json["code_objects"] = codeobj_json;
tool_json["strings"]["code_object_snapshot_files"] = snapshot_json;
nlohmann::json array;
array.push_back(tool_json);
nlohmann::json sdk_json;
sdk_json["rocprofiler-sdk-tool"] = array;
std::ofstream json_file(tool.out_dir + (std::to_string(getpid()) + "_results.json"));
assert(json_file.is_open() && "Could not open json file for writing!");
json_file << sdk_json;
#endif
if(latency.load() == 0) std::cerr << "Error: No latency was assigned to the trace!";
}
} // namespace Callbacks
+13 -49
Просмотреть файл
@@ -40,60 +40,21 @@
#include <unordered_map>
#include <vector>
#define ROCPROFILER_VAR_NAME_COMBINE(X, Y) X##Y
#define ROCPROFILER_VARIABLE(X, Y) ROCPROFILER_VAR_NAME_COMBINE(X, Y)
#define C_API_BEGIN \
try \
{
#define C_API_END \
} \
catch(std::exception & e) \
{ \
std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << ' ' << e.what() << std::endl; \
} \
catch(...) { std::cerr << "Error in " << __FILE__ << ':' << __LINE__ << std::endl; }
#define ROCPROFILER_CALL(result, msg) \
if((result) != ROCPROFILER_STATUS_SUCCESS) \
{ \
rocprofiler_status_t CHECKSTATUS = result; \
if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \
std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \
<< " failed with error code " << CHECKSTATUS << ": " << status_msg \
<< std::endl; \
std::stringstream errmsg{}; \
errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \
<< status_msg << ")"; \
throw std::runtime_error(errmsg.str()); \
} \
std::cerr << "Error: " << msg << std::endl; \
abort(); \
}
#define DECODER_CALL(result) \
if((result) != ROCPROFILER_STATUS_SUCCESS) \
{ \
std::cerr << "Error: Generic decoder error" << std::endl; \
}
namespace Callbacks
{
struct CodeobjInfo
{
int64_t addr = 0;
size_t size = 0;
size_t id = 0;
std::string filename{};
std::string uri{};
};
struct ToolData
{
ToolData(const char* out)
: out_dir(out){};
std::string out_dir{};
std::mutex mut{};
std::vector<CodeobjInfo> codeobjs{};
std::vector<std::string> att_files{};
std::unordered_map<uint64_t, std::string> kernel_id_to_kernel_name = {};
};
void
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t*,
@@ -107,6 +68,9 @@ shader_data_callback(rocprofiler_agent_id_t agent,
rocprofiler_user_data_t userdata);
void
finalize_json(void* userdata);
init();
void
finalize(void* /* tool_data */);
} // namespace Callbacks