* Added first ATT API

* Finalizing thread trace API

* Fixing more rebase conflicts

* Added codeobj disassembly sample

* Fixing merge issues with rebase [2]

* Adding ATT packets

* Implemented thread trace intercept

* Moved codeobj parser to same repo as rocprofiler

* Moved thread trace to new API

* Fixing merge conflicts

* Fixing more merge conflicts

* Adding thread trace packet reuse

* Merged aql_profile_v2 headers

* Linked ATT sample to aqlprofile

* Updated decoder to include non-loaded codeobjs

* Implemented ISA decoder into ATT sample

* Added marker_id to vaddr

* Updating aql_profile_v2 API to memcpy

* Updating thread trace API to include 64bit markers. Using the result of ISA matching.

* Added instruction type and cycles summary

* Updated sample with selection of kernel by kernel_object

* Added option to copy from memory kernels

* Moved tool_data in thread_trace to dynamic alloc

* Restoring hsa.cpp

* Fixed ATT sample crash. General improvements.

* Moved codeobj library to outside src/

* Updated license header

* Moved codeobj_capture to camelcase

* Solving some more merge conflicts

* Update samples/advanced_thread_trace/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update samples/advanced_thread_trace/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update samples/code_object_isa_decode/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Update source/lib/rocprofiler-sdk/thread_trace/CMakeLists.txt

* Removing unused parameter check

* Adding const to isEmpty

* Removing unused warning

* Adding libdw-dev to requirements

* Running clang-format

* Commenting out new aql calls

* Clang format

* Unused variable fix

* Adding codeobj-decoder coverage

* Commenting out threadtrace

* Update samples/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* P

* WOverloaded

* Addressing clang-tidy

* Virtual destructor on ttracer class

* Corr id

* Fixing code source format

* Update CMakeLists.txt

* Build fixes

* Update source/lib/rocprofiler-sdk-codeobj/code_object_track.cpp

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* Fix shadowing

* Update CMakeLists.txt

* Update samples/CMakeLists.txt

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Ammar ELWazir <ammar.elwazir@amd.com>
Co-authored-by: Ammar ELWazir <aelwazir@amd.com>
Co-authored-by: Benjamin Welton <bewelton@amd.com>

[ROCm/rocprofiler-sdk commit: 69b8a43dc6]
Этот коммит содержится в:
Giovanni Lenzi Baraldi
2024-04-08 16:43:02 -03:00
коммит произвёл GitHub
родитель b501f25ba0
Коммит 2cd198a7e7
48 изменённых файлов: 4649 добавлений и 109 удалений
+2 -2
Просмотреть файл
@@ -200,7 +200,7 @@ jobs:
run: |
git config --global --add safe.directory '*'
apt-get update
apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig
apt-get install -y cmake python3-pip gcovr wkhtmltopdf xvfb xfonts-base xfonts-75dpi xfonts-100dpi xfonts-utils xfonts-encodings libfontconfig libdw-dev
python3 -m pip install -r requirements.txt
python3 -m pip install pytest pycobertura
@@ -388,7 +388,7 @@ jobs:
python3 -m pip install pytest
add-apt-repository ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-13
apt-get install -y g++-13 libdw-dev
update-alternatives --install $(which gcc) gcc $(which gcc-13) 100 --slave $(which g++) g++ $(which g++-13)
realpath $(which gcc)
realpath $(which g++)
+1
Просмотреть файл
@@ -29,3 +29,4 @@ add_subdirectory(api_buffered_tracing)
add_subdirectory(code_object_tracing)
add_subdirectory(counter_collection)
add_subdirectory(intercept_table)
# add_subdirectory(code_object_isa_decode) add_subdirectory(advanced_thread_trace)
+59
Просмотреть файл
@@ -0,0 +1,59 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-samples-advanced-thread-trace LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
find_package(rocprofiler-sdk REQUIRED)
add_library(advanced-thread-trace-client SHARED)
target_sources(advanced-thread-trace-client PRIVATE client.cpp)
target_link_libraries(
advanced-thread-trace-client
PRIVATE rocprofiler::rocprofiler rocprofiler::samples-build-flags
rocprofiler-sdk-codeobj rocprofiler::samples-common-library)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g")
find_package(Threads REQUIRED)
add_executable(advanced-thread-trace)
target_sources(advanced-thread-trace PRIVATE main.cpp)
target_link_libraries(
advanced-thread-trace PRIVATE advanced-thread-trace-client Threads::Threads
rocprofiler::samples-build-flags)
add_test(NAME advanced-thread-trace COMMAND $<TARGET_FILE:advanced-thread-trace>)
set_tests_properties(
advanced-thread-trace
PROPERTIES
TIMEOUT
45
LABELS
"samples"
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
+578
Просмотреть файл
@@ -0,0 +1,578 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// undefine NDEBUG so asserts are implemented
#ifdef NDEBUG
# undef NDEBUG
#endif
/**
* @file samples/code_object_isa_decode/client.cpp
*
* @brief Example rocprofiler client (tool)
*/
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
#include "code_object_track.hpp"
#include "common/defines.hpp"
#include "common/filesystem.hpp"
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
#include <cxxabi.h>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <iomanip>
#include <iostream>
#include <map>
#include <mutex>
#include <regex>
#include <string>
#include <string_view>
#include <thread>
#include <unordered_map>
#include <vector>
#define TARGET_CU 1
#define SIMD_SELECT 0x3
#define BUFFER_SIZE 0x6000000
#define NUM_SE 2
constexpr bool COPY_MEMORY_CODEOBJ = false;
template <>
struct std::hash<pcinfo_t>
{
uint64_t operator()(const pcinfo_t& info) const
{
return info.addr ^ (info.marker_id << 32ul) ^ (info.marker_id >> 32ul);
}
};
bool
operator==(const pcinfo_t& a, const pcinfo_t& b)
{
return a.addr == b.addr && a.marker_id == b.marker_id;
};
bool
operator<(const pcinfo_t& a, const pcinfo_t& b)
{
if(a.marker_id == b.marker_id) return a.addr < b.addr;
return a.marker_id < b.marker_id;
};
namespace client
{
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_context_id_t client_ctx = {};
struct isa_map_elem_t
{
std::atomic<size_t> hitcount{0};
std::atomic<size_t> latency{0};
std::shared_ptr<Instruction> code_line{nullptr};
};
struct ToolData
{
std::shared_mutex isa_map_mut;
std::mutex output_mut;
CodeobjAddressTranslate codeobjTranslate;
std::map<pcinfo_t, std::unique_ptr<isa_map_elem_t>> isa_map;
std::unordered_map<uint64_t, SymbolInfo> kernels_in_codeobj = {};
std::unordered_map<uint64_t, std::string> kernel_object_to_kernel_name = {};
std::stringstream output;
std::stringstream printKernel(uint64_t vaddr)
{
std::stringstream ss;
try
{
ss << '\n' << std::hex;
SymbolInfo& info = kernels_in_codeobj.at(vaddr);
ss << std::hex << "Found: " << info.name << " at addr: 0x" << vaddr << " with offset 0x"
<< info.faddr << " vaddr 0x" << info.vaddr << std::dec << '\n';
} catch(std::exception& e)
{
ss << e.what() << '\n';
}
return ss;
}
};
struct source_location
{
std::string function = {};
std::string file = {};
uint32_t line = 0;
std::string context = {};
};
struct trace_data_t
{
int64_t id;
uint8_t* data;
uint64_t size;
ToolData* tool;
};
std::atomic<int> TRACE_DATA_ID{-1};
std::atomic<int> KERNEL_ADDR_ID{-1};
std::atomic<int> OCCUPANCY_ID{-1};
void
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
void* callback_data)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return;
assert(callback_data && "Shader callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(callback_data);
if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
{
std::unique_lock<std::shared_mutex> lg(tool.isa_map_mut);
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
tool.kernel_object_to_kernel_name.emplace(data->kernel_object, data->kernel_name);
}
if(record.operation != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD) return;
std::unique_lock<std::shared_mutex> lg(tool.isa_map_mut);
auto* data = static_cast<code_obj_load_data_t*>(record.payload);
if(std::string_view(data->uri).find("file:///") == 0)
{
tool.codeobjTranslate.addDecoder(
data->uri, data->code_object_id, data->load_delta, data->load_size);
auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id);
for(auto& [vaddr, symbol] : symbolmap)
tool.kernels_in_codeobj[vaddr] = symbol;
}
else if(COPY_MEMORY_CODEOBJ)
{
tool.codeobjTranslate.addDecoder(reinterpret_cast<const void*>(data->memory_base),
data->memory_size,
data->code_object_id,
data->load_delta,
data->load_size);
auto symbolmap = tool.codeobjTranslate.getSymbolMap(data->code_object_id);
for(auto& [vaddr, symbol] : symbolmap)
tool.kernels_in_codeobj[vaddr] = symbol;
}
(void) user_data;
(void) callback_data;
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
rocprofiler_att_control_flags_t
dispatch_callback(rocprofiler_queue_id_t queue_id,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t correlation_id,
const hsa_kernel_dispatch_packet_t* dispatch_packet,
uint64_t kernel_id,
void* userdata)
{
assert(userdata && "Dispatch callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(userdata);
std::shared_lock<std::shared_mutex> lg(tool.isa_map_mut);
constexpr int desired_call_iteration = 1;
static std::atomic<int> call_id{0};
static std::string_view desired_func_name = "transposeLdsSwapInplace";
try
{
auto& kernel_name = tool.kernel_object_to_kernel_name.at(dispatch_packet->kernel_object);
if(kernel_name.find(desired_func_name) == std::string::npos)
return ROCPROFILER_ATT_CONTROL_NONE;
if(call_id.fetch_add(1) == desired_call_iteration)
return ROCPROFILER_ATT_CONTROL_START_AND_STOP;
} catch(...)
{
std::cerr << "Could not find kernel object: " << dispatch_packet->kernel_object
<< std::endl;
}
return ROCPROFILER_ATT_CONTROL_NONE;
}
void
iterate_trace_types(int id, const char* metadata, void*)
{
if(std::string_view(metadata).find("occupancy") == 0)
OCCUPANCY_ID.store(id);
else if(std::string_view(metadata).find("kernel_ids_addr") == 0)
KERNEL_ADDR_ID.store(id);
else if(std::string_view(metadata).find("tracedata") == 0)
TRACE_DATA_ID.store(id);
}
hsa_status_t
get_trace_data(int trace_type_id,
int correlation_id,
void* trace_events,
uint64_t trace_size,
void* userdata)
{
assert(userdata && "ISA callback passed null!");
trace_data_t& trace_data = *reinterpret_cast<trace_data_t*>(userdata);
assert(trace_data.tool && "ISA callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(trace_data.tool);
std::stringstream ss;
std::shared_lock<std::shared_mutex> shared_lock(tool.isa_map_mut);
if(trace_type_id == OCCUPANCY_ID)
{
ss << "Num waves: " << trace_size / 2 << '\n';
// auto* occ = reinterpret_cast<att_occupancy_info_t*>(trace_events);
}
else if(trace_type_id == KERNEL_ADDR_ID)
{
ss << "Num KRN events: " << trace_size << std::hex << '\n';
auto* kaddr = reinterpret_cast<pcinfo_t*>(trace_events);
for(size_t i = 0; i < trace_size; i++)
if(kaddr[i].addr != 0)
{
ss << " - ADDR: " << kaddr[i].addr << ' ' << tool.printKernel(kaddr[i].addr).str()
<< '\n';
}
ss << std::dec;
}
else if(trace_type_id == TRACE_DATA_ID)
{
ss << "Trace Length: " << trace_size << '\n';
auto* tracedata = reinterpret_cast<att_trace_event_t*>(trace_events);
for(size_t i = 0; i < trace_size; i++)
{
pcinfo_t pc = tracedata[i].pc;
auto it = tool.isa_map.find(pc);
if(it == tool.isa_map.end())
{
shared_lock.unlock();
{
std::unique_lock<std::shared_mutex> unique_lock(tool.isa_map_mut);
auto ptr = std::make_unique<isa_map_elem_t>();
ptr->code_line = tool.codeobjTranslate.get(pc.marker_id, pc.addr);
it = tool.isa_map.emplace(pc, std::move(ptr)).first;
}
shared_lock.lock();
}
it->second->hitcount.fetch_add(tracedata[i].hitcount, std::memory_order_relaxed);
it->second->latency.fetch_add(tracedata[i].latency, std::memory_order_relaxed);
}
}
std::unique_lock<std::mutex> lk(tool.output_mut);
tool.output << ss.str();
return HSA_STATUS_SUCCESS;
}
uint64_t
copy_trace_data(int* seid, uint8_t** buffer, uint64_t* buffer_size, void* userdata)
{
trace_data_t& data = *reinterpret_cast<trace_data_t*>(userdata);
*seid = data.id;
*buffer_size = data.size;
*buffer = data.data;
data.size = 0;
return *buffer_size;
}
hsa_status_t
isa_callback(char* isa_instruction,
char* source_reference,
uint64_t* isa_memory_size,
uint64_t* isa_size,
uint64_t* source_size,
uint64_t marker_id,
uint64_t offset,
void* userdata)
{
assert(userdata && "ISA callback passed null!");
trace_data_t& trace_data = *reinterpret_cast<trace_data_t*>(userdata);
assert(trace_data.tool && "ISA callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(trace_data.tool);
std::shared_ptr<Instruction> instruction;
{
std::unique_lock<std::shared_mutex> unique_lock(tool.isa_map_mut);
instruction = tool.codeobjTranslate.get(marker_id, offset);
}
if(!instruction.get()) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
{
size_t tmp_isa_size = *isa_size;
size_t tmp_source_size = *source_size;
*isa_size = instruction->inst.size();
*source_size = instruction->comment.size();
if(*isa_size > tmp_isa_size || *source_size > tmp_source_size)
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
memcpy(isa_instruction, instruction->inst.data(), *isa_size);
memcpy(source_reference, instruction->comment.data(), *source_size);
*isa_memory_size = instruction->size;
auto ptr = std::make_unique<isa_map_elem_t>();
ptr->code_line = std::move(instruction);
tool.isa_map.emplace(pcinfo_t{offset, marker_id}, std::move(ptr));
return HSA_STATUS_SUCCESS;
}
void
shader_data_callback(int64_t se_id,
int64_t data_type_id,
const char* data_type_name,
void* se_data,
size_t data_size,
void* userdata)
{
assert(userdata && "Shader callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(userdata);
{
std::unique_lock<std::mutex> lk(tool.output_mut);
tool.output << "SE ID: " << se_id << " with size " << data_size << std::hex << '\n';
}
trace_data_t data{.id = se_id, .data = (uint8_t*) se_data, .size = data_size, .tool = &tool};
auto status = aqlprofile_att_parse_data(copy_trace_data, get_trace_data, isa_callback, &data);
(void) status;
}
#pragma GCC diagnostic pop
int
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
{
(void) fini_func;
aqlprofile_att_parser_iterate_event_list(iterate_trace_types, nullptr);
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(client_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
nullptr,
0,
tool_codeobj_tracing_callback,
tool_data),
"code object tracing service configure");
rocprofiler_att_parameters_t parameters{};
parameters.target_cu = TARGET_CU;
parameters.simd_select = SIMD_SELECT;
parameters.buffer_size = BUFFER_SIZE;
std::vector<int> shaders;
for(size_t i = 0; i < NUM_SE; i++)
shaders.push_back(2 * i); // use shader engines 0, 2
parameters.shader_ids = shaders.data();
parameters.shader_num = shaders.size();
ROCPROFILER_CALL(
rocprofiler_configure_thread_trace_service(
client_ctx, parameters, dispatch_callback, shader_data_callback, tool_data),
"thread trace service configure");
int valid_ctx = 0;
ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx),
"context validity check");
if(valid_ctx == 0)
{
// notify rocprofiler that initialization failed
// and all the contexts, buffers, etc. created
// should be ignored
return -1;
}
ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start");
// no errors
return 0;
}
void
tool_fini(void* tool_data)
{
assert(tool_data && "tool_fini callback passed null!");
ToolData& tool = *reinterpret_cast<ToolData*>(tool_data);
std::unique_lock<std::shared_mutex> isa_lk(tool.isa_map_mut);
std::unique_lock<std::mutex> out_lk(tool.output_mut);
// Find largest instruction
size_t max_inst_size = 0;
for(auto& [addr, lines] : tool.isa_map)
if(lines.get()) max_inst_size = std::max(max_inst_size, lines->code_line->inst.size());
std::string empty_space;
empty_space.resize(max_inst_size, ' ');
size_t vmc_latency = 0;
size_t lgk_latency = 0;
size_t scalar_latency = 0;
size_t vector_latency = 0;
size_t other_latency = 0;
size_t scalar_exec = 0;
size_t vector_exec = 0;
size_t other_exec = 0;
for(auto& [addr, line] : tool.isa_map)
if(line.get())
{
size_t hitcount = line->hitcount.load(std::memory_order_relaxed);
size_t latency = line->latency.load(std::memory_order_relaxed);
auto& code_line = line->code_line->inst;
tool.output << std::hex << "0x" << addr.addr << std::dec << ' ' << code_line
<< empty_space.substr(0, max_inst_size - code_line.size())
<< " Hit: " << hitcount << " - Latency: " << latency << '\n';
if(code_line.find("s_waitcnt") == 0)
{
other_exec += hitcount;
if(code_line.find("lgkmcnt") != std::string::npos)
lgk_latency += latency;
else
vmc_latency += latency;
}
else if(code_line.find("v_") == 0)
{
vector_exec += hitcount;
vector_latency += latency;
}
else if(code_line.find("s_") == 0)
{
scalar_exec += hitcount;
scalar_latency += latency;
}
else
{
other_exec += hitcount;
other_latency += latency;
}
}
size_t total_exec = vector_exec + scalar_exec + other_exec;
size_t memory_latency = vmc_latency + lgk_latency;
size_t total_latency = memory_latency + vector_latency + scalar_latency + other_latency;
float vmc_fraction = 100 * vmc_latency / float(total_latency);
float lgk_fraction = 100 * lgk_latency / float(total_latency);
tool.output << "Total executed instructions: " << total_exec << '\n'
<< "Total executed vector instructions: " << vector_exec << " with average "
<< vector_latency / float(vector_exec) << " cycles.\n"
<< "Total executed scalar instructions: " << scalar_exec << " with average "
<< scalar_latency / float(scalar_exec) << " cycles.\n"
<< "Vector memory ops occupied: " << vmc_fraction << "% of cycles.\n"
<< "Scalar and LDS memory ops occupied: " << lgk_fraction << "% of cycles.\n";
std::cout << tool.output.str();
}
void
setup()
{
if(int status = 0;
rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0)
{
ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure),
"force configuration");
}
}
// force configuration when library is loaded
bool cfg_on_load = (client::setup(), true);
} // namespace client
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "Adv_Thread_Trace_Sample";
// store client info
client::client_id = id;
// compute major/minor/patch version info
uint32_t major = version / 10000;
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
auto* data = new client::ToolData{};
// create configure data
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&client::tool_init,
&client::tool_fini,
reinterpret_cast<void*>(data)};
// return pointer to configure data
return &cfg;
}
+245
Просмотреть файл
@@ -0,0 +1,245 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "hip/hip_runtime.h"
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <random>
#include <stdexcept>
#include "transpose_kernels.hpp"
#define PRINT_ALIGN 36
namespace
{
using lock_guard_t = std::lock_guard<std::mutex>;
auto print_lock = std::mutex{};
} // namespace
enum TransposeType
{
TRANSPOSE_NAIVE,
TRANSPOSE_INPLACE_LDS,
TRANSPOSE_NO_BANK_CONFLICTS
};
class ITranspose
{
public:
virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0;
virtual ~ITranspose(){};
};
template <typename T>
class Transpose : public ITranspose
{
public:
Transpose(int dev, size_t _M)
: devID(dev)
, M(_M)
, databytes(_M * _M * sizeof(T))
{
HIP_API_CALL(hipSetDevice(devID));
HIP_API_CALL(hipStreamCreate(&stream));
std::default_random_engine _engine{std::random_device{}() * rand()};
std::uniform_int_distribution<int> _dist{0, 1000};
inp_matrix = new T[M * M];
out_matrix = new T[M * M];
for(size_t i = 0; i < M * M; i++)
inp_matrix[i] = static_cast<T>(_dist(_engine));
memset(out_matrix, 0, databytes);
HIP_API_CALL(hipMalloc(&in, databytes));
HIP_API_CALL(hipMalloc(&out, databytes));
HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream));
HIP_API_CALL(hipEventCreate(&start));
HIP_API_CALL(hipEventCreate(&stop));
}
void run(TransposeType ttype, int numThreadsY, int num_iter) override
{
HIP_API_CALL(hipSetDevice(devID));
dim3 grid(M / TILE_DIM, M / TILE_DIM, 1);
dim3 block(TILE_DIM, numThreadsY, 1);
auto Kernel = transposeNaive<T>;
std::string KernelName = "transposeNaive";
if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS)
{
Kernel = transposeLdsNoBankConflicts<T>;
KernelName = "transposeLdsNoBankConflicts";
}
else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS)
{
Kernel = transposeLdsSwapInplace<T>;
KernelName = "transposeLdsSwapInplace";
}
{
std::string functypeid = __PRETTY_FUNCTION__;
auto it_beg = functypeid.rfind("[T = ");
auto it_end = functypeid.rfind(']');
if(it_beg != std::string::npos) it_beg += std::string("[T = ").size();
if(it_beg < it_end && it_end != std::string::npos)
KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>';
}
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipEventRecord(start, stream));
for(int i = 0; i < num_iter; i++)
{
Kernel<<<grid, block, 0, stream>>>(out, in, M);
HIP_API_CALL(hipGetLastError());
}
HIP_API_CALL(hipEventRecord(stop, stream));
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream));
HIP_API_CALL(hipEventSynchronize(stop));
float time;
HIP_API_CALL(hipEventElapsedTime(&time, start, stop));
float GB = databytes * num_iter * 2 / float(1 << 30);
{
lock_guard_t _lk{print_lock};
std::cout << "The average performance of " << std::setw(38) << KernelName << " : "
<< (1000 * GB / time) << " GB/s" << std::endl;
}
verify();
}
void verify() const
{
HIP_API_CALL(hipStreamSynchronize(stream));
for(int i = 0; i < 10; i++)
{
int row = rand() % M;
int col = rand() % M;
if(inp_matrix[row * M + col] != out_matrix[col * M + row])
{
lock_guard_t _lk{print_lock};
std::cout << "mismatch: " << row << ", " << col << " : "
<< inp_matrix[row * M + col] << " | " << out_matrix[col * M + row]
<< std::endl;
}
}
}
virtual ~Transpose()
{
HIP_API_CALL(hipSetDevice(devID));
HIP_API_CALL(hipEventDestroy(start));
HIP_API_CALL(hipEventDestroy(stop));
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
HIP_API_CALL(hipStreamDestroy(stream));
delete[] inp_matrix;
delete[] out_matrix;
}
const int devID;
const size_t M;
const size_t databytes;
hipStream_t stream;
hipEvent_t start, stop;
T* inp_matrix = nullptr;
T* out_matrix = nullptr;
T* in = nullptr;
T* out = nullptr;
};
int
main(int argc, char** argv)
{
int deviceId = 0;
int blockDimY = 8;
int num_iter = 1;
int mat_size = 8192;
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
std::cout << "usage: transpose "
<< "[MatrixSize (" << mat_size << ")] "
<< "[numIter (" << num_iter << ")] "
<< "[blockDimY (" << blockDimY << ")] "
<< "[DEVICE_ID (" << deviceId << ")] " << std::endl;
exit(EXIT_SUCCESS);
}
}
if(argc > 1) mat_size = atoll(argv[1]);
if(argc > 2) num_iter = atoll(argv[2]);
if(argc > 3) blockDimY = atoll(argv[3]);
if(argc > 4) deviceId = atoll(argv[4]);
printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n",
mat_size,
deviceId,
num_iter,
blockDimY);
int ndevice = 0;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[transpose] Number of devices found: %i\n", ndevice);
assert(ndevice > 0);
if(deviceId >= ndevice) exit(EXIT_FAILURE);
{
std::vector<std::unique_ptr<ITranspose>> kernels;
kernels.push_back(std::make_unique<Transpose<int>>(deviceId, mat_size));
kernels.push_back(std::make_unique<Transpose<float>>(deviceId, mat_size));
kernels.push_back(std::make_unique<Transpose<double>>(deviceId, mat_size));
for(auto& kernel : kernels)
{
kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter);
kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter);
kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter);
}
}
HIP_API_CALL(hipDeviceSynchronize());
return 0;
}
+84
Просмотреть файл
@@ -0,0 +1,84 @@
#pragma once
#include "hip/hip_runtime.h"
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
lock_guard_t _hip_api_print_lk{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
exit(EXIT_FAILURE); \
} \
}
#define TILE_DIM 64
template <typename T>
__global__ void
transposeNaive(T* odata, const T* idata, size_t size)
{
size_t idx = blockIdx.x * TILE_DIM + threadIdx.x;
size_t block_posy = blockIdx.y * TILE_DIM;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size];
}
template <typename T>
__global__ void
transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size)
{
__shared__ T tile[TILE_DIM][TILE_DIM + 1];
size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y;
size_t index_in = idx_in + idy_in * size;
size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y;
size_t index_out = idx_out + idy_out * size;
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size];
__syncthreads();
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y];
}
// Generates more interesting ISA
template <typename T>
__global__ void
transposeLdsSwapInplace(T* odata, const T* idata, size_t size)
{
__shared__ T tile[TILE_DIM][TILE_DIM];
const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size];
__syncthreads();
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
if(idy < threadIdx.x)
{
T temp = tile[idy][threadIdx.x];
tile[idy][threadIdx.x] = tile[threadIdx.x][idy];
tile[threadIdx.x][idy] = temp;
}
__syncthreads();
const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x];
}
+52
Просмотреть файл
@@ -0,0 +1,52 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-samples-code-object-isa-decode LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
find_package(rocprofiler-sdk REQUIRED)
find_package(Threads REQUIRED)
add_executable(code-object-isa-decode)
target_sources(code-object-isa-decode PRIVATE main.cpp client.cpp)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "-g")
target_link_libraries(
code-object-isa-decode PRIVATE Threads::Threads rocprofiler::samples-common-library
rocprofiler-sdk-codeobj rocprofiler::rocprofiler)
add_test(NAME code-object-isa-decode COMMAND $<TARGET_FILE:code-object-isa-decode>)
set_tests_properties(
code-object-isa-decode
PROPERTIES
TIMEOUT
45
LABELS
"samples"
ENVIRONMENT
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV};HSA_TOOLS_LIB=$<TARGET_FILE:rocprofiler::rocprofiler-shared-library>"
FAIL_REGULAR_EXPRESSION
"threw an exception")
+251
Просмотреть файл
@@ -0,0 +1,251 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// undefine NDEBUG so asserts are implemented
#ifdef NDEBUG
# undef NDEBUG
#endif
/**
* @file samples/code_object_isa_decode/client.cpp
*
* @brief Example rocprofiler client (tool)
*/
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
#include "common/defines.hpp"
#include "common/filesystem.hpp"
#include <cxxabi.h>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <functional>
#include <iomanip>
#include <iostream>
#include <map>
#include <mutex>
#include <regex>
#include <string>
#include <string_view>
#include <thread>
#include <vector>
#include "code_object_track.hpp"
namespace client
{
namespace
{
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
using kernel_symbol_map_t = std::unordered_map<std::string, std::pair<uint64_t, size_t>>;
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini_func = nullptr;
rocprofiler_context_id_t client_ctx = {};
kernel_symbol_map_t registered_kernels = {};
CodeobjAddressTranslate codeobjTranslate;
void
tool_codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
void* callback_data)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
if(record.phase != ROCPROFILER_CALLBACK_PHASE_LOAD) return;
if(record.operation == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_LOAD)
{
auto* data = static_cast<code_obj_load_data_t*>(record.payload);
if(std::string_view(data->uri).find("file:///") == 0)
{
codeobjTranslate.addDecoder(
data->uri, data->code_object_id, data->load_base, data->load_size);
auto symbolmap = codeobjTranslate.getSymbolMap();
for(auto& [vaddr, symbol] : symbolmap)
registered_kernels.insert({symbol.name, {vaddr, vaddr + symbol.mem_size}});
}
}
else if(record.operation ==
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
{
std::cout << std::hex;
auto* data = static_cast<kernel_symbol_data_t*>(record.payload);
auto kernel_name = std::regex_replace(data->kernel_name, std::regex{"(\\.kd)$"}, "");
if(registered_kernels.find(kernel_name) == registered_kernels.end())
{
std::cout << "Not Found: " << kernel_name << " in codeobj." << std::endl;
return;
}
auto& begin_end = registered_kernels.at(kernel_name);
std::cout << std::hex << "Found: " << kernel_name << " at addr: 0x" << begin_end.first
<< std::dec << ". Printing first 64 bytes:" << std::endl;
std::unordered_set<std::string> references{};
int num_waitcnts = 0;
int num_scalar = 0;
int num_vector = 0;
int num_other = 0;
size_t vaddr = begin_end.first;
while(vaddr < begin_end.second)
{
auto inst = codeobjTranslate.get(vaddr);
std::string_view source = inst->comment;
if(source.rfind('/') < source.size()) source = source.substr(source.rfind('/'));
if(vaddr < begin_end.first + 64) std::cout << '\t' << inst->inst << '\n';
if(source.rfind(':') < source.size()) source = source.substr(0, source.rfind(':'));
references.insert(std::string(source));
if(inst->inst.find("v_") == 0)
num_vector++;
else if(inst->inst.find("s_waitcnt") == 0)
num_waitcnts++;
else if(inst->inst.find("s_") == 0)
num_scalar++;
else
num_other++;
vaddr += inst->size;
}
std::cout << " --- Num Scalar: " << num_scalar << "\n --- Num Vector: " << num_vector
<< "\n --- Num Waitcnts: " << num_waitcnts
<< "\n --- Other instructions: " << num_other
<< "\nKernel has source references to: " << std::endl;
for(auto& ref : references)
std::cout << '\t' << ref << std::endl;
}
(void) user_data;
(void) callback_data;
}
int
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
{
client_fini_func = fini_func;
ROCPROFILER_CALL(rocprofiler_create_context(&client_ctx), "context creation");
ROCPROFILER_CALL(
rocprofiler_configure_callback_tracing_service(client_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
nullptr,
0,
tool_codeobj_tracing_callback,
tool_data),
"code object tracing service configure");
int valid_ctx = 0;
ROCPROFILER_CALL(rocprofiler_context_is_valid(client_ctx, &valid_ctx),
"context validity check");
if(valid_ctx == 0)
{
// notify rocprofiler that initialization failed
// and all the contexts, buffers, etc. created
// should be ignored
return -1;
}
ROCPROFILER_CALL(rocprofiler_start_context(client_ctx), "context start");
// no errors
return 0;
}
void
tool_fini(void* tool_data)
{
(void) tool_data;
}
void
setup()
{
if(int status = 0;
rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0)
{
ROCPROFILER_CALL(rocprofiler_force_configure(&rocprofiler_configure),
"force configuration");
}
}
} // namespace
// force configuration when library is loaded
bool cfg_on_load = (client::setup(), true);
} // namespace client
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t version,
const char* runtime_version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// only activate if main tool
if(priority > 0) return nullptr;
// set the client name
id->name = "ExampleTool";
// store client info
client::client_id = id;
// compute major/minor/patch version info
uint32_t major = version / 10000;
uint32_t minor = (version % 10000) / 100;
uint32_t patch = version % 100;
// generate info string
auto info = std::stringstream{};
info << id->name << " is using rocprofiler-sdk v" << major << "." << minor << "." << patch
<< " (" << runtime_version << ")";
std::clog << info.str() << std::endl;
// create configure data
static auto cfg =
rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t),
&client::tool_init,
&client::tool_fini,
nullptr};
// return pointer to configure data
return &cfg;
}
+246
Просмотреть файл
@@ -0,0 +1,246 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "hip/hip_runtime.h"
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <random>
#include <stdexcept>
#include "transpose_kernels.hpp"
#define PRINT_ALIGN 36
namespace
{
using lock_guard_t = std::lock_guard<std::mutex>;
auto print_lock = std::mutex{};
} // namespace
enum TransposeType
{
TRANSPOSE_NAIVE,
TRANSPOSE_INPLACE_LDS,
TRANSPOSE_NO_BANK_CONFLICTS
};
class ITranspose
{
public:
virtual void run(TransposeType ttype, int numThreadsY, int num_iter) = 0;
virtual ~ITranspose(){};
};
template <typename T>
class Transpose : public ITranspose
{
public:
Transpose(int dev, size_t _M)
: devID(dev)
, M(_M)
, databytes(_M * _M * sizeof(T))
{
HIP_API_CALL(hipSetDevice(devID));
HIP_API_CALL(hipStreamCreate(&stream));
std::default_random_engine _engine{std::random_device{}() * rand()};
std::uniform_int_distribution<int> _dist{0, 1000};
inp_matrix = new T[M * M];
out_matrix = new T[M * M];
for(size_t i = 0; i < M * M; i++)
inp_matrix[i] = static_cast<T>(_dist(_engine));
memset(out_matrix, 0, databytes);
HIP_API_CALL(hipMalloc(&in, databytes));
HIP_API_CALL(hipMalloc(&out, databytes));
HIP_API_CALL(hipMemsetAsync(in, 0, databytes, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, databytes, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, databytes, hipMemcpyDefault, stream));
HIP_API_CALL(hipEventCreate(&start));
HIP_API_CALL(hipEventCreate(&stop));
}
void run(TransposeType ttype, int numThreadsY, int num_iter) override
{
HIP_API_CALL(hipSetDevice(devID));
dim3 grid(M / TILE_DIM, M / TILE_DIM, 1);
dim3 block(TILE_DIM, numThreadsY, 1);
auto Kernel = transposeNaive<T>;
std::string KernelName = "transposeNaive";
if(ttype == TransposeType::TRANSPOSE_NO_BANK_CONFLICTS)
{
Kernel = transposeLdsNoBankConflicts<T>;
KernelName = "transposeLdsNoBankConflicts";
}
else if(ttype == TransposeType::TRANSPOSE_INPLACE_LDS)
{
Kernel = transposeLdsSwapInplace<T>;
KernelName = "transposeLdsSwapInplace";
}
{
std::string functypeid = __PRETTY_FUNCTION__;
auto it_beg = functypeid.rfind("[T = ");
auto it_end = functypeid.rfind(']');
if(it_beg != std::string::npos) it_beg += std::string("[T = ").size();
if(it_beg < it_end && it_end != std::string::npos)
KernelName += '<' + functypeid.substr(it_beg, it_end - it_beg) + '>';
}
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipEventRecord(start, stream));
for(int i = 0; i < num_iter; i++)
{
Kernel<<<grid, block, 0, stream>>>(out, in, M);
HIP_API_CALL(hipGetLastError());
}
HIP_API_CALL(hipEventRecord(stop, stream));
HIP_API_CALL(hipMemcpyAsync(out_matrix, out, databytes, hipMemcpyDefault, stream));
HIP_API_CALL(hipEventSynchronize(stop));
float time;
HIP_API_CALL(hipEventElapsedTime(&time, start, stop));
float GB = databytes * num_iter * 2 / float(1 << 30);
{
lock_guard_t _lk{print_lock};
std::cout << "The average performance of " << std::setw(38) << KernelName << " : "
<< (1000 * GB / time) << " GB/s" << std::endl;
}
verify();
}
void verify() const
{
HIP_API_CALL(hipStreamSynchronize(stream));
for(int i = 0; i < 10; i++)
{
int row = rand() % M;
int col = rand() % M;
if(inp_matrix[row * M + col] != out_matrix[col * M + row])
{
lock_guard_t _lk{print_lock};
std::cout << "mismatch: " << row << ", " << col << " : "
<< inp_matrix[row * M + col] << " | " << out_matrix[col * M + row]
<< std::endl;
}
}
}
virtual ~Transpose()
{
HIP_API_CALL(hipSetDevice(devID));
HIP_API_CALL(hipEventDestroy(start));
HIP_API_CALL(hipEventDestroy(stop));
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
HIP_API_CALL(hipStreamDestroy(stream));
delete[] inp_matrix;
delete[] out_matrix;
}
const int devID;
const size_t M;
const size_t databytes;
hipStream_t stream;
hipEvent_t start, stop;
T* inp_matrix = nullptr;
T* out_matrix = nullptr;
T* in = nullptr;
T* out = nullptr;
};
int
main(int argc, char** argv)
{
int deviceId = 0;
int blockDimY = 8;
int num_iter = 3;
int mat_size = 8192;
for(int i = 1; i < argc; ++i)
{
auto _arg = std::string{argv[i]};
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
std::cout << "usage: transpose "
<< "[MatrixSize (" << mat_size << ")] "
<< "[numIter (" << num_iter << ")] "
<< "[blockDimY (" << blockDimY << ")] "
<< "[DEVICE_ID (" << deviceId << ")] " << std::endl;
exit(EXIT_SUCCESS);
}
}
if(argc > 1) mat_size = atoll(argv[1]);
if(argc > 2) num_iter = atoll(argv[2]);
if(argc > 3) blockDimY = atoll(argv[3]);
if(argc > 4) deviceId = atoll(argv[4]);
printf("[transpose] Matrix size: %d, device ID: %d, num iter: %d, blockDimY: %d\n",
mat_size,
deviceId,
num_iter,
blockDimY);
int ndevice = 0;
HIP_API_CALL(hipGetDeviceCount(&ndevice));
printf("[transpose] Number of devices found: %i\n", ndevice);
assert(ndevice > 0);
if(deviceId >= ndevice) exit(EXIT_FAILURE);
{
std::vector<std::unique_ptr<ITranspose>> kernels;
kernels.push_back(std::make_unique<Transpose<int>>(deviceId, mat_size));
kernels.push_back(std::make_unique<Transpose<float>>(deviceId, mat_size));
kernels.push_back(std::make_unique<Transpose<double>>(deviceId, mat_size));
for(auto& kernel : kernels)
{
kernel->run(TransposeType::TRANSPOSE_NAIVE, blockDimY, num_iter);
kernel->run(TransposeType::TRANSPOSE_INPLACE_LDS, blockDimY, num_iter);
kernel->run(TransposeType::TRANSPOSE_NO_BANK_CONFLICTS, blockDimY, num_iter);
}
}
HIP_API_CALL(hipDeviceSynchronize());
HIP_API_CALL(hipDeviceReset());
return 0;
}
+84
Просмотреть файл
@@ -0,0 +1,84 @@
#pragma once
#include "hip/hip_runtime.h"
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
lock_guard_t _hip_api_print_lk{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
exit(EXIT_FAILURE); \
} \
}
#define TILE_DIM 64
template <typename T>
__global__ void
transposeNaive(T* odata, const T* idata, size_t size)
{
size_t idx = blockIdx.x * TILE_DIM + threadIdx.x;
size_t block_posy = blockIdx.y * TILE_DIM;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
odata[size * idx + block_posy + idy] = idata[idx + (block_posy + idy) * size];
}
template <typename T>
__global__ void
transposeLdsNoBankConflicts(T* odata, const T* idata, size_t size)
{
__shared__ T tile[TILE_DIM][TILE_DIM + 1];
size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
size_t idy_in = blockIdx.y * TILE_DIM + threadIdx.y;
size_t index_in = idx_in + idy_in * size;
size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
size_t idy_out = blockIdx.x * TILE_DIM + threadIdx.y;
size_t index_out = idx_out + idy_out * size;
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
tile[threadIdx.y + y][threadIdx.x] = idata[index_in + y * size];
__syncthreads();
for(size_t y = 0; y < TILE_DIM; y += blockDim.y)
odata[index_out + y * size] = tile[threadIdx.x][threadIdx.y + y];
}
// Generates more interesting ISA
template <typename T>
__global__ void
transposeLdsSwapInplace(T* odata, const T* idata, size_t size)
{
__shared__ T tile[TILE_DIM][TILE_DIM];
const size_t idx_in = blockIdx.x * TILE_DIM + threadIdx.x;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
tile[idy][threadIdx.x] = idata[idx_in + (idy + blockIdx.y * TILE_DIM) * size];
__syncthreads();
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
if(idy < threadIdx.x)
{
T temp = tile[idy][threadIdx.x];
tile[idy][threadIdx.x] = tile[threadIdx.x][idy];
tile[threadIdx.x][idy] = temp;
}
__syncthreads();
const size_t idx_out = blockIdx.y * TILE_DIM + threadIdx.x;
for(size_t idy = threadIdx.y; idy < TILE_DIM; idy += blockDim.y)
odata[(blockIdx.x * TILE_DIM + idy) * size + idx_out] = tile[idy][threadIdx.x];
}
+1
Просмотреть файл
@@ -30,6 +30,7 @@ set(ROCPROFILER_HEADER_FILES
profile_config.h
registration.h
spm.h
thread_trace.h
${CMAKE_CURRENT_BINARY_DIR}/version.h)
install(
+1
Просмотреть файл
@@ -80,6 +80,7 @@ ROCPROFILER_EXTERN_C_FINI
// #include "rocprofiler-sdk/marker.h"
#include "rocprofiler-sdk/pc_sampling.h"
#include "rocprofiler-sdk/profile_config.h"
#include "rocprofiler-sdk/thread_trace.h"
// #include "rocprofiler-sdk/spm.h"
ROCPROFILER_EXTERN_C_INIT
+108
Просмотреть файл
@@ -0,0 +1,108 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/defines.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
ROCPROFILER_EXTERN_C_INIT
/**
* @defgroup THREAD_TRACE Thread Trace Service
* @brief Provides API calls to enable and handle thread trace data
*
* @{
*/
typedef void (*rocprofiler_att_data_types_callback_t)(int64_t data_type_id,
const char* data_type_name,
void* userdata);
rocprofiler_status_t
rocprofiler_att_iterate_data_types(rocprofiler_att_data_types_callback_t callback,
void* userdata) ROCPROFILER_API;
typedef union
{
struct
{
uint32_t enable_async_queue : 1;
uint32_t enable_occupancy_mode : 1;
uint32_t enable_double_buffering : 1;
uint32_t disable_att_markers : 1;
uint32_t disable_software_header : 1;
};
uint32_t raw;
} rocprofiler_att_parameter_flag_t;
typedef struct
{
rocprofiler_att_parameter_flag_t flags;
int shader_num;
int* shader_ids;
uint64_t buffer_size;
uint8_t target_cu;
uint8_t simd_select;
uint8_t reserved;
uint8_t vmid_mask;
uint16_t perfcounter_mask;
uint8_t perfcounter_ctrl;
uint8_t perfcounter_num;
const char** perfcounter;
} rocprofiler_att_parameters_t;
typedef enum
{
ROCPROFILER_ATT_CONTROL_NONE = 0,
ROCPROFILER_ATT_CONTROL_START = 1,
ROCPROFILER_ATT_CONTROL_STOP = 2,
ROCPROFILER_ATT_CONTROL_START_AND_STOP = 3
} rocprofiler_att_control_flags_t;
typedef rocprofiler_att_control_flags_t (*rocprofiler_att_dispatch_callback_t)(
rocprofiler_queue_id_t queue_id,
const rocprofiler_agent_t* agent,
rocprofiler_correlation_id_t correlation_id,
const hsa_kernel_dispatch_packet_t* dispatch_packet,
uint64_t kernel_id,
void* userdata);
typedef void (*rocprofiler_att_shader_data_callback_t)(int64_t shader_engine_id,
int64_t data_type_id,
const char* data_type_name,
void* data,
size_t data_size,
void* userdata);
rocprofiler_status_t
rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id,
rocprofiler_att_parameters_t parameters,
rocprofiler_att_dispatch_callback_t dispatch_callback,
rocprofiler_att_shader_data_callback_t shader_callback,
void* callback_userdata) ROCPROFILER_API;
/** @} */
ROCPROFILER_EXTERN_C_FINI
+2
Просмотреть файл
@@ -9,6 +9,8 @@ set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "tools")
add_subdirectory(rocprofiler-sdk-tool)
add_subdirectory(rocprofiler-sdk-roctx)
add_subdirectory(rocprofiler-sdk-codeobj)
if(ROCPROFILER_BUILD_TESTS)
add_subdirectory(tests)
endif()
+58
Просмотреть файл
@@ -0,0 +1,58 @@
# ##############################################################################
# # Copyright (c) 2024 Advanced Micro Devices, Inc. # # Permission is hereby
# granted, free of charge, to any person obtaining a copy # of this software and
# associated documentation files (the "Software"), to # deal in the Software
# without restriction, including without limitation the # rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or # sell copies of the
# Software, and to permit persons to whom the Software is # furnished to do so,
# subject to the following conditions: # # The above copyright notice and this
# permission notice shall be included in # all copies or substantial portions of
# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
# USE OR OTHER DEALINGS # IN THE SOFTWARE.
# ##############################################################################
set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "codeobj")
find_package(amd_comgr REQUIRED)
set(LIB_CODEOBJ_PARSER_SOURCES code_object_track.cpp code_printing.cpp disassembly.cpp)
set(LIB_CODEOBJ_PARSER_HEADERS code_object_track.hpp code_printing.hpp disassembly.hpp
segment.hpp)
add_library(rocprofiler-sdk-codeobj STATIC)
target_sources(rocprofiler-sdk-codeobj PRIVATE ${LIB_CODEOBJ_PARSER_SOURCES})
target_link_libraries(
rocprofiler-sdk-codeobj
PRIVATE amd_comgr dw elf rocprofiler::rocprofiler-build-flags
rocprofiler::rocprofiler-memcheck rocprofiler::rocprofiler-common-library)
target_include_directories(rocprofiler-sdk-codeobj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
if(ROCPROFILER_BUILD_CODECOV)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -coverage")
target_link_libraries(rocprofiler-sdk-codeobj PUBLIC gcov)
endif()
set_target_properties(
rocprofiler-sdk-codeobj
PROPERTIES LIBRARY_OUTPUT_DIRECTORY
${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj
SOVERSION ${PROJECT_VERSION_MAJOR}
VERSION ${PROJECT_VERSION}
BUILD_RPATH "\$ORIGIN:\$ORIGIN/.."
INSTALL_RPATH "\$ORIGIN:\$ORIGIN/..")
install(
TARGETS rocprofiler-sdk-codeobj
DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler-sdk-codeobj
COMPONENT tools
EXPORT rocprofiler-sdk-codeobj-targets)
install(
FILES ${LIB_CODEOBJ_PARSER_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocprofiler-sdk-codeobj
COMPONENT tools)
+21
Просмотреть файл
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 AMD ROCm™ Software
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+1
Просмотреть файл
@@ -0,0 +1 @@
# rocprofiler-codeobj-parser
+196
Просмотреть файл
@@ -0,0 +1,196 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <sys/mman.h>
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cinttypes>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <functional>
#include <iostream>
#include <map>
#include <mutex>
#include <optional>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include "code_object_track.hpp"
void
CodeobjRecorder::Load(uint64_t addr,
uint64_t load_size,
const std::string& URI,
uint64_t mem_addr,
uint64_t mem_size,
uint64_t id)
{
Load(std::make_shared<CodeobjCaptureInstance>(
addr, load_size, URI, mem_addr, mem_size, id, capture_mode));
}
void
CodeobjCaptureInstance::copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path)
{
std::ifstream file(path, std::ios::in | std::ios::binary);
if(!file)
{
printf("could not open `%s'\n", path.c_str());
return;
}
if(!size)
{
file.ignore(std::numeric_limits<std::streamsize>::max());
size_t bytes = file.gcount();
file.clear();
if(bytes < offset)
{
printf("invalid uri `%s' (file size < offset)\n", path.c_str());
return;
}
size = bytes - offset;
}
file.seekg(offset, std::ios_base::beg);
buffer.resize(size);
file.read(&buffer[0], size);
}
void CodeobjCaptureInstance::copyCodeobjFromMemory(uint64_t, uint64_t)
{
// buffer.resize(mem_size);
// std::memcpy(buffer.data(), (uint64_t*)mem_addr, mem_size);
}
std::pair<size_t, size_t>
CodeobjCaptureInstance::parse_uri()
{
const std::string protocol_delim{"://"};
size_t protocol_end = URI.find(protocol_delim);
protocol = URI.substr(0, protocol_end);
protocol_end += protocol_delim.length();
std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) {
return std::tolower(c);
});
std::string path;
size_t path_end = URI.find_first_of("#?", protocol_end);
if(path_end != std::string::npos)
{
path = URI.substr(protocol_end, path_end++ - protocol_end);
}
else
{
path = URI.substr(protocol_end);
}
/* %-decode the string. */
decoded_path = std::string{};
decoded_path.reserve(path.length());
for(size_t i = 0; i < path.length(); ++i)
{
if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2]))
{
decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16);
i += 2;
}
else
{
decoded_path += path[i];
}
}
/* Tokenize the query/fragment. */
std::vector<std::string> tokens;
size_t pos, last = path_end;
while((pos = URI.find('&', last)) != std::string::npos)
{
tokens.emplace_back(URI.substr(last, pos - last));
last = pos + 1;
}
if(last != std::string::npos) tokens.emplace_back(URI.substr(last));
/* Create a tag-value map from the tokenized query/fragment. */
std::unordered_map<std::string, std::string> params;
std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) {
size_t delim = token.find('=');
if(delim != std::string::npos)
{
params.emplace(token.substr(0, delim), token.substr(delim + 1));
}
});
size_t offset = 0;
size_t size = 0;
if(auto offset_it = params.find("offset"); offset_it != params.end())
offset = std::stoul(offset_it->second, nullptr, 0);
if(auto size_it = params.find("size"); size_it != params.end())
{
if(!(size = std::stoul(size_it->second, nullptr, 0))) throw std::exception();
}
return {offset, size};
}
void
CodeobjCaptureInstance::reset(codeobj_capture_mode_t mode)
{
if(static_cast<int>(mode) <= static_cast<int>(capture_mode)) return;
capture_mode = mode;
if(!buffer.empty()) return;
size_t offset, size;
try
{
std::tie(offset, size) = parse_uri();
} catch(...)
{
std::cerr << "Error parsing URI " << URI << std::endl;
return;
}
if(protocol == "file")
{
if(mode == ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY)
copyCodeobjFromFile(offset, size, decoded_path);
}
else if(protocol == "memory")
{
if(mode != ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY)
copyCodeobjFromMemory(mem_addr, mem_size);
}
else
{
printf("\"%s\" protocol not supported\n", protocol.c_str());
}
}
+158
Просмотреть файл
@@ -0,0 +1,158 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <fstream>
#include <memory>
#include <shared_mutex>
#include <string>
#include <unordered_map>
#include <vector>
/**
* Enum defines how code object is captured for ATT and PC Sampling
*/
enum codeobj_capture_mode_t
{
/**
* Capture file and memory paths for the loaded code object
*/
ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY = 0,
/**
* Capture symbols for file:// and memory:// type objects,
* and generate a copy of all kernel code for objects under memory://
*/
ROCPROFILER_CODEOBJ_CAPTURE_COPY_MEMORY = 1,
/**
* Capture symbols and all kernel code for file:// and memory:// type objects
*/
ROCPROFILER_CODEOBJ_CAPTURE_COPY_FILE_AND_MEMORY = 2,
ROCPROFILER_CODEOBJ_CAPTURE_LAST = 3,
};
/**
* A class to keep track of currently loaded code objects.
* Only the public static methods are thread-safe and expected to be used.
*/
class CodeobjCaptureInstance
{
public:
CodeobjCaptureInstance(uint64_t _addr,
uint64_t _load_size,
const std::string& _uri,
uint64_t _mem_addr,
uint64_t _mem_size,
uint64_t id,
codeobj_capture_mode_t mode)
: addr(_addr)
, load_size(_load_size)
, load_id(id)
, URI(_uri)
, mem_addr(_mem_addr)
, mem_size(_mem_size)
{
reset(mode);
};
const uint64_t addr;
const uint64_t load_size;
const uint64_t load_id;
private:
void reset(codeobj_capture_mode_t mode);
std::pair<size_t, size_t> parse_uri();
void DecodePath();
void copyCodeobjFromFile(uint64_t offset, uint64_t size, const std::string& path);
void copyCodeobjFromMemory(uint64_t, uint64_t);
std::string URI{};
std::string decoded_path{};
std::string protocol{};
std::vector<char> buffer{};
uint64_t mem_addr = 0;
uint64_t mem_size = 0;
codeobj_capture_mode_t capture_mode = ROCPROFILER_CODEOBJ_CAPTURE_SYMBOLS_ONLY;
};
typedef std::shared_ptr<CodeobjCaptureInstance> CodeobjPtr;
template <>
struct std::hash<CodeobjPtr>
{
uint64_t operator()(const CodeobjPtr& p) const { return p->load_id; }
};
template <>
struct std::equal_to<CodeobjPtr>
{
bool operator()(const CodeobjPtr& a, const CodeobjPtr& b) const
{
return (a->addr == b->addr) && (a->load_id == b->load_id);
};
};
/**
* A class to keep track of the history of loaded code objets.
* Only the public static methods are thread-safe and expected to be used.
*/
class CodeobjRecorder
{
public:
CodeobjRecorder(codeobj_capture_mode_t mode)
: capture_mode(mode){};
void Load(uint64_t _addr,
uint64_t _load_size,
const std::string& _uri,
uint64_t mem_addr,
uint64_t mem_size,
uint64_t id);
void Load(CodeobjPtr capture)
{
std::lock_guard<std::shared_mutex> lk(mutex);
captures[capture->load_id] = capture;
}
void Unload(uint64_t id)
{
std::lock_guard<std::shared_mutex> lk(mutex);
captures.erase(id);
};
public:
std::shared_mutex mutex;
std::vector<CodeobjPtr> get()
{
std::vector<CodeobjPtr> vec;
std::shared_lock<std::shared_mutex> lk(mutex);
for(auto& [k, v] : captures)
vec.push_back(v);
return vec;
};
private:
codeobj_capture_mode_t capture_mode;
std::unordered_map<uint64_t, CodeobjPtr> captures;
};
+278
Просмотреть файл
@@ -0,0 +1,278 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <cxxabi.h>
#include <elfutils/libdw.h>
#include <hsa/amd_hsa_elf.h>
#include <sys/mman.h>
#include <atomic>
#define C_API_BEGIN \
try \
{
#define C_API_END(returndata) \
} \
catch(std::exception & e) \
{ \
std::string s = e.what(); \
if(s.find("memory protocol not supported!") == std::string::npos) \
std::cerr << "Codeobj API lookup: " << e.what() << std::endl; \
return returndata; \
} \
catch(std::string & s) \
{ \
if(s.find("memory protocol not supported!") == std::string::npos) \
std::cerr << "Codeobj API lookup: " << s << std::endl; \
return returndata; \
} \
catch(...) { return returndata; }
CodeobjDecoderComponent::CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size)
{
m_fd = -1;
#if defined(_GNU_SOURCE) && defined(MFD_ALLOW_SEALING) && defined(MFD_CLOEXEC)
m_fd = ::memfd_create(m_uri.c_str(), MFD_ALLOW_SEALING | MFD_CLOEXEC);
#endif
if(m_fd == -1) // If fail, attempt under /tmp
m_fd = ::open("/tmp", O_TMPFILE | O_RDWR, 0666);
if(m_fd == -1)
{
printf("could not create a temporary file for code object\n");
return;
}
if(size_t size = ::write(m_fd, codeobj_data, codeobj_size); size != codeobj_size)
{
printf("could not write to the temporary file\n");
return;
}
::lseek(m_fd, 0, SEEK_SET);
fsync(m_fd);
m_line_number_map = {};
std::unique_ptr<Dwarf, void (*)(Dwarf*)> dbg(dwarf_begin(m_fd, DWARF_C_READ),
[](Dwarf* _dbg) { dwarf_end(_dbg); });
/*if (!dbg) {
rocprofiler::warning("Error opening Dwarf!\n");
return;
} */
if(dbg)
{
Dwarf_Off cu_offset{0}, next_offset;
size_t header_size;
std::unordered_set<uint64_t> used_addrs;
while(!dwarf_nextcu(
dbg.get(), cu_offset, &next_offset, &header_size, nullptr, nullptr, nullptr))
{
Dwarf_Die die;
if(!dwarf_offdie(dbg.get(), cu_offset + header_size, &die)) continue;
Dwarf_Lines* lines;
size_t line_count;
if(dwarf_getsrclines(&die, &lines, &line_count)) continue;
for(size_t i = 0; i < line_count; ++i)
{
Dwarf_Addr addr;
int line_number;
Dwarf_Line* line = dwarf_onesrcline(lines, i);
if(line && !dwarf_lineaddr(line, &addr) && !dwarf_lineno(line, &line_number) &&
line_number)
{
std::string src = dwarf_linesrc(line, nullptr, nullptr);
auto dwarf_line = src + ':' + std::to_string(line_number);
if(used_addrs.find(addr) != used_addrs.end())
{
size_t pos = m_line_number_map.lower_bound(addr);
m_line_number_map.data()[pos].str += ' ' + dwarf_line;
continue;
}
used_addrs.insert(addr);
m_line_number_map.insert(DSourceLine{addr, 0, std::move(dwarf_line)});
}
}
cu_offset = next_offset;
}
}
// Can throw
disassembly = std::make_unique<DisassemblyInstance>(codeobj_data, codeobj_size);
if(m_line_number_map.size())
{
size_t total_size = 0;
for(size_t i = 0; i < m_line_number_map.size() - 1; i++)
{
size_t s = m_line_number_map.get(i + 1).vaddr - m_line_number_map.get(i).vaddr;
m_line_number_map.data()[i].size = s;
total_size += s;
}
m_line_number_map.back().size = std::max(total_size, codeobj_size) - total_size;
}
try
{
m_symbol_map = disassembly->GetKernelMap(); // Can throw
} catch(...)
{}
// disassemble_kernels();
}
CodeobjDecoderComponent::~CodeobjDecoderComponent()
{
if(m_fd) ::close(m_fd);
}
std::shared_ptr<Instruction>
CodeobjDecoderComponent::disassemble_instruction(uint64_t faddr, uint64_t vaddr)
{
if(!disassembly) throw std::exception();
const char* cpp_line = nullptr;
try
{
const DSourceLine& it = m_line_number_map.find_obj(vaddr);
cpp_line = it.str.data();
} catch(...)
{}
auto pair = disassembly->ReadInstruction(faddr);
auto inst = std::make_shared<Instruction>(std::move(pair.first), pair.second);
inst->faddr = faddr;
inst->vaddr = vaddr;
if(cpp_line) inst->comment = cpp_line;
return inst;
}
LoadedCodeobjDecoder::LoadedCodeobjDecoder(const char* filepath,
uint64_t _load_addr,
uint64_t mem_size)
: load_addr(_load_addr)
, load_end(load_addr + mem_size)
{
if(!filepath) throw "Empty filepath.";
std::string_view fpath(filepath);
if(fpath.rfind(".out") + 4 == fpath.size())
{
std::ifstream file(filepath, std::ios::in | std::ios::binary);
if(!file.is_open()) throw "Invalid filename " + std::string(filepath);
std::vector<char> buffer;
file.seekg(0, file.end);
buffer.resize(file.tellg());
file.seekg(0, file.beg);
file.read(buffer.data(), buffer.size());
decoder = std::make_unique<CodeobjDecoderComponent>(buffer.data(), buffer.size());
}
else
{
std::unique_ptr<CodeObjectBinary> binary = std::make_unique<CodeObjectBinary>(filepath);
auto& buffer = binary->buffer;
decoder = std::make_unique<CodeobjDecoderComponent>(buffer.data(), buffer.size());
}
elf_segments = decoder->disassembly->getSegments();
}
LoadedCodeobjDecoder::LoadedCodeobjDecoder(const void* data,
size_t size,
uint64_t _load_addr,
uint64_t mem_size)
: load_addr(_load_addr)
, load_end(load_addr + mem_size)
{
decoder = std::make_unique<CodeobjDecoderComponent>(reinterpret_cast<const char*>(data), size);
elf_segments = decoder->disassembly->getSegments();
}
std::shared_ptr<Instruction>
LoadedCodeobjDecoder::add_to_map(uint64_t ld_addr)
{
if(!decoder || ld_addr < load_addr) throw std::out_of_range("Addr not in decoder");
uint64_t voffset = ld_addr - load_addr;
auto faddr = decoder->disassembly->va2fo(voffset);
if(!faddr) throw std::out_of_range("Could not find file offset");
auto shared = decoder->disassemble_instruction(*faddr, voffset);
shared->ld_addr = ld_addr;
decoded_map[ld_addr] = shared;
return shared;
}
std::shared_ptr<Instruction>
LoadedCodeobjDecoder::get(uint64_t addr)
{
if(decoded_map.find(addr) != decoded_map.end()) return decoded_map[addr];
try
{
return add_to_map(addr);
} catch(std::exception& e)
{
std::cerr << e.what() << " at addr " << std::hex << addr << std::dec << std::endl;
}
throw std::out_of_range("Invalid address");
return nullptr;
}
#define PUBLIC_API __attribute__((visibility("default")))
+257
Просмотреть файл
@@ -0,0 +1,257 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>
#include "disassembly.hpp"
#include "segment.hpp"
struct Instruction
{
Instruction() = default;
Instruction(std::string&& _inst, size_t _size)
: inst(std::move(_inst))
, size(_size)
{}
std::string inst;
std::string comment;
uint64_t faddr;
uint64_t vaddr;
uint64_t ld_addr;
size_t size;
};
struct DSourceLine
{
uint64_t vaddr;
uint64_t size;
std::string str;
uint64_t begin() const { return vaddr; }
bool inrange(uint64_t addr) const { return addr >= vaddr && addr < vaddr + size; }
};
class CodeobjDecoderComponent
{
public:
CodeobjDecoderComponent(const char* codeobj_data, uint64_t codeobj_size);
~CodeobjDecoderComponent();
std::shared_ptr<Instruction> disassemble_instruction(uint64_t faddr, uint64_t vaddr);
int m_fd;
cached_ordered_vector<DSourceLine> m_line_number_map;
std::map<uint64_t, SymbolInfo> m_symbol_map{};
std::string m_uri;
std::vector<std::shared_ptr<Instruction>> instructions{};
std::unique_ptr<DisassemblyInstance> disassembly{};
};
class LoadedCodeobjDecoder
{
public:
LoadedCodeobjDecoder(const char* filepath, uint64_t load_addr, uint64_t memsize);
LoadedCodeobjDecoder(const void* data, uint64_t size, uint64_t load_addr, size_t memsize);
std::shared_ptr<Instruction> add_to_map(uint64_t ld_addr);
std::shared_ptr<Instruction> get(uint64_t addr);
uint64_t begin() const { return load_addr; };
uint64_t end() const { return load_end; }
uint64_t size() const { return load_end - load_addr; }
bool inrange(uint64_t addr) const { return addr >= begin() && addr < end(); }
const char* getSymbolName(uint64_t addr) const
{
if(!decoder) return nullptr;
auto it = decoder->m_symbol_map.find(addr - load_addr);
if(it != decoder->m_symbol_map.end()) return it->second.name.data();
return nullptr;
}
std::map<uint64_t, SymbolInfo>& getSymbolMap() const
{
if(!decoder) throw std::exception();
return decoder->m_symbol_map;
}
std::vector<std::pair<uint64_t, uint64_t>> elf_segments{};
const uint64_t load_addr;
private:
uint64_t load_end = 0;
std::unordered_map<uint64_t, std::shared_ptr<Instruction>> decoded_map;
std::unique_ptr<CodeobjDecoderComponent> decoder{nullptr};
};
/**
* @brief Maps ID and offsets into instructions
*/
class CodeobjMap
{
public:
CodeobjMap() = default;
virtual void addDecoder(const char* filepath,
codeobj_marker_id_t id,
uint64_t load_addr,
uint64_t memsize)
{
decoders[id] = std::make_shared<LoadedCodeobjDecoder>(filepath, load_addr, memsize);
}
virtual void addDecoder(const void* data,
size_t memory_size,
codeobj_marker_id_t id,
uint64_t load_addr,
uint64_t memsize)
{
decoders[id] =
std::make_shared<LoadedCodeobjDecoder>(data, memory_size, load_addr, memsize);
}
virtual bool removeDecoderbyId(codeobj_marker_id_t id) { return decoders.erase(id) != 0; }
std::shared_ptr<Instruction> get(codeobj_marker_id_t id, uint64_t offset)
{
auto& decoder = decoders.at(id);
return decoder->get(decoder->begin() + offset);
}
const char* getSymbolName(codeobj_marker_id_t id, uint64_t offset)
{
auto& decoder = decoders.at(id);
uint64_t vaddr = decoder->begin() + offset;
if(decoder->inrange(vaddr)) return decoder->getSymbolName(vaddr);
return nullptr;
}
protected:
std::unordered_map<codeobj_marker_id_t, std::shared_ptr<LoadedCodeobjDecoder>> decoders{};
};
/**
* @brief Translates virtual addresses to elf file offsets
*/
class CodeobjAddressTranslate : protected CodeobjMap
{
using Super = CodeobjMap;
public:
CodeobjAddressTranslate() = default;
virtual void addDecoder(const char* filepath,
codeobj_marker_id_t id,
uint64_t load_addr,
uint64_t memsize) override
{
this->Super::addDecoder(filepath, id, load_addr, memsize);
auto ptr = decoders.at(id);
table.insert({ptr->begin(), ptr->size(), id, 0});
}
virtual void addDecoder(const void* data,
size_t memory_size,
codeobj_marker_id_t id,
uint64_t load_addr,
uint64_t memsize) override
{
this->Super::addDecoder(data, memory_size, id, load_addr, memsize);
auto ptr = decoders.at(id);
table.insert({ptr->begin(), ptr->size(), id, 0});
}
virtual bool removeDecoder(codeobj_marker_id_t id, uint64_t load_addr)
{
return table.remove(load_addr) && this->Super::removeDecoderbyId(id);
}
std::shared_ptr<Instruction> get(uint64_t vaddr)
{
auto& addr_range = table.find_codeobj_in_range(vaddr);
return this->Super::get(addr_range.id, vaddr - addr_range.vbegin);
}
std::shared_ptr<Instruction> get(codeobj_marker_id_t id, uint64_t offset)
{
if(id == 0)
return get(offset);
else
return this->Super::get(id, offset);
}
const char* getSymbolName(uint64_t vaddr)
{
for(auto& [_, decoder] : decoders)
{
if(!decoder->inrange(vaddr)) continue;
return decoder->getSymbolName(vaddr);
}
return nullptr;
}
void getSymbolMap(const std::shared_ptr<LoadedCodeobjDecoder>& dec,
std::unordered_map<uint64_t, SymbolInfo>& symbols) const
{
try
{
auto& smap = dec->getSymbolMap();
for(auto& [vaddr, sym] : smap)
symbols[vaddr + dec->load_addr] = sym;
} catch(std::exception& e)
{
return;
};
}
std::unordered_map<uint64_t, SymbolInfo> getSymbolMap() const
{
std::unordered_map<uint64_t, SymbolInfo> symbols;
for(auto& [_, dec] : decoders)
this->getSymbolMap(dec, symbols);
return symbols;
}
std::unordered_map<uint64_t, SymbolInfo> getSymbolMap(codeobj_marker_id_t id) const
{
std::unordered_map<uint64_t, SymbolInfo> symbols;
auto it = decoders.find(id);
if(it == decoders.end()) return symbols;
this->getSymbolMap(it->second, symbols);
return symbols;
}
private:
CodeobjTableTranslator table;
};
+372
Просмотреть файл
@@ -0,0 +1,372 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#if !defined(_GNU_SOURCE) || !defined(_XOPEN_SOURCE)
# define _XOPEN_SOURCE 700
#endif
#include <cxxabi.h>
#include <elf.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include <cassert>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <elfutils/libdw.h>
#include <hsa/amd_hsa_elf.h>
#include "lib/rocprofiler-sdk-codeobj/code_printing.hpp"
#define THROW_COMGR(call) \
if(amd_comgr_status_s status = call) \
{ \
const char* reason = ""; \
amd_comgr_status_string(status, &reason); \
std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \
<< std::endl; \
throw std::exception(); \
}
#define RETURN_COMGR(call) \
if(amd_comgr_status_s status = call) \
{ \
const char* reason = ""; \
amd_comgr_status_string(status, &reason); \
std::cerr << __FILE__ << ':' << __LINE__ << " code: " << status << " failed: " << reason \
<< std::endl; \
return AMD_COMGR_STATUS_ERROR; \
}
CodeObjectBinary::CodeObjectBinary(const std::string& uri)
: m_uri(uri)
{
const std::string protocol_delim{"://"};
size_t protocol_end = m_uri.find(protocol_delim);
std::string protocol = m_uri.substr(0, protocol_end);
protocol_end += protocol_delim.length();
std::transform(protocol.begin(), protocol.end(), protocol.begin(), [](unsigned char c) {
return std::tolower(c);
});
std::string path;
size_t path_end = m_uri.find_first_of("#?", protocol_end);
if(path_end != std::string::npos)
{
path = m_uri.substr(protocol_end, path_end++ - protocol_end);
}
else
{
path = m_uri.substr(protocol_end);
}
/* %-decode the string. */
std::string decoded_path;
decoded_path.reserve(path.length());
for(size_t i = 0; i < path.length(); ++i)
{
if(path[i] == '%' && std::isxdigit(path[i + 1]) && std::isxdigit(path[i + 2]))
{
decoded_path += std::stoi(path.substr(i + 1, 2), 0, 16);
i += 2;
}
else
{
decoded_path += path[i];
}
}
/* Tokenize the query/fragment. */
std::vector<std::string> tokens;
size_t pos, last = path_end;
while((pos = m_uri.find('&', last)) != std::string::npos)
{
tokens.emplace_back(m_uri.substr(last, pos - last));
last = pos + 1;
}
if(last != std::string::npos)
{
tokens.emplace_back(m_uri.substr(last));
}
/* Create a tag-value map from the tokenized query/fragment. */
std::unordered_map<std::string, std::string> params;
std::for_each(tokens.begin(), tokens.end(), [&](std::string& token) {
size_t delim = token.find('=');
if(delim != std::string::npos)
{
params.emplace(token.substr(0, delim), token.substr(delim + 1));
}
});
buffer = std::vector<char>{};
size_t offset = 0;
size_t size = 0;
if(auto offset_it = params.find("offset"); offset_it != params.end())
{
offset = std::stoul(offset_it->second, nullptr, 0);
}
if(auto size_it = params.find("size"); size_it != params.end())
{
if(!(size = std::stoul(size_it->second, nullptr, 0))) return;
}
if(protocol != "file") throw protocol + " protocol not supported!";
std::ifstream file(decoded_path, std::ios::in | std::ios::binary);
if(!file || !file.is_open()) throw "could not open " + decoded_path;
if(!size)
{
file.ignore(std::numeric_limits<std::streamsize>::max());
size_t bytes = file.gcount();
file.clear();
if(bytes < offset) throw "invalid uri " + decoded_path + " (file size < offset)";
size = bytes - offset;
}
file.seekg(offset, std::ios_base::beg);
buffer.resize(size);
file.read(&buffer[0], size);
}
DisassemblyInstance::DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size)
{
buffer = std::vector<char>(codeobj_size, 0);
std::memcpy(buffer.data(), codeobj_data, codeobj_size);
THROW_COMGR(amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &data));
THROW_COMGR(amd_comgr_set_data(data, buffer.size(), buffer.data()));
size_t isa_size = 128;
std::string input_isa{};
input_isa.resize(isa_size);
THROW_COMGR(amd_comgr_get_data_isa_name(data, &isa_size, input_isa.data()));
THROW_COMGR(amd_comgr_create_disassembly_info(
input_isa.data(),
&DisassemblyInstance::memory_callback,
&DisassemblyInstance::inst_callback,
[](uint64_t, void*) {},
&info));
}
amd_comgr_status_t
DisassemblyInstance::symbol_callback(amd_comgr_symbol_t symbol, void* user_data)
{
amd_comgr_symbol_type_t type;
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_TYPE, &type));
if(type != AMD_COMGR_SYMBOL_TYPE_FUNC) return AMD_COMGR_STATUS_SUCCESS;
uint64_t vaddr = 0;
uint64_t mem_size = 0;
uint64_t name_size = 0;
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_VALUE, &vaddr));
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_SIZE, &mem_size));
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME_LENGTH, &name_size));
std::string name;
name.resize(name_size);
RETURN_COMGR(amd_comgr_symbol_get_info(symbol, AMD_COMGR_SYMBOL_INFO_NAME, name.data()));
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
std::optional<uint64_t> faddr = instance.va2fo(vaddr);
if(faddr) instance.symbol_map[vaddr] = {name, *faddr, vaddr, mem_size};
return AMD_COMGR_STATUS_SUCCESS;
}
std::map<uint64_t, SymbolInfo>&
DisassemblyInstance::GetKernelMap()
{
symbol_map = {};
THROW_COMGR(amd_comgr_iterate_symbols(data, &DisassemblyInstance::symbol_callback, this));
return symbol_map;
}
DisassemblyInstance::~DisassemblyInstance()
{
amd_comgr_release_data(data);
amd_comgr_destroy_disassembly_info(info);
}
std::pair<std::string, size_t>
DisassemblyInstance::ReadInstruction(uint64_t faddr)
{
uint64_t size_read;
uint64_t addr_in_buffer = reinterpret_cast<uint64_t>(buffer.data()) + faddr;
THROW_COMGR(amd_comgr_disassemble_instruction(info, addr_in_buffer, (void*) this, &size_read));
return {std::move(this->last_instruction), size_read};
}
uint64_t
DisassemblyInstance::memory_callback(uint64_t from, char* to, uint64_t size, void* user_data)
{
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
int64_t copysize = reinterpret_cast<int64_t>(instance.buffer.data()) + instance.buffer.size() -
static_cast<int64_t>(from);
copysize = std::min<int64_t>(size, copysize);
std::memcpy(to, (char*) from, copysize);
return copysize;
}
void
DisassemblyInstance::inst_callback(const char* instruction, void* user_data)
{
DisassemblyInstance& instance = *static_cast<DisassemblyInstance*>(user_data);
if(!instruction) return;
while(*instruction == '\t' || *instruction == ' ')
instruction++;
instance.last_instruction = instruction;
}
#define CHECK_VA2FO(x, msg) \
if(!(x)) \
{ \
std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \
return std::nullopt; \
}
// mem - input argument, start of the elf
// va - input argument, virtual address
// return file offset, if found
std::optional<uint64_t>
DisassemblyInstance::va2fo(uint64_t va)
{
CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough");
uint8_t* e_ident = (uint8_t*) buffer.data();
CHECK_VA2FO(e_ident, "e_ident is nullptr");
CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 ||
e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3,
"unexpected ei_mag");
CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class");
CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data");
CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version");
CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA
CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4
e_ident[EI_ABIVERSION] == 3,
"unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5
Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data();
CHECK_VA2FO(ehdr, "ehdr is nullptr");
CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type");
CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine");
CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff");
CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough");
Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff);
CHECK_VA2FO(phdr, "phdr is nullptr");
for(uint16_t i = 0; i < ehdr->e_phnum; ++i)
{
if(phdr[i].p_type != PT_LOAD) continue;
if(va < phdr[i].p_vaddr || va >= (phdr[i].p_vaddr + phdr[i].p_memsz)) continue;
return va + phdr[i].p_offset - phdr[i].p_vaddr;
}
return std::nullopt;
}
#undef CHECK_VA2FO
#define CHECK_VA2FO(x, msg) \
if(!(x)) \
{ \
std::cerr << __FILE__ << ' ' << __LINE__ << ' ' << msg << std::endl; \
return {}; \
}
std::vector<std::pair<uint64_t, uint64_t>>
DisassemblyInstance::getSegments()
{
CHECK_VA2FO(buffer.size() > sizeof(Elf64_Ehdr), "buffer is not large enough");
uint8_t* e_ident = (uint8_t*) buffer.data();
CHECK_VA2FO(e_ident, "e_ident is nullptr");
CHECK_VA2FO(e_ident[EI_MAG0] == ELFMAG0 || e_ident[EI_MAG1] == ELFMAG1 ||
e_ident[EI_MAG2] == ELFMAG2 || e_ident[EI_MAG3] == ELFMAG3,
"unexpected ei_mag");
CHECK_VA2FO(e_ident[EI_CLASS] == ELFCLASS64, "unexpected ei_class");
CHECK_VA2FO(e_ident[EI_DATA] == ELFDATA2LSB, "unexpected ei_data");
CHECK_VA2FO(e_ident[EI_VERSION] == EV_CURRENT, "unexpected ei_version");
CHECK_VA2FO(e_ident[EI_OSABI] == 64, "unexpected ei_osabi"); // ELFOSABI_AMDGPU_HSA
CHECK_VA2FO(e_ident[EI_ABIVERSION] == 2 || // ELFABIVERSION_AMDGPU_HSA_V4
e_ident[EI_ABIVERSION] == 3,
"unexpected ei_abiversion"); // ELFABIVERSION_AMDGPU_HSA_V5
Elf64_Ehdr* ehdr = (Elf64_Ehdr*) buffer.data();
CHECK_VA2FO(ehdr, "ehdr is nullptr");
CHECK_VA2FO(ehdr->e_type == ET_DYN, "unexpected e_type");
CHECK_VA2FO(ehdr->e_machine == ELF::EM_AMDGPU, "unexpected e_machine");
CHECK_VA2FO(ehdr->e_phoff != 0, "unexpected e_phoff");
CHECK_VA2FO(buffer.size() > ehdr->e_phoff + sizeof(Elf64_Phdr), "buffer is not large enough");
Elf64_Phdr* phdr = (Elf64_Phdr*) ((uint8_t*) buffer.data() + ehdr->e_phoff);
CHECK_VA2FO(phdr, "phdr is nullptr");
std::vector<std::pair<uint64_t, uint64_t>> segments;
for(Elf64_Half i = 0; i < ehdr->e_phnum; ++i)
{
if(phdr[i].p_type != PT_LOAD) continue;
segments.push_back({phdr[i].p_vaddr - phdr[i].p_offset, phdr[i].p_memsz});
}
return segments;
}
+68
Просмотреть файл
@@ -0,0 +1,68 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <amd_comgr/amd_comgr.h>
#include <limits>
#include <memory>
#include <string>
#include <vector>
class CodeObjectBinary
{
public:
CodeObjectBinary(const std::string& uri);
std::string m_uri;
std::vector<char> buffer;
};
struct SymbolInfo
{
std::string name{};
uint64_t faddr = 0;
uint64_t vaddr = 0;
uint64_t mem_size = 0;
};
class DisassemblyInstance
{
public:
DisassemblyInstance(const char* codeobj_data, uint64_t codeobj_size);
~DisassemblyInstance();
std::pair<std::string, size_t> ReadInstruction(uint64_t faddr);
std::map<uint64_t, SymbolInfo>& GetKernelMap();
static uint64_t memory_callback(uint64_t from, char* to, uint64_t size, void* user_data);
static void inst_callback(const char* instruction, void* user_data);
static amd_comgr_status_t symbol_callback(amd_comgr_symbol_t symbol, void* user_data);
std::optional<uint64_t> va2fo(uint64_t va);
std::vector<std::pair<uint64_t, uint64_t>> getSegments();
std::vector<char> buffer;
std::string last_instruction;
amd_comgr_disassembly_info_t info;
amd_comgr_data_t data;
std::map<uint64_t, SymbolInfo> symbol_map;
};
+162
Просмотреть файл
@@ -0,0 +1,162 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <algorithm>
#include <iostream>
#include <random>
#include <string>
#include <unordered_set>
#include <vector>
using codeobj_marker_id_t = size_t;
template <typename Type>
class ordered_vector : public std::vector<Type>
{
using Super = std::vector<Type>;
public:
void insert(const Type& elem)
{
size_t loc = lower_bound(elem.begin());
if(this->size() && get(loc).begin() < elem.begin()) loc++;
this->Super::insert(this->begin() + loc, elem);
}
bool remove(const Type& elem)
{
if(!this->size()) return false;
size_t loc = lower_bound(elem.begin());
if(get(loc) != elem) return false;
this->Super::erase(this->begin() + loc);
return true;
}
bool remove(uint64_t elem_begin)
{
if(!this->size()) return false;
size_t loc = lower_bound(elem_begin);
if(get(loc).begin() != elem_begin) return false;
this->Super::erase(this->begin() + loc);
return true;
}
size_t lower_bound(size_t addr) const
{
if(!this->size()) return 0;
return binary_search(addr, 0, this->size() - 1);
}
size_t binary_search(size_t addr, size_t s, size_t e) const
{
if(s >= e)
return s;
else if(s + 1 == e)
return (get(e).begin() <= addr) ? e : s;
size_t mid = (s + e) / 2;
if(get(mid).begin() <= addr)
return binary_search(addr, mid, e);
else
return binary_search(addr, s, mid);
}
const Type& get(size_t i) const { return this->operator[](i); }
};
/**
* @brief Finds a candidate codeobj for the given vaddr
*/
template <typename Type>
class cached_ordered_vector : public ordered_vector<Type>
{
using Super = ordered_vector<Type>;
public:
cached_ordered_vector<Type>() { reset(); }
const Type& find_obj(uint64_t addr)
{
if(testCache(addr)) return get(cached_segment);
size_t lb = this->lower_bound(addr);
if(lb >= this->size() || !get(lb).inrange(addr))
throw std::string("segment addr out of range");
cached_segment = lb;
return get(cached_segment);
}
uint64_t find_addr(uint64_t addr) { return find_obj(addr).begin(); }
bool testCache(uint64_t addr) const
{
return this->cached_segment < this->size() && get(cached_segment).inrange(addr);
}
const Type& get(size_t index) const { return this->data()[index]; }
void insert(const Type& elem) { this->Super::insert(elem); }
void insert_list(std::vector<Type> arange)
{
for(auto& elem : arange)
push_back(elem);
std::sort(this->begin(), this->end(), [](const Type& a, const Type& b) {
return a.begin() < b.begin();
});
};
void reset() { cached_segment = ~0; }
void clear()
{
reset();
this->Super::clear();
}
bool remove(uint64_t addr)
{
reset();
return this->Super::remove(addr);
}
private:
size_t cached_segment = ~0;
};
struct address_range_t
{
uint64_t vbegin;
uint64_t size;
codeobj_marker_id_t id;
uint64_t offset;
bool operator<(const address_range_t& other) const { return vbegin < other.vbegin; }
bool inrange(uint64_t addr) const { return addr >= vbegin && addr < vbegin + size; };
uint64_t begin() const { return vbegin; }
};
/**
* @brief Finds a candidate codeobj for the given vaddr
*/
class CodeobjTableTranslator : public cached_ordered_vector<address_range_t>
{
public:
const address_range_t& find_codeobj_in_range(uint64_t addr) { return this->find_obj(addr); }
};
+1
Просмотреть файл
@@ -42,6 +42,7 @@ add_subdirectory(counters)
add_subdirectory(aql)
add_subdirectory(pc_sampling)
add_subdirectory(marker)
add_subdirectory(thread_trace)
target_link_libraries(
rocprofiler-object-library
+644 -7
Просмотреть файл
@@ -1,17 +1,352 @@
// This file should be removed when it appears in AQL Profile
#pragma ONCE
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <hsa/hsa.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
#define PUBLIC_API __attribute__((visibility("default")))
#ifdef __cplusplus
extern "C" {
#endif
typedef struct
{
uint64_t handle;
} aqlprofile_handle_t;
typedef enum
{
AQLPROFILE_MEMORY_HINT_NONE = 0,
AQLPROFILE_MEMORY_HINT_HOST = 1,
AQLPROFILE_MEMORY_HINT_DEVICE_UNCACHED = 2,
AQLPROFILE_MEMORY_HINT_DEVICE_COHERENT = 3,
AQLPROFILE_MEMORY_HINT_DEVICE_NONCOHERENT = 4,
AQLPROFILE_MEMORY_HINT_LAST
} aqlprofile_memory_hint_t;
/**
* @brief Flags to describe which agents can access given buffer.
*/
typedef union
{
uint32_t raw;
struct
{
uint32_t device_access : 1;
uint32_t host_access : 1;
uint32_t memory_hint : 6; // One of aqlprofile_memory_hint_t
uint32_t _reserved : 24;
};
} aqlprofile_buffer_desc_flags_t;
/**
* @brief Callback to request a memory buffer, which will be tied to a profile.
* The user is responsible for clearing up memory after the profile is no longer needed.
* @param[out] ptr The pointer containing memory.
* @param[in] size Minimum requested buffer size.
* @param[in] flags Access flags, requesting which agents need to read/write to the buffer.
* @param[in] userdata Data to be passed back to user.
* @retval HSA_STATUS_SUCCESS if successful
* @retval HSA_STATUS_ERROR if memory could not be allocated
*/
typedef hsa_status_t (*aqlprofile_memory_alloc_callback_t)(void** ptr,
uint64_t size,
aqlprofile_buffer_desc_flags_t flags,
void* userdata);
/**
* @brief Callback to dealloc memory requested via aqlprofile_memory_alloc_callback_t
* @param[in] ptr The pointer containing memory.
* @param[in] userdata Data to be passed back to user.
* @retval HSA_STATUS_SUCCESS if successful
* @retval HSA_STATUS_ERROR if memory could not be allocated
*/
typedef void (*aqlprofile_memory_dealloc_callback_t)(void* ptr, void* userdata);
typedef enum
{
AQLPROFILE_ACCUMULATION_NONE = 0, /** Do not accumulate event */
AQLPROFILE_ACCUMULATION_LO_RES, /**< The event should be integrated over quad-cycles */
AQLPROFILE_ACCUMULATION_HI_RES, /**< The event should be integrated every cycle */
AQLPROFILE_ACCUMULATION_LAST,
} aqlprofile_accumulation_type_t;
/**
* @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics
*/
typedef union
{
uint32_t raw;
struct
{
uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */
uint32_t _reserved : 29;
} sq_flags;
} aqlprofile_pmc_event_flags_t;
/**
* @brief Struct containing all necessary information of an event (counter).
*/
typedef struct
{
uint32_t block_index; /**< Block channel. */
uint32_t event_id; /**< Event ID as fined by XML */
aqlprofile_pmc_event_flags_t flags; /**< Special event flags e.g. accumulation */
hsa_ven_amd_aqlprofile_block_name_t block_name; /**< Block name as defined by block indexes */
} aqlprofile_pmc_event_t;
/**
* @brief Struct containing information about the agent. User code sets these values
* to the describe the agent to profile. Information can be obtained either from HSA
* (if loaded) or the KFD topology.
*/
typedef struct
{
const char* agent_gfxip; /**< Agent GFXIP (HSA_AGENT_INFO_NAME or KFD.product_name) */
uint32_t xcc_num; /**< XCC's on the agent (HSA_AMD_AGENT_INFO_NUM_XCC or KFD.num_xcc) */
uint32_t se_num; /**< SE's on the agent (HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES or
KFD.num_shader_banks) */
uint32_t
cu_num; /**< CU's on the agent (HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT or KFD.cu_count) */
uint32_t shader_arrays_per_se; /**< Shader arrays per SE of agent
(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE or
KFD.simd_arrays_per_engine)*/
} aqlprofile_agent_info_t;
/**
* @brief Struct containing a handle to a registered agent
*
*/
typedef struct
{
uint64_t handle;
} aqlprofile_agent_handle_t;
/**
* @brief Registers an agent to be used with AQL profile.
* @param[out] agent_id Handle to newly registered agent
* @param[in] agent_info Info to register a new agent with AQL Profiler
* @retval HSA_STATUS_SUCCESS registration ok
* @retval HSA_STATUS_ERROR registration failed
*/
hsa_status_t
aqlprofile_register_agent(aqlprofile_agent_handle_t* agent_id,
const aqlprofile_agent_info_t* agent_info);
/**
* @brief AQLprofile struct containing information for perfmon events
*/
typedef struct
{
aqlprofile_agent_handle_t agent;
const aqlprofile_pmc_event_t* events;
uint32_t event_count;
} aqlprofile_pmc_profile_t;
// Profile attributes
typedef enum
{
AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value
AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value
AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value
// in info_data object
AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter
AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances
// by name string using _id_query_t
AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for
// counters enable command buffer
AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for
// counters disable command buffer
} aqlprofile_pmc_info_type_t;
hsa_status_t
aqlprofile_get_pmc_info(const aqlprofile_pmc_profile_t* profile,
aqlprofile_pmc_info_type_t attribute,
void* value);
/**
* @brief AQLprofile struct containing information for Advanced Thread Trace
*/
typedef struct
{
hsa_agent_t agent;
const hsa_ven_amd_aqlprofile_parameter_t* parameters;
uint32_t parameter_count;
} aqlprofile_att_profile_t;
/**
* @brief Data callback for perfmon events. Each event will call this once per coordinate
* @param[in] event The event information passed in from aqlprofile_pmc_profile_t
* @param[in] counter_id Internal ID of the counter
* @param[in] counter_value The event value, as incremented from start() to stop()
* @param[in] userdata Data returned to user
* @retval HSA_STATUS_SUCCESS to continue iteration
* @retval HSA_STATUS_ERROR to stop callback iteration
*/
typedef hsa_status_t (*aqlprofile_pmc_data_callback_t)(aqlprofile_pmc_event_t event,
uint64_t counter_id,
uint64_t counter_value,
void* userdata);
/**
* @brief Data callback for thread trace. This will be called at least once per shader engine
* @param[in] shader Shader Engine ID
* @param[in] buffer Pointer containing the data
* @param[in] size Amount of bytes used by thread trace
* @param[in] callback_data Data returned to user
* @retval HSA_STATUS_SUCCESS to continue iteration
* @retval HSA_STATUS_ERROR to stop callback iteration
*/
typedef hsa_status_t (*aqlprofile_att_data_callback_t)(uint32_t shader,
void* buffer,
uint64_t size,
void* callback_data);
/**
* @brief Memory copy fn for aqlprofile to copy data.
* @param[in] dst Destination pointer to copy data to.
* @param[in] src Source pointer where data is to be copied from.
* @param[in] size Amount of bytes to be copied.
* @param[in] userdata Data returned to user
* @retval HSA_STATUS_SUCCESS on success
* @retval HSA_STATUS_ERROR on failure
*/
typedef hsa_status_t (*aqlprofile_memory_copy_t)(void* dst,
const void* src,
size_t size,
void* userdata);
/**
* @brief Validates the event for the agent.
* @param[in] agent The agent to validate the event for.
* @param[in] event The event to validate.
* @param[out] result True if the event is valid for the agent, false otherwise.
* @retval HSA_STATUS_SUCCESS if the event was validated.
* @retval HSA_STATUS_ERROR if the event was not validated.
*/
hsa_status_t
aqlprofile_validate_pmc_event(aqlprofile_agent_handle_t agent,
const aqlprofile_pmc_event_t* event,
bool* result);
/**
* @brief Iterate_data() will parse the event data and call @callback with the resulting event data
* @param[in] handle The handle returned from aqlprofile_pmc_create_packets()
* @param[in] callback CB where the resulting event values are going to be returned
* @param[in] userdata Data sent back to user
* @retval HSA_STATUS_SUCCESS all operations exited succesfully
* @retval HSA_STATUS_ERROR if some callback returns an error
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given
*/
hsa_status_t
aqlprofile_pmc_iterate_data(aqlprofile_handle_t handle,
aqlprofile_pmc_data_callback_t callback,
void* userdata);
/**
* @brief Struct to be returned by aqlprofile_pmc_create_packets
*/
typedef struct
{
hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Reset counters and start incrementing */
hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Pause counters from incrementing */
hsa_ext_amd_aql_pm4_packet_t read_packet; /**< Retrieve results from device */
} aqlprofile_pmc_aql_packets_t;
/**
* @brief Function to create AQL packets to be inserted into the queue.
* @param[out] handle To be passed to iterate_data()
* @param[out] packets Pointer to where the start, stop and read packets will be written to
* @param[in] profile Agent and events information
* @param[in] alloc_cb Memory allocation, which may request cpu or gpu memory for internal use
* @param[in] dealloc_cb Function to free memory allocated by alloc_cb
* @param[in] userdata Data passed back to user via memory alloc callback
*/
hsa_status_t
aqlprofile_pmc_create_packets(aqlprofile_handle_t* handle,
aqlprofile_pmc_aql_packets_t* packets,
aqlprofile_pmc_profile_t profile,
aqlprofile_memory_alloc_callback_t alloc_cb,
aqlprofile_memory_dealloc_callback_t dealloc_cb,
aqlprofile_memory_copy_t memcpy_cb,
void* userdata);
/**
* @brief Function to delete AQL packets after creation by aqlprofile_pmc_create_packets
* @param[in] handle Returned by aqlprofile_pmc_create_packets()
*/
void
aqlprofile_pmc_delete_packets(aqlprofile_handle_t handle);
/**
* @brief Iterates over thread trace data and the data to user
* @param[in] handle The handle returned from aqlprofile_att_create_packets()
* @param[in] callback CB where the resulting data is going to be returned
* @param[in] userdata Data sent back to user
* @retval HSA_STATUS_SUCCESS all operations exited succesfully
* @retval HSA_STATUS_ERROR if some callback returns an error
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if invalid handle is given
*/
hsa_status_t
aqlprofile_att_iterate_data(aqlprofile_handle_t handle,
aqlprofile_att_data_callback_t callback,
void* userdata);
/**
* @brief Struct containing AQLpackets to start and stop thread trace
*/
typedef struct
{
hsa_ext_amd_aql_pm4_packet_t start_packet; /**< Packet to start thread trace */
hsa_ext_amd_aql_pm4_packet_t stop_packet; /**< Packet to stop thread trace and flush data */
} aqlprofile_att_control_aql_packets_t;
/**
* @brief Fn to create start and stop thread trace packets
* @param[out] handle To be passed to iterate_data()
* @param[out] packets Packets returned by this function to start and stop thread trace
* @param[in] profile Agent information and extra parameters for thread trace
* @param[in] callback Memory allocation fn which may request cpu or gpu memory
* @retval HSA_STATUS_SUCCESS if all packets created succesfully
* @retval HSA_STATUS_ERROR otherwise
*/
hsa_status_t
aqlprofile_att_create_packets(aqlprofile_handle_t* handle,
aqlprofile_att_control_aql_packets_t* packets,
aqlprofile_att_profile_t profile,
aqlprofile_memory_alloc_callback_t alloc_cb,
aqlprofile_memory_dealloc_callback_t dealloc_cb,
aqlprofile_memory_copy_t memcpy_cb,
void* userdata);
void
aqlprofile_att_delete_packets(aqlprofile_handle_t handle);
/**
* @brief Callback for iteration of all possible event coordinate IDs and coordinate names.
* @param [in] id Integer identifying the dimension.
* @param [in] name Name of the dimension
* @param [in] data User data supplied to @ref aqlprofile_iterate_event_ids
* @return hsa_status_t
* @retval HSA_STATUS_SUCCESS Continues iteration
* @retval OTHERS Any other HSA return values stops iteration, passing back this value through
* @ref aqlprofile_iterate_event_ids
@@ -22,12 +357,314 @@ typedef hsa_status_t (*aqlprofile_eventname_callback_t)(int id, const char* name
* @brief Iterate over all possible event coordinate IDs and their names.
* @param [in] callback Callback to use for iteration of dimensions
* @param [in] user_data Data to supply to callback @ref aqlprofile_eventname_callback_t
* @return hsa_status_t
* @retval HSA_STATUS_SUCCESS if successful
* @retval HSA_STATUS_ERROR if error on interation
* @retval OTHERS If @ref aqlprofile_eventname_callback_t returns non-HSA_STATUS_SUCCESS,
* that value is returned.
*/
PUBLIC_API hsa_status_t
hsa_status_t
aqlprofile_iterate_event_ids(aqlprofile_eventname_callback_t callback, void* user_data);
/**
* @brief Iterate over all event coordinates for a given agent_t and event_t.
* @param position A counting sequence indicating callback number.
* @param id Coordinate ID as in _iterate_event_ids.
* @param extent Coordinate extent indicating maximum allowed instances.
* @param coordinate The coordinate, in the range [0,extent-1].
* @param name Coordinate name as in _iterate_event_ids.
* @param userdata Userdata returned from _iterate_event_coord function.
*/
typedef hsa_status_t (*aqlprofile_coordinate_callback_t)(int position,
int id,
int extent,
int coordinate,
const char* name,
void* userdata);
/**
* @brief Iterate over all event coordinates for a given agent_t and event_t.
* @param[in] agent HSA agent.
* @param[in] event The event ID and block ID to iterate for.
* @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data.
* @param[in] callback Callback function to return the coordinates.
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
*/
hsa_status_t
aqlprofile_iterate_event_coord(aqlprofile_agent_handle_t agent,
aqlprofile_pmc_event_t event,
uint64_t sample_id,
aqlprofile_coordinate_callback_t callback,
void* userdata);
typedef union
{
uint64_t raw;
struct
{
uint64_t isValid : 1;
uint64_t isNavi : 1;
uint64_t npiWaveData : 1;
uint64_t version : 13;
};
} att_output_flags_t;
typedef struct
{
int64_t time;
uint16_t events0;
uint16_t events1;
uint16_t events2;
uint16_t events3;
uint8_t CU;
uint8_t bank;
} att_perfevent_t;
typedef struct
{
uint64_t kernel_id : 12;
uint64_t simd : 2;
uint64_t slot : 4;
uint64_t enable : 1;
uint64_t cu : 4;
uint64_t time : 41; // Time_value/8
} att_occupancy_info_t;
typedef struct
{
int32_t type;
int32_t duration;
} wave_state_t;
typedef struct
{
int64_t time;
int64_t duration;
} wave_instruction_t;
enum WAVESLOT_STATE
{
WS_EMPTY = 0,
WS_IDLE = 1,
WS_EXEC = 2,
WS_WAIT = 3,
WS_STALL = 4,
WS_UNKNOWN = 5,
};
enum WaveInstCategory
{
NONE = 0,
SMEM = 1,
SALU = 2,
VMEM = 3,
FLAT = 4,
LDS = 5,
VALU = 6,
JUMP = 7,
NEXT = 8,
IMMED = 9,
TRAP = 10,
PCINFO = 15,
WAVE_NOT_FINISHED,
};
enum WaveTrapStatus
{
TRAP_RESTORED = 0,
TRAP_REQUEST = 1,
TRAP_SAVED = 1,
TRAP_STANDBY = 2
};
typedef struct
{
size_t addr;
size_t marker_id;
} pcinfo_t;
typedef struct __attribute__((packed))
{
uint64_t category : 8;
uint64_t hitcount : 56;
uint64_t latency;
pcinfo_t pc;
} att_trace_event_t;
typedef struct
{
uint8_t simd;
uint8_t wave_id;
uint8_t trap_status;
uint8_t reserved;
// VMEM Pipeline: instrs and stalls
int num_vmem_instrs;
int num_vmem_stalls;
// FLAT instrs and stalls
int num_flat_instrs;
int num_flat_stalls;
// LDS instr and stalls
int num_lds_instrs;
int num_lds_stalls;
// SCA instrs stalls
int num_salu_instrs;
int num_smem_instrs;
int num_salu_stalls;
int num_smem_stalls;
// Branch
int num_branch_instrs;
int num_branch_taken_instrs;
int num_branch_stalls;
// total VMEM/FLAT/LDS/SMEM instructions issued
int num_mem_instrs; // total issued memory instructions
int num_valu_stalls;
size_t num_valu_instrs;
size_t num_issued_instrs; // total issued instructions (compute + memory)
int64_t begin_time; // Begin and end cycle
int64_t end_time;
int64_t traceID;
size_t timeline_size;
size_t instructions_size;
wave_state_t* timeline_array;
wave_instruction_t* instructions_array;
} wave_data_t;
/**
* @brief Callback for rocprofiler to return ISA to aqlprofile ATT parser.
* The caller must copy a desired instruction on isa_instruction and source_reference,
* while obeying the max length passed by the caller.
* If the caller's length is insufficient, then this function writes the minimum sizes to isa_size
* and source_size and returns HSA_STATUS_ERROR_OUT_OF_RESOURCES.
* If call returns HSA_STATUS_SUCCESS, isa_size and source_size are written with bytes used.
* @param[out] isa_instruction Where to copy the ISA line to.
* @param[out] source_reference Reference to source line and/or additional comments in the binary.
* @param[out] isa_memory_size (Auto) The number of bytes to next instruction. 0 for custom ISA.
* @param[inout] isa_size Size of returned ISA string.
* @param[inout] source_size Size of returned reference/comment string.
* @param[in] marker_id The generated ATT marker for given codeobject ID.
* @param[in] offset The offset from base vaddr for given codeobj ID.
* If marker_id == 0, this parameter is raw virtual address with no codeobj ID information.
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
* @retval HSA_STATUS_SUCCESS on success.
* @retval HSA_STATUS_ERROR on generic error.
* @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid offset or invalid marker_id.
* @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES for insufficient isa_size or source_size.
*/
typedef hsa_status_t (*aqlprofile_att_isa_callback_t)(char* isa_instruction,
char* source_reference,
uint64_t* isa_memory_size,
uint64_t* isa_size,
uint64_t* source_size,
uint64_t marker_id,
uint64_t offset,
void* userdata);
/**
* @brief Callback for rocprofiler to return traces back to rocprofiler.
* @param[in] trace_type_id The type of this trace as in _iterate_event_ids().
* @param[in] correlation_id The ID of shader engine or trace callback number.
* @param[in] trace_events A pointer to sequence of events, of size trace_size.
* @param[in] trace_size The number of events in the trace.
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
*/
typedef hsa_status_t (*aqlprofile_att_trace_callback_t)(int trace_type_id,
int correlation_id,
void* trace_events,
uint64_t trace_size,
void* userdata);
/**
* @brief Callback for the ATT parser to retrieve Shader Engine data.
* Returns the amount of data filled. If no more data is available, then callback return 0
* If the space available in the buffer is less than required for parsing the full data,
* the full data is transfered over multiple calls.
* When all data has been transfered from current shader_engine_id, the caller has the option to
* 1) Return -1 on shader_engine ID and parsing terminates
* 2) Move to the next shader engine.
* @param[out] shader_engine_id The ID of given shader engine.
* @param[out] buffer The buffer to fill up with SE data.
* @param[out] buffer_size The space available in the buffer.
* @param[in] userdata Arbitrary data pointer to be sent back to the user via callback.
* @returns Number of bytes remaining in shader engine.
* @retval 0 if no more SE data is available. Parsing will stop.
* @retval buffer_size if the buffer does not hold enough data for the current shader engine.
* @retval 0 > ret > buffer_size for partially filled buffer, and caller moves over to next SE.
*/
typedef uint64_t (*aqlprofile_att_se_data_callback_t)(int* shader_engine_id,
uint8_t** buffer,
uint64_t* buffer_size,
void* userdata);
/**
* @brief Callback returning from aqlprofile_att_parser_iterate_event_list
* @param[in] trace_event_id ID of the event.
* @param[in] trace_event_metadata Null-terminated string, entries separated by ';'
* @param[in] userdata userdata.
*/
typedef void (*aqlprofile_att_parser_iterate_event_cb_t)(int trace_event_id,
const char* trace_event_metadata,
void* userdata);
/**
* @brief Iterate over all available event types.
* @param[in] callback Callback where events are returned to.
* @param[in] userdata userdata.
*/
void
aqlprofile_att_parser_iterate_event_list(aqlprofile_att_parser_iterate_event_cb_t callback,
void* userdata);
/**
* @brief Iterate over all event coordinates for a given agent_t and event_t.
* @param[in] se_data_callback Callback to return shader engine data from.
* @param[in] trace_callback Callback where the trace data is returned to.
* Each trace will be marked by the ID returned on aqlprofile_att_parser_iterate_event_list.
* @param[in] isa_callback Callback to return ISA lines.
* @param[in] userdata Userdata passed back to caller via callback.
*/
hsa_status_t
aqlprofile_att_parse_data(aqlprofile_att_se_data_callback_t se_data_callback,
aqlprofile_att_trace_callback_t trace_callback,
aqlprofile_att_isa_callback_t isa_callback,
void* userdata);
/**
* @brief Contains flags for how code objects are interpreted
*/
typedef union
{
struct
{
uint32_t isUnload : 1; // 0 if code object is being loaded, 1 for unload
uint32_t bFromStart : 1; // Has this code object been loaded before thread trace started?
uint32_t legacy_id : 30; // Legacy code object ID, if it fits in 30 bits.
};
uint32_t raw;
} aqlprofile_att_header_marker_t;
/**
* @brief Creates an AQL packet for marking code objects
* @param[out] packets Returned packet
* @param[in] handle The handle created from aqlprofile_att_create_packets()
* @param[in] header Header containing code object information created from profiler
* @param[in] id To be passed back to isa_string_callback in marker_id
* @param[in] addr Code object loaded address.
* @param[in] size Code object loaded size.
*/
hsa_status_t
aqlprofile_att_codeobj_load_marker(hsa_ext_amd_aql_pm4_packet_t* packets,
aqlprofile_handle_t handle,
aqlprofile_att_header_marker_t header,
uint64_t id,
uint64_t addr,
uint64_t size);
#ifdef __cplusplus
}
#endif
+82 -29
Просмотреть файл
@@ -26,12 +26,22 @@
#include <hsa/hsa_ext_amd.h>
#include "glog/logging.h"
#define CHECK_HSA(fn, message) \
{ \
auto status = (fn); \
if(status != HSA_STATUS_SUCCESS) \
{ \
std::cerr << "HSA Err: " << status << "\n"; \
exit(1); \
} \
}
namespace rocprofiler
{
namespace aql
{
AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agent,
const std::vector<counters::Metric>& metrics)
CounterPacketConstruct::CounterPacketConstruct(const hsa::AgentCache& agent,
const std::vector<counters::Metric>& metrics)
: _agent(agent)
{
// Validate that the counter exists and construct the block instances
@@ -67,12 +77,11 @@ AQLPacketConstruct::AQLPacketConstruct(const hsa::AgentCache& agen
_events = get_all_events();
}
std::unique_ptr<hsa::AQLPacket>
AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
std::unique_ptr<hsa::CounterAQLPacket>
CounterPacketConstruct::construct_packet(const AmdExtTable& ext)
{
const size_t MEM_PAGE_MASK = 0x1000 - 1;
auto pkt_ptr = std::make_unique<hsa::AQLPacket>(ext.hsa_amd_memory_pool_free_fn);
auto& pkt = *pkt_ptr;
auto pkt_ptr = std::make_unique<hsa::CounterAQLPacket>(ext.hsa_amd_memory_pool_free_fn);
auto& pkt = *pkt_ptr;
if(_events.empty())
{
return pkt_ptr;
@@ -104,15 +113,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
_agent.get_hsa_agent().handle));
}
auto throw_if_failed = [](auto status, auto& message) {
if(status != HSA_STATUS_SUCCESS)
{
throw std::runtime_error(message);
}
};
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, nullptr),
"could not generate packet sizes");
CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, nullptr), "could not generate packet sizes");
if(profile.command_buffer.size == 0 || profile.output_buffer.size == 0)
{
@@ -125,7 +126,7 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
// Allocate buffers and check the results
auto alloc_and_check = [&](auto& pool, auto** mem_loc, auto size) -> bool {
bool malloced = false;
size_t page_aligned = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
size_t page_aligned = getPageAligned(size);
if(ext.hsa_amd_memory_pool_allocate_fn(
pool, page_aligned, 0, static_cast<void**>(mem_loc)) != HSA_STATUS_SUCCESS)
{
@@ -153,22 +154,74 @@ AQLPacketConstruct::construct_packet(const AmdExtTable& ext) const
_agent.kernarg_pool(), &profile.output_buffer.ptr, profile.output_buffer.size);
memset(profile.output_buffer.ptr, 0x0, profile.output_buffer.size);
// throw if we do not construct the packets correctly.
throw_if_failed(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start),
"could not generate start packet");
throw_if_failed(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop),
"could not generate stop packet");
throw_if_failed(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read),
"could not generate read packet");
CHECK_HSA(hsa_ven_amd_aqlprofile_start(&profile, &pkt.start), "failed to create start packet");
CHECK_HSA(hsa_ven_amd_aqlprofile_stop(&profile, &pkt.stop), "failed to create stop packet");
CHECK_HSA(hsa_ven_amd_aqlprofile_read(&profile, &pkt.read), "failed to create read packet");
pkt.start.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
pkt.stop.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
pkt.read.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
return pkt_ptr;
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnarrowing"
ThreadTraceAQLPacketFactory::ThreadTraceAQLPacketFactory(
const hsa::AgentCache& agent,
std::shared_ptr<thread_trace_parameters>& params,
const CoreApiTable& coreapi,
const AmdExtTable& ext)
{
this->tracepool = std::make_shared<hsa::TraceMemoryPool>();
this->tracepool->allocate_fn = ext.hsa_amd_memory_pool_allocate_fn;
this->tracepool->allow_access_fn = ext.hsa_amd_agents_allow_access_fn;
this->tracepool->free_fn = ext.hsa_amd_memory_pool_free_fn;
this->tracepool->api_copy_fn = coreapi.hsa_memory_copy_fn;
this->tracepool->gpu_agent = agent.get_hsa_agent();
this->tracepool->cpu_pool_ = agent.cpu_pool();
this->tracepool->gpu_pool_ = agent.gpu_pool();
this->aql_params.clear();
auto& p = this->aql_params;
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET, params->target_cu});
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK, params->shader_engine_mask});
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION, params->simd_select});
p.push_back({HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, params->buffer_size});
this->profile = aqlprofile_att_profile_t{agent.get_hsa_agent(), p.data(), p.size()};
}
#pragma GCC diagnostic pop
std::unique_ptr<hsa::TraceAQLPacket>
ThreadTraceAQLPacketFactory::construct_packet()
{
auto packet = std::make_unique<hsa::TraceAQLPacket>(this->tracepool);
/*hsa_status_t _status = aqlprofile_att_create_packets(&packet->handle,
&packet->packets,
this->profile,
&hsa::TraceAQLPacket::Alloc,
&hsa::TraceAQLPacket::Free,
&hsa::TraceAQLPacket::Copy,
packet.get());
CHECK_HSA(_status, "failed to create ATT packet");*/
packet->before_krn_pkt.clear();
packet->after_krn_pkt.clear();
packet->packets.start_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
packet->packets.stop_packet.header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
packet->empty = false;
packet->start = packet->packets.start_packet;
packet->stop = packet->packets.stop_packet;
packet->before_krn_pkt.push_back(packet->start);
packet->after_krn_pkt.push_back(packet->stop);
return packet;
}
std::vector<hsa_ven_amd_aqlprofile_event_t>
AQLPacketConstruct::get_all_events() const
CounterPacketConstruct::get_all_events() const
{
std::vector<hsa_ven_amd_aqlprofile_event_t> ret;
for(const auto& metric : _metrics)
@@ -179,7 +232,7 @@ AQLPacketConstruct::get_all_events() const
}
const counters::Metric*
AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const
CounterPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const
{
if(const auto* ptr = rocprofiler::common::get_val(
_event_to_metric,
@@ -191,7 +244,7 @@ AQLPacketConstruct::event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event)
}
const std::vector<hsa_ven_amd_aqlprofile_event_t>&
AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const
CounterPacketConstruct::get_counter_events(const counters::Metric& metric) const
{
for(const auto& prof_metric : _metrics)
{
@@ -204,7 +257,7 @@ AQLPacketConstruct::get_counter_events(const counters::Metric& metric) const
}
void
AQLPacketConstruct::can_collect()
CounterPacketConstruct::can_collect()
{
// Verify that the counters fit within harrdware limits
std::map<std::pair<hsa_ven_amd_aqlprofile_block_name_t, uint32_t>, int64_t> counter_count;
+28 -6
Просмотреть файл
@@ -30,10 +30,12 @@
#include <hsa/hsa_api_trace.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
#include "lib/rocprofiler-sdk/hsa/queue.hpp"
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
namespace rocprofiler
{
@@ -47,21 +49,27 @@ namespace aql
* consturcted start/stop/read packets along with allocated buffers needed
* to collect the counter data.
*/
class AQLPacketConstruct
class CounterPacketConstruct
{
public:
AQLPacketConstruct(const hsa::AgentCache& agent, const std::vector<counters::Metric>& metrics);
std::unique_ptr<hsa::AQLPacket> construct_packet(const AmdExtTable&) const;
CounterPacketConstruct(const hsa::AgentCache& agent,
const std::vector<counters::Metric>& metrics);
std::unique_ptr<hsa::CounterAQLPacket> construct_packet(const AmdExtTable&);
const counters::Metric* event_to_metric(const hsa_ven_amd_aqlprofile_event_t& event) const;
std::vector<hsa_ven_amd_aqlprofile_event_t> get_all_events() const;
hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); }
const std::vector<hsa_ven_amd_aqlprofile_event_t>& get_counter_events(
const counters::Metric&) const;
hsa_agent_t hsa_agent() const { return _agent.get_hsa_agent(); }
private:
const hsa::AgentCache& _agent;
static constexpr size_t MEM_PAGE_ALIGN = 0x1000;
static constexpr size_t MEM_PAGE_MASK = MEM_PAGE_ALIGN - 1;
static size_t getPageAligned(size_t p) { return (p + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; }
protected:
struct AQLProfileMetric
{
counters::Metric metric;
@@ -70,12 +78,26 @@ private:
void can_collect();
const hsa::AgentCache& _agent;
std::vector<AQLProfileMetric> _metrics;
std::vector<hsa_ven_amd_aqlprofile_event_t> _events;
std::map<std::tuple<hsa_ven_amd_aqlprofile_block_name_t, uint32_t, uint32_t>, counters::Metric>
_event_to_metric;
};
class ThreadTraceAQLPacketFactory
{
public:
ThreadTraceAQLPacketFactory(const hsa::AgentCache& agent,
std::shared_ptr<thread_trace_parameters>& params,
const CoreApiTable& coreapi,
const AmdExtTable& ext);
std::unique_ptr<hsa::TraceAQLPacket> construct_packet();
private:
std::shared_ptr<hsa::TraceMemoryPool> tracepool;
std::vector<hsa_ven_amd_aqlprofile_parameter_t> aql_params;
aqlprofile_att_profile_t profile;
};
} // namespace aql
} // namespace rocprofiler
+26 -23
Просмотреть файл
@@ -122,7 +122,7 @@ TEST(aql_profile, construct_packets)
LOG(WARNING) << fmt::format("Found Agent: {}", agent.get_hsa_agent().handle);
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
ASSERT_EQ(metrics.size(), 1);
AQLPacketConstruct(agent, metrics);
CounterPacketConstruct(agent, metrics);
}
hsa_shut_down();
}
@@ -142,7 +142,7 @@ TEST(aql_profile, too_many_counters)
{
try
{
AQLPacketConstruct(agent, metrics);
CounterPacketConstruct(agent, metrics);
} catch(const std::exception& e)
{
EXPECT_NE(e.what(), nullptr) << e.what();
@@ -163,9 +163,9 @@ TEST(aql_profile, packet_generation_single)
ASSERT_GT(agents.size(), 0);
for(const auto& [_, agent] : agents)
{
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
AQLPacketConstruct pkt(agent, metrics);
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
auto metrics = rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES"});
CounterPacketConstruct pkt(agent, metrics);
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
EXPECT_TRUE(test_pkt);
}
@@ -183,14 +183,30 @@ TEST(aql_profile, packet_generation_multi)
{
auto metrics =
rocprofiler::findDeviceMetrics(agent, {"SQ_WAVES", "TA_FLAT_READ_WAVEFRONTS"});
AQLPacketConstruct pkt(agent, metrics);
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
CounterPacketConstruct pkt(agent, metrics);
auto test_pkt = pkt.construct_packet(rocprofiler::get_ext_table());
EXPECT_TRUE(test_pkt);
}
hsa_shut_down();
}
class TestAqlPacket : public rocprofiler::hsa::CounterAQLPacket
{
public:
TestAqlPacket(bool mallocd)
: rocprofiler::hsa::CounterAQLPacket([](void* x) -> hsa_status_t {
::free(x);
return HSA_STATUS_SUCCESS;
})
{
this->profile.output_buffer.ptr = malloc(sizeof(double));
this->profile.command_buffer.ptr = malloc(sizeof(double));
this->command_buf_mallocd = mallocd;
this->output_buffer_malloced = mallocd;
}
};
TEST(aql_profile, test_aql_packet)
{
auto check_null = [](auto& val) {
@@ -201,25 +217,12 @@ TEST(aql_profile, test_aql_packet)
val.completion_signal.handle == null_val.completion_signal.handle;
};
rocprofiler::hsa::AQLPacket test_pkt([](void* x) -> hsa_status_t {
::free(x);
return HSA_STATUS_SUCCESS;
});
TestAqlPacket test_pkt(true);
EXPECT_TRUE(check_null(test_pkt.start)) << "Start packet not null";
EXPECT_TRUE(check_null(test_pkt.stop)) << "Stop packet not null";
EXPECT_TRUE(check_null(test_pkt.read)) << "Read packet not null";
// If this leaks, then AQLPacket is not freeing data correctly.
test_pkt.profile.output_buffer.ptr = malloc(sizeof(double));
test_pkt.profile.command_buffer.ptr = malloc(sizeof(double));
test_pkt.command_buf_mallocd = true;
test_pkt.output_buffer_malloced = true;
// test custom destructor as well
rocprofiler::hsa::AQLPacket test_pkt2([](void* x) -> hsa_status_t {
::free(x);
return HSA_STATUS_SUCCESS;
});
test_pkt2.profile.output_buffer.ptr = malloc(sizeof(double));
test_pkt2.profile.command_buffer.ptr = malloc(sizeof(double));
// Why is this valid?
TestAqlPacket test_pkt2(false);
}
+4
Просмотреть файл
@@ -32,6 +32,7 @@
#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/counters/core.hpp"
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
#include <glog/logging.h>
@@ -446,6 +447,7 @@ start_context(rocprofiler_context_id_t context_id)
}
if(cfg->counter_collection) rocprofiler::counters::start_context(cfg);
if(cfg->thread_trace) cfg->thread_trace->start_context();
return ROCPROFILER_STATUS_SUCCESS;
}
@@ -472,6 +474,8 @@ stop_context(rocprofiler_context_id_t idx)
if(_expected->counter_collection)
rocprofiler::counters::stop_context(const_cast<context*>(_expected));
else if(_expected->thread_trace)
_expected->thread_trace->stop_context();
return ROCPROFILER_STATUS_SUCCESS;
}
}
+2
Просмотреть файл
@@ -33,6 +33,7 @@
#include "lib/rocprofiler-sdk/context/domain.hpp"
#include "lib/rocprofiler-sdk/counters/core.hpp"
#include "lib/rocprofiler-sdk/external_correlation.hpp"
#include "lib/rocprofiler-sdk/thread_trace/att_core.hpp"
#include <array>
#include <atomic>
@@ -153,6 +154,7 @@ struct context
std::unique_ptr<callback_tracing_service> callback_tracer = {};
std::unique_ptr<buffer_tracing_service> buffered_tracer = {};
std::unique_ptr<counter_collection_service> counter_collection = {};
std::shared_ptr<rocprofiler::ThreadTracer> thread_trace = {};
};
// set the client index needs to be called before allocate_context()
+2 -2
Просмотреть файл
@@ -218,7 +218,7 @@ counter_callback_info::setup_profile_config(const hsa::AgentCache& age
}
}
profile->pkt_generator = std::make_unique<rocprofiler::aql::AQLPacketConstruct>(
profile->pkt_generator = std::make_unique<rocprofiler::aql::CounterPacketConstruct>(
agent,
std::vector<counters::Metric>{profile->reqired_hw_counters.begin(),
profile->reqired_hw_counters.end()});
@@ -288,7 +288,7 @@ queue_cb(const context::context* ctx,
// Packet generated when no instrumentation is performed. May contain serialization
// packets/barrier packets (and can be empty).
auto no_instrumentation = [&]() {
auto ret_pkt = std::make_unique<rocprofiler::hsa::AQLPacket>(nullptr);
auto ret_pkt = std::make_unique<rocprofiler::hsa::CounterAQLPacket>(nullptr);
// If we have a counter collection context but it is not enabled, we still might need
// to add barrier packets to transition from serialized -> unserialized execution. This
// transition is coordinated by the serializer.
+1 -1
Просмотреть файл
@@ -59,7 +59,7 @@ struct profile_config
std::vector<counters::EvaluateAST> asts{};
rocprofiler_profile_config_id_t id{.handle = 0};
// Packet generator to create AQL packets for insertion
std::unique_ptr<rocprofiler::aql::AQLPacketConstruct> pkt_generator{nullptr};
std::unique_ptr<rocprofiler::aql::CounterPacketConstruct> pkt_generator{nullptr};
// A packet cache of AQL packets. This allows reuse of AQL packets (preventing costly
// allocation of new packets/destruction).
rocprofiler::common::Synchronized<std::vector<std::unique_ptr<rocprofiler::hsa::AQLPacket>>>
+2 -2
Просмотреть файл
@@ -61,8 +61,8 @@ getBlockDimensions(std::string_view agent, const Metric& metric)
{
if(maybe_agent.name() == agent)
{
aql::AQLPacketConstruct pkt_gen(maybe_agent, {metric});
const auto& events = pkt_gen.get_counter_events(metric);
aql::CounterPacketConstruct pkt_gen(maybe_agent, {metric});
const auto& events = pkt_gen.get_counter_events(metric);
for(const auto& event : events)
{
+3 -3
Просмотреть файл
@@ -450,16 +450,16 @@ EvaluateAST::read_special_counters(
}
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>>
EvaluateAST::read_pkt(const aql::AQLPacketConstruct* pkt_gen, hsa::AQLPacket& pkt)
EvaluateAST::read_pkt(const aql::CounterPacketConstruct* pkt_gen, hsa::AQLPacket& pkt)
{
struct it_data
{
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>>* data;
const aql::AQLPacketConstruct* pkt_gen;
const aql::CounterPacketConstruct* pkt_gen;
};
std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>> ret;
if(pkt.empty) return ret;
if(pkt.isEmpty()) return ret;
it_data aql_data{.data = &ret, .pkt_gen = pkt_gen};
;
hsa_status_t status = hsa_ven_amd_aqlprofile_iterate_data(
+2 -2
Просмотреть файл
@@ -136,8 +136,8 @@ public:
*
*/
static std::unordered_map<uint64_t, std::vector<rocprofiler_record_counter_t>> read_pkt(
const aql::AQLPacketConstruct* pkt_gen,
hsa::AQLPacket& pkt);
const aql::CounterPacketConstruct* pkt_gen,
hsa::AQLPacket& pkt);
/**
* @brief Insert special counter values, such as constants of the agent (i.e. max waves)
+2 -2
Просмотреть файл
@@ -251,8 +251,8 @@ TEST(dimension, block_dim_test)
}
else
{
aql::AQLPacketConstruct pkt_gen(agent, {metric});
const auto& events = pkt_gen.get_counter_events(metric);
aql::CounterPacketConstruct pkt_gen(agent, {metric});
const auto& events = pkt_gen.get_counter_events(metric);
for(const auto& event : events)
{
std::map<int, uint64_t> dims;
+61 -4
Просмотреть файл
@@ -21,14 +21,22 @@
// THE SOFTWARE.
#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp"
#include <cstdlib>
#include <iostream>
#include <string>
#define CHECK_HSA(fn, message) \
if((fn) != HSA_STATUS_SUCCESS) \
{ \
std::cerr << __FILE__ << ':' << __LINE__ << ' ' << message; \
exit(1); \
}
namespace rocprofiler
{
namespace hsa
{
AQLPacket::~AQLPacket()
CounterAQLPacket::~CounterAQLPacket()
{
if(!profile.command_buffer.ptr)
{
@@ -36,7 +44,7 @@ AQLPacket::~AQLPacket()
}
else if(!command_buf_mallocd)
{
free_func(profile.command_buffer.ptr);
CHECK_HSA(free_func(profile.command_buffer.ptr), "freeing memory");
}
else
{
@@ -49,12 +57,61 @@ AQLPacket::~AQLPacket()
}
else if(!output_buffer_malloced)
{
free_func(profile.output_buffer.ptr);
CHECK_HSA(free_func(profile.output_buffer.ptr), "freeing memory");
}
else
{
::free(profile.output_buffer.ptr);
}
}
TraceAQLPacket::~TraceAQLPacket() = default;
/*
TraceAQLPacket::~TraceAQLPacket()
{
aqlprofile_att_delete_packets(this->handle);
}
*/
hsa_status_t
TraceAQLPacket::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data)
{
if(!data) return HSA_STATUS_ERROR;
if(!reinterpret_cast<TraceAQLPacket*>(data)->tracepool) return HSA_STATUS_ERROR;
auto& pool = *reinterpret_cast<TraceAQLPacket*>(data)->tracepool;
if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR;
if(flags.host_access)
{
hsa_status_t status = pool.allocate_fn(pool.cpu_pool_, size, 0, ptr);
if(!flags.device_access || status != HSA_STATUS_SUCCESS) return status;
return pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr);
}
return pool.allocate_fn(pool.gpu_pool_, size, 0, ptr);
}
void
TraceAQLPacket::Free(void* ptr, void* data)
{
auto* pool = reinterpret_cast<TraceAQLPacket*>(data)->tracepool.get();
if(!pool || !pool->free_fn) return;
pool->free_fn(ptr);
}
hsa_status_t
TraceAQLPacket::Copy(void* dst, const void* src, size_t size, void* data)
{
auto* pool = reinterpret_cast<TraceAQLPacket*>(data)->tracepool.get();
if(!pool || !pool->api_copy_fn) return HSA_STATUS_ERROR;
return pool->api_copy_fn(dst, src, size);
}
TraceAQLPacket::TraceAQLPacket(std::shared_ptr<TraceMemoryPool>& _tracepool)
: tracepool(_tracepool){};
} // namespace hsa
} // namespace rocprofiler
+70 -16
Просмотреть файл
@@ -23,6 +23,7 @@
#pragma once
#include "lib/common/container/small_vector.hpp"
#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h"
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ven_amd_aqlprofile.h>
@@ -30,6 +31,12 @@
namespace rocprofiler
{
namespace aql
{
class CounterPacketConstruct;
class ThreadTraceAQLPacketFactory;
} // namespace aql
namespace hsa
{
constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
@@ -41,12 +48,11 @@ constexpr hsa_ext_amd_aql_pm4_packet_t null_amd_aql_pm4_packet = {
* Struct containing AQL packet information. Including start/stop/read
* packets along with allocated buffers
*/
struct AQLPacket
class AQLPacket
{
using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*;
AQLPacket(memory_pool_free_func_t func);
~AQLPacket();
public:
AQLPacket() = default;
virtual ~AQLPacket() = default;
// Keep move constuctors (i.e. std::move())
AQLPacket(AQLPacket&& other) = default;
@@ -56,20 +62,68 @@ struct AQLPacket
AQLPacket(const AQLPacket&) = delete;
AQLPacket& operator=(const AQLPacket&) = delete;
bool command_buf_mallocd = false;
bool output_buffer_malloced = false;
bool empty = {true};
hsa_ven_amd_aqlprofile_profile_t profile = {};
hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet;
hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet;
hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet;
memory_pool_free_func_t free_func = nullptr;
hsa_ven_amd_aqlprofile_profile_t profile = {};
hsa_ext_amd_aql_pm4_packet_t start = null_amd_aql_pm4_packet;
hsa_ext_amd_aql_pm4_packet_t stop = null_amd_aql_pm4_packet;
hsa_ext_amd_aql_pm4_packet_t read = null_amd_aql_pm4_packet;
common::container::small_vector<hsa_ext_amd_aql_pm4_packet_t, 3> before_krn_pkt = {};
common::container::small_vector<hsa_ext_amd_aql_pm4_packet_t, 2> after_krn_pkt = {};
bool isEmpty() const { return empty; }
bool empty = true;
};
class CounterAQLPacket : public AQLPacket
{
friend class rocprofiler::aql::CounterPacketConstruct;
using memory_pool_free_func_t = decltype(::hsa_amd_memory_pool_free)*;
public:
CounterAQLPacket(memory_pool_free_func_t func)
: free_func{func} {};
~CounterAQLPacket() override;
protected:
bool command_buf_mallocd = false;
bool output_buffer_malloced = false;
memory_pool_free_func_t free_func = nullptr;
};
struct TraceMemoryPool
{
hsa_agent_t gpu_agent;
hsa_amd_memory_pool_t cpu_pool_;
hsa_amd_memory_pool_t gpu_pool_;
decltype(hsa_amd_memory_pool_allocate)* allocate_fn;
decltype(hsa_amd_agents_allow_access)* allow_access_fn;
decltype(hsa_amd_memory_pool_free)* free_fn;
decltype(hsa_memory_copy)* api_copy_fn;
};
class TraceAQLPacket : public AQLPacket
{
friend class rocprofiler::aql::ThreadTraceAQLPacketFactory;
public:
TraceAQLPacket(std::shared_ptr<TraceMemoryPool>& _tracepool);
TraceMemoryPool& GetPool() const { return *tracepool; }
aqlprofile_handle_t GetHandle() const { return handle; }
uint64_t GetAgent() const { return tracepool->gpu_agent.handle; }
~TraceAQLPacket() override;
protected:
std::shared_ptr<TraceMemoryPool> tracepool;
aqlprofile_att_control_aql_packets_t packets;
aqlprofile_handle_t handle;
static hsa_status_t Alloc(void** ptr,
size_t size,
aqlprofile_buffer_desc_flags_t flags,
void* data);
static void Free(void* ptr, void* data);
static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data);
};
inline AQLPacket::AQLPacket(memory_pool_free_func_t func)
: free_func{func}
{}
} // namespace hsa
} // namespace rocprofiler
+4 -4
Просмотреть файл
@@ -672,10 +672,10 @@ code_object_load_callback(hsa_executable_t executable,
if(_status != HSA_STATUS_SUCCESS) return _status; \
}
auto& loader_table = get_loader_table();
auto code_obj_v = code_object{};
auto& data = code_obj_v.rocp_data;
int _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE;
auto& loader_table = get_loader_table();
auto code_obj_v = code_object{};
auto& data = code_obj_v.rocp_data;
uint32_t _storage_type = ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_NONE;
LOG_IF(FATAL, data.size == 0) << "code object did not properly initialized the size field upon "
"construction (this is likely a compiler bug)";
-2
Просмотреть файл
@@ -342,8 +342,6 @@ WriteInterceptor(const void* packets,
{
hsa_barrier_and_packet_t barrier{};
barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
// barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
// barrier.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
barrier.header |= 1 << HSA_PACKET_HEADER_BARRIER;
barrier.completion_signal = original_packet.completion_signal;
transformed_packets.emplace_back(barrier);
+24 -1
Просмотреть файл
@@ -90,6 +90,9 @@ constexpr rocprofiler_agent_t default_agent =
void
QueueController::add_queue(hsa_queue_t* id, std::unique_ptr<Queue> queue)
{
for(auto& pre_initialize_fn : pre_initialize)
pre_initialize_fn(queue->get_agent(), get_core_table(), get_ext_table());
CHECK(queue);
_callback_cache.wlock([&](auto& callbacks) {
_queues.wlock([&](auto& map) {
@@ -111,6 +114,11 @@ void
QueueController::destroy_queue(hsa_queue_t* id)
{
if(!id) return;
_queues.wlock([&](auto& map) {
for(auto& deinitialize_fn : pre_deinitialize)
if(map.find(id) != map.end())
deinitialize_fn(map.at(id)->get_agent(), get_core_table(), get_ext_table());
});
const auto* queue = get_queue(*id);
@@ -195,7 +203,8 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
{
constexpr auto expected_context_size = 160UL;
static_assert(
sizeof(context::context) == expected_context_size,
sizeof(context::context) ==
expected_context_size + sizeof(std::shared_ptr<rocprofiler::ThreadTracer>),
"If you added a new field to context struct, make sure there is a check here if it "
"requires queue interception. Once you have done so, increment expected_context_size");
@@ -212,6 +221,20 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
break;
}
}
else if(itr->thread_trace)
{
enable_intercepter = true;
std::weak_ptr<rocprofiler::ThreadTracer> trace = itr->thread_trace;
pre_initialize.emplace_back(
[trace](const AgentCache& cache, const CoreApiTable& core, const AmdExtTable& ext) {
if(auto locked = trace.lock()) locked->resource_init(cache, core, ext);
});
pre_deinitialize.emplace_back(
[trace](const AgentCache& cache, const CoreApiTable&, const AmdExtTable&) {
if(auto locked = trace.lock()) locked->resource_deinit(cache);
});
break;
}
}
if(enable_intercepter)
+4
Просмотреть файл
@@ -103,6 +103,7 @@ public:
private:
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;
using resource_alloc_t = void(const AgentCache&, const CoreApiTable&, const AmdExtTable&);
CoreApiTable _core_table = {};
AmdExtTable _ext_table = {};
@@ -110,6 +111,9 @@ private:
common::Synchronized<client_id_map_t> _callback_cache = {};
agent_cache_map_t _supported_agents = {};
common::Synchronized<hsa::profiler_serializer> _profiler_serializer;
std::vector<std::function<resource_alloc_t>> pre_initialize;
std::vector<std::function<resource_alloc_t>> pre_deinitialize;
};
QueueController*
+6
Просмотреть файл
@@ -0,0 +1,6 @@
set(ROCPROFILER_LIB_THREAD_TRACE_SOURCES att_core.cpp att_service.cpp)
set(ROCPROFILER_LIB_THREAD_TRACE_HEADERS att_core.hpp)
target_sources(rocprofiler-object-library PRIVATE ${ROCPROFILER_LIB_THREAD_TRACE_SOURCES}
${ROCPROFILER_LIB_THREAD_TRACE_HEADERS})
# if(ROCPROFILER_BUILD_TESTS) add_subdirectory(tests) endif()
+238
Просмотреть файл
@@ -0,0 +1,238 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/intercept_table.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/common/container/stable_vector.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/internal_threading.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include <hsa/hsa_api_trace.h>
#include <atomic>
#include <cstdint>
#include <mutex>
#include <stdexcept>
#include <string>
#include <vector>
#define CHECK_HSA(fn, message) \
{ \
auto _status = (fn); \
if(_status != HSA_STATUS_SUCCESS) \
{ \
LOG(ERROR) << "HSA Err: " << _status << '\n'; \
throw std::runtime_error(message); \
} \
}
namespace rocprofiler
{
using AQLPacketOwner = std::unique_ptr<hsa::AQLPacket>;
using inst_pkt_t = common::container::small_vector<std::pair<AQLPacketOwner, int64_t>, 4>;
using corr_id_map_t = hsa::Queue::queue_info_session_t::external_corr_id_map_t;
struct cbdata_t
{
void* tool_userdata;
rocprofiler_att_shader_data_callback_t cb_fn;
std::vector<uint8_t>* memory_space;
};
/**
* Callback we get from HSA interceptor when a kernel packet is being enqueued.
* We return an AQLPacket containing the start/stop/read packets for injection.
*/
AQLPacketOwner
pre_kernel_call(ThreadTracer& tracer,
const hsa::Queue& queue,
const hsa::rocprofiler_packet& kern_pkt,
uint64_t kernel_id,
const corr_id_map_t& extern_corr_ids,
const context::correlation_id* corr_id)
{
(void) extern_corr_ids;
(void) corr_id;
rocprofiler_correlation_id_t temp_corr_id;
temp_corr_id.internal = 0;
temp_corr_id.external.value = 0;
temp_corr_id.external.ptr = nullptr;
auto control_flags = tracer.params->dispatch_cb_fn(queue.get_id(),
queue.get_agent().get_rocp_agent(),
temp_corr_id,
&kern_pkt.kernel_dispatch,
kernel_id,
tracer.params->callback_userdata);
if(control_flags == ROCPROFILER_ATT_CONTROL_NONE) return nullptr;
assert(control_flags == ROCPROFILER_ATT_CONTROL_START_AND_STOP && "Error: Not implemented");
uint64_t agent = queue.get_agent().get_hsa_agent().handle;
std::lock_guard<std::mutex> lk(tracer.trace_resources_mut);
try
{
auto moved = std::move(tracer.resources.at(agent));
tracer.resources.erase(agent);
return moved;
} catch(std::out_of_range& e)
{
LOG(WARNING) << "Attempt to initialize ATT without allocated resources!\n";
return nullptr;
}
}
hsa_status_t
thread_trace_callback(uint32_t shader, void* buffer, uint64_t size, void* callback_data)
{
void* tool_userdata = static_cast<cbdata_t*>(callback_data)->tool_userdata;
auto callback_fn = *static_cast<cbdata_t*>(callback_data)->cb_fn;
std::vector<uint8_t>& cpu_data = *static_cast<cbdata_t*>(callback_data)->memory_space;
// TODO(gbaraldi): Handle parallel callbacks
static std::mutex mut;
std::lock_guard<std::mutex> lk(mut);
if(size > cpu_data.size()) cpu_data.resize(size + cpu_data.size());
auto status = hsa::get_queue_controller()->get_core_table().hsa_memory_copy_fn(
cpu_data.data(), buffer, size);
if(status != HSA_STATUS_SUCCESS)
{
LOG(WARNING) << "Failed to copy hsa memory!";
return HSA_STATUS_SUCCESS;
}
callback_fn(shader, 0, "", cpu_data.data(), size, tool_userdata);
return HSA_STATUS_SUCCESS;
}
void
post_kernel_call(ThreadTracer& tracer, inst_pkt_t& aql)
{
std::vector<uint8_t> cpu_data{};
auto pair = cbdata_t{tracer.params->callback_userdata, tracer.params->shader_cb_fn, &cpu_data};
(void) pair;
for(auto& aql_pkt : aql)
{
auto* pkt = dynamic_cast<hsa::TraceAQLPacket*>(aql_pkt.first.get());
if(!pkt) continue;
// auto status = aqlprofile_att_iterate_data(pkt->GetHandle(), thread_trace_callback,
// &pair); CHECK_HSA(status, "Failed to iterate ATT data");
std::lock_guard<std::mutex> lk(tracer.trace_resources_mut);
if(tracer.agent_active_queues.find(pkt->GetAgent()) != tracer.agent_active_queues.end())
tracer.resources[pkt->GetAgent()] = std::move(aql_pkt.first);
}
}
common::Synchronized<std::optional<int64_t>> client;
void
ThreadTracer::start_context()
{
// Only one thread should be attempting to enable/disable this context
client.wlock([&](auto& client_id) {
if(client_id) return;
client_id = hsa::get_queue_controller()->add_callback(
std::nullopt,
[=](const hsa::Queue& q,
const hsa::rocprofiler_packet& kern_pkt,
uint64_t kernel_id,
rocprofiler_user_data_t* user_data,
const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids,
const context::correlation_id* corr_id) {
(void) user_data;
return pre_kernel_call(*this, q, kern_pkt, kernel_id, extern_corr_ids, corr_id);
},
[=](const hsa::Queue& q,
hsa::rocprofiler_packet kern_pkt,
const hsa::Queue::queue_info_session_t& session,
inst_pkt_t& aql) {
post_kernel_call(*this, aql);
(void) session;
(void) kern_pkt;
(void) q;
});
});
}
void
ThreadTracer::stop_context()
{
client.wlock([&](auto& client_id) {
if(!client_id) return;
// Remove our callbacks from HSA's queue controller
hsa::get_queue_controller()->remove_callback(*client_id);
client_id = std::nullopt;
});
}
void
ThreadTracer::resource_init(const hsa::AgentCache& cache,
const CoreApiTable& coreapi,
const AmdExtTable& ext)
{
uint64_t agent = cache.get_hsa_agent().handle;
std::lock_guard<std::mutex> lk(trace_resources_mut);
if(agent_active_queues.find(agent) != agent_active_queues.end())
{
agent_active_queues.at(agent).fetch_add(1);
return;
}
auto factory = aql::ThreadTraceAQLPacketFactory(cache, this->params, coreapi, ext);
resources[agent] = factory.construct_packet();
agent_active_queues[agent] = 1;
}
void
ThreadTracer::resource_deinit(const hsa::AgentCache& cache)
{
uint64_t agent = cache.get_hsa_agent().handle;
std::lock_guard<std::mutex> lk(trace_resources_mut);
try
{
if(agent_active_queues.at(agent).fetch_add(-1) > 1) return;
} catch(std::out_of_range&)
{}
agent_active_queues.erase(agent);
resources.erase(agent);
}
} // namespace rocprofiler
+75
Просмотреть файл
@@ -0,0 +1,75 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <rocprofiler-sdk/intercept_table.h>
#include <tuple>
#include "include/rocprofiler-sdk/thread_trace.h"
namespace rocprofiler
{
struct thread_trace_parameters
{
rocprofiler_context_id_t context_id;
rocprofiler_att_dispatch_callback_t dispatch_cb_fn;
rocprofiler_att_shader_data_callback_t shader_cb_fn;
void* callback_userdata;
// Parameters
rocprofiler_att_parameter_flag_t flags;
uint64_t buffer_size;
uint8_t target_cu;
uint8_t simd_select;
uint8_t reserved;
uint8_t vmid_mask;
uint16_t perfcounter_mask;
uint8_t perfcounter_ctrl;
uint64_t shader_engine_mask;
// GFX9 Only
std::vector<std::string> perfcounters;
};
namespace hsa
{
class AQLPacket;
};
class ThreadTracer
{
public:
ThreadTracer(std::shared_ptr<thread_trace_parameters>& _params)
: params(_params){};
virtual void start_context();
virtual void stop_context();
virtual void resource_init(const hsa::AgentCache&, const CoreApiTable&, const AmdExtTable&);
virtual void resource_deinit(const hsa::AgentCache&);
virtual ~ThreadTracer() = default;
std::shared_ptr<thread_trace_parameters> params;
std::mutex trace_resources_mut;
std::unordered_map<uint64_t, std::unique_ptr<hsa::AQLPacket>> resources;
std::unordered_map<uint64_t, std::atomic<int>> agent_active_queues;
}; // namespace thread_trace
} // namespace rocprofiler
+81
Просмотреть файл
@@ -0,0 +1,81 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <rocprofiler-sdk/rocprofiler.h>
#include "lib/rocprofiler-sdk/aql/helpers.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp"
extern "C" {
/**
* @brief Configure buffered dispatch profile Counting Service.
* Collects the counters in dispatch packets and stores them
* in buffer_id. The buffer may contain packets from more than
* one dispatch (denoted by correlation id). Will trigger the
* callback based on the parameters setup in buffer_id_t.
*
* @param [in] context_id context id
* @param [in] buffer_id id of the buffer to use for the counting service
* @param [in] profile profile config to use for dispatch
* @return ::rocprofiler_status_t
*/
rocprofiler_status_t ROCPROFILER_API
rocprofiler_configure_thread_trace_service(rocprofiler_context_id_t context_id,
rocprofiler_att_parameters_t parameters,
rocprofiler_att_dispatch_callback_t dispatch_callback,
rocprofiler_att_shader_data_callback_t shader_callback,
void* callback_userdata)
{
auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id);
if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_NOT_STARTED;
if(ctx->thread_trace) return ROCPROFILER_STATUS_ERROR_SERVICE_ALREADY_CONFIGURED;
if(parameters.flags.raw != 0) return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED;
auto thread_tracer = std::make_shared<rocprofiler::thread_trace_parameters>();
thread_tracer->context_id = context_id;
thread_tracer->dispatch_cb_fn = dispatch_callback;
thread_tracer->shader_cb_fn = shader_callback;
thread_tracer->callback_userdata = callback_userdata;
thread_tracer->flags = parameters.flags;
thread_tracer->buffer_size = parameters.buffer_size;
thread_tracer->target_cu = parameters.target_cu;
thread_tracer->simd_select = parameters.simd_select;
thread_tracer->vmid_mask = parameters.vmid_mask;
thread_tracer->perfcounter_mask = parameters.perfcounter_mask;
thread_tracer->perfcounter_ctrl = parameters.perfcounter_ctrl;
for(int i = 0; i < parameters.perfcounter_num; i++)
thread_tracer->perfcounters.emplace_back(parameters.perfcounter[i]);
thread_tracer->shader_engine_mask = 0;
for(int i = 0; i < parameters.shader_num; i++)
thread_tracer->shader_engine_mask |= 1ul << parameters.shader_ids[i];
ctx->thread_trace = std::make_shared<rocprofiler::ThreadTracer>(thread_tracer);
return ROCPROFILER_STATUS_SUCCESS;
}
}
+2 -3
Просмотреть файл
@@ -127,9 +127,8 @@ run(int tid, int devid)
{
auto roctx_range_id = roctxRangeStart("run");
constexpr int min_sa = 8;
constexpr int min_avail_simd = 24;
dim3 grid(min_sa * min_avail_simd);
constexpr int min_avail_simd = 128;
dim3 grid(min_avail_simd);
dim3 block(32);
double time = 0.0;
hipStream_t stream = {};